dom_parser_implementation.cpp source code [Velox/build/_deps/simdjson-src/src/arm64/dom_parser_implementation.cpp]

1	#include "simdjson/arm64/begin.h"
2
3	//
4	// Stage 1
5	//
6	namespace simdjson {
7	namespace SIMDJSON_IMPLEMENTATION {
8	namespace {
9
10	using namespace simd;
11
12	struct json_character_block {
13	static simdjson_inline json_character_block classify(const simd::simd8x64<uint8_t>& in);
14
15	simdjson_inline uint64_t whitespace() const noexcept { return _whitespace; }
16	simdjson_inline uint64_t op() const noexcept { return _op; }
17	simdjson_inline uint64_t scalar() const noexcept { return ~(op() \| whitespace()); }
18
19	uint64_t _whitespace;
20	uint64_t _op;
21	};
22
23	simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
24	// Functional programming causes trouble with Visual Studio.
25	// Keeping this version in comments since it is much nicer:
26	// auto v = in.map<uint8_t>([&](simd8<uint8_t> chunk) {
27	// auto nib_lo = chunk & 0xf;
28	// auto nib_hi = chunk.shr<4>();
29	// auto shuf_lo = nib_lo.lookup_16<uint8_t>(16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0);
30	// auto shuf_hi = nib_hi.lookup_16<uint8_t>(8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0);
31	// return shuf_lo & shuf_hi;
32	// });
33	const simd8<uint8_t> table1(`16`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `8`, `12`, `1`, `2`, `9`, `0`, `0`);
34	const simd8<uint8_t> table2(`8`, `0`, `18`, `4`, `0`, `1`, `0`, `1`, `0`, `0`, `0`, `3`, `2`, `1`, `0`, `0`);
35
36	simd8x64<uint8_t> v(
37	(in.chunks[`0`] & `0xf`).lookup_16(lookup_table: table1) & (in.chunks[`0`].shr<`4`>()).lookup_16(lookup_table: table2),
38	(in.chunks[`1`] & `0xf`).lookup_16(lookup_table: table1) & (in.chunks[`1`].shr<`4`>()).lookup_16(lookup_table: table2),
39	(in.chunks[`2`] & `0xf`).lookup_16(lookup_table: table1) & (in.chunks[`2`].shr<`4`>()).lookup_16(lookup_table: table2),
40	(in.chunks[`3`] & `0xf`).lookup_16(lookup_table: table1) & (in.chunks[`3`].shr<`4`>()).lookup_16(lookup_table: table2)
41	);
42
43
44	// We compute whitespace and op separately. If the code later only use one or the
45	// other, given the fact that all functions are aggressively inlined, we can
46	// hope that useless computations will be omitted. This is namely case when
47	// minifying (we only need whitespace). However* if we only need spaces,*
48	// it is likely that we will still compute 'v' above with two lookup_16: one
49	// could do it a bit cheaper. This is in contrast with the x64 implementations
50	// where we can, efficiently, do the white space and structural matching
51	// separately. One reason for this difference is that on ARM NEON, the table
52	// lookups either zero or leave unchanged the characters exceeding 0xF whereas
53	// on x64, the equivalent instruction (pshufb) automatically applies a mask,
54	// ignoring the 4 most significant bits. Thus the x64 implementation is
55	// optimized differently. This being said, if you use this code strictly
56	// just for minification (or just to identify the structural characters),
57	// there is a small untaken optimization opportunity here. We deliberately
58	// do not pick it up.
59
60	uint64_t op = simd8x64<bool>(
61	v.chunks[`0`].any_bits_set(bits: `0x7`),
62	v.chunks[`1`].any_bits_set(bits: `0x7`),
63	v.chunks[`2`].any_bits_set(bits: `0x7`),
64	v.chunks[`3`].any_bits_set(bits: `0x7`)
65	).to_bitmask();
66
67	uint64_t whitespace = simd8x64<bool>(
68	v.chunks[`0`].any_bits_set(bits: `0x18`),
69	v.chunks[`1`].any_bits_set(bits: `0x18`),
70	v.chunks[`2`].any_bits_set(bits: `0x18`),
71	v.chunks[`3`].any_bits_set(bits: `0x18`)
72	).to_bitmask();
73
74	return { ._whitespace: whitespace, ._op: op };
75	}
76
77	simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
78	simd8<uint8_t> bits = input.reduce_or();
79	return bits.max_val() < `0x80u`;
80	}
81
82	simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
83	simd8<bool> is_second_byte = prev1 >= uint8_t(`0xc0u`);
84	simd8<bool> is_third_byte = prev2 >= uint8_t(`0xe0u`);
85	simd8<bool> is_fourth_byte = prev3 >= uint8_t(`0xf0u`);
86	// Use ^ instead of \| for is__byte, because ^ is commutative, and the caller is using ^ as well.*
87	// This will work fine because we only have to report errors for cases with 0-1 lead bytes.
88	// Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
89	// guaranteed to be at least one* lead byte that is part of only 1 other multibyte character.*
90	// The error will be detected there.
91	return is_second_byte ^ is_third_byte ^ is_fourth_byte;
92	}
93
94	simdjson_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
95	simd8<bool> is_third_byte = prev2 >= uint8_t(`0xe0u`);
96	simd8<bool> is_fourth_byte = prev3 >= uint8_t(`0xf0u`);
97	return is_third_byte ^ is_fourth_byte;
98	}
99
100	} // unnamed namespace
101	} // namespace SIMDJSON_IMPLEMENTATION
102	} // namespace simdjson
103
104	#include "generic/stage1/utf8_lookup4_algorithm.h"
105	#include "generic/stage1/json_structural_indexer.h"
106	#include "generic/stage1/utf8_validator.h"
107
108	//
109	// Stage 2
110	//
111
112	#include "generic/stage2/stringparsing.h"
113	#include "generic/stage2/tape_builder.h"
114
115	//
116	// Implementation-specific overrides
117	//
118	namespace simdjson {
119	namespace SIMDJSON_IMPLEMENTATION {
120	namespace {
121	namespace stage1 {
122
123	simdjson_inline uint64_t json_string_scanner::find_escaped(uint64_t backslash) {
124	// On ARM, we don't short-circuit this if there are no backslashes, because the branch gives us no
125	// benefit and therefore makes things worse.
126	// if (!backslash) { uint64_t escaped = prev_escaped; prev_escaped = 0; return escaped; }
127	return find_escaped_branchless(backslash);
128	}
129
130	} // namespace stage1
131	} // unnamed namespace
132
133	simdjson_warn_unused error_code implementation::minify(const uint8_t buf, size_t len, uint8_t dst, size_t &dst_len) const noexcept {
134	return arm64::stage1::json_minifier::minify<`64`>(buf, len, dst, dst_len);
135	}
136
137	simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t _buf, size_t _len, stage1_mode streaming) noexcept* {
138	this->buf = _buf;
139	this->len = _len;
140	return arm64::stage1::json_structural_indexer::index<`64`>(buf, len, parser&: *this, partial: streaming);
141	}
142
143	simdjson_warn_unused bool implementation::validate_utf8(const char buf, size_t len) const* noexcept {
144	return arm64::stage1::generic_validate_utf8(input: buf,length: len);
145	}
146
147	simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
148	return stage2::tape_builder::parse_document<false>(dom_parser&: *this, doc&: _doc);
149	}
150
151	simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
152	return stage2::tape_builder::parse_document<true>(dom_parser&: *this, doc&: _doc);
153	}
154
155	simdjson_warn_unused uint8_t dom_parser_implementation::parse_string(const* uint8_t src, uint8_t dst, bool allow_replacement) const noexcept {
156	return arm64::stringparsing::parse_string(src, dst, allow_replacement);
157	}
158
159	simdjson_warn_unused uint8_t dom_parser_implementation::parse_wobbly_string(const* uint8_t src, uint8_t dst) const noexcept {
160	return arm64::stringparsing::parse_wobbly_string(src, dst);
161	}
162
163	simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t _buf, size_t _len, dom::document &_doc) noexcept* {
164	auto error = stage1(_buf, _len, streaming: stage1_mode::regular);
165	if (error) { return error; }
166	return stage2(_doc);
167	}
168
169	} // namespace SIMDJSON_IMPLEMENTATION
170	} // namespace simdjson
171
172	#include "simdjson/arm64/end.h"
173

Browse the source code of Velox/build/_deps/simdjson-src/src/arm64/dom_parser_implementation.cpp