1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc. All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9// * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11// * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15// * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: jrm@google.com (Jim Meehan)
32
33#include <google/protobuf/stubs/common.h>
34
35#include <google/protobuf/stubs/stringpiece.h>
36
37namespace google {
38namespace protobuf {
39namespace internal {
40
41// These four-byte entries compactly encode how many bytes 0..255 to delete
42// in making a string replacement, how many bytes to add 0..255, and the offset
43// 0..64k-1 of the replacement string in remap_string.
44struct RemapEntry {
45 uint8_t delete_bytes;
46 uint8_t add_bytes;
47 uint16_t bytes_offset;
48};
49
50// Exit type codes for state tables. All but the first get stuffed into
51// signed one-byte entries. The first is only generated by executable code.
52// To distinguish from next-state entries, these must be contiguous and
53// all <= kExitNone
54typedef enum {
55 kExitDstSpaceFull = 239,
56 kExitIllegalStructure, // 240
57 kExitOK, // 241
58 kExitReject, // ...
59 kExitReplace1,
60 kExitReplace2,
61 kExitReplace3,
62 kExitReplace21,
63 kExitReplace31,
64 kExitReplace32,
65 kExitReplaceOffset1,
66 kExitReplaceOffset2,
67 kExitReplace1S0,
68 kExitSpecial,
69 kExitDoAgain,
70 kExitRejectAlt,
71 kExitNone // 255
72} ExitReason;
73
74
75// This struct represents one entire state table. The three initialized byte
76// areas are state_table, remap_base, and remap_string. state0 and state0_size
77// give the byte offset and length within state_table of the initial state --
78// table lookups are expected to start and end in this state, but for
79// truncated UTF-8 strings, may end in a different state. These allow a quick
80// test for that condition. entry_shift is 8 for tables subscripted by a full
81// byte value and 6 for space-optimized tables subscripted by only six
82// significant bits in UTF-8 continuation bytes.
83typedef struct {
84 const uint32_t state0;
85 const uint32_t state0_size;
86 const uint32_t total_size;
87 const int max_expand;
88 const int entry_shift;
89 const int bytes_per_entry;
90 const uint32_t losub;
91 const uint32_t hiadd;
92 const uint8_t* state_table;
93 const RemapEntry* remap_base;
94 const uint8_t* remap_string;
95 const uint8_t* fast_state;
96} UTF8StateMachineObj;
97
98typedef UTF8StateMachineObj UTF8ScanObj;
99
100#define X__ (kExitIllegalStructure)
101#define RJ_ (kExitReject)
102#define S1_ (kExitReplace1)
103#define S2_ (kExitReplace2)
104#define S3_ (kExitReplace3)
105#define S21 (kExitReplace21)
106#define S31 (kExitReplace31)
107#define S32 (kExitReplace32)
108#define T1_ (kExitReplaceOffset1)
109#define T2_ (kExitReplaceOffset2)
110#define S11 (kExitReplace1S0)
111#define SP_ (kExitSpecial)
112#define D__ (kExitDoAgain)
113#define RJA (kExitRejectAlt)
114
115// Entire table has 9 state blocks of 256 entries each
116static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
117static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
118static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
119static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
120static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
121static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
122static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
123static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
124
125static const uint8_t utf8acceptnonsurrogates[] = {
126// state[0] 0x000000 Byte 1
127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131
132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136
137X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
138X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
139X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
140X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
141
142X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
144 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
145 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
146
147// state[1] 0x000080 Byte 2 of 2
148X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
149X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
150X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
151X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
152
153X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
154X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
155X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
156X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
157
158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162
163X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
164X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
165X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
166X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
167
168// state[2] 0x000000 Byte 2 of 3
169X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
170X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
171X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
172X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
173
174X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
175X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
176X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
177X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
178
179X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
180X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
183
184X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
185X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
186X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
187X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
188
189// state[3] 0x001000 Byte 2 of 3
190X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
191X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
192X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
193X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
194
195X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
196X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
197X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
198X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
199
200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
204
205X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
206X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
207X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
208X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
209
210// state[4] 0x000000 Byte 2 of 4
211X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
212X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
213X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
214X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
215
216X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
217X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
218X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
219X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
220
221X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
222 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
223 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
225
226X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
227X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
228X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
229X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
230
231// state[5] 0x040000 Byte 2 of 4
232X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
233X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
234X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
235X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
236
237X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
238X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
239X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
240X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
241
242 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
243 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
244 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
245 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
246
247X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
248X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
249X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
250X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
251
252// state[6] 0x100000 Byte 2 of 4
253X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
254X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
255X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
256X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
257
258X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
259X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
260X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
261X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
262
263 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
264X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
265X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
266X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
267
268X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
269X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
270X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
271X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
272
273// state[7] 0x00d000 Byte 2 of 3
274X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
275X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
276X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
277X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
278
279X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
280X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
281X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
282X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
283
284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
286 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
287 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
288
289X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
290X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
291X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
292X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
293
294// state[8] 0x00d800 Byte 3 of 3
295X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
296X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
297X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
298X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
299
300X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
301X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
302X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
303X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
304
305RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
306RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
307RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
308RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
309
310X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
311X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
312X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
313X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
314};
315
316// Remap base[0] = (del, add, string_offset)
317static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
318{.delete_bytes: 0, .add_bytes: 0, .bytes_offset: 0} };
319
320// Remap string[0]
321static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
3220 };
323
324static const unsigned char utf8acceptnonsurrogates_fast[256] = {
3250, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3260, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3270, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3280, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329
3300, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3310, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3330, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
334
3351, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3361, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3371, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3381, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
339
3401, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3411, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3421, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3431, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
344};
345
346static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
347 .state0: utf8acceptnonsurrogates_STATE0,
348 .state0_size: utf8acceptnonsurrogates_STATE0_SIZE,
349 .total_size: utf8acceptnonsurrogates_TOTAL_SIZE,
350 .max_expand: utf8acceptnonsurrogates_MAX_EXPAND_X4,
351 .entry_shift: utf8acceptnonsurrogates_SHIFT,
352 .bytes_per_entry: utf8acceptnonsurrogates_BYTES,
353 .losub: utf8acceptnonsurrogates_LOSUB,
354 .hiadd: utf8acceptnonsurrogates_HIADD,
355 .state_table: utf8acceptnonsurrogates,
356 .remap_base: utf8acceptnonsurrogates_remap_base,
357 .remap_string: utf8acceptnonsurrogates_remap_string,
358 .fast_state: utf8acceptnonsurrogates_fast
359};
360
361
362#undef X__
363#undef RJ_
364#undef S1_
365#undef S2_
366#undef S3_
367#undef S21
368#undef S31
369#undef S32
370#undef T1_
371#undef T2_
372#undef S11
373#undef SP_
374#undef D__
375#undef RJA
376
377// Return true if current Tbl pointer is within state0 range
378// Note that unsigned compare checks both ends of range simultaneously
379static inline bool InStateZero(const UTF8ScanObj* st, const uint8_t* Tbl) {
380 const uint8_t* Tbl0 = &st->state_table[st->state0];
381 return (static_cast<uint32_t>(Tbl - Tbl0) < st->state0_size);
382}
383
384namespace {
385
386// Scan a UTF-8 string based on state table.
387// Always scan complete UTF-8 characters
388// Set number of bytes scanned. Return reason for exiting
389int UTF8GenericScan(const UTF8ScanObj* st,
390 const char * str,
391 int str_length,
392 int* bytes_consumed) {
393 *bytes_consumed = 0;
394 if (str_length == 0) return kExitOK;
395
396 int eshift = st->entry_shift;
397 const uint8_t* isrc = reinterpret_cast<const uint8_t*>(str);
398 const uint8_t* src = isrc;
399 const uint8_t* srclimit = isrc + str_length;
400 const uint8_t* srclimit8 = str_length < 7 ? isrc : srclimit - 7;
401 const uint8_t* Tbl_0 = &st->state_table[st->state0];
402
403 DoAgain:
404 // Do state-table scan
405 int e = 0;
406 uint8_t c;
407 const uint8_t* Tbl2 = &st->fast_state[0];
408 const uint32_t losub = st->losub;
409 const uint32_t hiadd = st->hiadd;
410 // Check initial few bytes one at a time until 8-byte aligned
411 //----------------------------
412 while ((((uintptr_t)src & 0x07) != 0) &&
413 (src < srclimit) &&
414 Tbl2[src[0]] == 0) {
415 src++;
416 }
417 if (((uintptr_t)src & 0x07) == 0) {
418 // Do fast for groups of 8 identity bytes.
419 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
420 // including slowing slightly on cr/lf/ht
421 //----------------------------
422 while (src < srclimit8) {
423 uint32_t s0123 = (reinterpret_cast<const uint32_t *>(src))[0];
424 uint32_t s4567 = (reinterpret_cast<const uint32_t *>(src))[1];
425 src += 8;
426 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
427 uint32_t temp = (s0123 - losub) | (s0123 + hiadd) |
428 (s4567 - losub) | (s4567 + hiadd);
429 if ((temp & 0x80808080) != 0) {
430 // We typically end up here on cr/lf/ht; src was incremented
431 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
432 (Tbl2[src[-6]] | Tbl2[src[-5]]);
433 if (e0123 != 0) {
434 src -= 8;
435 break;
436 } // Exit on Non-interchange
437 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
438 (Tbl2[src[-2]] | Tbl2[src[-1]]);
439 if (e0123 != 0) {
440 src -= 4;
441 break;
442 } // Exit on Non-interchange
443 // Else OK, go around again
444 }
445 }
446 }
447 //----------------------------
448
449 // Byte-at-a-time scan
450 //----------------------------
451 const uint8_t* Tbl = Tbl_0;
452 while (src < srclimit) {
453 c = *src;
454 e = Tbl[c];
455 src++;
456 if (e >= kExitIllegalStructure) {break;}
457 Tbl = &Tbl_0[e << eshift];
458 }
459 //----------------------------
460
461 // Exit possibilities:
462 // Some exit code, !state0, back up over last char
463 // Some exit code, state0, back up one byte exactly
464 // source consumed, !state0, back up over partial char
465 // source consumed, state0, exit OK
466 // For illegal byte in state0, avoid backup up over PREVIOUS char
467 // For truncated last char, back up to beginning of it
468
469 if (e >= kExitIllegalStructure) {
470 // Back up over exactly one byte of rejected/illegal UTF-8 character
471 src--;
472 // Back up more if needed
473 if (!InStateZero(st, Tbl)) {
474 do {
475 src--;
476 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
477 }
478 } else if (!InStateZero(st, Tbl)) {
479 // Back up over truncated UTF-8 character
480 e = kExitIllegalStructure;
481 do {
482 src--;
483 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
484 } else {
485 // Normal termination, source fully consumed
486 e = kExitOK;
487 }
488
489 if (e == kExitDoAgain) {
490 // Loop back up to the fast scan
491 goto DoAgain;
492 }
493
494 *bytes_consumed = src - isrc;
495 return e;
496}
497
498int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
499 const char * str,
500 int str_length,
501 int* bytes_consumed) {
502 *bytes_consumed = 0;
503 if (str_length == 0) return kExitOK;
504
505 const uint8_t* isrc = reinterpret_cast<const uint8_t*>(str);
506 const uint8_t* src = isrc;
507 const uint8_t* srclimit = isrc + str_length;
508 const uint8_t* srclimit8 = str_length < 7 ? isrc : srclimit - 7;
509 int n;
510 int rest_consumed;
511 int exit_reason;
512 do {
513 // Check initial few bytes one at a time until 8-byte aligned
514 while ((((uintptr_t)src & 0x07) != 0) &&
515 (src < srclimit) && (src[0] < 0x80)) {
516 src++;
517 }
518 if (((uintptr_t)src & 0x07) == 0) {
519 while ((src < srclimit8) &&
520 (((reinterpret_cast<const uint32_t*>(src)[0] |
521 reinterpret_cast<const uint32_t*>(src)[1]) &
522 0x80808080) == 0)) {
523 src += 8;
524 }
525 }
526 while ((src < srclimit) && (src[0] < 0x80)) {
527 src++;
528 }
529 // Run state table on the rest
530 n = src - isrc;
531 exit_reason = UTF8GenericScan(st, str: str + n, str_length: str_length - n, bytes_consumed: &rest_consumed);
532 src += rest_consumed;
533 } while ( exit_reason == kExitDoAgain );
534
535 *bytes_consumed = src - isrc;
536 return exit_reason;
537}
538
539// Hack: On some compilers the static tables are initialized at startup.
540// We can't use them until they are initialized. However, some Protocol
541// Buffer parsing happens at static init time and may try to validate
542// UTF-8 strings. Since UTF-8 validation is only used for debugging
543// anyway, we simply always return success if initialization hasn't
544// occurred yet.
545
546bool module_initialized_ = false;
547
548struct InitDetector {
549 InitDetector() {
550 module_initialized_ = true;
551 }
552};
553InitDetector init_detector;
554
555} // namespace
556
557bool IsStructurallyValidUTF8(const char* buf, int len) {
558 if (!module_initialized_) return true;
559
560 int bytes_consumed = 0;
561 UTF8GenericScanFastAscii(st: &utf8acceptnonsurrogates_obj,
562 str: buf, str_length: len, bytes_consumed: &bytes_consumed);
563 return (bytes_consumed == len);
564}
565
566int UTF8SpnStructurallyValid(StringPiece str) {
567 if (!module_initialized_) return str.size();
568
569 int bytes_consumed = 0;
570 UTF8GenericScanFastAscii(st: &utf8acceptnonsurrogates_obj,
571 str: str.data(), str_length: str.size(), bytes_consumed: &bytes_consumed);
572 return bytes_consumed;
573}
574
575// Coerce UTF-8 byte string in src_str to be
576// a structurally-valid equal-length string by selectively
577// overwriting illegal bytes with replace_char (typically blank).
578// replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
579// src_str is read-only. If any overwriting is needed, a modified byte string
580// is created in idst, length isrclen.
581//
582// Returns pointer to output buffer, isrc if no changes were made,
583// or idst if some bytes were changed.
584//
585// Fast case: all is structurally valid and no byte copying is done.
586//
587char* UTF8CoerceToStructurallyValid(StringPiece src_str, char* idst,
588 const char replace_char) {
589 const char* isrc = src_str.data();
590 const int len = src_str.length();
591 int n = UTF8SpnStructurallyValid(str: src_str);
592 if (n == len) { // Normal case -- all is cool, return
593 return const_cast<char*>(isrc);
594 } else { // Unusual case -- copy w/o bad bytes
595 const char* src = isrc;
596 const char* srclimit = isrc + len;
597 char* dst = idst;
598 memmove(dest: dst, src: src, n: n); // Copy initial good chunk
599 src += n;
600 dst += n;
601 while (src < srclimit) { // src points to bogus byte or is off the end
602 dst[0] = replace_char; // replace one bad byte
603 src++;
604 dst++;
605 StringPiece str2(src, srclimit - src);
606 n = UTF8SpnStructurallyValid(str: str2); // scan the remainder
607 memmove(dest: dst, src: src, n: n); // copy next good chunk
608 src += n;
609 dst += n;
610 }
611 }
612 return idst;
613}
614
615} // namespace internal
616} // namespace protobuf
617} // namespace google
618