Line data Source code
1 : // Copyright 2005-2008 Google Inc. All Rights Reserved.
2 : // Author: jrm@google.com (Jim Meehan)
3 :
4 : #include <google/protobuf/stubs/common.h>
5 :
6 : namespace google {
7 : namespace protobuf {
8 : namespace internal {
9 :
10 : // These four-byte entries compactly encode how many bytes 0..255 to delete
11 : // in making a string replacement, how many bytes to add 0..255, and the offset
12 : // 0..64k-1 of the replacement string in remap_string.
13 : struct RemapEntry {
14 : uint8 delete_bytes;
15 : uint8 add_bytes;
16 : uint16 bytes_offset;
17 : };
18 :
19 : // Exit type codes for state tables. All but the first get stuffed into
20 : // signed one-byte entries. The first is only generated by executable code.
21 : // To distinguish from next-state entries, these must be contiguous and
22 : // all <= kExitNone
23 : typedef enum {
24 : kExitDstSpaceFull = 239,
25 : kExitIllegalStructure, // 240
26 : kExitOK, // 241
27 : kExitReject, // ...
28 : kExitReplace1,
29 : kExitReplace2,
30 : kExitReplace3,
31 : kExitReplace21,
32 : kExitReplace31,
33 : kExitReplace32,
34 : kExitReplaceOffset1,
35 : kExitReplaceOffset2,
36 : kExitReplace1S0,
37 : kExitSpecial,
38 : kExitDoAgain,
39 : kExitRejectAlt,
40 : kExitNone // 255
41 : } ExitReason;
42 :
43 :
44 : // This struct represents one entire state table. The three initialized byte
45 : // areas are state_table, remap_base, and remap_string. state0 and state0_size
46 : // give the byte offset and length within state_table of the initial state --
47 : // table lookups are expected to start and end in this state, but for
48 : // truncated UTF-8 strings, may end in a different state. These allow a quick
49 : // test for that condition. entry_shift is 8 for tables subscripted by a full
50 : // byte value and 6 for space-optimized tables subscripted by only six
51 : // significant bits in UTF-8 continuation bytes.
52 : typedef struct {
53 : const uint32 state0;
54 : const uint32 state0_size;
55 : const uint32 total_size;
56 : const int max_expand;
57 : const int entry_shift;
58 : const int bytes_per_entry;
59 : const uint32 losub;
60 : const uint32 hiadd;
61 : const uint8* state_table;
62 : const RemapEntry* remap_base;
63 : const uint8* remap_string;
64 : const uint8* fast_state;
65 : } UTF8StateMachineObj;
66 :
67 : typedef UTF8StateMachineObj UTF8ScanObj;
68 :
69 : #define X__ (kExitIllegalStructure)
70 : #define RJ_ (kExitReject)
71 : #define S1_ (kExitReplace1)
72 : #define S2_ (kExitReplace2)
73 : #define S3_ (kExitReplace3)
74 : #define S21 (kExitReplace21)
75 : #define S31 (kExitReplace31)
76 : #define S32 (kExitReplace32)
77 : #define T1_ (kExitReplaceOffset1)
78 : #define T2_ (kExitReplaceOffset2)
79 : #define S11 (kExitReplace1S0)
80 : #define SP_ (kExitSpecial)
81 : #define D__ (kExitDoAgain)
82 : #define RJA (kExitRejectAlt)
83 :
84 : // Entire table has 9 state blocks of 256 entries each
85 : static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
86 : static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
87 : static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
88 : static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
89 : static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
90 : static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
91 : static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
92 : static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
93 :
94 : static const uint8 utf8acceptnonsurrogates[] = {
95 : // state[0] 0x000000 Byte 1
96 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 :
101 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
103 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 :
106 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
107 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
108 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
109 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
110 :
111 : X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
112 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 : 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
114 : 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
115 :
116 : // state[1] 0x000080 Byte 2 of 2
117 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
118 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
119 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
120 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
121 :
122 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
123 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
124 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
125 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
126 :
127 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 :
132 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
133 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
134 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
135 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
136 :
137 : // state[2] 0x000000 Byte 2 of 3
138 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
139 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
140 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
141 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
142 :
143 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
144 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
145 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
146 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
147 :
148 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
149 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
150 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
151 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
152 :
153 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
154 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
155 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
156 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
157 :
158 : // state[3] 0x001000 Byte 2 of 3
159 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
160 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
161 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
162 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
163 :
164 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
165 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
166 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
167 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
168 :
169 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
170 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
171 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 :
174 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
175 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
176 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
177 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
178 :
179 : // state[4] 0x000000 Byte 2 of 4
180 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
181 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
182 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
183 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
184 :
185 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
186 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
187 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
188 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
189 :
190 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
191 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
192 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
193 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
194 :
195 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
196 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
197 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
198 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
199 :
200 : // state[5] 0x040000 Byte 2 of 4
201 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
202 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
203 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
204 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
205 :
206 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
207 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
208 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
209 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
210 :
211 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
212 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
213 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
214 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 :
216 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
217 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
218 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
219 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
220 :
221 : // state[6] 0x100000 Byte 2 of 4
222 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
223 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
224 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
225 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
226 :
227 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
228 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
229 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
230 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
231 :
232 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
233 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
234 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
235 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
236 :
237 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
238 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
239 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
240 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
241 :
242 : // state[7] 0x00d000 Byte 2 of 3
243 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
244 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
245 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
246 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
247 :
248 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
249 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
250 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
251 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
252 :
253 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
254 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
255 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
256 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
257 :
258 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
259 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
260 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
261 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
262 :
263 : // state[8] 0x00d800 Byte 3 of 3
264 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
265 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
266 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
267 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
268 :
269 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
270 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
271 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
272 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
273 :
274 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
275 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
276 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277 : RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278 :
279 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
280 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
281 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
282 : X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
283 : };
284 :
285 : // Remap base[0] = (del, add, string_offset)
286 : static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
287 : {0, 0, 0} };
288 :
289 : // Remap string[0]
290 : static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
291 : 0 };
292 :
293 : static const unsigned char utf8acceptnonsurrogates_fast[256] = {
294 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
295 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
297 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
298 :
299 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
300 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
301 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303 :
304 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
305 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
306 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
307 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
308 :
309 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
310 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
311 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
312 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
313 : };
314 :
315 : static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
316 : utf8acceptnonsurrogates_STATE0,
317 : utf8acceptnonsurrogates_STATE0_SIZE,
318 : utf8acceptnonsurrogates_TOTAL_SIZE,
319 : utf8acceptnonsurrogates_MAX_EXPAND_X4,
320 : utf8acceptnonsurrogates_SHIFT,
321 : utf8acceptnonsurrogates_BYTES,
322 : utf8acceptnonsurrogates_LOSUB,
323 : utf8acceptnonsurrogates_HIADD,
324 : utf8acceptnonsurrogates,
325 : utf8acceptnonsurrogates_remap_base,
326 : utf8acceptnonsurrogates_remap_string,
327 : utf8acceptnonsurrogates_fast
328 : };
329 :
330 :
331 : #undef X__
332 : #undef RJ_
333 : #undef S1_
334 : #undef S2_
335 : #undef S3_
336 : #undef S21
337 : #undef S31
338 : #undef S32
339 : #undef T1_
340 : #undef T2_
341 : #undef S11
342 : #undef SP_
343 : #undef D__
344 : #undef RJA
345 :
346 : // Return true if current Tbl pointer is within state0 range
347 : // Note that unsigned compare checks both ends of range simultaneously
348 0 : static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
349 0 : const uint8* Tbl0 = &st->state_table[st->state0];
350 0 : return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
351 : }
352 :
353 : // Scan a UTF-8 string based on state table.
354 : // Always scan complete UTF-8 characters
355 : // Set number of bytes scanned. Return reason for exiting
356 138 : int UTF8GenericScan(const UTF8ScanObj* st,
357 : const char * str,
358 : int str_length,
359 : int* bytes_consumed) {
360 138 : *bytes_consumed = 0;
361 138 : if (str_length == 0) return kExitOK;
362 :
363 0 : int eshift = st->entry_shift;
364 0 : const uint8* isrc = reinterpret_cast<const uint8*>(str);
365 0 : const uint8* src = isrc;
366 0 : const uint8* srclimit = isrc + str_length;
367 0 : const uint8* srclimit8 = srclimit - 7;
368 0 : const uint8* Tbl_0 = &st->state_table[st->state0];
369 :
370 : DoAgain:
371 : // Do state-table scan
372 0 : int e = 0;
373 : uint8 c;
374 0 : const uint8* Tbl2 = &st->fast_state[0];
375 0 : const uint32 losub = st->losub;
376 0 : const uint32 hiadd = st->hiadd;
377 : // Check initial few bytes one at a time until 8-byte aligned
378 : //----------------------------
379 0 : while ((((uintptr_t)src & 0x07) != 0) &&
380 0 : (src < srclimit) &&
381 0 : Tbl2[src[0]] == 0) {
382 0 : src++;
383 : }
384 0 : if (((uintptr_t)src & 0x07) == 0) {
385 : // Do fast for groups of 8 identity bytes.
386 : // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
387 : // including slowing slightly on cr/lf/ht
388 : //----------------------------
389 0 : while (src < srclimit8) {
390 0 : uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
391 0 : uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
392 0 : src += 8;
393 : // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
394 0 : uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
395 0 : (s4567 - losub) | (s4567 + hiadd);
396 0 : if ((temp & 0x80808080) != 0) {
397 : // We typically end up here on cr/lf/ht; src was incremented
398 0 : int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
399 0 : (Tbl2[src[-6]] | Tbl2[src[-5]]);
400 0 : if (e0123 != 0) {
401 0 : src -= 8;
402 0 : break;
403 : } // Exit on Non-interchange
404 0 : e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
405 0 : (Tbl2[src[-2]] | Tbl2[src[-1]]);
406 0 : if (e0123 != 0) {
407 0 : src -= 4;
408 0 : break;
409 : } // Exit on Non-interchange
410 : // Else OK, go around again
411 : }
412 : }
413 : }
414 : //----------------------------
415 :
416 : // Byte-at-a-time scan
417 : //----------------------------
418 0 : const uint8* Tbl = Tbl_0;
419 0 : while (src < srclimit) {
420 0 : c = *src;
421 0 : e = Tbl[c];
422 0 : src++;
423 0 : if (e >= kExitIllegalStructure) {break;}
424 0 : Tbl = &Tbl_0[e << eshift];
425 : }
426 : //----------------------------
427 :
428 :
429 : // Exit posibilities:
430 : // Some exit code, !state0, back up over last char
431 : // Some exit code, state0, back up one byte exactly
432 : // source consumed, !state0, back up over partial char
433 : // source consumed, state0, exit OK
434 : // For illegal byte in state0, avoid backup up over PREVIOUS char
435 : // For truncated last char, back up to beginning of it
436 :
437 0 : if (e >= kExitIllegalStructure) {
438 : // Back up over exactly one byte of rejected/illegal UTF-8 character
439 0 : src--;
440 : // Back up more if needed
441 0 : if (!InStateZero(st, Tbl)) {
442 0 : do {
443 0 : src--;
444 0 : } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
445 : }
446 0 : } else if (!InStateZero(st, Tbl)) {
447 : // Back up over truncated UTF-8 character
448 0 : e = kExitIllegalStructure;
449 0 : do {
450 0 : src--;
451 0 : } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
452 : } else {
453 : // Normal termination, source fully consumed
454 0 : e = kExitOK;
455 : }
456 :
457 0 : if (e == kExitDoAgain) {
458 : // Loop back up to the fast scan
459 0 : goto DoAgain;
460 : }
461 :
462 0 : *bytes_consumed = src - isrc;
463 0 : return e;
464 : }
465 :
466 138 : int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
467 : const char * str,
468 : int str_length,
469 : int* bytes_consumed) {
470 138 : *bytes_consumed = 0;
471 138 : if (str_length == 0) return kExitOK;
472 :
473 138 : const uint8* isrc = reinterpret_cast<const uint8*>(str);
474 138 : const uint8* src = isrc;
475 138 : const uint8* srclimit = isrc + str_length;
476 138 : const uint8* srclimit8 = srclimit - 7;
477 : int n;
478 : int rest_consumed;
479 : int exit_reason;
480 0 : do {
481 : // Check initial few bytes one at a time until 8-byte aligned
482 138 : while ((((uintptr_t)src & 0x07) != 0) &&
483 138 : (src < srclimit) && (src[0] < 0x80)) {
484 0 : src++;
485 : }
486 138 : if (((uintptr_t)src & 0x07) == 0) {
487 615 : while ((src < srclimit8) &&
488 318 : (((reinterpret_cast<const uint32*>(src)[0] |
489 318 : reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
490 159 : src += 8;
491 : }
492 : }
493 1104 : while ((src < srclimit) && (src[0] < 0x80)) {
494 483 : src++;
495 : }
496 : // Run state table on the rest
497 138 : n = src - isrc;
498 138 : exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
499 138 : src += rest_consumed;
500 138 : } while ( exit_reason == kExitDoAgain );
501 :
502 138 : *bytes_consumed = src - isrc;
503 138 : return exit_reason;
504 : }
505 :
506 : // Hack: On some compilers the static tables are initialized at startup.
507 : // We can't use them until they are initialized. However, some Protocol
508 : // Buffer parsing happens at static init time and may try to validate
509 : // UTF-8 strings. Since UTF-8 validation is only used for debugging
510 : // anyway, we simply always return success if initialization hasn't
511 : // occurred yet.
512 : namespace {
513 :
514 : bool module_initialized_ = false;
515 :
516 : struct InitDetector {
517 3 : InitDetector() {
518 3 : module_initialized_ = true;
519 3 : }
520 : };
521 3 : InitDetector init_detector;
522 :
523 : } // namespace
524 :
525 738 : bool IsStructurallyValidUTF8(const char* buf, int len) {
526 738 : if (!module_initialized_) return true;
527 :
528 138 : int bytes_consumed = 0;
529 : UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
530 138 : buf, len, &bytes_consumed);
531 138 : return (bytes_consumed == len);
532 : }
533 :
534 : } // namespace internal
535 : } // namespace protobuf
536 : } // namespace google
|