Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2007-2012, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: bmpset.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2007jan29
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #include "unicode/utypes.h"
20 : #include "unicode/uniset.h"
21 : #include "unicode/utf8.h"
22 : #include "unicode/utf16.h"
23 : #include "cmemory.h"
24 : #include "bmpset.h"
25 : #include "uassert.h"
26 :
27 : U_NAMESPACE_BEGIN
28 :
29 0 : BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30 0 : list(parentList), listLength(parentListLength) {
31 0 : uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
32 0 : uprv_memset(table7FF, 0, sizeof(table7FF));
33 0 : uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34 :
35 : /*
36 : * Set the list indexes for binary searches for
37 : * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38 : * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39 : * looked up in the bit tables.
40 : * The last pair of indexes is for finding supplementary code points.
41 : */
42 0 : list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43 : int32_t i;
44 0 : for(i=1; i<=0x10; ++i) {
45 0 : list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46 : }
47 0 : list4kStarts[0x11]=listLength-1;
48 :
49 0 : initBits();
50 0 : overrideIllegal();
51 0 : }
52 :
53 0 : BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
54 0 : list(newParentList), listLength(newParentListLength) {
55 0 : uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
56 0 : uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
57 0 : uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
58 0 : uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
59 0 : }
60 :
61 0 : BMPSet::~BMPSet() {
62 0 : }
63 :
64 : /*
65 : * Set bits in a bit rectangle in "vertical" bit organization.
66 : * start<limit<=0x800
67 : */
68 0 : static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
69 0 : U_ASSERT(start<limit);
70 0 : U_ASSERT(limit<=0x800);
71 :
72 0 : int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
73 0 : int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
74 :
75 : // Set one bit indicating an all-one block.
76 0 : uint32_t bits=(uint32_t)1<<lead;
77 0 : if((start+1)==limit) { // Single-character shortcut.
78 0 : table[trail]|=bits;
79 0 : return;
80 : }
81 :
82 0 : int32_t limitLead=limit>>6;
83 0 : int32_t limitTrail=limit&0x3f;
84 :
85 0 : if(lead==limitLead) {
86 : // Partial vertical bit column.
87 0 : while(trail<limitTrail) {
88 0 : table[trail++]|=bits;
89 : }
90 : } else {
91 : // Partial vertical bit column,
92 : // followed by a bit rectangle,
93 : // followed by another partial vertical bit column.
94 0 : if(trail>0) {
95 0 : do {
96 0 : table[trail++]|=bits;
97 0 : } while(trail<64);
98 0 : ++lead;
99 : }
100 0 : if(lead<limitLead) {
101 0 : bits=~((1<<lead)-1);
102 0 : if(limitLead<0x20) {
103 0 : bits&=(1<<limitLead)-1;
104 : }
105 0 : for(trail=0; trail<64; ++trail) {
106 0 : table[trail]|=bits;
107 : }
108 : }
109 : // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
110 : // In that case, bits=1<<limitLead is undefined but the bits value
111 : // is not used because trail<limitTrail is already false.
112 0 : bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
113 0 : for(trail=0; trail<limitTrail; ++trail) {
114 0 : table[trail]|=bits;
115 : }
116 : }
117 : }
118 :
119 0 : void BMPSet::initBits() {
120 : UChar32 start, limit;
121 0 : int32_t listIndex=0;
122 :
123 : // Set asciiBytes[].
124 0 : do {
125 0 : start=list[listIndex++];
126 0 : if(listIndex<listLength) {
127 0 : limit=list[listIndex++];
128 : } else {
129 0 : limit=0x110000;
130 : }
131 0 : if(start>=0x80) {
132 0 : break;
133 : }
134 0 : do {
135 0 : asciiBytes[start++]=1;
136 0 : } while(start<limit && start<0x80);
137 0 : } while(limit<=0x80);
138 :
139 : // Set table7FF[].
140 0 : while(start<0x800) {
141 0 : set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
142 0 : if(limit>0x800) {
143 0 : start=0x800;
144 0 : break;
145 : }
146 :
147 0 : start=list[listIndex++];
148 0 : if(listIndex<listLength) {
149 0 : limit=list[listIndex++];
150 : } else {
151 0 : limit=0x110000;
152 : }
153 : }
154 :
155 : // Set bmpBlockBits[].
156 0 : int32_t minStart=0x800;
157 0 : while(start<0x10000) {
158 0 : if(limit>0x10000) {
159 0 : limit=0x10000;
160 : }
161 :
162 0 : if(start<minStart) {
163 0 : start=minStart;
164 : }
165 0 : if(start<limit) { // Else: Another range entirely in a known mixed-value block.
166 0 : if(start&0x3f) {
167 : // Mixed-value block of 64 code points.
168 0 : start>>=6;
169 0 : bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
170 0 : start=(start+1)<<6; // Round up to the next block boundary.
171 0 : minStart=start; // Ignore further ranges in this block.
172 : }
173 0 : if(start<limit) {
174 0 : if(start<(limit&~0x3f)) {
175 : // Multiple all-ones blocks of 64 code points each.
176 0 : set32x64Bits(bmpBlockBits, start>>6, limit>>6);
177 : }
178 :
179 0 : if(limit&0x3f) {
180 : // Mixed-value block of 64 code points.
181 0 : limit>>=6;
182 0 : bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
183 0 : limit=(limit+1)<<6; // Round up to the next block boundary.
184 0 : minStart=limit; // Ignore further ranges in this block.
185 : }
186 : }
187 : }
188 :
189 0 : if(limit==0x10000) {
190 0 : break;
191 : }
192 :
193 0 : start=list[listIndex++];
194 0 : if(listIndex<listLength) {
195 0 : limit=list[listIndex++];
196 : } else {
197 0 : limit=0x110000;
198 : }
199 : }
200 0 : }
201 :
202 : /*
203 : * Override some bits and bytes to the result of contains(FFFD)
204 : * for faster validity checking at runtime.
205 : * No need to set 0 values where they were reset to 0 in the constructor
206 : * and not modified by initBits().
207 : * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
208 : * Need to set 0 values for surrogates D800..DFFF.
209 : */
210 0 : void BMPSet::overrideIllegal() {
211 : uint32_t bits, mask;
212 : int32_t i;
213 :
214 0 : if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
215 : // contains(FFFD)==TRUE
216 0 : for(i=0x80; i<0xc0; ++i) {
217 0 : asciiBytes[i]=1;
218 : }
219 :
220 0 : bits=3; // Lead bytes 0xC0 and 0xC1.
221 0 : for(i=0; i<64; ++i) {
222 0 : table7FF[i]|=bits;
223 : }
224 :
225 0 : bits=1; // Lead byte 0xE0.
226 0 : for(i=0; i<32; ++i) { // First half of 4k block.
227 0 : bmpBlockBits[i]|=bits;
228 : }
229 :
230 0 : mask=~(0x10001<<0xd); // Lead byte 0xED.
231 0 : bits=1<<0xd;
232 0 : for(i=32; i<64; ++i) { // Second half of 4k block.
233 0 : bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
234 : }
235 : } else {
236 : // contains(FFFD)==FALSE
237 0 : mask=~(0x10001<<0xd); // Lead byte 0xED.
238 0 : for(i=32; i<64; ++i) { // Second half of 4k block.
239 0 : bmpBlockBits[i]&=mask;
240 : }
241 : }
242 0 : }
243 :
244 0 : int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
245 : /* Examples:
246 : findCodePoint(c)
247 : set list[] c=0 1 3 4 7 8
248 : === ============== ===========
249 : [] [110000] 0 0 0 0 0 0
250 : [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
251 : [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
252 : [:Any:] [0, 110000] 1 1 1 1 1 1
253 : */
254 :
255 : // Return the smallest i such that c < list[i]. Assume
256 : // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
257 0 : if (c < list[lo])
258 0 : return lo;
259 : // High runner test. c is often after the last range, so an
260 : // initial check for this condition pays off.
261 0 : if (lo >= hi || c >= list[hi-1])
262 0 : return hi;
263 : // invariant: c >= list[lo]
264 : // invariant: c < list[hi]
265 : for (;;) {
266 0 : int32_t i = (lo + hi) >> 1;
267 0 : if (i == lo) {
268 0 : break; // Found!
269 0 : } else if (c < list[i]) {
270 0 : hi = i;
271 : } else {
272 0 : lo = i;
273 : }
274 0 : }
275 0 : return hi;
276 : }
277 :
278 : UBool
279 0 : BMPSet::contains(UChar32 c) const {
280 0 : if((uint32_t)c<=0x7f) {
281 0 : return (UBool)asciiBytes[c];
282 0 : } else if((uint32_t)c<=0x7ff) {
283 0 : return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
284 0 : } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
285 0 : int lead=c>>12;
286 0 : uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
287 0 : if(twoBits<=1) {
288 : // All 64 code points with the same bits 15..6
289 : // are either in the set or not.
290 0 : return (UBool)twoBits;
291 : } else {
292 : // Look up the code point in its 4k block of code points.
293 0 : return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
294 : }
295 0 : } else if((uint32_t)c<=0x10ffff) {
296 : // surrogate or supplementary code point
297 0 : return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
298 : } else {
299 : // Out-of-range code points get FALSE, consistent with long-standing
300 : // behavior of UnicodeSet::contains(c).
301 0 : return FALSE;
302 : }
303 : }
304 :
305 : /*
306 : * Check for sufficient length for trail unit for each surrogate pair.
307 : * Handle single surrogates as surrogate code points as usual in ICU.
308 : */
309 : const UChar *
310 0 : BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
311 : UChar c, c2;
312 :
313 0 : if(spanCondition) {
314 : // span
315 0 : do {
316 0 : c=*s;
317 0 : if(c<=0x7f) {
318 0 : if(!asciiBytes[c]) {
319 0 : break;
320 : }
321 0 : } else if(c<=0x7ff) {
322 0 : if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
323 0 : break;
324 : }
325 0 : } else if(c<0xd800 || c>=0xe000) {
326 0 : int lead=c>>12;
327 0 : uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
328 0 : if(twoBits<=1) {
329 : // All 64 code points with the same bits 15..6
330 : // are either in the set or not.
331 0 : if(twoBits==0) {
332 0 : break;
333 : }
334 : } else {
335 : // Look up the code point in its 4k block of code points.
336 0 : if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
337 0 : break;
338 : }
339 0 : }
340 0 : } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
341 : // surrogate code point
342 0 : if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
343 0 : break;
344 : }
345 : } else {
346 : // surrogate pair
347 0 : if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
348 0 : break;
349 : }
350 0 : ++s;
351 : }
352 : } while(++s<limit);
353 : } else {
354 : // span not
355 0 : do {
356 0 : c=*s;
357 0 : if(c<=0x7f) {
358 0 : if(asciiBytes[c]) {
359 0 : break;
360 : }
361 0 : } else if(c<=0x7ff) {
362 0 : if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
363 0 : break;
364 : }
365 0 : } else if(c<0xd800 || c>=0xe000) {
366 0 : int lead=c>>12;
367 0 : uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
368 0 : if(twoBits<=1) {
369 : // All 64 code points with the same bits 15..6
370 : // are either in the set or not.
371 0 : if(twoBits!=0) {
372 0 : break;
373 : }
374 : } else {
375 : // Look up the code point in its 4k block of code points.
376 0 : if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
377 0 : break;
378 : }
379 0 : }
380 0 : } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
381 : // surrogate code point
382 0 : if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
383 0 : break;
384 : }
385 : } else {
386 : // surrogate pair
387 0 : if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
388 0 : break;
389 : }
390 0 : ++s;
391 : }
392 : } while(++s<limit);
393 : }
394 0 : return s;
395 : }
396 :
397 : /* Symmetrical with span(). */
398 : const UChar *
399 0 : BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
400 : UChar c, c2;
401 :
402 0 : if(spanCondition) {
403 : // span
404 : for(;;) {
405 0 : c=*(--limit);
406 0 : if(c<=0x7f) {
407 0 : if(!asciiBytes[c]) {
408 0 : break;
409 : }
410 0 : } else if(c<=0x7ff) {
411 0 : if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
412 0 : break;
413 : }
414 0 : } else if(c<0xd800 || c>=0xe000) {
415 0 : int lead=c>>12;
416 0 : uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
417 0 : if(twoBits<=1) {
418 : // All 64 code points with the same bits 15..6
419 : // are either in the set or not.
420 0 : if(twoBits==0) {
421 0 : break;
422 : }
423 : } else {
424 : // Look up the code point in its 4k block of code points.
425 0 : if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
426 0 : break;
427 : }
428 0 : }
429 0 : } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
430 : // surrogate code point
431 0 : if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
432 0 : break;
433 : }
434 : } else {
435 : // surrogate pair
436 0 : if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
437 0 : break;
438 : }
439 0 : --limit;
440 : }
441 0 : if(s==limit) {
442 0 : return s;
443 : }
444 0 : }
445 : } else {
446 : // span not
447 : for(;;) {
448 0 : c=*(--limit);
449 0 : if(c<=0x7f) {
450 0 : if(asciiBytes[c]) {
451 0 : break;
452 : }
453 0 : } else if(c<=0x7ff) {
454 0 : if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
455 0 : break;
456 : }
457 0 : } else if(c<0xd800 || c>=0xe000) {
458 0 : int lead=c>>12;
459 0 : uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
460 0 : if(twoBits<=1) {
461 : // All 64 code points with the same bits 15..6
462 : // are either in the set or not.
463 0 : if(twoBits!=0) {
464 0 : break;
465 : }
466 : } else {
467 : // Look up the code point in its 4k block of code points.
468 0 : if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
469 0 : break;
470 : }
471 0 : }
472 0 : } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
473 : // surrogate code point
474 0 : if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
475 0 : break;
476 : }
477 : } else {
478 : // surrogate pair
479 0 : if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
480 0 : break;
481 : }
482 0 : --limit;
483 : }
484 0 : if(s==limit) {
485 0 : return s;
486 : }
487 0 : }
488 : }
489 0 : return limit+1;
490 : }
491 :
492 : /*
493 : * Precheck for sufficient trail bytes at end of string only once per span.
494 : * Check validity.
495 : */
496 : const uint8_t *
497 0 : BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
498 0 : const uint8_t *limit=s+length;
499 0 : uint8_t b=*s;
500 0 : if((int8_t)b>=0) {
501 : // Initial all-ASCII span.
502 0 : if(spanCondition) {
503 0 : do {
504 0 : if(!asciiBytes[b] || ++s==limit) {
505 0 : return s;
506 : }
507 0 : b=*s;
508 0 : } while((int8_t)b>=0);
509 : } else {
510 0 : do {
511 0 : if(asciiBytes[b] || ++s==limit) {
512 0 : return s;
513 : }
514 0 : b=*s;
515 0 : } while((int8_t)b>=0);
516 : }
517 0 : length=(int32_t)(limit-s);
518 : }
519 :
520 0 : if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
521 0 : spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
522 : }
523 :
524 0 : const uint8_t *limit0=limit;
525 :
526 : /*
527 : * Make sure that the last 1/2/3/4-byte sequence before limit is complete
528 : * or runs into a lead byte.
529 : * In the span loop compare s with limit only once
530 : * per multi-byte character.
531 : *
532 : * Give a trailing illegal sequence the same value as the result of contains(FFFD),
533 : * including it if that is part of the span, otherwise set limit0 to before
534 : * the truncated sequence.
535 : */
536 0 : b=*(limit-1);
537 0 : if((int8_t)b<0) {
538 : // b>=0x80: lead or trail byte
539 0 : if(b<0xc0) {
540 : // single trail byte, check for preceding 3- or 4-byte lead byte
541 0 : if(length>=2 && (b=*(limit-2))>=0xe0) {
542 0 : limit-=2;
543 0 : if(asciiBytes[0x80]!=spanCondition) {
544 0 : limit0=limit;
545 : }
546 0 : } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
547 : // 4-byte lead byte with only two trail bytes
548 0 : limit-=3;
549 0 : if(asciiBytes[0x80]!=spanCondition) {
550 0 : limit0=limit;
551 : }
552 : }
553 : } else {
554 : // lead byte with no trail bytes
555 0 : --limit;
556 0 : if(asciiBytes[0x80]!=spanCondition) {
557 0 : limit0=limit;
558 : }
559 : }
560 : }
561 :
562 : uint8_t t1, t2, t3;
563 :
564 0 : while(s<limit) {
565 0 : b=*s;
566 0 : if(b<0xc0) {
567 : // ASCII; or trail bytes with the result of contains(FFFD).
568 0 : if(spanCondition) {
569 0 : do {
570 0 : if(!asciiBytes[b]) {
571 0 : return s;
572 0 : } else if(++s==limit) {
573 0 : return limit0;
574 : }
575 0 : b=*s;
576 0 : } while(b<0xc0);
577 : } else {
578 0 : do {
579 0 : if(asciiBytes[b]) {
580 0 : return s;
581 0 : } else if(++s==limit) {
582 0 : return limit0;
583 : }
584 0 : b=*s;
585 0 : } while(b<0xc0);
586 : }
587 : }
588 0 : ++s; // Advance past the lead byte.
589 0 : if(b>=0xe0) {
590 0 : if(b<0xf0) {
591 0 : if( /* handle U+0000..U+FFFF inline */
592 0 : (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
593 0 : (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
594 : ) {
595 0 : b&=0xf;
596 0 : uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
597 0 : if(twoBits<=1) {
598 : // All 64 code points with this lead byte and middle trail byte
599 : // are either in the set or not.
600 0 : if(twoBits!=(uint32_t)spanCondition) {
601 0 : return s-1;
602 : }
603 : } else {
604 : // Look up the code point in its 4k block of code points.
605 0 : UChar32 c=(b<<12)|(t1<<6)|t2;
606 0 : if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
607 0 : return s-1;
608 : }
609 : }
610 0 : s+=2;
611 0 : continue;
612 : }
613 0 : } else if( /* handle U+10000..U+10FFFF inline */
614 0 : (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
615 0 : (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
616 0 : (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
617 : ) {
618 : // Give an illegal sequence the same value as the result of contains(FFFD).
619 0 : UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
620 0 : if( ( (0x10000<=c && c<=0x10ffff) ?
621 0 : containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
622 0 : asciiBytes[0x80]
623 0 : ) != spanCondition
624 : ) {
625 0 : return s-1;
626 : }
627 0 : s+=3;
628 0 : continue;
629 : }
630 : } else /* 0xc0<=b<0xe0 */ {
631 0 : if( /* handle U+0000..U+07FF inline */
632 0 : (t1=(uint8_t)(*s-0x80)) <= 0x3f
633 : ) {
634 0 : if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
635 0 : return s-1;
636 : }
637 0 : ++s;
638 0 : continue;
639 : }
640 : }
641 :
642 : // Give an illegal sequence the same value as the result of contains(FFFD).
643 : // Handle each byte of an illegal sequence separately to simplify the code;
644 : // no need to optimize error handling.
645 0 : if(asciiBytes[0x80]!=spanCondition) {
646 0 : return s-1;
647 : }
648 : }
649 :
650 0 : return limit0;
651 : }
652 :
653 : /*
654 : * While going backwards through UTF-8 optimize only for ASCII.
655 : * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
656 : * possible to tell from the last byte in a multi-byte sequence how many
657 : * preceding bytes there should be. Therefore, going backwards through UTF-8
658 : * is much harder than going forward.
659 : */
660 : int32_t
661 0 : BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
662 0 : if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
663 0 : spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
664 : }
665 :
666 : uint8_t b;
667 :
668 0 : do {
669 0 : b=s[--length];
670 0 : if((int8_t)b>=0) {
671 : // ASCII sub-span
672 0 : if(spanCondition) {
673 0 : do {
674 0 : if(!asciiBytes[b]) {
675 0 : return length+1;
676 0 : } else if(length==0) {
677 0 : return 0;
678 : }
679 0 : b=s[--length];
680 0 : } while((int8_t)b>=0);
681 : } else {
682 0 : do {
683 0 : if(asciiBytes[b]) {
684 0 : return length+1;
685 0 : } else if(length==0) {
686 0 : return 0;
687 : }
688 0 : b=s[--length];
689 0 : } while((int8_t)b>=0);
690 : }
691 : }
692 :
693 0 : int32_t prev=length;
694 : UChar32 c;
695 : // trail byte: collect a multi-byte character
696 : // (or lead byte in last-trail position)
697 0 : c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
698 : // c is a valid code point, not ASCII, not a surrogate
699 0 : if(c<=0x7ff) {
700 0 : if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
701 0 : return prev+1;
702 : }
703 0 : } else if(c<=0xffff) {
704 0 : int lead=c>>12;
705 0 : uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
706 0 : if(twoBits<=1) {
707 : // All 64 code points with the same bits 15..6
708 : // are either in the set or not.
709 0 : if(twoBits!=(uint32_t)spanCondition) {
710 0 : return prev+1;
711 : }
712 : } else {
713 : // Look up the code point in its 4k block of code points.
714 0 : if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
715 0 : return prev+1;
716 : }
717 : }
718 : } else {
719 0 : if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
720 0 : return prev+1;
721 : }
722 : }
723 0 : } while(length>0);
724 0 : return 0;
725 : }
726 :
727 : U_NAMESPACE_END
|