LCOV - code coverage report
Current view: top level - intl/icu/source/common - bmpset.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 371 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 13 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : ******************************************************************************
       5             : *
       6             : *   Copyright (C) 2007-2012, International Business Machines
       7             : *   Corporation and others.  All Rights Reserved.
       8             : *
       9             : ******************************************************************************
      10             : *   file name:  bmpset.cpp
      11             : *   encoding:   UTF-8
      12             : *   tab size:   8 (not used)
      13             : *   indentation:4
      14             : *
      15             : *   created on: 2007jan29
      16             : *   created by: Markus W. Scherer
      17             : */
      18             : 
      19             : #include "unicode/utypes.h"
      20             : #include "unicode/uniset.h"
      21             : #include "unicode/utf8.h"
      22             : #include "unicode/utf16.h"
      23             : #include "cmemory.h"
      24             : #include "bmpset.h"
      25             : #include "uassert.h"
      26             : 
      27             : U_NAMESPACE_BEGIN
      28             : 
      29           0 : BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
      30           0 :         list(parentList), listLength(parentListLength) {
      31           0 :     uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
      32           0 :     uprv_memset(table7FF, 0, sizeof(table7FF));
      33           0 :     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
      34             : 
      35             :     /*
      36             :      * Set the list indexes for binary searches for
      37             :      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
      38             :      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
      39             :      * looked up in the bit tables.
      40             :      * The last pair of indexes is for finding supplementary code points.
      41             :      */
      42           0 :     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
      43             :     int32_t i;
      44           0 :     for(i=1; i<=0x10; ++i) {
      45           0 :         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
      46             :     }
      47           0 :     list4kStarts[0x11]=listLength-1;
      48             : 
      49           0 :     initBits();
      50           0 :     overrideIllegal();
      51           0 : }
      52             : 
      53           0 : BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
      54           0 :         list(newParentList), listLength(newParentListLength) {
      55           0 :     uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
      56           0 :     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
      57           0 :     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
      58           0 :     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
      59           0 : }
      60             : 
      61           0 : BMPSet::~BMPSet() {
      62           0 : }
      63             : 
      64             : /*
      65             :  * Set bits in a bit rectangle in "vertical" bit organization.
      66             :  * start<limit<=0x800
      67             :  */
      68           0 : static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
      69           0 :     U_ASSERT(start<limit);
      70           0 :     U_ASSERT(limit<=0x800);
      71             : 
      72           0 :     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
      73           0 :     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
      74             : 
      75             :     // Set one bit indicating an all-one block.
      76           0 :     uint32_t bits=(uint32_t)1<<lead;
      77           0 :     if((start+1)==limit) {  // Single-character shortcut.
      78           0 :         table[trail]|=bits;
      79           0 :         return;
      80             :     }
      81             : 
      82           0 :     int32_t limitLead=limit>>6;
      83           0 :     int32_t limitTrail=limit&0x3f;
      84             : 
      85           0 :     if(lead==limitLead) {
      86             :         // Partial vertical bit column.
      87           0 :         while(trail<limitTrail) {
      88           0 :             table[trail++]|=bits;
      89             :         }
      90             :     } else {
      91             :         // Partial vertical bit column,
      92             :         // followed by a bit rectangle,
      93             :         // followed by another partial vertical bit column.
      94           0 :         if(trail>0) {
      95           0 :             do {
      96           0 :                 table[trail++]|=bits;
      97           0 :             } while(trail<64);
      98           0 :             ++lead;
      99             :         }
     100           0 :         if(lead<limitLead) {
     101           0 :             bits=~((1<<lead)-1);
     102           0 :             if(limitLead<0x20) {
     103           0 :                 bits&=(1<<limitLead)-1;
     104             :             }
     105           0 :             for(trail=0; trail<64; ++trail) {
     106           0 :                 table[trail]|=bits;
     107             :             }
     108             :         }
     109             :         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
     110             :         // In that case, bits=1<<limitLead is undefined but the bits value
     111             :         // is not used because trail<limitTrail is already false.
     112           0 :         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
     113           0 :         for(trail=0; trail<limitTrail; ++trail) {
     114           0 :             table[trail]|=bits;
     115             :         }
     116             :     }
     117             : }
     118             : 
     119           0 : void BMPSet::initBits() {
     120             :     UChar32 start, limit;
     121           0 :     int32_t listIndex=0;
     122             : 
     123             :     // Set asciiBytes[].
     124           0 :     do {
     125           0 :         start=list[listIndex++];
     126           0 :         if(listIndex<listLength) {
     127           0 :             limit=list[listIndex++];
     128             :         } else {
     129           0 :             limit=0x110000;
     130             :         }
     131           0 :         if(start>=0x80) {
     132           0 :             break;
     133             :         }
     134           0 :         do {
     135           0 :             asciiBytes[start++]=1;
     136           0 :         } while(start<limit && start<0x80);
     137           0 :     } while(limit<=0x80);
     138             : 
     139             :     // Set table7FF[].
     140           0 :     while(start<0x800) {
     141           0 :         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
     142           0 :         if(limit>0x800) {
     143           0 :             start=0x800;
     144           0 :             break;
     145             :         }
     146             : 
     147           0 :         start=list[listIndex++];
     148           0 :         if(listIndex<listLength) {
     149           0 :             limit=list[listIndex++];
     150             :         } else {
     151           0 :             limit=0x110000;
     152             :         }
     153             :     }
     154             : 
     155             :     // Set bmpBlockBits[].
     156           0 :     int32_t minStart=0x800;
     157           0 :     while(start<0x10000) {
     158           0 :         if(limit>0x10000) {
     159           0 :             limit=0x10000;
     160             :         }
     161             : 
     162           0 :         if(start<minStart) {
     163           0 :             start=minStart;
     164             :         }
     165           0 :         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
     166           0 :             if(start&0x3f) {
     167             :                 // Mixed-value block of 64 code points.
     168           0 :                 start>>=6;
     169           0 :                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
     170           0 :                 start=(start+1)<<6;  // Round up to the next block boundary.
     171           0 :                 minStart=start;      // Ignore further ranges in this block.
     172             :             }
     173           0 :             if(start<limit) {
     174           0 :                 if(start<(limit&~0x3f)) {
     175             :                     // Multiple all-ones blocks of 64 code points each.
     176           0 :                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
     177             :                 }
     178             : 
     179           0 :                 if(limit&0x3f) {
     180             :                     // Mixed-value block of 64 code points.
     181           0 :                     limit>>=6;
     182           0 :                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
     183           0 :                     limit=(limit+1)<<6;  // Round up to the next block boundary.
     184           0 :                     minStart=limit;      // Ignore further ranges in this block.
     185             :                 }
     186             :             }
     187             :         }
     188             : 
     189           0 :         if(limit==0x10000) {
     190           0 :             break;
     191             :         }
     192             : 
     193           0 :         start=list[listIndex++];
     194           0 :         if(listIndex<listLength) {
     195           0 :             limit=list[listIndex++];
     196             :         } else {
     197           0 :             limit=0x110000;
     198             :         }
     199             :     }
     200           0 : }
     201             : 
     202             : /*
     203             :  * Override some bits and bytes to the result of contains(FFFD)
     204             :  * for faster validity checking at runtime.
     205             :  * No need to set 0 values where they were reset to 0 in the constructor
     206             :  * and not modified by initBits().
     207             :  * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
     208             :  * Need to set 0 values for surrogates D800..DFFF.
     209             :  */
     210           0 : void BMPSet::overrideIllegal() {
     211             :     uint32_t bits, mask;
     212             :     int32_t i;
     213             : 
     214           0 :     if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
     215             :         // contains(FFFD)==TRUE
     216           0 :         for(i=0x80; i<0xc0; ++i) {
     217           0 :             asciiBytes[i]=1;
     218             :         }
     219             : 
     220           0 :         bits=3;                 // Lead bytes 0xC0 and 0xC1.
     221           0 :         for(i=0; i<64; ++i) {
     222           0 :             table7FF[i]|=bits;
     223             :         }
     224             : 
     225           0 :         bits=1;                 // Lead byte 0xE0.
     226           0 :         for(i=0; i<32; ++i) {   // First half of 4k block.
     227           0 :             bmpBlockBits[i]|=bits;
     228             :         }
     229             : 
     230           0 :         mask=~(0x10001<<0xd);   // Lead byte 0xED.
     231           0 :         bits=1<<0xd;
     232           0 :         for(i=32; i<64; ++i) {  // Second half of 4k block.
     233           0 :             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
     234             :         }
     235             :     } else {
     236             :         // contains(FFFD)==FALSE
     237           0 :         mask=~(0x10001<<0xd);   // Lead byte 0xED.
     238           0 :         for(i=32; i<64; ++i) {  // Second half of 4k block.
     239           0 :             bmpBlockBits[i]&=mask;
     240             :         }
     241             :     }
     242           0 : }
     243             : 
     244           0 : int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
     245             :     /* Examples:
     246             :                                        findCodePoint(c)
     247             :        set              list[]         c=0 1 3 4 7 8
     248             :        ===              ==============   ===========
     249             :        []               [110000]         0 0 0 0 0 0
     250             :        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
     251             :        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
     252             :        [:Any:]          [0, 110000]      1 1 1 1 1 1
     253             :      */
     254             : 
     255             :     // Return the smallest i such that c < list[i].  Assume
     256             :     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
     257           0 :     if (c < list[lo])
     258           0 :         return lo;
     259             :     // High runner test.  c is often after the last range, so an
     260             :     // initial check for this condition pays off.
     261           0 :     if (lo >= hi || c >= list[hi-1])
     262           0 :         return hi;
     263             :     // invariant: c >= list[lo]
     264             :     // invariant: c < list[hi]
     265             :     for (;;) {
     266           0 :         int32_t i = (lo + hi) >> 1;
     267           0 :         if (i == lo) {
     268           0 :             break; // Found!
     269           0 :         } else if (c < list[i]) {
     270           0 :             hi = i;
     271             :         } else {
     272           0 :             lo = i;
     273             :         }
     274           0 :     }
     275           0 :     return hi;
     276             : }
     277             : 
     278             : UBool
     279           0 : BMPSet::contains(UChar32 c) const {
     280           0 :     if((uint32_t)c<=0x7f) {
     281           0 :         return (UBool)asciiBytes[c];
     282           0 :     } else if((uint32_t)c<=0x7ff) {
     283           0 :         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
     284           0 :     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
     285           0 :         int lead=c>>12;
     286           0 :         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
     287           0 :         if(twoBits<=1) {
     288             :             // All 64 code points with the same bits 15..6
     289             :             // are either in the set or not.
     290           0 :             return (UBool)twoBits;
     291             :         } else {
     292             :             // Look up the code point in its 4k block of code points.
     293           0 :             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
     294             :         }
     295           0 :     } else if((uint32_t)c<=0x10ffff) {
     296             :         // surrogate or supplementary code point
     297           0 :         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
     298             :     } else {
     299             :         // Out-of-range code points get FALSE, consistent with long-standing
     300             :         // behavior of UnicodeSet::contains(c).
     301           0 :         return FALSE;
     302             :     }
     303             : }
     304             : 
     305             : /*
     306             :  * Check for sufficient length for trail unit for each surrogate pair.
     307             :  * Handle single surrogates as surrogate code points as usual in ICU.
     308             :  */
     309             : const UChar *
     310           0 : BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
     311             :     UChar c, c2;
     312             : 
     313           0 :     if(spanCondition) {
     314             :         // span
     315           0 :         do {
     316           0 :             c=*s;
     317           0 :             if(c<=0x7f) {
     318           0 :                 if(!asciiBytes[c]) {
     319           0 :                     break;
     320             :                 }
     321           0 :             } else if(c<=0x7ff) {
     322           0 :                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
     323           0 :                     break;
     324             :                 }
     325           0 :             } else if(c<0xd800 || c>=0xe000) {
     326           0 :                 int lead=c>>12;
     327           0 :                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
     328           0 :                 if(twoBits<=1) {
     329             :                     // All 64 code points with the same bits 15..6
     330             :                     // are either in the set or not.
     331           0 :                     if(twoBits==0) {
     332           0 :                         break;
     333             :                     }
     334             :                 } else {
     335             :                     // Look up the code point in its 4k block of code points.
     336           0 :                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
     337           0 :                         break;
     338             :                     }
     339           0 :                 }
     340           0 :             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
     341             :                 // surrogate code point
     342           0 :                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
     343           0 :                     break;
     344             :                 }
     345             :             } else {
     346             :                 // surrogate pair
     347           0 :                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
     348           0 :                     break;
     349             :                 }
     350           0 :                 ++s;
     351             :             }
     352             :         } while(++s<limit);
     353             :     } else {
     354             :         // span not
     355           0 :         do {
     356           0 :             c=*s;
     357           0 :             if(c<=0x7f) {
     358           0 :                 if(asciiBytes[c]) {
     359           0 :                     break;
     360             :                 }
     361           0 :             } else if(c<=0x7ff) {
     362           0 :                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
     363           0 :                     break;
     364             :                 }
     365           0 :             } else if(c<0xd800 || c>=0xe000) {
     366           0 :                 int lead=c>>12;
     367           0 :                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
     368           0 :                 if(twoBits<=1) {
     369             :                     // All 64 code points with the same bits 15..6
     370             :                     // are either in the set or not.
     371           0 :                     if(twoBits!=0) {
     372           0 :                         break;
     373             :                     }
     374             :                 } else {
     375             :                     // Look up the code point in its 4k block of code points.
     376           0 :                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
     377           0 :                         break;
     378             :                     }
     379           0 :                 }
     380           0 :             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
     381             :                 // surrogate code point
     382           0 :                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
     383           0 :                     break;
     384             :                 }
     385             :             } else {
     386             :                 // surrogate pair
     387           0 :                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
     388           0 :                     break;
     389             :                 }
     390           0 :                 ++s;
     391             :             }
     392             :         } while(++s<limit);
     393             :     }
     394           0 :     return s;
     395             : }
     396             : 
     397             : /* Symmetrical with span(). */
     398             : const UChar *
     399           0 : BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
     400             :     UChar c, c2;
     401             : 
     402           0 :     if(spanCondition) {
     403             :         // span
     404             :         for(;;) {
     405           0 :             c=*(--limit);
     406           0 :             if(c<=0x7f) {
     407           0 :                 if(!asciiBytes[c]) {
     408           0 :                     break;
     409             :                 }
     410           0 :             } else if(c<=0x7ff) {
     411           0 :                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
     412           0 :                     break;
     413             :                 }
     414           0 :             } else if(c<0xd800 || c>=0xe000) {
     415           0 :                 int lead=c>>12;
     416           0 :                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
     417           0 :                 if(twoBits<=1) {
     418             :                     // All 64 code points with the same bits 15..6
     419             :                     // are either in the set or not.
     420           0 :                     if(twoBits==0) {
     421           0 :                         break;
     422             :                     }
     423             :                 } else {
     424             :                     // Look up the code point in its 4k block of code points.
     425           0 :                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
     426           0 :                         break;
     427             :                     }
     428           0 :                 }
     429           0 :             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
     430             :                 // surrogate code point
     431           0 :                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
     432           0 :                     break;
     433             :                 }
     434             :             } else {
     435             :                 // surrogate pair
     436           0 :                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
     437           0 :                     break;
     438             :                 }
     439           0 :                 --limit;
     440             :             }
     441           0 :             if(s==limit) {
     442           0 :                 return s;
     443             :             }
     444           0 :         }
     445             :     } else {
     446             :         // span not
     447             :         for(;;) {
     448           0 :             c=*(--limit);
     449           0 :             if(c<=0x7f) {
     450           0 :                 if(asciiBytes[c]) {
     451           0 :                     break;
     452             :                 }
     453           0 :             } else if(c<=0x7ff) {
     454           0 :                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
     455           0 :                     break;
     456             :                 }
     457           0 :             } else if(c<0xd800 || c>=0xe000) {
     458           0 :                 int lead=c>>12;
     459           0 :                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
     460           0 :                 if(twoBits<=1) {
     461             :                     // All 64 code points with the same bits 15..6
     462             :                     // are either in the set or not.
     463           0 :                     if(twoBits!=0) {
     464           0 :                         break;
     465             :                     }
     466             :                 } else {
     467             :                     // Look up the code point in its 4k block of code points.
     468           0 :                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
     469           0 :                         break;
     470             :                     }
     471           0 :                 }
     472           0 :             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
     473             :                 // surrogate code point
     474           0 :                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
     475           0 :                     break;
     476             :                 }
     477             :             } else {
     478             :                 // surrogate pair
     479           0 :                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
     480           0 :                     break;
     481             :                 }
     482           0 :                 --limit;
     483             :             }
     484           0 :             if(s==limit) {
     485           0 :                 return s;
     486             :             }
     487           0 :         }
     488             :     }
     489           0 :     return limit+1;
     490             : }
     491             : 
     492             : /*
     493             :  * Precheck for sufficient trail bytes at end of string only once per span.
     494             :  * Check validity.
     495             :  */
     496             : const uint8_t *
     497           0 : BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
     498           0 :     const uint8_t *limit=s+length;
     499           0 :     uint8_t b=*s;
     500           0 :     if((int8_t)b>=0) {
     501             :         // Initial all-ASCII span.
     502           0 :         if(spanCondition) {
     503           0 :             do {
     504           0 :                 if(!asciiBytes[b] || ++s==limit) {
     505           0 :                     return s;
     506             :                 }
     507           0 :                 b=*s;
     508           0 :             } while((int8_t)b>=0);
     509             :         } else {
     510           0 :             do {
     511           0 :                 if(asciiBytes[b] || ++s==limit) {
     512           0 :                     return s;
     513             :                 }
     514           0 :                 b=*s;
     515           0 :             } while((int8_t)b>=0);
     516             :         }
     517           0 :         length=(int32_t)(limit-s);
     518             :     }
     519             : 
     520           0 :     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
     521           0 :         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
     522             :     }
     523             : 
     524           0 :     const uint8_t *limit0=limit;
     525             : 
     526             :     /*
     527             :      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
     528             :      * or runs into a lead byte.
     529             :      * In the span loop compare s with limit only once
     530             :      * per multi-byte character.
     531             :      *
     532             :      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
     533             :      * including it if that is part of the span, otherwise set limit0 to before
     534             :      * the truncated sequence.
     535             :      */
     536           0 :     b=*(limit-1);
     537           0 :     if((int8_t)b<0) {
     538             :         // b>=0x80: lead or trail byte
     539           0 :         if(b<0xc0) {
     540             :             // single trail byte, check for preceding 3- or 4-byte lead byte
     541           0 :             if(length>=2 && (b=*(limit-2))>=0xe0) {
     542           0 :                 limit-=2;
     543           0 :                 if(asciiBytes[0x80]!=spanCondition) {
     544           0 :                     limit0=limit;
     545             :                 }
     546           0 :             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
     547             :                 // 4-byte lead byte with only two trail bytes
     548           0 :                 limit-=3;
     549           0 :                 if(asciiBytes[0x80]!=spanCondition) {
     550           0 :                     limit0=limit;
     551             :                 }
     552             :             }
     553             :         } else {
     554             :             // lead byte with no trail bytes
     555           0 :             --limit;
     556           0 :             if(asciiBytes[0x80]!=spanCondition) {
     557           0 :                 limit0=limit;
     558             :             }
     559             :         }
     560             :     }
     561             : 
     562             :     uint8_t t1, t2, t3;
     563             : 
     564           0 :     while(s<limit) {
     565           0 :         b=*s;
     566           0 :         if(b<0xc0) {
     567             :             // ASCII; or trail bytes with the result of contains(FFFD).
     568           0 :             if(spanCondition) {
     569           0 :                 do {
     570           0 :                     if(!asciiBytes[b]) {
     571           0 :                         return s;
     572           0 :                     } else if(++s==limit) {
     573           0 :                         return limit0;
     574             :                     }
     575           0 :                     b=*s;
     576           0 :                 } while(b<0xc0);
     577             :             } else {
     578           0 :                 do {
     579           0 :                     if(asciiBytes[b]) {
     580           0 :                         return s;
     581           0 :                     } else if(++s==limit) {
     582           0 :                         return limit0;
     583             :                     }
     584           0 :                     b=*s;
     585           0 :                 } while(b<0xc0);
     586             :             }
     587             :         }
     588           0 :         ++s;  // Advance past the lead byte.
     589           0 :         if(b>=0xe0) {
     590           0 :             if(b<0xf0) {
     591           0 :                 if( /* handle U+0000..U+FFFF inline */
     592           0 :                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
     593           0 :                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
     594             :                 ) {
     595           0 :                     b&=0xf;
     596           0 :                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
     597           0 :                     if(twoBits<=1) {
     598             :                         // All 64 code points with this lead byte and middle trail byte
     599             :                         // are either in the set or not.
     600           0 :                         if(twoBits!=(uint32_t)spanCondition) {
     601           0 :                             return s-1;
     602             :                         }
     603             :                     } else {
     604             :                         // Look up the code point in its 4k block of code points.
     605           0 :                         UChar32 c=(b<<12)|(t1<<6)|t2;
     606           0 :                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
     607           0 :                             return s-1;
     608             :                         }
     609             :                     }
     610           0 :                     s+=2;
     611           0 :                     continue;
     612             :                 }
     613           0 :             } else if( /* handle U+10000..U+10FFFF inline */
     614           0 :                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
     615           0 :                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
     616           0 :                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
     617             :             ) {
     618             :                 // Give an illegal sequence the same value as the result of contains(FFFD).
     619           0 :                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
     620           0 :                 if( (   (0x10000<=c && c<=0x10ffff) ?
     621           0 :                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
     622           0 :                             asciiBytes[0x80]
     623           0 :                     ) != spanCondition
     624             :                 ) {
     625           0 :                     return s-1;
     626             :                 }
     627           0 :                 s+=3;
     628           0 :                 continue;
     629             :             }
     630             :         } else /* 0xc0<=b<0xe0 */ {
     631           0 :             if( /* handle U+0000..U+07FF inline */
     632           0 :                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
     633             :             ) {
     634           0 :                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
     635           0 :                     return s-1;
     636             :                 }
     637           0 :                 ++s;
     638           0 :                 continue;
     639             :             }
     640             :         }
     641             : 
     642             :         // Give an illegal sequence the same value as the result of contains(FFFD).
     643             :         // Handle each byte of an illegal sequence separately to simplify the code;
     644             :         // no need to optimize error handling.
     645           0 :         if(asciiBytes[0x80]!=spanCondition) {
     646           0 :             return s-1;
     647             :         }
     648             :     }
     649             : 
     650           0 :     return limit0;
     651             : }
     652             : 
     653             : /*
     654             :  * While going backwards through UTF-8 optimize only for ASCII.
     655             :  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
     656             :  * possible to tell from the last byte in a multi-byte sequence how many
     657             :  * preceding bytes there should be. Therefore, going backwards through UTF-8
     658             :  * is much harder than going forward.
     659             :  */
     660             : int32_t
     661           0 : BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
     662           0 :     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
     663           0 :         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
     664             :     }
     665             : 
     666             :     uint8_t b;
     667             : 
     668           0 :     do {
     669           0 :         b=s[--length];
     670           0 :         if((int8_t)b>=0) {
     671             :             // ASCII sub-span
     672           0 :             if(spanCondition) {
     673           0 :                 do {
     674           0 :                     if(!asciiBytes[b]) {
     675           0 :                         return length+1;
     676           0 :                     } else if(length==0) {
     677           0 :                         return 0;
     678             :                     }
     679           0 :                     b=s[--length];
     680           0 :                 } while((int8_t)b>=0);
     681             :             } else {
     682           0 :                 do {
     683           0 :                     if(asciiBytes[b]) {
     684           0 :                         return length+1;
     685           0 :                     } else if(length==0) {
     686           0 :                         return 0;
     687             :                     }
     688           0 :                     b=s[--length];
     689           0 :                 } while((int8_t)b>=0);
     690             :             }
     691             :         }
     692             : 
     693           0 :         int32_t prev=length;
     694             :         UChar32 c;
     695             :         // trail byte: collect a multi-byte character
     696             :         // (or  lead byte in last-trail position)
     697           0 :         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
     698             :         // c is a valid code point, not ASCII, not a surrogate
     699           0 :         if(c<=0x7ff) {
     700           0 :             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
     701           0 :                 return prev+1;
     702             :             }
     703           0 :         } else if(c<=0xffff) {
     704           0 :             int lead=c>>12;
     705           0 :             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
     706           0 :             if(twoBits<=1) {
     707             :                 // All 64 code points with the same bits 15..6
     708             :                 // are either in the set or not.
     709           0 :                 if(twoBits!=(uint32_t)spanCondition) {
     710           0 :                     return prev+1;
     711             :                 }
     712             :             } else {
     713             :                 // Look up the code point in its 4k block of code points.
     714           0 :                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
     715           0 :                     return prev+1;
     716             :                 }
     717             :             }
     718             :         } else {
     719           0 :             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
     720           0 :                 return prev+1;
     721             :             }
     722             :         }
     723           0 :     } while(length>0);
     724           0 :     return 0;
     725             : }
     726             : 
     727             : U_NAMESPACE_END

Generated by: LCOV version 1.13