LCOV - code coverage report
Current view: top level - intl/icu/source/common - unisetspan.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 696 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 30 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : ******************************************************************************
       5             : *
       6             : *   Copyright (C) 2007-2012, International Business Machines
       7             : *   Corporation and others.  All Rights Reserved.
       8             : *
       9             : ******************************************************************************
      10             : *   file name:  unisetspan.cpp
      11             : *   encoding:   UTF-8
      12             : *   tab size:   8 (not used)
      13             : *   indentation:4
      14             : *
      15             : *   created on: 2007mar01
      16             : *   created by: Markus W. Scherer
      17             : */
      18             : 
      19             : #include "unicode/utypes.h"
      20             : #include "unicode/uniset.h"
      21             : #include "unicode/ustring.h"
      22             : #include "unicode/utf8.h"
      23             : #include "unicode/utf16.h"
      24             : #include "cmemory.h"
      25             : #include "uvector.h"
      26             : #include "unisetspan.h"
      27             : 
      28             : U_NAMESPACE_BEGIN
      29             : 
      30             : /*
      31             :  * List of offsets from the current position from where to try matching
      32             :  * a code point or a string.
      33             :  * Store offsets rather than indexes to simplify the code and use the same list
      34             :  * for both increments (in span()) and decrements (in spanBack()).
      35             :  *
      36             :  * Assumption: The maximum offset is limited, and the offsets that are stored
      37             :  * at any one time are relatively dense, that is, there are normally no gaps of
      38             :  * hundreds or thousands of offset values.
      39             :  *
      40             :  * The implementation uses a circular buffer of byte flags,
      41             :  * each indicating whether the corresponding offset is in the list.
      42             :  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
      43             :  * physically moving part of the list.
      44             :  *
      45             :  * Note: In principle, the caller should setMaxLength() to the maximum of the
      46             :  * max string length and U16_LENGTH/U8_LENGTH to account for
      47             :  * "long" single code points.
      48             :  * However, this implementation uses at least a staticList with more than
      49             :  * U8_LENGTH entries anyway.
      50             :  *
      51             :  * Note: If maxLength were guaranteed to be no more than 32 or 64,
      52             :  * the list could be stored as bit flags in a single integer.
      53             :  * Rather than handling a circular buffer with a start list index,
      54             :  * the integer would simply be shifted when lower offsets are removed.
      55             :  * UnicodeSet does not have a limit on the lengths of strings.
      56             :  */
      57             : class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
      58             : public:
      59           0 :     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
      60             : 
      61           0 :     ~OffsetList() {
      62           0 :         if(list!=staticList) {
      63           0 :             uprv_free(list);
      64             :         }
      65           0 :     }
      66             : 
      67             :     // Call exactly once if the list is to be used.
      68           0 :     void setMaxLength(int32_t maxLength) {
      69           0 :         if(maxLength<=(int32_t)sizeof(staticList)) {
      70           0 :             capacity=(int32_t)sizeof(staticList);
      71             :         } else {
      72           0 :             UBool *l=(UBool *)uprv_malloc(maxLength);
      73           0 :             if(l!=NULL) {
      74           0 :                 list=l;
      75           0 :                 capacity=maxLength;
      76             :             }
      77             :         }
      78           0 :         uprv_memset(list, 0, capacity);
      79           0 :     }
      80             : 
      81             :     void clear() {
      82             :         uprv_memset(list, 0, capacity);
      83             :         start=length=0;
      84             :     }
      85             : 
      86           0 :     UBool isEmpty() const {
      87           0 :         return (UBool)(length==0);
      88             :     }
      89             : 
      90             :     // Reduce all stored offsets by delta, used when the current position
      91             :     // moves by delta.
      92             :     // There must not be any offsets lower than delta.
      93             :     // If there is an offset equal to delta, it is removed.
      94             :     // delta=[1..maxLength]
      95           0 :     void shift(int32_t delta) {
      96           0 :         int32_t i=start+delta;
      97           0 :         if(i>=capacity) {
      98           0 :             i-=capacity;
      99             :         }
     100           0 :         if(list[i]) {
     101           0 :             list[i]=FALSE;
     102           0 :             --length;
     103             :         }
     104           0 :         start=i;
     105           0 :     }
     106             : 
     107             :     // Add an offset. The list must not contain it yet.
     108             :     // offset=[1..maxLength]
     109           0 :     void addOffset(int32_t offset) {
     110           0 :         int32_t i=start+offset;
     111           0 :         if(i>=capacity) {
     112           0 :             i-=capacity;
     113             :         }
     114           0 :         list[i]=TRUE;
     115           0 :         ++length;
     116           0 :     }
     117             : 
     118             :     // offset=[1..maxLength]
     119           0 :     UBool containsOffset(int32_t offset) const {
     120           0 :         int32_t i=start+offset;
     121           0 :         if(i>=capacity) {
     122           0 :             i-=capacity;
     123             :         }
     124           0 :         return list[i];
     125             :     }
     126             : 
     127             :     // Find the lowest stored offset from a non-empty list, remove it,
     128             :     // and reduce all other offsets by this minimum.
     129             :     // Returns [1..maxLength].
     130           0 :     int32_t popMinimum() {
     131             :         // Look for the next offset in list[start+1..capacity-1].
     132           0 :         int32_t i=start, result;
     133           0 :         while(++i<capacity) {
     134           0 :             if(list[i]) {
     135           0 :                 list[i]=FALSE;
     136           0 :                 --length;
     137           0 :                 result=i-start;
     138           0 :                 start=i;
     139           0 :                 return result;
     140             :             }
     141             :         }
     142             :         // i==capacity
     143             : 
     144             :         // Wrap around and look for the next offset in list[0..start].
     145             :         // Since the list is not empty, there will be one.
     146           0 :         result=capacity-start;
     147           0 :         i=0;
     148           0 :         while(!list[i]) {
     149           0 :             ++i;
     150             :         }
     151           0 :         list[i]=FALSE;
     152           0 :         --length;
     153           0 :         start=i;
     154           0 :         return result+=i;
     155             :     }
     156             : 
     157             : private:
     158             :     UBool *list;
     159             :     int32_t capacity;
     160             :     int32_t length;
     161             :     int32_t start;
     162             : 
     163             :     UBool staticList[16];
     164             : };
     165             : 
     166             : // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
     167             : static int32_t
     168           0 : getUTF8Length(const UChar *s, int32_t length) {
     169           0 :     UErrorCode errorCode=U_ZERO_ERROR;
     170           0 :     int32_t length8=0;
     171           0 :     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
     172           0 :     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
     173           0 :         return length8;
     174             :     } else {
     175             :         // The string contains an unpaired surrogate.
     176             :         // Ignore this string.
     177           0 :         return 0;
     178             :     }
     179             : }
     180             : 
     181             : // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
     182             : static int32_t
     183           0 : appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
     184           0 :     UErrorCode errorCode=U_ZERO_ERROR;
     185           0 :     int32_t length8=0;
     186           0 :     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
     187           0 :     if(U_SUCCESS(errorCode)) {
     188           0 :         return length8;
     189             :     } else {
     190             :         // The string contains an unpaired surrogate.
     191             :         // Ignore this string.
     192           0 :         return 0;
     193             :     }
     194             : }
     195             : 
     196             : static inline uint8_t
     197           0 : makeSpanLengthByte(int32_t spanLength) {
     198             :     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
     199           0 :     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
     200             : }
     201             : 
     202             : // Construct for all variants of span(), or only for any one variant.
     203             : // Initialize as little as possible, for single use.
     204           0 : UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
     205             :                                            const UVector &setStrings,
     206           0 :                                            uint32_t which)
     207             :         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
     208             :           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
     209             :           utf8Length(0),
     210             :           maxLength16(0), maxLength8(0),
     211           0 :           all((UBool)(which==ALL)) {
     212           0 :     spanSet.retainAll(set);
     213           0 :     if(which&NOT_CONTAINED) {
     214             :         // Default to the same sets.
     215             :         // addToSpanNotSet() will create a separate set if necessary.
     216           0 :         pSpanNotSet=&spanSet;
     217             :     }
     218             : 
     219             :     // Determine if the strings even need to be taken into account at all for span() etc.
     220             :     // If any string is relevant, then all strings need to be used for
     221             :     // span(longest match) but only the relevant ones for span(while contained).
     222             :     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
     223             :     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
     224             :     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
     225             :     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
     226           0 :     int32_t stringsLength=strings.size();
     227             : 
     228             :     int32_t i, spanLength;
     229           0 :     UBool someRelevant=FALSE;
     230           0 :     for(i=0; i<stringsLength; ++i) {
     231           0 :         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
     232           0 :         const UChar *s16=string.getBuffer();
     233           0 :         int32_t length16=string.length();
     234             :         UBool thisRelevant;
     235           0 :         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
     236           0 :         if(spanLength<length16) {  // Relevant string.
     237           0 :             someRelevant=thisRelevant=TRUE;
     238             :         } else {
     239           0 :             thisRelevant=FALSE;
     240             :         }
     241           0 :         if((which&UTF16) && length16>maxLength16) {
     242           0 :             maxLength16=length16;
     243             :         }
     244           0 :         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
     245           0 :             int32_t length8=getUTF8Length(s16, length16);
     246           0 :             utf8Length+=length8;
     247           0 :             if(length8>maxLength8) {
     248           0 :                 maxLength8=length8;
     249             :             }
     250             :         }
     251             :     }
     252           0 :     if(!someRelevant) {
     253           0 :         maxLength16=maxLength8=0;
     254           0 :         return;
     255             :     }
     256             : 
     257             :     // Freeze after checking for the need to use strings at all because freezing
     258             :     // a set takes some time and memory which are wasted if there are no relevant strings.
     259           0 :     if(all) {
     260           0 :         spanSet.freeze();
     261             :     }
     262             : 
     263             :     uint8_t *spanBackLengths;
     264             :     uint8_t *spanUTF8Lengths;
     265             :     uint8_t *spanBackUTF8Lengths;
     266             : 
     267             :     // Allocate a block of meta data.
     268             :     int32_t allocSize;
     269           0 :     if(all) {
     270             :         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
     271           0 :         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
     272             :     } else {
     273           0 :         allocSize=stringsLength;  // One set of span lengths.
     274           0 :         if(which&UTF8) {
     275             :             // UTF-8 lengths and UTF-8 strings.
     276           0 :             allocSize+=stringsLength*4+utf8Length;
     277             :         }
     278             :     }
     279           0 :     if(allocSize<=(int32_t)sizeof(staticLengths)) {
     280           0 :         utf8Lengths=staticLengths;
     281             :     } else {
     282           0 :         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
     283           0 :         if(utf8Lengths==NULL) {
     284           0 :             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
     285           0 :             return;  // Out of memory.
     286             :         }
     287             :     }
     288             : 
     289           0 :     if(all) {
     290             :         // Store span lengths for all span() variants.
     291           0 :         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
     292           0 :         spanBackLengths=spanLengths+stringsLength;
     293           0 :         spanUTF8Lengths=spanBackLengths+stringsLength;
     294           0 :         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
     295           0 :         utf8=spanBackUTF8Lengths+stringsLength;
     296             :     } else {
     297             :         // Store span lengths for only one span() variant.
     298           0 :         if(which&UTF8) {
     299           0 :             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
     300           0 :             utf8=spanLengths+stringsLength;
     301             :         } else {
     302           0 :             spanLengths=(uint8_t *)utf8Lengths;
     303             :         }
     304           0 :         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
     305             :     }
     306             : 
     307             :     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
     308           0 :     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
     309             : 
     310           0 :     for(i=0; i<stringsLength; ++i) {
     311           0 :         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
     312           0 :         const UChar *s16=string.getBuffer();
     313           0 :         int32_t length16=string.length();
     314           0 :         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
     315           0 :         if(spanLength<length16) {  // Relevant string.
     316           0 :             if(which&UTF16) {
     317           0 :                 if(which&CONTAINED) {
     318           0 :                     if(which&FWD) {
     319           0 :                         spanLengths[i]=makeSpanLengthByte(spanLength);
     320             :                     }
     321           0 :                     if(which&BACK) {
     322           0 :                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
     323           0 :                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
     324             :                     }
     325             :                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
     326           0 :                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
     327             :                 }
     328             :             }
     329           0 :             if(which&UTF8) {
     330           0 :                 uint8_t *s8=utf8+utf8Count;
     331           0 :                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
     332           0 :                 utf8Count+=utf8Lengths[i]=length8;
     333           0 :                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
     334           0 :                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
     335             :                 } else {  // Relevant for UTF-8.
     336           0 :                     if(which&CONTAINED) {
     337           0 :                         if(which&FWD) {
     338           0 :                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
     339           0 :                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
     340             :                         }
     341           0 :                         if(which&BACK) {
     342           0 :                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
     343           0 :                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
     344             :                         }
     345             :                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
     346           0 :                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
     347             :                     }
     348             :                 }
     349             :             }
     350           0 :             if(which&NOT_CONTAINED) {
     351             :                 // Add string start and end code points to the spanNotSet so that
     352             :                 // a span(while not contained) stops before any string.
     353             :                 UChar32 c;
     354           0 :                 if(which&FWD) {
     355           0 :                     int32_t len=0;
     356           0 :                     U16_NEXT(s16, len, length16, c);
     357           0 :                     addToSpanNotSet(c);
     358             :                 }
     359           0 :                 if(which&BACK) {
     360           0 :                     int32_t len=length16;
     361           0 :                     U16_PREV(s16, 0, len, c);
     362           0 :                     addToSpanNotSet(c);
     363             :                 }
     364             :             }
     365             :         } else {  // Irrelevant string.
     366           0 :             if(which&UTF8) {
     367           0 :                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
     368           0 :                     uint8_t *s8=utf8+utf8Count;
     369           0 :                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
     370           0 :                     utf8Count+=utf8Lengths[i]=length8;
     371             :                 } else {
     372           0 :                     utf8Lengths[i]=0;
     373             :                 }
     374             :             }
     375           0 :             if(all) {
     376           0 :                 spanLengths[i]=spanBackLengths[i]=
     377           0 :                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
     378             :                         (uint8_t)ALL_CP_CONTAINED;
     379             :             } else {
     380             :                 // All spanXYZLengths pointers contain the same address.
     381           0 :                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
     382             :             }
     383             :         }
     384             :     }
     385             : 
     386             :     // Finish.
     387           0 :     if(all) {
     388           0 :         pSpanNotSet->freeze();
     389             :     }
     390             : }
     391             : 
     392             : // Copy constructor. Assumes which==ALL for a frozen set.
     393           0 : UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
     394           0 :                                            const UVector &newParentSetStrings)
     395             :         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
     396             :           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
     397           0 :           utf8Length(otherStringSpan.utf8Length),
     398           0 :           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
     399           0 :           all(TRUE) {
     400           0 :     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
     401           0 :         pSpanNotSet=&spanSet;
     402             :     } else {
     403           0 :         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
     404             :     }
     405             : 
     406             :     // Allocate a block of meta data.
     407             :     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
     408           0 :     int32_t stringsLength=strings.size();
     409           0 :     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
     410           0 :     if(allocSize<=(int32_t)sizeof(staticLengths)) {
     411           0 :         utf8Lengths=staticLengths;
     412             :     } else {
     413           0 :         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
     414           0 :         if(utf8Lengths==NULL) {
     415           0 :             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
     416           0 :             return;  // Out of memory.
     417             :         }
     418             :     }
     419             : 
     420           0 :     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
     421           0 :     utf8=spanLengths+stringsLength*4;
     422           0 :     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
     423             : }
     424             : 
     425           0 : UnicodeSetStringSpan::~UnicodeSetStringSpan() {
     426           0 :     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
     427           0 :         delete pSpanNotSet;
     428             :     }
     429           0 :     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
     430           0 :         uprv_free(utf8Lengths);
     431             :     }
     432           0 : }
     433             : 
     434           0 : void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
     435           0 :     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
     436           0 :         if(spanSet.contains(c)) {
     437           0 :             return;  // Nothing to do.
     438             :         }
     439           0 :         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
     440           0 :         if(newSet==NULL) {
     441           0 :             return;  // Out of memory.
     442             :         } else {
     443           0 :             pSpanNotSet=newSet;
     444             :         }
     445             :     }
     446           0 :     pSpanNotSet->add(c);
     447             : }
     448             : 
     449             : // Compare strings without any argument checks. Requires length>0.
     450             : static inline UBool
     451           0 : matches16(const UChar *s, const UChar *t, int32_t length) {
     452           0 :     do {
     453           0 :         if(*s++!=*t++) {
     454           0 :             return FALSE;
     455             :         }
     456             :     } while(--length>0);
     457           0 :     return TRUE;
     458             : }
     459             : 
     460             : static inline UBool
     461           0 : matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
     462           0 :     do {
     463           0 :         if(*s++!=*t++) {
     464           0 :             return FALSE;
     465             :         }
     466             :     } while(--length>0);
     467           0 :     return TRUE;
     468             : }
     469             : 
     470             : // Compare 16-bit Unicode strings (which may be malformed UTF-16)
     471             : // at code point boundaries.
     472             : // That is, each edge of a match must not be in the middle of a surrogate pair.
     473             : static inline UBool
     474           0 : matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
     475           0 :     s+=start;
     476           0 :     limit-=start;
     477           0 :     return matches16(s, t, length) &&
     478           0 :            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
     479           0 :            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
     480             : }
     481             : 
     482             : // Does the set contain the next code point?
     483             : // If so, return its length; otherwise return its negative length.
     484             : static inline int32_t
     485           0 : spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
     486           0 :     UChar c=*s, c2;
     487           0 :     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
     488           0 :         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
     489             :     }
     490           0 :     return set.contains(c) ? 1 : -1;
     491             : }
     492             : 
     493             : static inline int32_t
     494           0 : spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
     495           0 :     UChar c=s[length-1], c2;
     496           0 :     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
     497           0 :         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
     498             :     }
     499           0 :     return set.contains(c) ? 1 : -1;
     500             : }
     501             : 
     502             : static inline int32_t
     503           0 : spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
     504           0 :     UChar32 c=*s;
     505           0 :     if((int8_t)c>=0) {
     506           0 :         return set.contains(c) ? 1 : -1;
     507             :     }
     508             :     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
     509           0 :     int32_t i=0;
     510           0 :     U8_NEXT_OR_FFFD(s, i, length, c);
     511           0 :     return set.contains(c) ? i : -i;
     512             : }
     513             : 
     514             : static inline int32_t
     515           0 : spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
     516           0 :     UChar32 c=s[length-1];
     517           0 :     if((int8_t)c>=0) {
     518           0 :         return set.contains(c) ? 1 : -1;
     519             :     }
     520           0 :     int32_t i=length-1;
     521           0 :     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
     522           0 :     length-=i;
     523           0 :     return set.contains(c) ? length : -length;
     524             : }
     525             : 
     526             : /*
     527             :  * Note: In span() when spanLength==0 (after a string match, or at the beginning
     528             :  * after an empty code point span) and in spanNot() and spanNotUTF8(),
     529             :  * string matching could use a binary search
     530             :  * because all string matches are done from the same start index.
     531             :  *
     532             :  * For UTF-8, this would require a comparison function that returns UTF-16 order.
     533             :  *
     534             :  * This optimization should not be necessary for normal UnicodeSets because
     535             :  * most sets have no strings, and most sets with strings have
     536             :  * very few very short strings.
     537             :  * For cases with many strings, it might be better to use a different API
     538             :  * and implementation with a DFA (state machine).
     539             :  */
     540             : 
     541             : /*
     542             :  * Algorithm for span(USET_SPAN_CONTAINED)
     543             :  *
     544             :  * Theoretical algorithm:
     545             :  * - Iterate through the string, and at each code point boundary:
     546             :  *   + If the code point there is in the set, then remember to continue after it.
     547             :  *   + If a set string matches at the current position, then remember to continue after it.
     548             :  *   + Either recursively span for each code point or string match,
     549             :  *     or recursively span for all but the shortest one and
     550             :  *     iteratively continue the span with the shortest local match.
     551             :  *   + Remember the longest recursive span (the farthest end point).
     552             :  *   + If there is no match at the current position, neither for the code point there
     553             :  *     nor for any set string, then stop and return the longest recursive span length.
     554             :  *
     555             :  * Optimized implementation:
     556             :  *
     557             :  * (We assume that most sets will have very few very short strings.
     558             :  * A span using a string-less set is extremely fast.)
     559             :  *
     560             :  * Create and cache a spanSet which contains all of the single code points
     561             :  * of the original set but none of its strings.
     562             :  *
     563             :  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
     564             :  * - Loop:
     565             :  *   + Try to match each set string at the end of the spanLength.
     566             :  *     ~ Set strings that start with set-contained code points must be matched
     567             :  *       with a partial overlap because the recursive algorithm would have tried
     568             :  *       to match them at every position.
     569             :  *     ~ Set strings that entirely consist of set-contained code points
     570             :  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
     571             :  *       recursive algorithm would continue after them anyway
     572             :  *       and find the longest recursive match from their end.
     573             :  *     ~ Rather than recursing, note each end point of a set string match.
     574             :  *   + If no set string matched after spanSet.span(), then return
     575             :  *     with where the spanSet.span() ended.
     576             :  *   + If at least one set string matched after spanSet.span(), then
     577             :  *     pop the shortest string match end point and continue
     578             :  *     the loop, trying to match all set strings from there.
     579             :  *   + If at least one more set string matched after a previous string match,
     580             :  *     then test if the code point after the previous string match is also
     581             :  *     contained in the set.
     582             :  *     Continue the loop with the shortest end point of either this code point
     583             :  *     or a matching set string.
     584             :  *   + If no more set string matched after a previous string match,
     585             :  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
     586             :  *     Stop if spanLength==0, otherwise continue the loop.
     587             :  *
     588             :  * By noting each end point of a set string match,
     589             :  * the function visits each string position at most once and finishes
     590             :  * in linear time.
     591             :  *
     592             :  * The recursive algorithm may visit the same string position many times
     593             :  * if multiple paths lead to it and finishes in exponential time.
     594             :  */
     595             : 
     596             : /*
     597             :  * Algorithm for span(USET_SPAN_SIMPLE)
     598             :  *
     599             :  * Theoretical algorithm:
     600             :  * - Iterate through the string, and at each code point boundary:
     601             :  *   + If the code point there is in the set, then remember to continue after it.
     602             :  *   + If a set string matches at the current position, then remember to continue after it.
     603             :  *   + Continue from the farthest match position and ignore all others.
     604             :  *   + If there is no match at the current position,
     605             :  *     then stop and return the current position.
     606             :  *
     607             :  * Optimized implementation:
     608             :  *
     609             :  * (Same assumption and spanSet as above.)
     610             :  *
     611             :  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
     612             :  * - Loop:
     613             :  *   + Try to match each set string at the end of the spanLength.
     614             :  *     ~ Set strings that start with set-contained code points must be matched
     615             :  *       with a partial overlap because the standard algorithm would have tried
     616             :  *       to match them earlier.
     617             :  *     ~ Set strings that entirely consist of set-contained code points
     618             :  *       must be matched with a full overlap because the longest-match algorithm
     619             :  *       would hide set string matches that end earlier.
     620             :  *       Such set strings need not be matched earlier inside the code point span
     621             :  *       because the standard algorithm would then have continued after
     622             :  *       the set string match anyway.
     623             :  *     ~ Remember the longest set string match (farthest end point) from the earliest
     624             :  *       starting point.
     625             :  *   + If no set string matched after spanSet.span(), then return
     626             :  *     with where the spanSet.span() ended.
     627             :  *   + If at least one set string matched, then continue the loop after the
     628             :  *     longest match from the earliest position.
     629             :  *   + If no more set string matched after a previous string match,
     630             :  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
     631             :  *     Stop if spanLength==0, otherwise continue the loop.
     632             :  */
     633             : 
     634           0 : int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
     635           0 :     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     636           0 :         return spanNot(s, length);
     637             :     }
     638           0 :     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
     639           0 :     if(spanLength==length) {
     640           0 :         return length;
     641             :     }
     642             : 
     643             :     // Consider strings; they may overlap with the span.
     644           0 :     OffsetList offsets;
     645           0 :     if(spanCondition==USET_SPAN_CONTAINED) {
     646             :         // Use offset list to try all possibilities.
     647           0 :         offsets.setMaxLength(maxLength16);
     648             :     }
     649           0 :     int32_t pos=spanLength, rest=length-pos;
     650           0 :     int32_t i, stringsLength=strings.size();
     651             :     for(;;) {
     652           0 :         if(spanCondition==USET_SPAN_CONTAINED) {
     653           0 :             for(i=0; i<stringsLength; ++i) {
     654           0 :                 int32_t overlap=spanLengths[i];
     655           0 :                 if(overlap==ALL_CP_CONTAINED) {
     656           0 :                     continue;  // Irrelevant string.
     657             :                 }
     658           0 :                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
     659           0 :                 const UChar *s16=string.getBuffer();
     660           0 :                 int32_t length16=string.length();
     661             : 
     662             :                 // Try to match this string at pos-overlap..pos.
     663           0 :                 if(overlap>=LONG_SPAN) {
     664           0 :                     overlap=length16;
     665             :                     // While contained: No point matching fully inside the code point span.
     666           0 :                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
     667             :                 }
     668           0 :                 if(overlap>spanLength) {
     669           0 :                     overlap=spanLength;
     670             :                 }
     671           0 :                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
     672             :                 for(;;) {
     673           0 :                     if(inc>rest) {
     674           0 :                         break;
     675             :                     }
     676             :                     // Try to match if the increment is not listed already.
     677           0 :                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
     678           0 :                         if(inc==rest) {
     679           0 :                             return length;  // Reached the end of the string.
     680             :                         }
     681           0 :                         offsets.addOffset(inc);
     682             :                     }
     683           0 :                     if(overlap==0) {
     684           0 :                         break;
     685             :                     }
     686           0 :                     --overlap;
     687           0 :                     ++inc;
     688             :                 }
     689             :             }
     690             :         } else /* USET_SPAN_SIMPLE */ {
     691           0 :             int32_t maxInc=0, maxOverlap=0;
     692           0 :             for(i=0; i<stringsLength; ++i) {
     693           0 :                 int32_t overlap=spanLengths[i];
     694             :                 // For longest match, we do need to try to match even an all-contained string
     695             :                 // to find the match from the earliest start.
     696             : 
     697           0 :                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
     698           0 :                 const UChar *s16=string.getBuffer();
     699           0 :                 int32_t length16=string.length();
     700             : 
     701             :                 // Try to match this string at pos-overlap..pos.
     702           0 :                 if(overlap>=LONG_SPAN) {
     703           0 :                     overlap=length16;
     704             :                     // Longest match: Need to match fully inside the code point span
     705             :                     // to find the match from the earliest start.
     706             :                 }
     707           0 :                 if(overlap>spanLength) {
     708           0 :                     overlap=spanLength;
     709             :                 }
     710           0 :                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
     711             :                 for(;;) {
     712           0 :                     if(inc>rest || overlap<maxOverlap) {
     713             :                         break;
     714             :                     }
     715             :                     // Try to match if the string is longer or starts earlier.
     716           0 :                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
     717           0 :                         matches16CPB(s, pos-overlap, length, s16, length16)
     718             :                     ) {
     719           0 :                         maxInc=inc;  // Longest match from earliest start.
     720           0 :                         maxOverlap=overlap;
     721           0 :                         break;
     722             :                     }
     723           0 :                     --overlap;
     724           0 :                     ++inc;
     725             :                 }
     726             :             }
     727             : 
     728           0 :             if(maxInc!=0 || maxOverlap!=0) {
     729             :                 // Longest-match algorithm, and there was a string match.
     730             :                 // Simply continue after it.
     731           0 :                 pos+=maxInc;
     732           0 :                 rest-=maxInc;
     733           0 :                 if(rest==0) {
     734           0 :                     return length;  // Reached the end of the string.
     735             :                 }
     736           0 :                 spanLength=0;  // Match strings from after a string match.
     737           0 :                 continue;
     738             :             }
     739             :         }
     740             :         // Finished trying to match all strings at pos.
     741             : 
     742           0 :         if(spanLength!=0 || pos==0) {
     743             :             // The position is after an unlimited code point span (spanLength!=0),
     744             :             // not after a string match.
     745             :             // The only position where spanLength==0 after a span is pos==0.
     746             :             // Otherwise, an unlimited code point span is only tried again when no
     747             :             // strings match, and if such a non-initial span fails we stop.
     748           0 :             if(offsets.isEmpty()) {
     749           0 :                 return pos;  // No strings matched after a span.
     750             :             }
     751             :             // Match strings from after the next string match.
     752             :         } else {
     753             :             // The position is after a string match (or a single code point).
     754           0 :             if(offsets.isEmpty()) {
     755             :                 // No more strings matched after a previous string match.
     756             :                 // Try another code point span from after the last string match.
     757           0 :                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
     758           0 :                 if( spanLength==rest || // Reached the end of the string, or
     759             :                     spanLength==0       // neither strings nor span progressed.
     760             :                 ) {
     761           0 :                     return pos+spanLength;
     762             :                 }
     763           0 :                 pos+=spanLength;
     764           0 :                 rest-=spanLength;
     765           0 :                 continue;  // spanLength>0: Match strings from after a span.
     766             :             } else {
     767             :                 // Try to match only one code point from after a string match if some
     768             :                 // string matched beyond it, so that we try all possible positions
     769             :                 // and don't overshoot.
     770           0 :                 spanLength=spanOne(spanSet, s+pos, rest);
     771           0 :                 if(spanLength>0) {
     772           0 :                     if(spanLength==rest) {
     773           0 :                         return length;  // Reached the end of the string.
     774             :                     }
     775             :                     // Match strings after this code point.
     776             :                     // There cannot be any increments below it because UnicodeSet strings
     777             :                     // contain multiple code points.
     778           0 :                     pos+=spanLength;
     779           0 :                     rest-=spanLength;
     780           0 :                     offsets.shift(spanLength);
     781           0 :                     spanLength=0;
     782           0 :                     continue;  // Match strings from after a single code point.
     783             :                 }
     784             :                 // Match strings from after the next string match.
     785             :             }
     786             :         }
     787           0 :         int32_t minOffset=offsets.popMinimum();
     788           0 :         pos+=minOffset;
     789           0 :         rest-=minOffset;
     790           0 :         spanLength=0;  // Match strings from after a string match.
     791           0 :     }
     792             : }
     793             : 
     794           0 : int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
     795           0 :     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     796           0 :         return spanNotBack(s, length);
     797             :     }
     798           0 :     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
     799           0 :     if(pos==0) {
     800           0 :         return 0;
     801             :     }
     802           0 :     int32_t spanLength=length-pos;
     803             : 
     804             :     // Consider strings; they may overlap with the span.
     805           0 :     OffsetList offsets;
     806           0 :     if(spanCondition==USET_SPAN_CONTAINED) {
     807             :         // Use offset list to try all possibilities.
     808           0 :         offsets.setMaxLength(maxLength16);
     809             :     }
     810           0 :     int32_t i, stringsLength=strings.size();
     811           0 :     uint8_t *spanBackLengths=spanLengths;
     812           0 :     if(all) {
     813           0 :         spanBackLengths+=stringsLength;
     814             :     }
     815             :     for(;;) {
     816           0 :         if(spanCondition==USET_SPAN_CONTAINED) {
     817           0 :             for(i=0; i<stringsLength; ++i) {
     818           0 :                 int32_t overlap=spanBackLengths[i];
     819           0 :                 if(overlap==ALL_CP_CONTAINED) {
     820           0 :                     continue;  // Irrelevant string.
     821             :                 }
     822           0 :                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
     823           0 :                 const UChar *s16=string.getBuffer();
     824           0 :                 int32_t length16=string.length();
     825             : 
     826             :                 // Try to match this string at pos-(length16-overlap)..pos-length16.
     827           0 :                 if(overlap>=LONG_SPAN) {
     828           0 :                     overlap=length16;
     829             :                     // While contained: No point matching fully inside the code point span.
     830           0 :                     int32_t len1=0;
     831           0 :                     U16_FWD_1(s16, len1, overlap);
     832           0 :                     overlap-=len1;  // Length of the string minus the first code point.
     833             :                 }
     834           0 :                 if(overlap>spanLength) {
     835           0 :                     overlap=spanLength;
     836             :                 }
     837           0 :                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
     838             :                 for(;;) {
     839           0 :                     if(dec>pos) {
     840           0 :                         break;
     841             :                     }
     842             :                     // Try to match if the decrement is not listed already.
     843           0 :                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
     844           0 :                         if(dec==pos) {
     845           0 :                             return 0;  // Reached the start of the string.
     846             :                         }
     847           0 :                         offsets.addOffset(dec);
     848             :                     }
     849           0 :                     if(overlap==0) {
     850           0 :                         break;
     851             :                     }
     852           0 :                     --overlap;
     853           0 :                     ++dec;
     854             :                 }
     855             :             }
     856             :         } else /* USET_SPAN_SIMPLE */ {
     857           0 :             int32_t maxDec=0, maxOverlap=0;
     858           0 :             for(i=0; i<stringsLength; ++i) {
     859           0 :                 int32_t overlap=spanBackLengths[i];
     860             :                 // For longest match, we do need to try to match even an all-contained string
     861             :                 // to find the match from the latest end.
     862             : 
     863           0 :                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
     864           0 :                 const UChar *s16=string.getBuffer();
     865           0 :                 int32_t length16=string.length();
     866             : 
     867             :                 // Try to match this string at pos-(length16-overlap)..pos-length16.
     868           0 :                 if(overlap>=LONG_SPAN) {
     869           0 :                     overlap=length16;
     870             :                     // Longest match: Need to match fully inside the code point span
     871             :                     // to find the match from the latest end.
     872             :                 }
     873           0 :                 if(overlap>spanLength) {
     874           0 :                     overlap=spanLength;
     875             :                 }
     876           0 :                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
     877             :                 for(;;) {
     878           0 :                     if(dec>pos || overlap<maxOverlap) {
     879             :                         break;
     880             :                     }
     881             :                     // Try to match if the string is longer or ends later.
     882           0 :                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
     883           0 :                         matches16CPB(s, pos-dec, length, s16, length16)
     884             :                     ) {
     885           0 :                         maxDec=dec;  // Longest match from latest end.
     886           0 :                         maxOverlap=overlap;
     887           0 :                         break;
     888             :                     }
     889           0 :                     --overlap;
     890           0 :                     ++dec;
     891             :                 }
     892             :             }
     893             : 
     894           0 :             if(maxDec!=0 || maxOverlap!=0) {
     895             :                 // Longest-match algorithm, and there was a string match.
     896             :                 // Simply continue before it.
     897           0 :                 pos-=maxDec;
     898           0 :                 if(pos==0) {
     899           0 :                     return 0;  // Reached the start of the string.
     900             :                 }
     901           0 :                 spanLength=0;  // Match strings from before a string match.
     902           0 :                 continue;
     903             :             }
     904             :         }
     905             :         // Finished trying to match all strings at pos.
     906             : 
     907           0 :         if(spanLength!=0 || pos==length) {
     908             :             // The position is before an unlimited code point span (spanLength!=0),
     909             :             // not before a string match.
     910             :             // The only position where spanLength==0 before a span is pos==length.
     911             :             // Otherwise, an unlimited code point span is only tried again when no
     912             :             // strings match, and if such a non-initial span fails we stop.
     913           0 :             if(offsets.isEmpty()) {
     914           0 :                 return pos;  // No strings matched before a span.
     915             :             }
     916             :             // Match strings from before the next string match.
     917             :         } else {
     918             :             // The position is before a string match (or a single code point).
     919           0 :             if(offsets.isEmpty()) {
     920             :                 // No more strings matched before a previous string match.
     921             :                 // Try another code point span from before the last string match.
     922           0 :                 int32_t oldPos=pos;
     923           0 :                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
     924           0 :                 spanLength=oldPos-pos;
     925           0 :                 if( pos==0 ||           // Reached the start of the string, or
     926             :                     spanLength==0       // neither strings nor span progressed.
     927             :                 ) {
     928           0 :                     return pos;
     929             :                 }
     930           0 :                 continue;  // spanLength>0: Match strings from before a span.
     931             :             } else {
     932             :                 // Try to match only one code point from before a string match if some
     933             :                 // string matched beyond it, so that we try all possible positions
     934             :                 // and don't overshoot.
     935           0 :                 spanLength=spanOneBack(spanSet, s, pos);
     936           0 :                 if(spanLength>0) {
     937           0 :                     if(spanLength==pos) {
     938           0 :                         return 0;  // Reached the start of the string.
     939             :                     }
     940             :                     // Match strings before this code point.
     941             :                     // There cannot be any decrements below it because UnicodeSet strings
     942             :                     // contain multiple code points.
     943           0 :                     pos-=spanLength;
     944           0 :                     offsets.shift(spanLength);
     945           0 :                     spanLength=0;
     946           0 :                     continue;  // Match strings from before a single code point.
     947             :                 }
     948             :                 // Match strings from before the next string match.
     949             :             }
     950             :         }
     951           0 :         pos-=offsets.popMinimum();
     952           0 :         spanLength=0;  // Match strings from before a string match.
     953           0 :     }
     954             : }
     955             : 
     956           0 : int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
     957           0 :     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
     958           0 :         return spanNotUTF8(s, length);
     959             :     }
     960           0 :     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
     961           0 :     if(spanLength==length) {
     962           0 :         return length;
     963             :     }
     964             : 
     965             :     // Consider strings; they may overlap with the span.
     966           0 :     OffsetList offsets;
     967           0 :     if(spanCondition==USET_SPAN_CONTAINED) {
     968             :         // Use offset list to try all possibilities.
     969           0 :         offsets.setMaxLength(maxLength8);
     970             :     }
     971           0 :     int32_t pos=spanLength, rest=length-pos;
     972           0 :     int32_t i, stringsLength=strings.size();
     973           0 :     uint8_t *spanUTF8Lengths=spanLengths;
     974           0 :     if(all) {
     975           0 :         spanUTF8Lengths+=2*stringsLength;
     976             :     }
     977             :     for(;;) {
     978           0 :         const uint8_t *s8=utf8;
     979             :         int32_t length8;
     980           0 :         if(spanCondition==USET_SPAN_CONTAINED) {
     981           0 :             for(i=0; i<stringsLength; ++i) {
     982           0 :                 length8=utf8Lengths[i];
     983           0 :                 if(length8==0) {
     984           0 :                     continue;  // String not representable in UTF-8.
     985             :                 }
     986           0 :                 int32_t overlap=spanUTF8Lengths[i];
     987           0 :                 if(overlap==ALL_CP_CONTAINED) {
     988           0 :                     s8+=length8;
     989           0 :                     continue;  // Irrelevant string.
     990             :                 }
     991             : 
     992             :                 // Try to match this string at pos-overlap..pos.
     993           0 :                 if(overlap>=LONG_SPAN) {
     994           0 :                     overlap=length8;
     995             :                     // While contained: No point matching fully inside the code point span.
     996           0 :                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
     997             :                 }
     998           0 :                 if(overlap>spanLength) {
     999           0 :                     overlap=spanLength;
    1000             :                 }
    1001           0 :                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
    1002             :                 for(;;) {
    1003           0 :                     if(inc>rest) {
    1004           0 :                         break;
    1005             :                     }
    1006             :                     // Try to match if the increment is not listed already.
    1007             :                     // Match at code point boundaries. (The UTF-8 strings were converted
    1008             :                     // from UTF-16 and are guaranteed to be well-formed.)
    1009           0 :                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
    1010           0 :                         !offsets.containsOffset(inc) &&
    1011           0 :                         matches8(s+pos-overlap, s8, length8)
    1012             :                         
    1013             :                     ) {
    1014           0 :                         if(inc==rest) {
    1015           0 :                             return length;  // Reached the end of the string.
    1016             :                         }
    1017           0 :                         offsets.addOffset(inc);
    1018             :                     }
    1019           0 :                     if(overlap==0) {
    1020           0 :                         break;
    1021             :                     }
    1022           0 :                     --overlap;
    1023           0 :                     ++inc;
    1024             :                 }
    1025           0 :                 s8+=length8;
    1026             :             }
    1027             :         } else /* USET_SPAN_SIMPLE */ {
    1028           0 :             int32_t maxInc=0, maxOverlap=0;
    1029           0 :             for(i=0; i<stringsLength; ++i) {
    1030           0 :                 length8=utf8Lengths[i];
    1031           0 :                 if(length8==0) {
    1032           0 :                     continue;  // String not representable in UTF-8.
    1033             :                 }
    1034           0 :                 int32_t overlap=spanUTF8Lengths[i];
    1035             :                 // For longest match, we do need to try to match even an all-contained string
    1036             :                 // to find the match from the earliest start.
    1037             : 
    1038             :                 // Try to match this string at pos-overlap..pos.
    1039           0 :                 if(overlap>=LONG_SPAN) {
    1040           0 :                     overlap=length8;
    1041             :                     // Longest match: Need to match fully inside the code point span
    1042             :                     // to find the match from the earliest start.
    1043             :                 }
    1044           0 :                 if(overlap>spanLength) {
    1045           0 :                     overlap=spanLength;
    1046             :                 }
    1047           0 :                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
    1048             :                 for(;;) {
    1049           0 :                     if(inc>rest || overlap<maxOverlap) {
    1050             :                         break;
    1051             :                     }
    1052             :                     // Try to match if the string is longer or starts earlier.
    1053             :                     // Match at code point boundaries. (The UTF-8 strings were converted
    1054             :                     // from UTF-16 and are guaranteed to be well-formed.)
    1055           0 :                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
    1056           0 :                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
    1057           0 :                         matches8(s+pos-overlap, s8, length8)
    1058             :                         
    1059             :                     ) {
    1060           0 :                         maxInc=inc;  // Longest match from earliest start.
    1061           0 :                         maxOverlap=overlap;
    1062           0 :                         break;
    1063             :                     }
    1064           0 :                     --overlap;
    1065           0 :                     ++inc;
    1066             :                 }
    1067           0 :                 s8+=length8;
    1068             :             }
    1069             : 
    1070           0 :             if(maxInc!=0 || maxOverlap!=0) {
    1071             :                 // Longest-match algorithm, and there was a string match.
    1072             :                 // Simply continue after it.
    1073           0 :                 pos+=maxInc;
    1074           0 :                 rest-=maxInc;
    1075           0 :                 if(rest==0) {
    1076           0 :                     return length;  // Reached the end of the string.
    1077             :                 }
    1078           0 :                 spanLength=0;  // Match strings from after a string match.
    1079           0 :                 continue;
    1080             :             }
    1081             :         }
    1082             :         // Finished trying to match all strings at pos.
    1083             : 
    1084           0 :         if(spanLength!=0 || pos==0) {
    1085             :             // The position is after an unlimited code point span (spanLength!=0),
    1086             :             // not after a string match.
    1087             :             // The only position where spanLength==0 after a span is pos==0.
    1088             :             // Otherwise, an unlimited code point span is only tried again when no
    1089             :             // strings match, and if such a non-initial span fails we stop.
    1090           0 :             if(offsets.isEmpty()) {
    1091           0 :                 return pos;  // No strings matched after a span.
    1092             :             }
    1093             :             // Match strings from after the next string match.
    1094             :         } else {
    1095             :             // The position is after a string match (or a single code point).
    1096           0 :             if(offsets.isEmpty()) {
    1097             :                 // No more strings matched after a previous string match.
    1098             :                 // Try another code point span from after the last string match.
    1099           0 :                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
    1100           0 :                 if( spanLength==rest || // Reached the end of the string, or
    1101             :                     spanLength==0       // neither strings nor span progressed.
    1102             :                 ) {
    1103           0 :                     return pos+spanLength;
    1104             :                 }
    1105           0 :                 pos+=spanLength;
    1106           0 :                 rest-=spanLength;
    1107           0 :                 continue;  // spanLength>0: Match strings from after a span.
    1108             :             } else {
    1109             :                 // Try to match only one code point from after a string match if some
    1110             :                 // string matched beyond it, so that we try all possible positions
    1111             :                 // and don't overshoot.
    1112           0 :                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
    1113           0 :                 if(spanLength>0) {
    1114           0 :                     if(spanLength==rest) {
    1115           0 :                         return length;  // Reached the end of the string.
    1116             :                     }
    1117             :                     // Match strings after this code point.
    1118             :                     // There cannot be any increments below it because UnicodeSet strings
    1119             :                     // contain multiple code points.
    1120           0 :                     pos+=spanLength;
    1121           0 :                     rest-=spanLength;
    1122           0 :                     offsets.shift(spanLength);
    1123           0 :                     spanLength=0;
    1124           0 :                     continue;  // Match strings from after a single code point.
    1125             :                 }
    1126             :                 // Match strings from after the next string match.
    1127             :             }
    1128             :         }
    1129           0 :         int32_t minOffset=offsets.popMinimum();
    1130           0 :         pos+=minOffset;
    1131           0 :         rest-=minOffset;
    1132           0 :         spanLength=0;  // Match strings from after a string match.
    1133           0 :     }
    1134             : }
    1135             : 
    1136           0 : int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
    1137           0 :     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
    1138           0 :         return spanNotBackUTF8(s, length);
    1139             :     }
    1140           0 :     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
    1141           0 :     if(pos==0) {
    1142           0 :         return 0;
    1143             :     }
    1144           0 :     int32_t spanLength=length-pos;
    1145             : 
    1146             :     // Consider strings; they may overlap with the span.
    1147           0 :     OffsetList offsets;
    1148           0 :     if(spanCondition==USET_SPAN_CONTAINED) {
    1149             :         // Use offset list to try all possibilities.
    1150           0 :         offsets.setMaxLength(maxLength8);
    1151             :     }
    1152           0 :     int32_t i, stringsLength=strings.size();
    1153           0 :     uint8_t *spanBackUTF8Lengths=spanLengths;
    1154           0 :     if(all) {
    1155           0 :         spanBackUTF8Lengths+=3*stringsLength;
    1156             :     }
    1157             :     for(;;) {
    1158           0 :         const uint8_t *s8=utf8;
    1159             :         int32_t length8;
    1160           0 :         if(spanCondition==USET_SPAN_CONTAINED) {
    1161           0 :             for(i=0; i<stringsLength; ++i) {
    1162           0 :                 length8=utf8Lengths[i];
    1163           0 :                 if(length8==0) {
    1164           0 :                     continue;  // String not representable in UTF-8.
    1165             :                 }
    1166           0 :                 int32_t overlap=spanBackUTF8Lengths[i];
    1167           0 :                 if(overlap==ALL_CP_CONTAINED) {
    1168           0 :                     s8+=length8;
    1169           0 :                     continue;  // Irrelevant string.
    1170             :                 }
    1171             : 
    1172             :                 // Try to match this string at pos-(length8-overlap)..pos-length8.
    1173           0 :                 if(overlap>=LONG_SPAN) {
    1174           0 :                     overlap=length8;
    1175             :                     // While contained: No point matching fully inside the code point span.
    1176           0 :                     int32_t len1=0;
    1177           0 :                     U8_FWD_1(s8, len1, overlap);
    1178           0 :                     overlap-=len1;  // Length of the string minus the first code point.
    1179             :                 }
    1180           0 :                 if(overlap>spanLength) {
    1181           0 :                     overlap=spanLength;
    1182             :                 }
    1183           0 :                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
    1184             :                 for(;;) {
    1185           0 :                     if(dec>pos) {
    1186           0 :                         break;
    1187             :                     }
    1188             :                     // Try to match if the decrement is not listed already.
    1189             :                     // Match at code point boundaries. (The UTF-8 strings were converted
    1190             :                     // from UTF-16 and are guaranteed to be well-formed.)
    1191           0 :                     if( !U8_IS_TRAIL(s[pos-dec]) &&
    1192           0 :                         !offsets.containsOffset(dec) &&
    1193           0 :                         matches8(s+pos-dec, s8, length8)
    1194             :                     ) {
    1195           0 :                         if(dec==pos) {
    1196           0 :                             return 0;  // Reached the start of the string.
    1197             :                         }
    1198           0 :                         offsets.addOffset(dec);
    1199             :                     }
    1200           0 :                     if(overlap==0) {
    1201           0 :                         break;
    1202             :                     }
    1203           0 :                     --overlap;
    1204           0 :                     ++dec;
    1205             :                 }
    1206           0 :                 s8+=length8;
    1207             :             }
    1208             :         } else /* USET_SPAN_SIMPLE */ {
    1209           0 :             int32_t maxDec=0, maxOverlap=0;
    1210           0 :             for(i=0; i<stringsLength; ++i) {
    1211           0 :                 length8=utf8Lengths[i];
    1212           0 :                 if(length8==0) {
    1213           0 :                     continue;  // String not representable in UTF-8.
    1214             :                 }
    1215           0 :                 int32_t overlap=spanBackUTF8Lengths[i];
    1216             :                 // For longest match, we do need to try to match even an all-contained string
    1217             :                 // to find the match from the latest end.
    1218             : 
    1219             :                 // Try to match this string at pos-(length8-overlap)..pos-length8.
    1220           0 :                 if(overlap>=LONG_SPAN) {
    1221           0 :                     overlap=length8;
    1222             :                     // Longest match: Need to match fully inside the code point span
    1223             :                     // to find the match from the latest end.
    1224             :                 }
    1225           0 :                 if(overlap>spanLength) {
    1226           0 :                     overlap=spanLength;
    1227             :                 }
    1228           0 :                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
    1229             :                 for(;;) {
    1230           0 :                     if(dec>pos || overlap<maxOverlap) {
    1231             :                         break;
    1232             :                     }
    1233             :                     // Try to match if the string is longer or ends later.
    1234             :                     // Match at code point boundaries. (The UTF-8 strings were converted
    1235             :                     // from UTF-16 and are guaranteed to be well-formed.)
    1236           0 :                     if( !U8_IS_TRAIL(s[pos-dec]) &&
    1237           0 :                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
    1238           0 :                         matches8(s+pos-dec, s8, length8)
    1239             :                     ) {
    1240           0 :                         maxDec=dec;  // Longest match from latest end.
    1241           0 :                         maxOverlap=overlap;
    1242           0 :                         break;
    1243             :                     }
    1244           0 :                     --overlap;
    1245           0 :                     ++dec;
    1246             :                 }
    1247           0 :                 s8+=length8;
    1248             :             }
    1249             : 
    1250           0 :             if(maxDec!=0 || maxOverlap!=0) {
    1251             :                 // Longest-match algorithm, and there was a string match.
    1252             :                 // Simply continue before it.
    1253           0 :                 pos-=maxDec;
    1254           0 :                 if(pos==0) {
    1255           0 :                     return 0;  // Reached the start of the string.
    1256             :                 }
    1257           0 :                 spanLength=0;  // Match strings from before a string match.
    1258           0 :                 continue;
    1259             :             }
    1260             :         }
    1261             :         // Finished trying to match all strings at pos.
    1262             : 
    1263           0 :         if(spanLength!=0 || pos==length) {
    1264             :             // The position is before an unlimited code point span (spanLength!=0),
    1265             :             // not before a string match.
    1266             :             // The only position where spanLength==0 before a span is pos==length.
    1267             :             // Otherwise, an unlimited code point span is only tried again when no
    1268             :             // strings match, and if such a non-initial span fails we stop.
    1269           0 :             if(offsets.isEmpty()) {
    1270           0 :                 return pos;  // No strings matched before a span.
    1271             :             }
    1272             :             // Match strings from before the next string match.
    1273             :         } else {
    1274             :             // The position is before a string match (or a single code point).
    1275           0 :             if(offsets.isEmpty()) {
    1276             :                 // No more strings matched before a previous string match.
    1277             :                 // Try another code point span from before the last string match.
    1278           0 :                 int32_t oldPos=pos;
    1279           0 :                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
    1280           0 :                 spanLength=oldPos-pos;
    1281           0 :                 if( pos==0 ||           // Reached the start of the string, or
    1282             :                     spanLength==0       // neither strings nor span progressed.
    1283             :                 ) {
    1284           0 :                     return pos;
    1285             :                 }
    1286           0 :                 continue;  // spanLength>0: Match strings from before a span.
    1287             :             } else {
    1288             :                 // Try to match only one code point from before a string match if some
    1289             :                 // string matched beyond it, so that we try all possible positions
    1290             :                 // and don't overshoot.
    1291           0 :                 spanLength=spanOneBackUTF8(spanSet, s, pos);
    1292           0 :                 if(spanLength>0) {
    1293           0 :                     if(spanLength==pos) {
    1294           0 :                         return 0;  // Reached the start of the string.
    1295             :                     }
    1296             :                     // Match strings before this code point.
    1297             :                     // There cannot be any decrements below it because UnicodeSet strings
    1298             :                     // contain multiple code points.
    1299           0 :                     pos-=spanLength;
    1300           0 :                     offsets.shift(spanLength);
    1301           0 :                     spanLength=0;
    1302           0 :                     continue;  // Match strings from before a single code point.
    1303             :                 }
    1304             :                 // Match strings from before the next string match.
    1305             :             }
    1306             :         }
    1307           0 :         pos-=offsets.popMinimum();
    1308           0 :         spanLength=0;  // Match strings from before a string match.
    1309           0 :     }
    1310             : }
    1311             : 
    1312             : /*
    1313             :  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
    1314             :  *
    1315             :  * Theoretical algorithm:
    1316             :  * - Iterate through the string, and at each code point boundary:
    1317             :  *   + If the code point there is in the set, then return with the current position.
    1318             :  *   + If a set string matches at the current position, then return with the current position.
    1319             :  *
    1320             :  * Optimized implementation:
    1321             :  *
    1322             :  * (Same assumption as for span() above.)
    1323             :  *
    1324             :  * Create and cache a spanNotSet which contains all of the single code points
    1325             :  * of the original set but none of its strings.
    1326             :  * For each set string add its initial code point to the spanNotSet.
    1327             :  * (Also add its final code point for spanNotBack().)
    1328             :  *
    1329             :  * - Loop:
    1330             :  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
    1331             :  *   + If the current code point is in the original set, then
    1332             :  *     return the current position.
    1333             :  *   + If any set string matches at the current position, then
    1334             :  *     return the current position.
    1335             :  *   + If there is no match at the current position, neither for the code point there
    1336             :  *     nor for any set string, then skip this code point and continue the loop.
    1337             :  *     This happens for set-string-initial code points that were added to spanNotSet
    1338             :  *     when there is not actually a match for such a set string.
    1339             :  */
    1340             : 
    1341           0 : int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
    1342           0 :     int32_t pos=0, rest=length;
    1343           0 :     int32_t i, stringsLength=strings.size();
    1344           0 :     do {
    1345             :         // Span until we find a code point from the set,
    1346             :         // or a code point that starts or ends some string.
    1347           0 :         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
    1348           0 :         if(i==rest) {
    1349           0 :             return length;  // Reached the end of the string.
    1350             :         }
    1351           0 :         pos+=i;
    1352           0 :         rest-=i;
    1353             : 
    1354             :         // Check whether the current code point is in the original set,
    1355             :         // without the string starts and ends.
    1356           0 :         int32_t cpLength=spanOne(spanSet, s+pos, rest);
    1357           0 :         if(cpLength>0) {
    1358           0 :             return pos;  // There is a set element at pos.
    1359             :         }
    1360             : 
    1361             :         // Try to match the strings at pos.
    1362           0 :         for(i=0; i<stringsLength; ++i) {
    1363           0 :             if(spanLengths[i]==ALL_CP_CONTAINED) {
    1364           0 :                 continue;  // Irrelevant string.
    1365             :             }
    1366           0 :             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    1367           0 :             const UChar *s16=string.getBuffer();
    1368           0 :             int32_t length16=string.length();
    1369           0 :             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
    1370           0 :                 return pos;  // There is a set element at pos.
    1371             :             }
    1372             :         }
    1373             : 
    1374             :         // The span(while not contained) ended on a string start/end which is
    1375             :         // not in the original set. Skip this code point and continue.
    1376             :         // cpLength<0
    1377           0 :         pos-=cpLength;
    1378           0 :         rest+=cpLength;
    1379           0 :     } while(rest!=0);
    1380           0 :     return length;  // Reached the end of the string.
    1381             : }
    1382             : 
    1383           0 : int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
    1384           0 :     int32_t pos=length;
    1385           0 :     int32_t i, stringsLength=strings.size();
    1386           0 :     do {
    1387             :         // Span until we find a code point from the set,
    1388             :         // or a code point that starts or ends some string.
    1389           0 :         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
    1390           0 :         if(pos==0) {
    1391           0 :             return 0;  // Reached the start of the string.
    1392             :         }
    1393             : 
    1394             :         // Check whether the current code point is in the original set,
    1395             :         // without the string starts and ends.
    1396           0 :         int32_t cpLength=spanOneBack(spanSet, s, pos);
    1397           0 :         if(cpLength>0) {
    1398           0 :             return pos;  // There is a set element at pos.
    1399             :         }
    1400             : 
    1401             :         // Try to match the strings at pos.
    1402           0 :         for(i=0; i<stringsLength; ++i) {
    1403             :             // Use spanLengths rather than a spanBackLengths pointer because
    1404             :             // it is easier and we only need to know whether the string is irrelevant
    1405             :             // which is the same in either array.
    1406           0 :             if(spanLengths[i]==ALL_CP_CONTAINED) {
    1407           0 :                 continue;  // Irrelevant string.
    1408             :             }
    1409           0 :             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
    1410           0 :             const UChar *s16=string.getBuffer();
    1411           0 :             int32_t length16=string.length();
    1412           0 :             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
    1413           0 :                 return pos;  // There is a set element at pos.
    1414             :             }
    1415             :         }
    1416             : 
    1417             :         // The span(while not contained) ended on a string start/end which is
    1418             :         // not in the original set. Skip this code point and continue.
    1419             :         // cpLength<0
    1420           0 :         pos+=cpLength;
    1421           0 :     } while(pos!=0);
    1422           0 :     return 0;  // Reached the start of the string.
    1423             : }
    1424             : 
    1425           0 : int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
    1426           0 :     int32_t pos=0, rest=length;
    1427           0 :     int32_t i, stringsLength=strings.size();
    1428           0 :     uint8_t *spanUTF8Lengths=spanLengths;
    1429           0 :     if(all) {
    1430           0 :         spanUTF8Lengths+=2*stringsLength;
    1431             :     }
    1432           0 :     do {
    1433             :         // Span until we find a code point from the set,
    1434             :         // or a code point that starts or ends some string.
    1435           0 :         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
    1436           0 :         if(i==rest) {
    1437           0 :             return length;  // Reached the end of the string.
    1438             :         }
    1439           0 :         pos+=i;
    1440           0 :         rest-=i;
    1441             : 
    1442             :         // Check whether the current code point is in the original set,
    1443             :         // without the string starts and ends.
    1444           0 :         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
    1445           0 :         if(cpLength>0) {
    1446           0 :             return pos;  // There is a set element at pos.
    1447             :         }
    1448             : 
    1449             :         // Try to match the strings at pos.
    1450           0 :         const uint8_t *s8=utf8;
    1451             :         int32_t length8;
    1452           0 :         for(i=0; i<stringsLength; ++i) {
    1453           0 :             length8=utf8Lengths[i];
    1454             :             // ALL_CP_CONTAINED: Irrelevant string.
    1455           0 :             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
    1456           0 :                 return pos;  // There is a set element at pos.
    1457             :             }
    1458           0 :             s8+=length8;
    1459             :         }
    1460             : 
    1461             :         // The span(while not contained) ended on a string start/end which is
    1462             :         // not in the original set. Skip this code point and continue.
    1463             :         // cpLength<0
    1464           0 :         pos-=cpLength;
    1465           0 :         rest+=cpLength;
    1466           0 :     } while(rest!=0);
    1467           0 :     return length;  // Reached the end of the string.
    1468             : }
    1469             : 
    1470           0 : int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
    1471           0 :     int32_t pos=length;
    1472           0 :     int32_t i, stringsLength=strings.size();
    1473           0 :     uint8_t *spanBackUTF8Lengths=spanLengths;
    1474           0 :     if(all) {
    1475           0 :         spanBackUTF8Lengths+=3*stringsLength;
    1476             :     }
    1477           0 :     do {
    1478             :         // Span until we find a code point from the set,
    1479             :         // or a code point that starts or ends some string.
    1480           0 :         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
    1481           0 :         if(pos==0) {
    1482           0 :             return 0;  // Reached the start of the string.
    1483             :         }
    1484             : 
    1485             :         // Check whether the current code point is in the original set,
    1486             :         // without the string starts and ends.
    1487           0 :         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
    1488           0 :         if(cpLength>0) {
    1489           0 :             return pos;  // There is a set element at pos.
    1490             :         }
    1491             : 
    1492             :         // Try to match the strings at pos.
    1493           0 :         const uint8_t *s8=utf8;
    1494             :         int32_t length8;
    1495           0 :         for(i=0; i<stringsLength; ++i) {
    1496           0 :             length8=utf8Lengths[i];
    1497             :             // ALL_CP_CONTAINED: Irrelevant string.
    1498           0 :             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
    1499           0 :                 return pos;  // There is a set element at pos.
    1500             :             }
    1501           0 :             s8+=length8;
    1502             :         }
    1503             : 
    1504             :         // The span(while not contained) ended on a string start/end which is
    1505             :         // not in the original set. Skip this code point and continue.
    1506             :         // cpLength<0
    1507           0 :         pos+=cpLength;
    1508           0 :     } while(pos!=0);
    1509           0 :     return 0;  // Reached the start of the string.
    1510             : }
    1511             : 
    1512             : U_NAMESPACE_END

Generated by: LCOV version 1.13