LCOV - output.info - intl/icu/source/common/uniset.cpp

LCOV - code coverage report

Current view:	top level - intl/icu/source/common - uniset.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	990	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	91	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : **********************************************************************
       5             : *   Copyright (C) 1999-2015, International Business Machines
       6             : *   Corporation and others.  All Rights Reserved.
       7             : **********************************************************************
       8             : *   Date        Name        Description
       9             : *   10/20/99    alan        Creation.
      10             : **********************************************************************
      11             : */
      12             : 
      13             : #include "unicode/utypes.h"
      14             : #include "unicode/parsepos.h"
      15             : #include "unicode/symtable.h"
      16             : #include "unicode/uniset.h"
      17             : #include "unicode/utf8.h"
      18             : #include "unicode/utf16.h"
      19             : #include "ruleiter.h"
      20             : #include "cmemory.h"
      21             : #include "cstring.h"
      22             : #include "patternprops.h"
      23             : #include "uelement.h"
      24             : #include "util.h"
      25             : #include "uvector.h"
      26             : #include "charstr.h"
      27             : #include "ustrfmt.h"
      28             : #include "uassert.h"
      29             : #include "bmpset.h"
      30             : #include "unisetspan.h"
      31             : 
      32             : // Define UChar constants using hex for EBCDIC compatibility
      33             : // Used #define to reduce private static exports and memory access time.
      34             : #define SET_OPEN        ((UChar)0x005B) /*[*/
      35             : #define SET_CLOSE       ((UChar)0x005D) /*]*/
      36             : #define HYPHEN          ((UChar)0x002D) /*-*/
      37             : #define COMPLEMENT      ((UChar)0x005E) /*^*/
      38             : #define COLON           ((UChar)0x003A) /*:*/
      39             : #define BACKSLASH       ((UChar)0x005C) /*\*/
      40             : #define INTERSECTION    ((UChar)0x0026) /*&*/
      41             : #define UPPER_U         ((UChar)0x0055) /*U*/
      42             : #define LOWER_U         ((UChar)0x0075) /*u*/
      43             : #define OPEN_BRACE      ((UChar)123)    /*{*/
      44             : #define CLOSE_BRACE     ((UChar)125)    /*}*/
      45             : #define UPPER_P         ((UChar)0x0050) /*P*/
      46             : #define LOWER_P         ((UChar)0x0070) /*p*/
      47             : #define UPPER_N         ((UChar)78)     /*N*/
      48             : #define EQUALS          ((UChar)0x003D) /*=*/
      49             : 
      50             : // HIGH_VALUE > all valid values. 110000 for codepoints
      51             : #define UNICODESET_HIGH 0x0110000
      52             : 
      53             : // LOW <= all valid values. ZERO for codepoints
      54             : #define UNICODESET_LOW 0x000000
      55             : 
      56             : // initial storage. Must be >= 0
      57             : #define START_EXTRA 16
      58             : 
      59             : // extra amount for growth. Must be >= 0
      60             : #define GROW_EXTRA START_EXTRA
      61             : 
      62             : U_NAMESPACE_BEGIN
      63             : 
      64           0 : SymbolTable::~SymbolTable() {}
      65             : 
      66           0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
      67             : 
      68             : /**
      69             :  * Modify the given UChar32 variable so that it is in range, by
      70             :  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
      71             :  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
      72             :  * It modifies its argument in-place and also returns it.
      73             :  */
      74           0 : static inline UChar32 pinCodePoint(UChar32& c) {
      75           0 :     if (c < UNICODESET_LOW) {
      76           0 :         c = UNICODESET_LOW;
      77           0 :     } else if (c > (UNICODESET_HIGH-1)) {
      78           0 :         c = (UNICODESET_HIGH-1);
      79             :     }
      80           0 :     return c;
      81             : }
      82             : 
      83             : //----------------------------------------------------------------
      84             : // Debugging
      85             : //----------------------------------------------------------------
      86             : 
      87             : // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
      88             : // To enable the debugging, define the symbol DEBUG_MEM in the line
      89             : // below.  This will result in text being sent to stdout that looks
      90             : // like this:
      91             : //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
      92             : //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
      93             : // Each line lists a construction (ct) or destruction (dt) event, the
      94             : // object address, the number of outstanding objects after the event,
      95             : // and the pattern of the object in question.
      96             : 
      97             : // #define DEBUG_MEM
      98             : 
      99             : #ifdef DEBUG_MEM
     100             : #include <stdio.h>
     101             : static int32_t _dbgCount = 0;
     102             : 
     103             : static inline void _dbgct(UnicodeSet* set) {
     104             :     UnicodeString str;
     105             :     set->toPattern(str, TRUE);
     106             :     char buf[40];
     107             :     str.extract(0, 39, buf, "");
     108             :     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
     109             : }
     110             : 
     111             : static inline void _dbgdt(UnicodeSet* set) {
     112             :     UnicodeString str;
     113             :     set->toPattern(str, TRUE);
     114             :     char buf[40];
     115             :     str.extract(0, 39, buf, "");
     116             :     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
     117             : }
     118             : 
     119             : #else
     120             : 
     121             : #define _dbgct(set)
     122             : #define _dbgdt(set)
     123             : 
     124             : #endif
     125             : 
     126             : //----------------------------------------------------------------
     127             : // UnicodeString in UVector support
     128             : //----------------------------------------------------------------
     129             : 
     130           0 : static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
     131           0 :     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
     132           0 : }
     133             : 
     134           0 : static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
     135           0 :     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
     136           0 :     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
     137           0 :     return a.compare(b);
     138             : }
     139             : 
     140             : //----------------------------------------------------------------
     141             : // Constructors &c
     142             : //----------------------------------------------------------------
     143             : 
     144             : /**
     145             :  * Constructs an empty set.
     146             :  */
     147           0 : UnicodeSet::UnicodeSet() :
     148             :     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
     149             :     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
     150           0 :     fFlags(0)
     151             : {
     152           0 :     UErrorCode status = U_ZERO_ERROR;
     153           0 :     allocateStrings(status);
     154           0 :     if (U_FAILURE(status)) {
     155           0 :         return;
     156             :     }
     157           0 :     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
     158           0 :     if(list!=NULL){
     159           0 :         list[0] = UNICODESET_HIGH;
     160             :     } else { // If memory allocation failed, set to bogus state.
     161           0 :         setToBogus();
     162           0 :         return;
     163             :     }
     164             :     _dbgct(this);
     165             : }
     166             : 
     167             : /**
     168             :  * Constructs a set containing the given range. If <code>end >
     169             :  * start</code> then an empty set is created.
     170             :  *
     171             :  * @param start first character, inclusive, of range
     172             :  * @param end last character, inclusive, of range
     173             :  */
     174           0 : UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
     175             :     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
     176             :     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
     177           0 :     fFlags(0)
     178             : {
     179           0 :     UErrorCode status = U_ZERO_ERROR;
     180           0 :     allocateStrings(status);
     181           0 :     if (U_FAILURE(status)) {
     182           0 :         return;
     183             :     }
     184           0 :     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
     185           0 :     if(list!=NULL){
     186           0 :         list[0] = UNICODESET_HIGH;
     187           0 :         complement(start, end);
     188             :     } else { // If memory allocation failed, set to bogus state.
     189           0 :         setToBogus();
     190           0 :         return;
     191             :     }
     192             :     _dbgct(this);
     193             : }
     194             : 
     195             : /**
     196             :  * Constructs a set that is identical to the given UnicodeSet.
     197             :  */
     198           0 : UnicodeSet::UnicodeSet(const UnicodeSet& o) :
     199             :     UnicodeFilter(o),
     200           0 :     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
     201             :     bmpSet(0),
     202             :     buffer(0), bufferCapacity(0),
     203             :     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
     204           0 :     fFlags(0)
     205             : {
     206           0 :     UErrorCode status = U_ZERO_ERROR;
     207           0 :     allocateStrings(status);
     208           0 :     if (U_FAILURE(status)) {
     209           0 :         return;
     210             :     }
     211           0 :     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
     212           0 :     if(list!=NULL){
     213           0 :         *this = o;
     214             :     } else { // If memory allocation failed, set to bogus state.
     215           0 :         setToBogus();
     216           0 :         return;
     217             :     }
     218             :     _dbgct(this);
     219             : }
     220             : 
     221             : // Copy-construct as thawed.
     222           0 : UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
     223             :     UnicodeFilter(o),
     224           0 :     len(0), capacity(o.len + GROW_EXTRA), list(0),
     225             :     bmpSet(0),
     226             :     buffer(0), bufferCapacity(0),
     227             :     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
     228           0 :     fFlags(0)
     229             : {
     230           0 :     UErrorCode status = U_ZERO_ERROR;
     231           0 :     allocateStrings(status);
     232           0 :     if (U_FAILURE(status)) {
     233           0 :         return;
     234             :     }
     235           0 :     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
     236           0 :     if(list!=NULL){
     237             :         // *this = o except for bmpSet and stringSpan
     238           0 :         len = o.len;
     239           0 :         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
     240           0 :         if (strings != NULL && o.strings != NULL) {
     241           0 :             strings->assign(*o.strings, cloneUnicodeString, status);
     242             :         } else { // Invalid strings.
     243           0 :             setToBogus();
     244           0 :             return;
     245             :         }
     246           0 :         if (o.pat) {
     247           0 :             setPattern(UnicodeString(o.pat, o.patLen));
     248             :         }
     249             :     } else { // If memory allocation failed, set to bogus state.
     250           0 :         setToBogus();
     251           0 :         return;
     252             :     }
     253             :     _dbgct(this);
     254             : }
     255             : 
     256             : /**
     257             :  * Destructs the set.
     258             :  */
     259           0 : UnicodeSet::~UnicodeSet() {
     260             :     _dbgdt(this); // first!
     261           0 :     uprv_free(list);
     262           0 :     delete bmpSet;
     263           0 :     if (buffer) {
     264           0 :         uprv_free(buffer);
     265             :     }
     266           0 :     delete strings;
     267           0 :     delete stringSpan;
     268           0 :     releasePattern();
     269           0 : }
     270             : 
     271             : /**
     272             :  * Assigns this object to be a copy of another.
     273             :  */
     274           0 : UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
     275           0 :     if (this == &o) {
     276           0 :         return *this;
     277             :     }
     278           0 :     if (isFrozen()) {
     279           0 :         return *this;
     280             :     }
     281           0 :     if (o.isBogus()) {
     282           0 :         setToBogus();
     283           0 :         return *this;
     284             :     }
     285           0 :     UErrorCode ec = U_ZERO_ERROR;
     286           0 :     ensureCapacity(o.len, ec);
     287           0 :     if (U_FAILURE(ec)) {
     288           0 :         return *this; // There is no way to report this error :-(
     289             :     }
     290           0 :     len = o.len;
     291           0 :     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
     292           0 :     if (o.bmpSet == NULL) {
     293           0 :         bmpSet = NULL;
     294             :     } else {
     295           0 :         bmpSet = new BMPSet(*o.bmpSet, list, len);
     296           0 :         if (bmpSet == NULL) { // Check for memory allocation error.
     297           0 :             setToBogus();
     298           0 :             return *this;
     299             :         }
     300             :     }
     301           0 :     if (strings != NULL && o.strings != NULL) {
     302           0 :         strings->assign(*o.strings, cloneUnicodeString, ec);
     303             :     } else { // Invalid strings.
     304           0 :         setToBogus();
     305           0 :         return *this;
     306             :     }
     307           0 :     if (o.stringSpan == NULL) {
     308           0 :         stringSpan = NULL;
     309             :     } else {
     310           0 :         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
     311           0 :         if (stringSpan == NULL) { // Check for memory allocation error.
     312           0 :             setToBogus();
     313           0 :             return *this;
     314             :         }
     315             :     }
     316           0 :     releasePattern();
     317           0 :     if (o.pat) {
     318           0 :         setPattern(UnicodeString(o.pat, o.patLen));
     319             :     }
     320           0 :     return *this;
     321             : }
     322             : 
     323             : /**
     324             :  * Returns a copy of this object.  All UnicodeMatcher objects have
     325             :  * to support cloning in order to allow classes using
     326             :  * UnicodeMatchers, such as Transliterator, to implement cloning.
     327             :  */
     328           0 : UnicodeFunctor* UnicodeSet::clone() const {
     329           0 :     return new UnicodeSet(*this);
     330             : }
     331             : 
     332           0 : UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
     333           0 :     return new UnicodeSet(*this, TRUE);
     334             : }
     335             : 
     336             : /**
     337             :  * Compares the specified object with this set for equality.  Returns
     338             :  * <tt>true</tt> if the two sets
     339             :  * have the same size, and every member of the specified set is
     340             :  * contained in this set (or equivalently, every member of this set is
     341             :  * contained in the specified set).
     342             :  *
     343             :  * @param o set to be compared for equality with this set.
     344             :  * @return <tt>true</tt> if the specified set is equal to this set.
     345             :  */
     346           0 : UBool UnicodeSet::operator==(const UnicodeSet& o) const {
     347           0 :     if (len != o.len) return FALSE;
     348           0 :     for (int32_t i = 0; i < len; ++i) {
     349           0 :         if (list[i] != o.list[i]) return FALSE;
     350             :     }
     351           0 :     if (*strings != *o.strings) return FALSE;
     352           0 :     return TRUE;
     353             : }
     354             : 
     355             : /**
     356             :  * Returns the hash code value for this set.
     357             :  *
     358             :  * @return the hash code value for this set.
     359             :  * @see Object#hashCode()
     360             :  */
     361           0 : int32_t UnicodeSet::hashCode(void) const {
     362           0 :     int32_t result = len;
     363           0 :     for (int32_t i = 0; i < len; ++i) {
     364           0 :         result *= 1000003;
     365           0 :         result += list[i];
     366             :     }
     367           0 :     return result;
     368             : }
     369             : 
     370             : //----------------------------------------------------------------
     371             : // Public API
     372             : //----------------------------------------------------------------
     373             : 
     374             : /**
     375             :  * Returns the number of elements in this set (its cardinality),
     376             :  * Note than the elements of a set may include both individual
     377             :  * codepoints and strings.
     378             :  *
     379             :  * @return the number of elements in this set (its cardinality).
     380             :  */
     381           0 : int32_t UnicodeSet::size(void) const {
     382           0 :     int32_t n = 0;
     383           0 :     int32_t count = getRangeCount();
     384           0 :     for (int32_t i = 0; i < count; ++i) {
     385           0 :         n += getRangeEnd(i) - getRangeStart(i) + 1;
     386             :     }
     387           0 :     return n + strings->size();
     388             : }
     389             : 
     390             : /**
     391             :  * Returns <tt>true</tt> if this set contains no elements.
     392             :  *
     393             :  * @return <tt>true</tt> if this set contains no elements.
     394             :  */
     395           0 : UBool UnicodeSet::isEmpty(void) const {
     396           0 :     return len == 1 && strings->size() == 0;
     397             : }
     398             : 
     399             : /**
     400             :  * Returns true if this set contains the given character.
     401             :  * @param c character to be checked for containment
     402             :  * @return true if the test condition is met
     403             :  */
     404           0 : UBool UnicodeSet::contains(UChar32 c) const {
     405             :     // Set i to the index of the start item greater than ch
     406             :     // We know we will terminate without length test!
     407             :     // LATER: for large sets, add binary search
     408             :     //int32_t i = -1;
     409             :     //for (;;) {
     410             :     //    if (c < list[++i]) break;
     411             :     //}
     412           0 :     if (bmpSet != NULL) {
     413           0 :         return bmpSet->contains(c);
     414             :     }
     415           0 :     if (stringSpan != NULL) {
     416           0 :         return stringSpan->contains(c);
     417             :     }
     418           0 :     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
     419           0 :         return FALSE;
     420             :     }
     421           0 :     int32_t i = findCodePoint(c);
     422           0 :     return (UBool)(i & 1); // return true if odd
     423             : }
     424             : 
     425             : /**
     426             :  * Returns the smallest value i such that c < list[i].  Caller
     427             :  * must ensure that c is a legal value or this method will enter
     428             :  * an infinite loop.  This method performs a binary search.
     429             :  * @param c a character in the range MIN_VALUE..MAX_VALUE
     430             :  * inclusive
     431             :  * @return the smallest integer i in the range 0..len-1,
     432             :  * inclusive, such that c < list[i]
     433             :  */
     434           0 : int32_t UnicodeSet::findCodePoint(UChar32 c) const {
     435             :     /* Examples:
     436             :                                        findCodePoint(c)
     437             :        set              list[]         c=0 1 3 4 7 8
     438             :        ===              ==============   ===========
     439             :        []               [110000]         0 0 0 0 0 0
     440             :        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
     441             :        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
     442             :        [:Any:]          [0, 110000]      1 1 1 1 1 1
     443             :      */
     444             : 
     445             :     // Return the smallest i such that c < list[i].  Assume
     446             :     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
     447           0 :     if (c < list[0])
     448           0 :         return 0;
     449             :     // High runner test.  c is often after the last range, so an
     450             :     // initial check for this condition pays off.
     451           0 :     int32_t lo = 0;
     452           0 :     int32_t hi = len - 1;
     453           0 :     if (lo >= hi || c >= list[hi-1])
     454           0 :         return hi;
     455             :     // invariant: c >= list[lo]
     456             :     // invariant: c < list[hi]
     457             :     for (;;) {
     458           0 :         int32_t i = (lo + hi) >> 1;
     459           0 :         if (i == lo) {
     460           0 :             break; // Found!
     461           0 :         } else if (c < list[i]) {
     462           0 :             hi = i;
     463             :         } else {
     464           0 :             lo = i;
     465             :         }
     466           0 :     }
     467           0 :     return hi;
     468             : }
     469             : 
     470             : /**
     471             :  * Returns true if this set contains every character
     472             :  * of the given range.
     473             :  * @param start first character, inclusive, of the range
     474             :  * @param end last character, inclusive, of the range
     475             :  * @return true if the test condition is met
     476             :  */
     477           0 : UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
     478             :     //int32_t i = -1;
     479             :     //for (;;) {
     480             :     //    if (start < list[++i]) break;
     481             :     //}
     482           0 :     int32_t i = findCodePoint(start);
     483           0 :     return ((i & 1) != 0 && end < list[i]);
     484             : }
     485             : 
     486             : /**
     487             :  * Returns <tt>true</tt> if this set contains the given
     488             :  * multicharacter string.
     489             :  * @param s string to be checked for containment
     490             :  * @return <tt>true</tt> if this set contains the specified string
     491             :  */
     492           0 : UBool UnicodeSet::contains(const UnicodeString& s) const {
     493           0 :     if (s.length() == 0) return FALSE;
     494           0 :     int32_t cp = getSingleCP(s);
     495           0 :     if (cp < 0) {
     496           0 :         return strings->contains((void*) &s);
     497             :     } else {
     498           0 :         return contains((UChar32) cp);
     499             :     }
     500             : }
     501             : 
     502             : /**
     503             :  * Returns true if this set contains all the characters and strings
     504             :  * of the given set.
     505             :  * @param c set to be checked for containment
     506             :  * @return true if the test condition is met
     507             :  */
     508           0 : UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
     509             :     // The specified set is a subset if all of its pairs are contained in
     510             :     // this set.  It's possible to code this more efficiently in terms of
     511             :     // direct manipulation of the inversion lists if the need arises.
     512           0 :     int32_t n = c.getRangeCount();
     513           0 :     for (int i=0; i<n; ++i) {
     514           0 :         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
     515           0 :             return FALSE;
     516             :         }
     517             :     }
     518           0 :     if (!strings->containsAll(*c.strings)) return FALSE;
     519           0 :     return TRUE;
     520             : }
     521             : 
     522             : /**
     523             :  * Returns true if this set contains all the characters
     524             :  * of the given string.
     525             :  * @param s string containing characters to be checked for containment
     526             :  * @return true if the test condition is met
     527             :  */
     528           0 : UBool UnicodeSet::containsAll(const UnicodeString& s) const {
     529           0 :     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
     530           0 :                    s.length());
     531             : }
     532             : 
     533             : /**
     534             :  * Returns true if this set contains none of the characters
     535             :  * of the given range.
     536             :  * @param start first character, inclusive, of the range
     537             :  * @param end last character, inclusive, of the range
     538             :  * @return true if the test condition is met
     539             :  */
     540           0 : UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
     541             :     //int32_t i = -1;
     542             :     //for (;;) {
     543             :     //    if (start < list[++i]) break;
     544             :     //}
     545           0 :     int32_t i = findCodePoint(start);
     546           0 :     return ((i & 1) == 0 && end < list[i]);
     547             : }
     548             : 
     549             : /**
     550             :  * Returns true if this set contains none of the characters and strings
     551             :  * of the given set.
     552             :  * @param c set to be checked for containment
     553             :  * @return true if the test condition is met
     554             :  */
     555           0 : UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
     556             :     // The specified set is a subset if all of its pairs are contained in
     557             :     // this set.  It's possible to code this more efficiently in terms of
     558             :     // direct manipulation of the inversion lists if the need arises.
     559           0 :     int32_t n = c.getRangeCount();
     560           0 :     for (int32_t i=0; i<n; ++i) {
     561           0 :         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
     562           0 :             return FALSE;
     563             :         }
     564             :     }
     565           0 :     if (!strings->containsNone(*c.strings)) return FALSE;
     566           0 :     return TRUE;
     567             : }
     568             : 
     569             : /**
     570             :  * Returns true if this set contains none of the characters
     571             :  * of the given string.
     572             :  * @param s string containing characters to be checked for containment
     573             :  * @return true if the test condition is met
     574             :  */
     575           0 : UBool UnicodeSet::containsNone(const UnicodeString& s) const {
     576           0 :     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
     577           0 :                    s.length());
     578             : }
     579             : 
     580             : /**
     581             :  * Returns <tt>true</tt> if this set contains any character whose low byte
     582             :  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
     583             :  * indexing.
     584             :  */
     585           0 : UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
     586             :     /* The index value v, in the range [0,255], is contained in this set if
     587             :      * it is contained in any pair of this set.  Pairs either have the high
     588             :      * bytes equal, or unequal.  If the high bytes are equal, then we have
     589             :      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
     590             :      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
     591             :      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
     592             :      * time zone month containment logic.)
     593             :      */
     594             :     int32_t i;
     595           0 :     int32_t rangeCount=getRangeCount();
     596           0 :     for (i=0; i<rangeCount; ++i) {
     597           0 :         UChar32 low = getRangeStart(i);
     598           0 :         UChar32 high = getRangeEnd(i);
     599           0 :         if ((low & ~0xFF) == (high & ~0xFF)) {
     600           0 :             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
     601           0 :                 return TRUE;
     602             :             }
     603           0 :         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
     604           0 :             return TRUE;
     605             :         }
     606             :     }
     607           0 :     if (strings->size() != 0) {
     608           0 :         for (i=0; i<strings->size(); ++i) {
     609           0 :             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
     610             :             //if (s.length() == 0) {
     611             :             //    // Empty strings match everything
     612             :             //    return TRUE;
     613             :             //}
     614             :             // assert(s.length() != 0); // We enforce this elsewhere
     615           0 :             UChar32 c = s.char32At(0);
     616           0 :             if ((c & 0xFF) == v) {
     617           0 :                 return TRUE;
     618             :             }
     619             :         }
     620             :     }
     621           0 :     return FALSE;
     622             : }
     623             : 
     624             : /**
     625             :  * Implementation of UnicodeMatcher::matches().  Always matches the
     626             :  * longest possible multichar string.
     627             :  */
     628           0 : UMatchDegree UnicodeSet::matches(const Replaceable& text,
     629             :                                  int32_t& offset,
     630             :                                  int32_t limit,
     631             :                                  UBool incremental) {
     632           0 :     if (offset == limit) {
     633             :         // Strings, if any, have length != 0, so we don't worry
     634             :         // about them here.  If we ever allow zero-length strings
     635             :         // we much check for them here.
     636           0 :         if (contains(U_ETHER)) {
     637           0 :             return incremental ? U_PARTIAL_MATCH : U_MATCH;
     638             :         } else {
     639           0 :             return U_MISMATCH;
     640             :         }
     641             :     } else {
     642           0 :         if (strings->size() != 0) { // try strings first
     643             : 
     644             :             // might separate forward and backward loops later
     645             :             // for now they are combined
     646             : 
     647             :             // TODO Improve efficiency of this, at least in the forward
     648             :             // direction, if not in both.  In the forward direction we
     649             :             // can assume the strings are sorted.
     650             : 
     651             :             int32_t i;
     652           0 :             UBool forward = offset < limit;
     653             : 
     654             :             // firstChar is the leftmost char to match in the
     655             :             // forward direction or the rightmost char to match in
     656             :             // the reverse direction.
     657           0 :             UChar firstChar = text.charAt(offset);
     658             : 
     659             :             // If there are multiple strings that can match we
     660             :             // return the longest match.
     661           0 :             int32_t highWaterLength = 0;
     662             : 
     663           0 :             for (i=0; i<strings->size(); ++i) {
     664           0 :                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
     665             : 
     666             :                 //if (trial.length() == 0) {
     667             :                 //    return U_MATCH; // null-string always matches
     668             :                 //}
     669             :                 // assert(trial.length() != 0); // We ensure this elsewhere
     670             : 
     671           0 :                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
     672             : 
     673             :                 // Strings are sorted, so we can optimize in the
     674             :                 // forward direction.
     675           0 :                 if (forward && c > firstChar) break;
     676           0 :                 if (c != firstChar) continue;
     677             : 
     678           0 :                 int32_t matchLen = matchRest(text, offset, limit, trial);
     679             : 
     680           0 :                 if (incremental) {
     681           0 :                     int32_t maxLen = forward ? limit-offset : offset-limit;
     682           0 :                     if (matchLen == maxLen) {
     683             :                         // We have successfully matched but only up to limit.
     684           0 :                         return U_PARTIAL_MATCH;
     685             :                     }
     686             :                 }
     687             : 
     688           0 :                 if (matchLen == trial.length()) {
     689             :                     // We have successfully matched the whole string.
     690           0 :                     if (matchLen > highWaterLength) {
     691           0 :                         highWaterLength = matchLen;
     692             :                     }
     693             :                     // In the forward direction we know strings
     694             :                     // are sorted so we can bail early.
     695           0 :                     if (forward && matchLen < highWaterLength) {
     696           0 :                         break;
     697             :                     }
     698           0 :                     continue;
     699             :                 }
     700             :             }
     701             : 
     702             :             // We've checked all strings without a partial match.
     703             :             // If we have full matches, return the longest one.
     704           0 :             if (highWaterLength != 0) {
     705           0 :                 offset += forward ? highWaterLength : -highWaterLength;
     706           0 :                 return U_MATCH;
     707             :             }
     708             :         }
     709           0 :         return UnicodeFilter::matches(text, offset, limit, incremental);
     710             :     }
     711             : }
     712             : 
     713             : /**
     714             :  * Returns the longest match for s in text at the given position.
     715             :  * If limit > start then match forward from start+1 to limit
     716             :  * matching all characters except s.charAt(0).  If limit < start,
     717             :  * go backward starting from start-1 matching all characters
     718             :  * except s.charAt(s.length()-1).  This method assumes that the
     719             :  * first character, text.charAt(start), matches s, so it does not
     720             :  * check it.
     721             :  * @param text the text to match
     722             :  * @param start the first character to match.  In the forward
     723             :  * direction, text.charAt(start) is matched against s.charAt(0).
     724             :  * In the reverse direction, it is matched against
     725             :  * s.charAt(s.length()-1).
     726             :  * @param limit the limit offset for matching, either last+1 in
     727             :  * the forward direction, or last-1 in the reverse direction,
     728             :  * where last is the index of the last character to match.
     729             :  * @return If part of s matches up to the limit, return |limit -
     730             :  * start|.  If all of s matches before reaching the limit, return
     731             :  * s.length().  If there is a mismatch between s and text, return
     732             :  * 0
     733             :  */
     734           0 : int32_t UnicodeSet::matchRest(const Replaceable& text,
     735             :                               int32_t start, int32_t limit,
     736             :                               const UnicodeString& s) {
     737             :     int32_t i;
     738             :     int32_t maxLen;
     739           0 :     int32_t slen = s.length();
     740           0 :     if (start < limit) {
     741           0 :         maxLen = limit - start;
     742           0 :         if (maxLen > slen) maxLen = slen;
     743           0 :         for (i = 1; i < maxLen; ++i) {
     744           0 :             if (text.charAt(start + i) != s.charAt(i)) return 0;
     745             :         }
     746             :     } else {
     747           0 :         maxLen = start - limit;
     748           0 :         if (maxLen > slen) maxLen = slen;
     749           0 :         --slen; // <=> slen = s.length() - 1;
     750           0 :         for (i = 1; i < maxLen; ++i) {
     751           0 :             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
     752             :         }
     753             :     }
     754           0 :     return maxLen;
     755             : }
     756             : 
     757             : /**
     758             :  * Implement of UnicodeMatcher
     759             :  */
     760           0 : void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
     761           0 :     toUnionTo.addAll(*this);
     762           0 : }
     763             : 
     764             : /**
     765             :  * Returns the index of the given character within this set, where
     766             :  * the set is ordered by ascending code point.  If the character
     767             :  * is not in this set, return -1.  The inverse of this method is
     768             :  * <code>charAt()</code>.
     769             :  * @return an index from 0..size()-1, or -1
     770             :  */
     771           0 : int32_t UnicodeSet::indexOf(UChar32 c) const {
     772           0 :     if (c < MIN_VALUE || c > MAX_VALUE) {
     773           0 :         return -1;
     774             :     }
     775           0 :     int32_t i = 0;
     776           0 :     int32_t n = 0;
     777             :     for (;;) {
     778           0 :         UChar32 start = list[i++];
     779           0 :         if (c < start) {
     780           0 :             return -1;
     781             :         }
     782           0 :         UChar32 limit = list[i++];
     783           0 :         if (c < limit) {
     784           0 :             return n + c - start;
     785             :         }
     786           0 :         n += limit - start;
     787           0 :     }
     788             : }
     789             : 
     790             : /**
     791             :  * Returns the character at the given index within this set, where
     792             :  * the set is ordered by ascending code point.  If the index is
     793             :  * out of range, return (UChar32)-1.  The inverse of this method is
     794             :  * <code>indexOf()</code>.
     795             :  * @param index an index from 0..size()-1
     796             :  * @return the character at the given index, or (UChar32)-1.
     797             :  */
     798           0 : UChar32 UnicodeSet::charAt(int32_t index) const {
     799           0 :     if (index >= 0) {
     800             :         // len2 is the largest even integer <= len, that is, it is len
     801             :         // for even values and len-1 for odd values.  With odd values
     802             :         // the last entry is UNICODESET_HIGH.
     803           0 :         int32_t len2 = len & ~1;
     804           0 :         for (int32_t i=0; i < len2;) {
     805           0 :             UChar32 start = list[i++];
     806           0 :             int32_t count = list[i++] - start;
     807           0 :             if (index < count) {
     808           0 :                 return (UChar32)(start + index);
     809             :             }
     810           0 :             index -= count;
     811             :         }
     812             :     }
     813           0 :     return (UChar32)-1;
     814             : }
     815             : 
     816             : /**
     817             :  * Make this object represent the range <code>start - end</code>.
     818             :  * If <code>end > start</code> then this object is set to an
     819             :  * an empty range.
     820             :  *
     821             :  * @param start first character in the set, inclusive
     822             :  * @rparam end last character in the set, inclusive
     823             :  */
     824           0 : UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
     825           0 :     clear();
     826           0 :     complement(start, end);
     827           0 :     return *this;
     828             : }
     829             : 
     830             : /**
     831             :  * Adds the specified range to this set if it is not already
     832             :  * present.  If this set already contains the specified range,
     833             :  * the call leaves this set unchanged.  If <code>end > start</code>
     834             :  * then an empty range is added, leaving the set unchanged.
     835             :  *
     836             :  * @param start first character, inclusive, of range to be added
     837             :  * to this set.
     838             :  * @param end last character, inclusive, of range to be added
     839             :  * to this set.
     840             :  */
     841           0 : UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
     842           0 :     if (pinCodePoint(start) < pinCodePoint(end)) {
     843           0 :         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
     844           0 :         add(range, 2, 0);
     845           0 :     } else if (start == end) {
     846           0 :         add(start);
     847             :     }
     848           0 :     return *this;
     849             : }
     850             : 
     851             : // #define DEBUG_US_ADD
     852             : 
     853             : #ifdef DEBUG_US_ADD
     854             : #include <stdio.h>
     855             : void dump(UChar32 c) {
     856             :     if (c <= 0xFF) {
     857             :         printf("%c", (char)c);
     858             :     } else {
     859             :         printf("U+%04X", c);
     860             :     }
     861             : }
     862             : void dump(const UChar32* list, int32_t len) {
     863             :     printf("[");
     864             :     for (int32_t i=0; i<len; ++i) {
     865             :         if (i != 0) printf(", ");
     866             :         dump(list[i]);
     867             :     }
     868             :     printf("]");
     869             : }
     870             : #endif
     871             : 
     872             : /**
     873             :  * Adds the specified character to this set if it is not already
     874             :  * present.  If this set already contains the specified character,
     875             :  * the call leaves this set unchanged.
     876             :  */
     877           0 : UnicodeSet& UnicodeSet::add(UChar32 c) {
     878             :     // find smallest i such that c < list[i]
     879             :     // if odd, then it is IN the set
     880             :     // if even, then it is OUT of the set
     881           0 :     int32_t i = findCodePoint(pinCodePoint(c));
     882             : 
     883             :     // already in set?
     884           0 :     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
     885             : 
     886             :     // HIGH is 0x110000
     887             :     // assert(list[len-1] == HIGH);
     888             : 
     889             :     // empty = [HIGH]
     890             :     // [start_0, limit_0, start_1, limit_1, HIGH]
     891             : 
     892             :     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
     893             :     //                             ^
     894             :     //                             list[i]
     895             : 
     896             :     // i == 0 means c is before the first range
     897             : 
     898             : #ifdef DEBUG_US_ADD
     899             :     printf("Add of ");
     900             :     dump(c);
     901             :     printf(" found at %d", i);
     902             :     printf(": ");
     903             :     dump(list, len);
     904             :     printf(" => ");
     905             : #endif
     906             : 
     907           0 :     if (c == list[i]-1) {
     908             :         // c is before start of next range
     909           0 :         list[i] = c;
     910             :         // if we touched the HIGH mark, then add a new one
     911           0 :         if (c == (UNICODESET_HIGH - 1)) {
     912           0 :             UErrorCode status = U_ZERO_ERROR;
     913           0 :             ensureCapacity(len+1, status);
     914           0 :             if (U_FAILURE(status)) {
     915           0 :                 return *this; // There is no way to report this error :-(
     916             :             }
     917           0 :             list[len++] = UNICODESET_HIGH;
     918             :         }
     919           0 :         if (i > 0 && c == list[i-1]) {
     920             :             // collapse adjacent ranges
     921             : 
     922             :             // [..., start_k-1, c, c, limit_k, ..., HIGH]
     923             :             //                     ^
     924             :             //                     list[i]
     925             : 
     926             :             //for (int32_t k=i-1; k<len-2; ++k) {
     927             :             //    list[k] = list[k+2];
     928             :             //}
     929           0 :             UChar32* dst = list + i - 1;
     930           0 :             UChar32* src = dst + 2;
     931           0 :             UChar32* srclimit = list + len;
     932           0 :             while (src < srclimit) *(dst++) = *(src++);
     933             : 
     934           0 :             len -= 2;
     935             :         }
     936             :     }
     937             : 
     938           0 :     else if (i > 0 && c == list[i-1]) {
     939             :         // c is after end of prior range
     940           0 :         list[i-1]++;
     941             :         // no need to check for collapse here
     942             :     }
     943             : 
     944             :     else {
     945             :         // At this point we know the new char is not adjacent to
     946             :         // any existing ranges, and it is not 10FFFF.
     947             : 
     948             : 
     949             :         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
     950             :         //                             ^
     951             :         //                             list[i]
     952             : 
     953             :         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
     954             :         //                             ^
     955             :         //                             list[i]
     956             : 
     957           0 :         UErrorCode status = U_ZERO_ERROR;
     958           0 :         ensureCapacity(len+2, status);
     959           0 :         if (U_FAILURE(status)) {
     960           0 :             return *this; // There is no way to report this error :-(
     961             :         }
     962             : 
     963             :         //for (int32_t k=len-1; k>=i; --k) {
     964             :         //    list[k+2] = list[k];
     965             :         //}
     966           0 :         UChar32* src = list + len;
     967           0 :         UChar32* dst = src + 2;
     968           0 :         UChar32* srclimit = list + i;
     969           0 :         while (src > srclimit) *(--dst) = *(--src);
     970             : 
     971           0 :         list[i] = c;
     972           0 :         list[i+1] = c+1;
     973           0 :         len += 2;
     974             :     }
     975             : 
     976             : #ifdef DEBUG_US_ADD
     977             :     dump(list, len);
     978             :     printf("\n");
     979             : 
     980             :     for (i=1; i<len; ++i) {
     981             :         if (list[i] <= list[i-1]) {
     982             :             // Corrupt array!
     983             :             printf("ERROR: list has been corrupted\n");
     984             :             exit(1);
     985             :         }
     986             :     }
     987             : #endif
     988             : 
     989           0 :     releasePattern();
     990           0 :     return *this;
     991             : }
     992             : 
     993             : /**
     994             :  * Adds the specified multicharacter to this set if it is not already
     995             :  * present.  If this set already contains the multicharacter,
     996             :  * the call leaves this set unchanged.
     997             :  * Thus "ch" => {"ch"}
     998             :  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
     999             :  * @param s the source string
    1000             :  * @return the modified set, for chaining
    1001             :  */
    1002           0 : UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
    1003           0 :     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
    1004           0 :     int32_t cp = getSingleCP(s);
    1005           0 :     if (cp < 0) {
    1006           0 :         if (!strings->contains((void*) &s)) {
    1007           0 :             _add(s);
    1008           0 :             releasePattern();
    1009             :         }
    1010             :     } else {
    1011           0 :         add((UChar32)cp);
    1012             :     }
    1013           0 :     return *this;
    1014             : }
    1015             : 
    1016             : /**
    1017             :  * Adds the given string, in order, to 'strings'.  The given string
    1018             :  * must have been checked by the caller to not be empty and to not
    1019             :  * already be in 'strings'.
    1020             :  */
    1021           0 : void UnicodeSet::_add(const UnicodeString& s) {
    1022           0 :     if (isFrozen() || isBogus()) {
    1023           0 :         return;
    1024             :     }
    1025           0 :     UnicodeString* t = new UnicodeString(s);
    1026           0 :     if (t == NULL) { // Check for memory allocation error.
    1027           0 :         setToBogus();
    1028           0 :         return;
    1029             :     }
    1030           0 :     UErrorCode ec = U_ZERO_ERROR;
    1031           0 :     strings->sortedInsert(t, compareUnicodeString, ec);
    1032           0 :     if (U_FAILURE(ec)) {
    1033           0 :         setToBogus();
    1034           0 :         delete t;
    1035             :     }
    1036             : }
    1037             : 
    1038             : /**
    1039             :  * @return a code point IF the string consists of a single one.
    1040             :  * otherwise returns -1.
    1041             :  * @param string to test
    1042             :  */
    1043           0 : int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
    1044             :     //if (s.length() < 1) {
    1045             :     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
    1046             :     //}
    1047           0 :     if (s.length() > 2) return -1;
    1048           0 :     if (s.length() == 1) return s.charAt(0);
    1049             : 
    1050             :     // at this point, len = 2
    1051           0 :     UChar32 cp = s.char32At(0);
    1052           0 :     if (cp > 0xFFFF) { // is surrogate pair
    1053           0 :         return cp;
    1054             :     }
    1055           0 :     return -1;
    1056             : }
    1057             : 
    1058             : /**
    1059             :  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
    1060             :  * If this set already any particular character, it has no effect on that character.
    1061             :  * @param the source string
    1062             :  * @return the modified set, for chaining
    1063             :  */
    1064           0 : UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
    1065             :     UChar32 cp;
    1066           0 :     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
    1067           0 :         cp = s.char32At(i);
    1068           0 :         add(cp);
    1069             :     }
    1070           0 :     return *this;
    1071             : }
    1072             : 
    1073             : /**
    1074             :  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
    1075             :  * If this set already any particular character, it has no effect on that character.
    1076             :  * @param the source string
    1077             :  * @return the modified set, for chaining
    1078             :  */
    1079           0 : UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
    1080           0 :     UnicodeSet set;
    1081           0 :     set.addAll(s);
    1082           0 :     retainAll(set);
    1083           0 :     return *this;
    1084             : }
    1085             : 
    1086             : /**
    1087             :  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
    1088             :  * If this set already any particular character, it has no effect on that character.
    1089             :  * @param the source string
    1090             :  * @return the modified set, for chaining
    1091             :  */
    1092           0 : UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
    1093           0 :     UnicodeSet set;
    1094           0 :     set.addAll(s);
    1095           0 :     complementAll(set);
    1096           0 :     return *this;
    1097             : }
    1098             : 
    1099             : /**
    1100             :  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
    1101             :  * If this set already any particular character, it has no effect on that character.
    1102             :  * @param the source string
    1103             :  * @return the modified set, for chaining
    1104             :  */
    1105           0 : UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
    1106           0 :     UnicodeSet set;
    1107           0 :     set.addAll(s);
    1108           0 :     removeAll(set);
    1109           0 :     return *this;
    1110             : }
    1111             : 
    1112           0 : UnicodeSet& UnicodeSet::removeAllStrings() {
    1113           0 :     strings->removeAllElements();
    1114           0 :     return *this;
    1115             : }
    1116             : 
    1117             : 
    1118             : /**
    1119             :  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
    1120             :  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
    1121             :  * @param the source string
    1122             :  * @return a newly created set containing the given string
    1123             :  */
    1124           0 : UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
    1125           0 :     UnicodeSet *set = new UnicodeSet();
    1126           0 :     if (set != NULL) { // Check for memory allocation error.
    1127           0 :         set->add(s);
    1128             :     }
    1129           0 :     return set;
    1130             : }
    1131             : 
    1132             : 
    1133             : /**
    1134             :  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
    1135             :  * @param the source string
    1136             :  * @return a newly created set containing the given characters
    1137             :  */
    1138           0 : UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
    1139           0 :     UnicodeSet *set = new UnicodeSet();
    1140           0 :     if (set != NULL) { // Check for memory allocation error.
    1141           0 :         set->addAll(s);
    1142             :     }
    1143           0 :     return set;
    1144             : }
    1145             : 
    1146             : /**
    1147             :  * Retain only the elements in this set that are contained in the
    1148             :  * specified range.  If <code>end > start</code> then an empty range is
    1149             :  * retained, leaving the set empty.
    1150             :  *
    1151             :  * @param start first character, inclusive, of range to be retained
    1152             :  * to this set.
    1153             :  * @param end last character, inclusive, of range to be retained
    1154             :  * to this set.
    1155             :  */
    1156           0 : UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
    1157           0 :     if (pinCodePoint(start) <= pinCodePoint(end)) {
    1158           0 :         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
    1159           0 :         retain(range, 2, 0);
    1160             :     } else {
    1161           0 :         clear();
    1162             :     }
    1163           0 :     return *this;
    1164             : }
    1165             : 
    1166           0 : UnicodeSet& UnicodeSet::retain(UChar32 c) {
    1167           0 :     return retain(c, c);
    1168             : }
    1169             : 
    1170             : /**
    1171             :  * Removes the specified range from this set if it is present.
    1172             :  * The set will not contain the specified range once the call
    1173             :  * returns.  If <code>end > start</code> then an empty range is
    1174             :  * removed, leaving the set unchanged.
    1175             :  *
    1176             :  * @param start first character, inclusive, of range to be removed
    1177             :  * from this set.
    1178             :  * @param end last character, inclusive, of range to be removed
    1179             :  * from this set.
    1180             :  */
    1181           0 : UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
    1182           0 :     if (pinCodePoint(start) <= pinCodePoint(end)) {
    1183           0 :         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
    1184           0 :         retain(range, 2, 2);
    1185             :     }
    1186           0 :     return *this;
    1187             : }
    1188             : 
    1189             : /**
    1190             :  * Removes the specified character from this set if it is present.
    1191             :  * The set will not contain the specified range once the call
    1192             :  * returns.
    1193             :  */
    1194           0 : UnicodeSet& UnicodeSet::remove(UChar32 c) {
    1195           0 :     return remove(c, c);
    1196             : }
    1197             : 
    1198             : /**
    1199             :  * Removes the specified string from this set if it is present.
    1200             :  * The set will not contain the specified character once the call
    1201             :  * returns.
    1202             :  * @param the source string
    1203             :  * @return the modified set, for chaining
    1204             :  */
    1205           0 : UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
    1206           0 :     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
    1207           0 :     int32_t cp = getSingleCP(s);
    1208           0 :     if (cp < 0) {
    1209           0 :         strings->removeElement((void*) &s);
    1210           0 :         releasePattern();
    1211             :     } else {
    1212           0 :         remove((UChar32)cp, (UChar32)cp);
    1213             :     }
    1214           0 :     return *this;
    1215             : }
    1216             : 
    1217             : /**
    1218             :  * Complements the specified range in this set.  Any character in
    1219             :  * the range will be removed if it is in this set, or will be
    1220             :  * added if it is not in this set.  If <code>end > start</code>
    1221             :  * then an empty range is xor'ed, leaving the set unchanged.
    1222             :  *
    1223             :  * @param start first character, inclusive, of range to be removed
    1224             :  * from this set.
    1225             :  * @param end last character, inclusive, of range to be removed
    1226             :  * from this set.
    1227             :  */
    1228           0 : UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
    1229           0 :     if (isFrozen() || isBogus()) {
    1230           0 :         return *this;
    1231             :     }
    1232           0 :     if (pinCodePoint(start) <= pinCodePoint(end)) {
    1233           0 :         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
    1234           0 :         exclusiveOr(range, 2, 0);
    1235             :     }
    1236           0 :     releasePattern();
    1237           0 :     return *this;
    1238             : }
    1239             : 
    1240           0 : UnicodeSet& UnicodeSet::complement(UChar32 c) {
    1241           0 :     return complement(c, c);
    1242             : }
    1243             : 
    1244             : /**
    1245             :  * This is equivalent to
    1246             :  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
    1247             :  */
    1248           0 : UnicodeSet& UnicodeSet::complement(void) {
    1249           0 :     if (isFrozen() || isBogus()) {
    1250           0 :         return *this;
    1251             :     }
    1252           0 :     UErrorCode status = U_ZERO_ERROR;
    1253           0 :     if (list[0] == UNICODESET_LOW) {
    1254           0 :         ensureBufferCapacity(len-1, status);
    1255           0 :         if (U_FAILURE(status)) {
    1256           0 :             return *this;
    1257             :         }
    1258           0 :         uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
    1259           0 :         --len;
    1260             :     } else {
    1261           0 :         ensureBufferCapacity(len+1, status);
    1262           0 :         if (U_FAILURE(status)) {
    1263           0 :             return *this;
    1264             :         }
    1265           0 :         uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
    1266           0 :         buffer[0] = UNICODESET_LOW;
    1267           0 :         ++len;
    1268             :     }
    1269           0 :     swapBuffers();
    1270           0 :     releasePattern();
    1271           0 :     return *this;
    1272             : }
    1273             : 
    1274             : /**
    1275             :  * Complement the specified string in this set.
    1276             :  * The set will not contain the specified string once the call
    1277             :  * returns.
    1278             :  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
    1279             :  * @param s the string to complement
    1280             :  * @return this object, for chaining
    1281             :  */
    1282           0 : UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
    1283           0 :     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
    1284           0 :     int32_t cp = getSingleCP(s);
    1285           0 :     if (cp < 0) {
    1286           0 :         if (strings->contains((void*) &s)) {
    1287           0 :             strings->removeElement((void*) &s);
    1288             :         } else {
    1289           0 :             _add(s);
    1290             :         }
    1291           0 :         releasePattern();
    1292             :     } else {
    1293           0 :         complement((UChar32)cp, (UChar32)cp);
    1294             :     }
    1295           0 :     return *this;
    1296             : }
    1297             : 
    1298             : /**
    1299             :  * Adds all of the elements in the specified set to this set if
    1300             :  * they're not already present.  This operation effectively
    1301             :  * modifies this set so that its value is the <i>union</i> of the two
    1302             :  * sets.  The behavior of this operation is unspecified if the specified
    1303             :  * collection is modified while the operation is in progress.
    1304             :  *
    1305             :  * @param c set whose elements are to be added to this set.
    1306             :  * @see #add(char, char)
    1307             :  */
    1308           0 : UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
    1309           0 :     if ( c.len>0 && c.list!=NULL ) {
    1310           0 :         add(c.list, c.len, 0);
    1311             :     }
    1312             : 
    1313             :     // Add strings in order
    1314           0 :     if ( c.strings!=NULL ) {
    1315           0 :         for (int32_t i=0; i<c.strings->size(); ++i) {
    1316           0 :             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
    1317           0 :             if (!strings->contains((void*) s)) {
    1318           0 :                 _add(*s);
    1319             :             }
    1320             :         }
    1321             :     }
    1322           0 :     return *this;
    1323             : }
    1324             : 
    1325             : /**
    1326             :  * Retains only the elements in this set that are contained in the
    1327             :  * specified set.  In other words, removes from this set all of
    1328             :  * its elements that are not contained in the specified set.  This
    1329             :  * operation effectively modifies this set so that its value is
    1330             :  * the <i>intersection</i> of the two sets.
    1331             :  *
    1332             :  * @param c set that defines which elements this set will retain.
    1333             :  */
    1334           0 : UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
    1335           0 :     if (isFrozen() || isBogus()) {
    1336           0 :         return *this;
    1337             :     }
    1338           0 :     retain(c.list, c.len, 0);
    1339           0 :     strings->retainAll(*c.strings);
    1340           0 :     return *this;
    1341             : }
    1342             : 
    1343             : /**
    1344             :  * Removes from this set all of its elements that are contained in the
    1345             :  * specified set.  This operation effectively modifies this
    1346             :  * set so that its value is the <i>asymmetric set difference</i> of
    1347             :  * the two sets.
    1348             :  *
    1349             :  * @param c set that defines which elements will be removed from
    1350             :  *          this set.
    1351             :  */
    1352           0 : UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
    1353           0 :     if (isFrozen() || isBogus()) {
    1354           0 :         return *this;
    1355             :     }
    1356           0 :     retain(c.list, c.len, 2);
    1357           0 :     strings->removeAll(*c.strings);
    1358           0 :     return *this;
    1359             : }
    1360             : 
    1361             : /**
    1362             :  * Complements in this set all elements contained in the specified
    1363             :  * set.  Any character in the other set will be removed if it is
    1364             :  * in this set, or will be added if it is not in this set.
    1365             :  *
    1366             :  * @param c set that defines which elements will be xor'ed from
    1367             :  *          this set.
    1368             :  */
    1369           0 : UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
    1370           0 :     if (isFrozen() || isBogus()) {
    1371           0 :         return *this;
    1372             :     }
    1373           0 :     exclusiveOr(c.list, c.len, 0);
    1374             : 
    1375           0 :     for (int32_t i=0; i<c.strings->size(); ++i) {
    1376           0 :         void* e = c.strings->elementAt(i);
    1377           0 :         if (!strings->removeElement(e)) {
    1378           0 :             _add(*(const UnicodeString*)e);
    1379             :         }
    1380             :     }
    1381           0 :     return *this;
    1382             : }
    1383             : 
    1384             : /**
    1385             :  * Removes all of the elements from this set.  This set will be
    1386             :  * empty after this call returns.
    1387             :  */
    1388           0 : UnicodeSet& UnicodeSet::clear(void) {
    1389           0 :     if (isFrozen()) {
    1390           0 :         return *this;
    1391             :     }
    1392           0 :     if (list != NULL) {
    1393           0 :         list[0] = UNICODESET_HIGH;
    1394             :     }
    1395           0 :     len = 1;
    1396           0 :     releasePattern();
    1397           0 :     if (strings != NULL) {
    1398           0 :         strings->removeAllElements();
    1399             :     }
    1400           0 :     if (list != NULL && strings != NULL) {
    1401             :         // Remove bogus
    1402           0 :         fFlags = 0;
    1403             :     }
    1404           0 :     return *this;
    1405             : }
    1406             : 
    1407             : /**
    1408             :  * Iteration method that returns the number of ranges contained in
    1409             :  * this set.
    1410             :  * @see #getRangeStart
    1411             :  * @see #getRangeEnd
    1412             :  */
    1413           0 : int32_t UnicodeSet::getRangeCount() const {
    1414           0 :     return len/2;
    1415             : }
    1416             : 
    1417             : /**
    1418             :  * Iteration method that returns the first character in the
    1419             :  * specified range of this set.
    1420             :  * @see #getRangeCount
    1421             :  * @see #getRangeEnd
    1422             :  */
    1423           0 : UChar32 UnicodeSet::getRangeStart(int32_t index) const {
    1424           0 :     return list[index*2];
    1425             : }
    1426             : 
    1427             : /**
    1428             :  * Iteration method that returns the last character in the
    1429             :  * specified range of this set.
    1430             :  * @see #getRangeStart
    1431             :  * @see #getRangeEnd
    1432             :  */
    1433           0 : UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
    1434           0 :     return list[index*2 + 1] - 1;
    1435             : }
    1436             : 
    1437           0 : int32_t UnicodeSet::getStringCount() const {
    1438           0 :     return strings->size();
    1439             : }
    1440             : 
    1441           0 : const UnicodeString* UnicodeSet::getString(int32_t index) const {
    1442           0 :     return (const UnicodeString*) strings->elementAt(index);
    1443             : }
    1444             : 
    1445             : /**
    1446             :  * Reallocate this objects internal structures to take up the least
    1447             :  * possible space, without changing this object's value.
    1448             :  */
    1449           0 : UnicodeSet& UnicodeSet::compact() {
    1450           0 :     if (isFrozen() || isBogus()) {
    1451           0 :         return *this;
    1452             :     }
    1453             :     // Delete buffer first to defragment memory less.
    1454           0 :     if (buffer != NULL) {
    1455           0 :         uprv_free(buffer);
    1456           0 :         buffer = NULL;
    1457             :     }
    1458           0 :     if (len < capacity) {
    1459             :         // Make the capacity equal to len or 1.
    1460             :         // We don't want to realloc of 0 size.
    1461           0 :         int32_t newCapacity = len + (len == 0);
    1462           0 :         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
    1463           0 :         if (temp) {
    1464           0 :             list = temp;
    1465           0 :             capacity = newCapacity;
    1466             :         }
    1467             :         // else what the heck happened?! We allocated less memory!
    1468             :         // Oh well. We'll keep our original array.
    1469             :     }
    1470           0 :     return *this;
    1471             : }
    1472             : 
    1473             : #ifdef DEBUG_SERIALIZE
    1474             : #include <stdio.h>
    1475             : #endif
    1476             : 
    1477             : /**
    1478             :  * Deserialize constructor.
    1479             :  */
    1480           0 : UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
    1481             :   : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
    1482             :     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
    1483           0 :     fFlags(0) {
    1484             : 
    1485           0 :   if(U_FAILURE(ec)) {
    1486           0 :     setToBogus();
    1487           0 :     return;
    1488             :   }
    1489             : 
    1490           0 :   if( (serialization != kSerialized)
    1491           0 :       || (data==NULL)
    1492           0 :       || (dataLen < 1)) {
    1493           0 :     ec = U_ILLEGAL_ARGUMENT_ERROR;
    1494           0 :     setToBogus();
    1495           0 :     return;
    1496             :   }
    1497             : 
    1498           0 :   allocateStrings(ec);
    1499           0 :   if (U_FAILURE(ec)) {
    1500           0 :     setToBogus();
    1501           0 :     return;
    1502             :   }
    1503             : 
    1504             :   // bmp?
    1505           0 :   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
    1506           0 :   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
    1507             : 
    1508           0 :   len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
    1509             : #ifdef DEBUG_SERIALIZE
    1510             :   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
    1511             : #endif
    1512           0 :   capacity = len+1;
    1513           0 :   list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
    1514           0 :   if(!list || U_FAILURE(ec)) {
    1515           0 :     setToBogus();
    1516           0 :     return;
    1517             :   }
    1518             :   // copy bmp
    1519             :   int32_t i;
    1520           0 :   for(i = 0; i< bmpLength;i++) {
    1521           0 :     list[i] = data[i+headerSize];
    1522             : #ifdef DEBUG_SERIALIZE
    1523             :     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
    1524             : #endif
    1525             :   }
    1526             :   // copy smp
    1527           0 :   for(i=bmpLength;i<len;i++) {
    1528           0 :     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
    1529           0 :               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
    1530             : #ifdef DEBUG_SERIALIZE
    1531             :     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
    1532             : #endif
    1533             :   }
    1534             :   // terminator
    1535           0 :   list[len++]=UNICODESET_HIGH;
    1536             : }
    1537             : 
    1538             : 
    1539           0 : int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
    1540             :     int32_t bmpLength, length, destLength;
    1541             : 
    1542           0 :     if (U_FAILURE(ec)) {
    1543           0 :         return 0;
    1544             :     }
    1545             : 
    1546           0 :     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
    1547           0 :         ec=U_ILLEGAL_ARGUMENT_ERROR;
    1548           0 :         return 0;
    1549             :     }
    1550             : 
    1551             :     /* count necessary 16-bit units */
    1552           0 :     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
    1553             :     // assert(length>=0);
    1554           0 :     if (length==0) {
    1555             :         /* empty set */
    1556           0 :         if (destCapacity>0) {
    1557           0 :             *dest=0;
    1558             :         } else {
    1559           0 :             ec=U_BUFFER_OVERFLOW_ERROR;
    1560             :         }
    1561           0 :         return 1;
    1562             :     }
    1563             :     /* now length>0 */
    1564             : 
    1565           0 :     if (this->list[length-1]<=0xffff) {
    1566             :         /* all BMP */
    1567           0 :         bmpLength=length;
    1568           0 :     } else if (this->list[0]>=0x10000) {
    1569             :         /* all supplementary */
    1570           0 :         bmpLength=0;
    1571           0 :         length*=2;
    1572             :     } else {
    1573             :         /* some BMP, some supplementary */
    1574           0 :         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
    1575           0 :         length=bmpLength+2*(length-bmpLength);
    1576             :     }
    1577             : #ifdef DEBUG_SERIALIZE
    1578             :     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
    1579             : #endif
    1580             :     /* length: number of 16-bit array units */
    1581           0 :     if (length>0x7fff) {
    1582             :         /* there are only 15 bits for the length in the first serialized word */
    1583           0 :         ec=U_INDEX_OUTOFBOUNDS_ERROR;
    1584           0 :         return 0;
    1585             :     }
    1586             : 
    1587             :     /*
    1588             :      * total serialized length:
    1589             :      * number of 16-bit array units (length) +
    1590             :      * 1 length unit (always) +
    1591             :      * 1 bmpLength unit (if there are supplementary values)
    1592             :      */
    1593           0 :     destLength=length+((length>bmpLength)?2:1);
    1594           0 :     if (destLength<=destCapacity) {
    1595             :         const UChar32 *p;
    1596             :         int32_t i;
    1597             : 
    1598             : #ifdef DEBUG_SERIALIZE
    1599             :         printf("writeHdr\n");
    1600             : #endif
    1601           0 :         *dest=(uint16_t)length;
    1602           0 :         if (length>bmpLength) {
    1603           0 :             *dest|=0x8000;
    1604           0 :             *++dest=(uint16_t)bmpLength;
    1605             :         }
    1606           0 :         ++dest;
    1607             : 
    1608             :         /* write the BMP part of the array */
    1609           0 :         p=this->list;
    1610           0 :         for (i=0; i<bmpLength; ++i) {
    1611             : #ifdef DEBUG_SERIALIZE
    1612             :           printf("writebmp: %x\n", (int)*p);
    1613             : #endif
    1614           0 :             *dest++=(uint16_t)*p++;
    1615             :         }
    1616             : 
    1617             :         /* write the supplementary part of the array */
    1618           0 :         for (; i<length; i+=2) {
    1619             : #ifdef DEBUG_SERIALIZE
    1620             :           printf("write32: %x\n", (int)*p);
    1621             : #endif
    1622           0 :             *dest++=(uint16_t)(*p>>16);
    1623           0 :             *dest++=(uint16_t)*p++;
    1624             :         }
    1625             :     } else {
    1626           0 :         ec=U_BUFFER_OVERFLOW_ERROR;
    1627             :     }
    1628           0 :     return destLength;
    1629             : }
    1630             : 
    1631             : //----------------------------------------------------------------
    1632             : // Implementation: Utility methods
    1633             : //----------------------------------------------------------------
    1634             : 
    1635             : /**
    1636             :  * Allocate our strings vector and return TRUE if successful.
    1637             :  */
    1638           0 : UBool UnicodeSet::allocateStrings(UErrorCode &status) {
    1639           0 :     if (U_FAILURE(status)) {
    1640           0 :         return FALSE;
    1641             :     }
    1642           0 :     strings = new UVector(uprv_deleteUObject,
    1643           0 :                           uhash_compareUnicodeString, 1, status);
    1644           0 :     if (strings == NULL) { // Check for memory allocation error.
    1645           0 :         status = U_MEMORY_ALLOCATION_ERROR;
    1646           0 :         return FALSE;
    1647             :     }
    1648           0 :     if (U_FAILURE(status)) {
    1649           0 :         delete strings;
    1650           0 :         strings = NULL;
    1651           0 :         return FALSE;
    1652             :     } 
    1653           0 :     return TRUE;
    1654             : }
    1655             : 
    1656           0 : void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
    1657           0 :     if (newLen <= capacity)
    1658           0 :         return;
    1659           0 :     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
    1660           0 :     if (temp == NULL) {
    1661           0 :         ec = U_MEMORY_ALLOCATION_ERROR;
    1662           0 :         setToBogus();
    1663           0 :         return;
    1664             :     }
    1665           0 :     list = temp;
    1666           0 :     capacity = newLen + GROW_EXTRA;
    1667             :     // else we keep the original contents on the memory failure.
    1668             : }
    1669             : 
    1670           0 : void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
    1671           0 :     if (buffer != NULL && newLen <= bufferCapacity)
    1672           0 :         return;
    1673           0 :     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
    1674           0 :     if (temp == NULL) {
    1675           0 :         ec = U_MEMORY_ALLOCATION_ERROR;
    1676           0 :         setToBogus();
    1677           0 :         return;
    1678             :     }
    1679           0 :     buffer = temp;
    1680           0 :     bufferCapacity = newLen + GROW_EXTRA;
    1681             :     // else we keep the original contents on the memory failure.
    1682             : }
    1683             : 
    1684             : /**
    1685             :  * Swap list and buffer.
    1686             :  */
    1687           0 : void UnicodeSet::swapBuffers(void) {
    1688             :     // swap list and buffer
    1689           0 :     UChar32* temp = list;
    1690           0 :     list = buffer;
    1691           0 :     buffer = temp;
    1692             : 
    1693           0 :     int32_t c = capacity;
    1694           0 :     capacity = bufferCapacity;
    1695           0 :     bufferCapacity = c;
    1696           0 : }
    1697             : 
    1698           0 : void UnicodeSet::setToBogus() {
    1699           0 :     clear(); // Remove everything in the set.
    1700           0 :     fFlags = kIsBogus;
    1701           0 : }
    1702             : 
    1703             : //----------------------------------------------------------------
    1704             : // Implementation: Fundamental operators
    1705             : //----------------------------------------------------------------
    1706             : 
    1707           0 : static inline UChar32 max(UChar32 a, UChar32 b) {
    1708           0 :     return (a > b) ? a : b;
    1709             : }
    1710             : 
    1711             : // polarity = 0, 3 is normal: x xor y
    1712             : // polarity = 1, 2: x xor ~y == x === y
    1713             : 
    1714           0 : void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
    1715           0 :     if (isFrozen() || isBogus()) {
    1716           0 :         return;
    1717             :     }
    1718           0 :     UErrorCode status = U_ZERO_ERROR;
    1719           0 :     ensureBufferCapacity(len + otherLen, status);
    1720           0 :     if (U_FAILURE(status)) {
    1721           0 :         return;
    1722             :     }
    1723             : 
    1724           0 :     int32_t i = 0, j = 0, k = 0;
    1725           0 :     UChar32 a = list[i++];
    1726             :     UChar32 b;
    1727           0 :     if (polarity == 1 || polarity == 2) {
    1728           0 :         b = UNICODESET_LOW;
    1729           0 :         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
    1730           0 :             ++j;
    1731           0 :             b = other[j];
    1732             :         }
    1733             :     } else {
    1734           0 :         b = other[j++];
    1735             :     }
    1736             :     // simplest of all the routines
    1737             :     // sort the values, discarding identicals!
    1738             :     for (;;) {
    1739           0 :         if (a < b) {
    1740           0 :             buffer[k++] = a;
    1741           0 :             a = list[i++];
    1742           0 :         } else if (b < a) {
    1743           0 :             buffer[k++] = b;
    1744           0 :             b = other[j++];
    1745           0 :         } else if (a != UNICODESET_HIGH) { // at this point, a == b
    1746             :             // discard both values!
    1747           0 :             a = list[i++];
    1748           0 :             b = other[j++];
    1749             :         } else { // DONE!
    1750           0 :             buffer[k++] = UNICODESET_HIGH;
    1751           0 :             len = k;
    1752           0 :             break;
    1753             :         }
    1754             :     }
    1755           0 :     swapBuffers();
    1756           0 :     releasePattern();
    1757             : }
    1758             : 
    1759             : // polarity = 0 is normal: x union y
    1760             : // polarity = 2: x union ~y
    1761             : // polarity = 1: ~x union y
    1762             : // polarity = 3: ~x union ~y
    1763             : 
    1764           0 : void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
    1765           0 :     if (isFrozen() || isBogus() || other==NULL) {
    1766           0 :         return;
    1767             :     }
    1768           0 :     UErrorCode status = U_ZERO_ERROR;
    1769           0 :     ensureBufferCapacity(len + otherLen, status);
    1770           0 :     if (U_FAILURE(status)) {
    1771           0 :         return;
    1772             :     }
    1773             : 
    1774           0 :     int32_t i = 0, j = 0, k = 0;
    1775           0 :     UChar32 a = list[i++];
    1776           0 :     UChar32 b = other[j++];
    1777             :     // change from xor is that we have to check overlapping pairs
    1778             :     // polarity bit 1 means a is second, bit 2 means b is.
    1779             :     for (;;) {
    1780           0 :         switch (polarity) {
    1781             :           case 0: // both first; take lower if unequal
    1782           0 :             if (a < b) { // take a
    1783             :                 // Back up over overlapping ranges in buffer[]
    1784           0 :                 if (k > 0 && a <= buffer[k-1]) {
    1785             :                     // Pick latter end value in buffer[] vs. list[]
    1786           0 :                     a = max(list[i], buffer[--k]);
    1787             :                 } else {
    1788             :                     // No overlap
    1789           0 :                     buffer[k++] = a;
    1790           0 :                     a = list[i];
    1791             :                 }
    1792           0 :                 i++; // Common if/else code factored out
    1793           0 :                 polarity ^= 1;
    1794           0 :             } else if (b < a) { // take b
    1795           0 :                 if (k > 0 && b <= buffer[k-1]) {
    1796           0 :                     b = max(other[j], buffer[--k]);
    1797             :                 } else {
    1798           0 :                     buffer[k++] = b;
    1799           0 :                     b = other[j];
    1800             :                 }
    1801           0 :                 j++;
    1802           0 :                 polarity ^= 2;
    1803             :             } else { // a == b, take a, drop b
    1804           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1805             :                 // This is symmetrical; it doesn't matter if
    1806             :                 // we backtrack with a or b. - liu
    1807           0 :                 if (k > 0 && a <= buffer[k-1]) {
    1808           0 :                     a = max(list[i], buffer[--k]);
    1809             :                 } else {
    1810             :                     // No overlap
    1811           0 :                     buffer[k++] = a;
    1812           0 :                     a = list[i];
    1813             :                 }
    1814           0 :                 i++;
    1815           0 :                 polarity ^= 1;
    1816           0 :                 b = other[j++];
    1817           0 :                 polarity ^= 2;
    1818             :             }
    1819           0 :             break;
    1820             :           case 3: // both second; take higher if unequal, and drop other
    1821           0 :             if (b <= a) { // take a
    1822           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1823           0 :                 buffer[k++] = a;
    1824             :             } else { // take b
    1825           0 :                 if (b == UNICODESET_HIGH) goto loop_end;
    1826           0 :                 buffer[k++] = b;
    1827             :             }
    1828           0 :             a = list[i++];
    1829           0 :             polarity ^= 1;   // factored common code
    1830           0 :             b = other[j++];
    1831           0 :             polarity ^= 2;
    1832           0 :             break;
    1833             :           case 1: // a second, b first; if b < a, overlap
    1834           0 :             if (a < b) { // no overlap, take a
    1835           0 :                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
    1836           0 :             } else if (b < a) { // OVERLAP, drop b
    1837           0 :                 b = other[j++];
    1838           0 :                 polarity ^= 2;
    1839             :             } else { // a == b, drop both!
    1840           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1841           0 :                 a = list[i++];
    1842           0 :                 polarity ^= 1;
    1843           0 :                 b = other[j++];
    1844           0 :                 polarity ^= 2;
    1845             :             }
    1846           0 :             break;
    1847             :           case 2: // a first, b second; if a < b, overlap
    1848           0 :             if (b < a) { // no overlap, take b
    1849           0 :                 buffer[k++] = b;
    1850           0 :                 b = other[j++];
    1851           0 :                 polarity ^= 2;
    1852           0 :             } else  if (a < b) { // OVERLAP, drop a
    1853           0 :                 a = list[i++];
    1854           0 :                 polarity ^= 1;
    1855             :             } else { // a == b, drop both!
    1856           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1857           0 :                 a = list[i++];
    1858           0 :                 polarity ^= 1;
    1859           0 :                 b = other[j++];
    1860           0 :                 polarity ^= 2;
    1861             :             }
    1862           0 :             break;
    1863             :         }
    1864             :     }
    1865             :  loop_end:
    1866           0 :     buffer[k++] = UNICODESET_HIGH;    // terminate
    1867           0 :     len = k;
    1868           0 :     swapBuffers();
    1869           0 :     releasePattern();
    1870             : }
    1871             : 
    1872             : // polarity = 0 is normal: x intersect y
    1873             : // polarity = 2: x intersect ~y == set-minus
    1874             : // polarity = 1: ~x intersect y
    1875             : // polarity = 3: ~x intersect ~y
    1876             : 
    1877           0 : void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
    1878           0 :     if (isFrozen() || isBogus()) {
    1879           0 :         return;
    1880             :     }
    1881           0 :     UErrorCode status = U_ZERO_ERROR;
    1882           0 :     ensureBufferCapacity(len + otherLen, status);
    1883           0 :     if (U_FAILURE(status)) {
    1884           0 :         return;
    1885             :     }
    1886             : 
    1887           0 :     int32_t i = 0, j = 0, k = 0;
    1888           0 :     UChar32 a = list[i++];
    1889           0 :     UChar32 b = other[j++];
    1890             :     // change from xor is that we have to check overlapping pairs
    1891             :     // polarity bit 1 means a is second, bit 2 means b is.
    1892             :     for (;;) {
    1893           0 :         switch (polarity) {
    1894             :           case 0: // both first; drop the smaller
    1895           0 :             if (a < b) { // drop a
    1896           0 :                 a = list[i++];
    1897           0 :                 polarity ^= 1;
    1898           0 :             } else if (b < a) { // drop b
    1899           0 :                 b = other[j++];
    1900           0 :                 polarity ^= 2;
    1901             :             } else { // a == b, take one, drop other
    1902           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1903           0 :                 buffer[k++] = a;
    1904           0 :                 a = list[i++];
    1905           0 :                 polarity ^= 1;
    1906           0 :                 b = other[j++];
    1907           0 :                 polarity ^= 2;
    1908             :             }
    1909           0 :             break;
    1910             :           case 3: // both second; take lower if unequal
    1911           0 :             if (a < b) { // take a
    1912           0 :                 buffer[k++] = a;
    1913           0 :                 a = list[i++];
    1914           0 :                 polarity ^= 1;
    1915           0 :             } else if (b < a) { // take b
    1916           0 :                 buffer[k++] = b;
    1917           0 :                 b = other[j++];
    1918           0 :                 polarity ^= 2;
    1919             :             } else { // a == b, take one, drop other
    1920           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1921           0 :                 buffer[k++] = a;
    1922           0 :                 a = list[i++];
    1923           0 :                 polarity ^= 1;
    1924           0 :                 b = other[j++];
    1925           0 :                 polarity ^= 2;
    1926             :             }
    1927           0 :             break;
    1928             :           case 1: // a second, b first;
    1929           0 :             if (a < b) { // NO OVERLAP, drop a
    1930           0 :                 a = list[i++];
    1931           0 :                 polarity ^= 1;
    1932           0 :             } else if (b < a) { // OVERLAP, take b
    1933           0 :                 buffer[k++] = b;
    1934           0 :                 b = other[j++];
    1935           0 :                 polarity ^= 2;
    1936             :             } else { // a == b, drop both!
    1937           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1938           0 :                 a = list[i++];
    1939           0 :                 polarity ^= 1;
    1940           0 :                 b = other[j++];
    1941           0 :                 polarity ^= 2;
    1942             :             }
    1943           0 :             break;
    1944             :           case 2: // a first, b second; if a < b, overlap
    1945           0 :             if (b < a) { // no overlap, drop b
    1946           0 :                 b = other[j++];
    1947           0 :                 polarity ^= 2;
    1948           0 :             } else  if (a < b) { // OVERLAP, take a
    1949           0 :                 buffer[k++] = a;
    1950           0 :                 a = list[i++];
    1951           0 :                 polarity ^= 1;
    1952             :             } else { // a == b, drop both!
    1953           0 :                 if (a == UNICODESET_HIGH) goto loop_end;
    1954           0 :                 a = list[i++];
    1955           0 :                 polarity ^= 1;
    1956           0 :                 b = other[j++];
    1957           0 :                 polarity ^= 2;
    1958             :             }
    1959           0 :             break;
    1960             :         }
    1961             :     }
    1962             :  loop_end:
    1963           0 :     buffer[k++] = UNICODESET_HIGH;    // terminate
    1964           0 :     len = k;
    1965           0 :     swapBuffers();
    1966           0 :     releasePattern();
    1967             : }
    1968             : 
    1969             : /**
    1970             :  * Append the <code>toPattern()</code> representation of a
    1971             :  * string to the given <code>StringBuffer</code>.
    1972             :  */
    1973           0 : void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
    1974             : escapeUnprintable) {
    1975             :     UChar32 cp;
    1976           0 :     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
    1977           0 :         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
    1978             :     }
    1979           0 : }
    1980             : 
    1981             : /**
    1982             :  * Append the <code>toPattern()</code> representation of a
    1983             :  * character to the given <code>StringBuffer</code>.
    1984             :  */
    1985           0 : void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
    1986             : escapeUnprintable) {
    1987           0 :     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
    1988             :         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
    1989             :         // unprintable
    1990           0 :         if (ICU_Utility::escapeUnprintable(buf, c)) {
    1991           0 :             return;
    1992             :         }
    1993             :     }
    1994             :     // Okay to let ':' pass through
    1995           0 :     switch (c) {
    1996             :     case SET_OPEN:
    1997             :     case SET_CLOSE:
    1998             :     case HYPHEN:
    1999             :     case COMPLEMENT:
    2000             :     case INTERSECTION:
    2001             :     case BACKSLASH:
    2002             :     case OPEN_BRACE:
    2003             :     case CLOSE_BRACE:
    2004             :     case COLON:
    2005             :     case SymbolTable::SYMBOL_REF:
    2006           0 :         buf.append(BACKSLASH);
    2007           0 :         break;
    2008             :     default:
    2009             :         // Escape whitespace
    2010           0 :         if (PatternProps::isWhiteSpace(c)) {
    2011           0 :             buf.append(BACKSLASH);
    2012             :         }
    2013           0 :         break;
    2014             :     }
    2015           0 :     buf.append(c);
    2016             : }
    2017             : 
    2018             : /**
    2019             :  * Append a string representation of this set to result.  This will be
    2020             :  * a cleaned version of the string passed to applyPattern(), if there
    2021             :  * is one.  Otherwise it will be generated.
    2022             :  */
    2023           0 : UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
    2024             :                                       UBool escapeUnprintable) const
    2025             : {
    2026           0 :     if (pat != NULL) {
    2027             :         int32_t i;
    2028           0 :         int32_t backslashCount = 0;
    2029           0 :         for (i=0; i<patLen; ) {
    2030             :             UChar32 c;
    2031           0 :             U16_NEXT(pat, i, patLen, c);
    2032           0 :             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
    2033             :                 // If the unprintable character is preceded by an odd
    2034             :                 // number of backslashes, then it has been escaped.
    2035             :                 // Before unescaping it, we delete the final
    2036             :                 // backslash.
    2037           0 :                 if ((backslashCount % 2) == 1) {
    2038           0 :                     result.truncate(result.length() - 1);
    2039             :                 }
    2040           0 :                 ICU_Utility::escapeUnprintable(result, c);
    2041           0 :                 backslashCount = 0;
    2042             :             } else {
    2043           0 :                 result.append(c);
    2044           0 :                 if (c == BACKSLASH) {
    2045           0 :                     ++backslashCount;
    2046             :                 } else {
    2047           0 :                     backslashCount = 0;
    2048             :                 }
    2049             :             }
    2050             :         }
    2051           0 :         return result;
    2052             :     }
    2053             : 
    2054           0 :     return _generatePattern(result, escapeUnprintable);
    2055             : }
    2056             : 
    2057             : /**
    2058             :  * Returns a string representation of this set.  If the result of
    2059             :  * calling this function is passed to a UnicodeSet constructor, it
    2060             :  * will produce another set that is equal to this one.
    2061             :  */
    2062           0 : UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
    2063             :                                      UBool escapeUnprintable) const
    2064             : {
    2065           0 :     result.truncate(0);
    2066           0 :     return _toPattern(result, escapeUnprintable);
    2067             : }
    2068             : 
    2069             : /**
    2070             :  * Generate and append a string representation of this set to result.
    2071             :  * This does not use this.pat, the cleaned up copy of the string
    2072             :  * passed to applyPattern().
    2073             :  */
    2074           0 : UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
    2075             :                                             UBool escapeUnprintable) const
    2076             : {
    2077           0 :     result.append(SET_OPEN);
    2078             : 
    2079             : //  // Check against the predefined categories.  We implicitly build
    2080             : //  // up ALL category sets the first time toPattern() is called.
    2081             : //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
    2082             : //      if (*this == getCategorySet(cat)) {
    2083             : //          result.append(COLON);
    2084             : //          result.append(CATEGORY_NAMES, cat*2, 2);
    2085             : //          return result.append(CATEGORY_CLOSE);
    2086             : //      }
    2087             : //  }
    2088             : 
    2089           0 :     int32_t count = getRangeCount();
    2090             : 
    2091             :     // If the set contains at least 2 intervals and includes both
    2092             :     // MIN_VALUE and MAX_VALUE, then the inverse representation will
    2093             :     // be more economical.
    2094           0 :     if (count > 1 &&
    2095           0 :         getRangeStart(0) == MIN_VALUE &&
    2096           0 :         getRangeEnd(count-1) == MAX_VALUE) {
    2097             : 
    2098             :         // Emit the inverse
    2099           0 :         result.append(COMPLEMENT);
    2100             : 
    2101           0 :         for (int32_t i = 1; i < count; ++i) {
    2102           0 :             UChar32 start = getRangeEnd(i-1)+1;
    2103           0 :             UChar32 end = getRangeStart(i)-1;
    2104           0 :             _appendToPat(result, start, escapeUnprintable);
    2105           0 :             if (start != end) {
    2106           0 :                 if ((start+1) != end) {
    2107           0 :                     result.append(HYPHEN);
    2108             :                 }
    2109           0 :                 _appendToPat(result, end, escapeUnprintable);
    2110             :             }
    2111             :         }
    2112             :     }
    2113             : 
    2114             :     // Default; emit the ranges as pairs
    2115             :     else {
    2116           0 :         for (int32_t i = 0; i < count; ++i) {
    2117           0 :             UChar32 start = getRangeStart(i);
    2118           0 :             UChar32 end = getRangeEnd(i);
    2119           0 :             _appendToPat(result, start, escapeUnprintable);
    2120           0 :             if (start != end) {
    2121           0 :                 if ((start+1) != end) {
    2122           0 :                     result.append(HYPHEN);
    2123             :                 }
    2124           0 :                 _appendToPat(result, end, escapeUnprintable);
    2125             :             }
    2126             :         }
    2127             :     }
    2128             : 
    2129           0 :     for (int32_t i = 0; i<strings->size(); ++i) {
    2130           0 :         result.append(OPEN_BRACE);
    2131           0 :         _appendToPat(result,
    2132           0 :                      *(const UnicodeString*) strings->elementAt(i),
    2133           0 :                      escapeUnprintable);
    2134           0 :         result.append(CLOSE_BRACE);
    2135             :     }
    2136           0 :     return result.append(SET_CLOSE);
    2137             : }
    2138             : 
    2139             : /**
    2140             : * Release existing cached pattern
    2141             : */
    2142           0 : void UnicodeSet::releasePattern() {
    2143           0 :     if (pat) {
    2144           0 :         uprv_free(pat);
    2145           0 :         pat = NULL;
    2146           0 :         patLen = 0;
    2147             :     }
    2148           0 : }
    2149             : 
    2150             : /**
    2151             : * Set the new pattern to cache.
    2152             : */
    2153           0 : void UnicodeSet::setPattern(const UnicodeString& newPat) {
    2154           0 :     releasePattern();
    2155           0 :     int32_t newPatLen = newPat.length();
    2156           0 :     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
    2157           0 :     if (pat) {
    2158           0 :         patLen = newPatLen;
    2159           0 :         newPat.extractBetween(0, patLen, pat);
    2160           0 :         pat[patLen] = 0;
    2161             :     }
    2162             :     // else we don't care if malloc failed. This was just a nice cache.
    2163             :     // We can regenerate an equivalent pattern later when requested.
    2164           0 : }
    2165             : 
    2166           0 : UnicodeFunctor *UnicodeSet::freeze() {
    2167           0 :     if(!isFrozen() && !isBogus()) {
    2168             :         // Do most of what compact() does before freezing because
    2169             :         // compact() will not work when the set is frozen.
    2170             :         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
    2171             : 
    2172             :         // Delete buffer first to defragment memory less.
    2173           0 :         if (buffer != NULL) {
    2174           0 :             uprv_free(buffer);
    2175           0 :             buffer = NULL;
    2176             :         }
    2177           0 :         if (capacity > (len + GROW_EXTRA)) {
    2178             :             // Make the capacity equal to len or 1.
    2179             :             // We don't want to realloc of 0 size.
    2180           0 :             capacity = len + (len == 0);
    2181           0 :             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
    2182           0 :             if (list == NULL) { // Check for memory allocation error.
    2183           0 :                 setToBogus();
    2184           0 :                 return this;
    2185             :             }
    2186             :         }
    2187             : 
    2188             :         // Optimize contains() and span() and similar functions.
    2189           0 :         if (!strings->isEmpty()) {
    2190           0 :             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
    2191           0 :             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
    2192             :                 // All strings are irrelevant for span() etc. because
    2193             :                 // all of each string's code points are contained in this set.
    2194             :                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
    2195             :                 // many relevant strings as UTF-16.
    2196             :                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
    2197           0 :                 delete stringSpan;
    2198           0 :                 stringSpan = NULL;
    2199             :             }
    2200             :         }
    2201           0 :         if (stringSpan == NULL) {
    2202             :             // No span-relevant strings: Optimize for code point spans.
    2203           0 :             bmpSet=new BMPSet(list, len);
    2204           0 :             if (bmpSet == NULL) { // Check for memory allocation error.
    2205           0 :                 setToBogus();
    2206             :             }
    2207             :         }
    2208             :     }
    2209           0 :     return this;
    2210             : }
    2211             : 
    2212           0 : int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    2213           0 :     if(length>0 && bmpSet!=NULL) {
    2214           0 :         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
    2215             :     }
    2216           0 :     if(length<0) {
    2217           0 :         length=u_strlen(s);
    2218             :     }
    2219           0 :     if(length==0) {
    2220           0 :         return 0;
    2221             :     }
    2222           0 :     if(stringSpan!=NULL) {
    2223           0 :         return stringSpan->span(s, length, spanCondition);
    2224           0 :     } else if(!strings->isEmpty()) {
    2225           0 :         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
    2226             :                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
    2227           0 :                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
    2228           0 :         UnicodeSetStringSpan strSpan(*this, *strings, which);
    2229           0 :         if(strSpan.needsStringSpanUTF16()) {
    2230           0 :             return strSpan.span(s, length, spanCondition);
    2231             :         }
    2232             :     }
    2233             : 
    2234           0 :     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    2235           0 :         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    2236             :     }
    2237             : 
    2238             :     UChar32 c;
    2239           0 :     int32_t start=0, prev=0;
    2240           0 :     do {
    2241           0 :         U16_NEXT(s, start, length, c);
    2242           0 :         if(spanCondition!=contains(c)) {
    2243           0 :             break;
    2244             :         }
    2245           0 :     } while((prev=start)<length);
    2246           0 :     return prev;
    2247             : }
    2248             : 
    2249           0 : int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
    2250           0 :     if(length>0 && bmpSet!=NULL) {
    2251           0 :         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
    2252             :     }
    2253           0 :     if(length<0) {
    2254           0 :         length=u_strlen(s);
    2255             :     }
    2256           0 :     if(length==0) {
    2257           0 :         return 0;
    2258             :     }
    2259           0 :     if(stringSpan!=NULL) {
    2260           0 :         return stringSpan->spanBack(s, length, spanCondition);
    2261           0 :     } else if(!strings->isEmpty()) {
    2262           0 :         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
    2263             :                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
    2264           0 :                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
    2265           0 :         UnicodeSetStringSpan strSpan(*this, *strings, which);
    2266           0 :         if(strSpan.needsStringSpanUTF16()) {
    2267           0 :             return strSpan.spanBack(s, length, spanCondition);
    2268             :         }
    2269             :     }
    2270             : 
    2271           0 :     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    2272           0 :         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    2273             :     }
    2274             : 
    2275             :     UChar32 c;
    2276           0 :     int32_t prev=length;
    2277           0 :     do {
    2278           0 :         U16_PREV(s, 0, length, c);
    2279           0 :         if(spanCondition!=contains(c)) {
    2280           0 :             break;
    2281             :         }
    2282           0 :     } while((prev=length)>0);
    2283           0 :     return prev;
    2284             : }
    2285             : 
    2286           0 : int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
    2287           0 :     if(length>0 && bmpSet!=NULL) {
    2288           0 :         const uint8_t *s0=(const uint8_t *)s;
    2289           0 :         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
    2290             :     }
    2291           0 :     if(length<0) {
    2292           0 :         length=(int32_t)uprv_strlen(s);
    2293             :     }
    2294           0 :     if(length==0) {
    2295           0 :         return 0;
    2296             :     }
    2297           0 :     if(stringSpan!=NULL) {
    2298           0 :         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
    2299           0 :     } else if(!strings->isEmpty()) {
    2300           0 :         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
    2301             :                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
    2302           0 :                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
    2303           0 :         UnicodeSetStringSpan strSpan(*this, *strings, which);
    2304           0 :         if(strSpan.needsStringSpanUTF8()) {
    2305           0 :             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
    2306             :         }
    2307             :     }
    2308             : 
    2309           0 :     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    2310           0 :         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    2311             :     }
    2312             : 
    2313             :     UChar32 c;
    2314           0 :     int32_t start=0, prev=0;
    2315           0 :     do {
    2316           0 :         U8_NEXT_OR_FFFD(s, start, length, c);
    2317           0 :         if(spanCondition!=contains(c)) {
    2318           0 :             break;
    2319             :         }
    2320           0 :     } while((prev=start)<length);
    2321           0 :     return prev;
    2322             : }
    2323             : 
    2324           0 : int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
    2325           0 :     if(length>0 && bmpSet!=NULL) {
    2326           0 :         const uint8_t *s0=(const uint8_t *)s;
    2327           0 :         return bmpSet->spanBackUTF8(s0, length, spanCondition);
    2328             :     }
    2329           0 :     if(length<0) {
    2330           0 :         length=(int32_t)uprv_strlen(s);
    2331             :     }
    2332           0 :     if(length==0) {
    2333           0 :         return 0;
    2334             :     }
    2335           0 :     if(stringSpan!=NULL) {
    2336           0 :         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
    2337           0 :     } else if(!strings->isEmpty()) {
    2338           0 :         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
    2339             :                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
    2340           0 :                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
    2341           0 :         UnicodeSetStringSpan strSpan(*this, *strings, which);
    2342           0 :         if(strSpan.needsStringSpanUTF8()) {
    2343           0 :             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
    2344             :         }
    2345             :     }
    2346             : 
    2347           0 :     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
    2348           0 :         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
    2349             :     }
    2350             : 
    2351             :     UChar32 c;
    2352           0 :     int32_t prev=length;
    2353           0 :     do {
    2354           0 :         U8_PREV_OR_FFFD(s, 0, length, c);
    2355           0 :         if(spanCondition!=contains(c)) {
    2356           0 :             break;
    2357             :         }
    2358           0 :     } while((prev=length)>0);
    2359           0 :     return prev;
    2360             : }
    2361             : 
    2362             : U_NAMESPACE_END

Generated by: LCOV version 1.13