LCOV - code coverage report
Current view: top level - intl/icu/source/common - uniset_props.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 561 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 33 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : *
       6             : *   Copyright (C) 1999-2014, International Business Machines
       7             : *   Corporation and others.  All Rights Reserved.
       8             : *
       9             : *******************************************************************************
      10             : *   file name:  uniset_props.cpp
      11             : *   encoding:   UTF-8
      12             : *   tab size:   8 (not used)
      13             : *   indentation:4
      14             : *
      15             : *   created on: 2004aug25
      16             : *   created by: Markus W. Scherer
      17             : *
      18             : *   Character property dependent functions moved here from uniset.cpp
      19             : */
      20             : 
      21             : #include "unicode/utypes.h"
      22             : #include "unicode/uniset.h"
      23             : #include "unicode/parsepos.h"
      24             : #include "unicode/uchar.h"
      25             : #include "unicode/uscript.h"
      26             : #include "unicode/symtable.h"
      27             : #include "unicode/uset.h"
      28             : #include "unicode/locid.h"
      29             : #include "unicode/brkiter.h"
      30             : #include "uset_imp.h"
      31             : #include "ruleiter.h"
      32             : #include "cmemory.h"
      33             : #include "ucln_cmn.h"
      34             : #include "util.h"
      35             : #include "uvector.h"
      36             : #include "uprops.h"
      37             : #include "propname.h"
      38             : #include "normalizer2impl.h"
      39             : #include "ucase.h"
      40             : #include "ubidi_props.h"
      41             : #include "uinvchar.h"
      42             : #include "uprops.h"
      43             : #include "charstr.h"
      44             : #include "cstring.h"
      45             : #include "mutex.h"
      46             : #include "umutex.h"
      47             : #include "uassert.h"
      48             : #include "hash.h"
      49             : 
      50             : U_NAMESPACE_USE
      51             : 
      52             : // initial storage. Must be >= 0
      53             : // *** same as in uniset.cpp ! ***
      54             : #define START_EXTRA 16
      55             : 
      56             : // Define UChar constants using hex for EBCDIC compatibility
      57             : // Used #define to reduce private static exports and memory access time.
      58             : #define SET_OPEN        ((UChar)0x005B) /*[*/
      59             : #define SET_CLOSE       ((UChar)0x005D) /*]*/
      60             : #define HYPHEN          ((UChar)0x002D) /*-*/
      61             : #define COMPLEMENT      ((UChar)0x005E) /*^*/
      62             : #define COLON           ((UChar)0x003A) /*:*/
      63             : #define BACKSLASH       ((UChar)0x005C) /*\*/
      64             : #define INTERSECTION    ((UChar)0x0026) /*&*/
      65             : #define UPPER_U         ((UChar)0x0055) /*U*/
      66             : #define LOWER_U         ((UChar)0x0075) /*u*/
      67             : #define OPEN_BRACE      ((UChar)123)    /*{*/
      68             : #define CLOSE_BRACE     ((UChar)125)    /*}*/
      69             : #define UPPER_P         ((UChar)0x0050) /*P*/
      70             : #define LOWER_P         ((UChar)0x0070) /*p*/
      71             : #define UPPER_N         ((UChar)78)     /*N*/
      72             : #define EQUALS          ((UChar)0x003D) /*=*/
      73             : 
      74             : //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
      75             : static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
      76             : //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
      77             : //static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
      78             : //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
      79             : static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
      80             : 
      81             : // Special property set IDs
      82             : static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
      83             : static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
      84             : static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
      85             : 
      86             : // Unicode name property alias
      87             : #define NAME_PROP "na"
      88             : #define NAME_PROP_LENGTH 2
      89             : 
      90             : /**
      91             :  * Delimiter string used in patterns to close a category reference:
      92             :  * ":]".  Example: "[:Lu:]".
      93             :  */
      94             : //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
      95             : 
      96             : // Cached sets ------------------------------------------------------------- ***
      97             : 
      98             : U_CDECL_BEGIN
      99             : static UBool U_CALLCONV uset_cleanup();
     100             : 
     101             : struct Inclusion {
     102             :     UnicodeSet  *fSet;
     103             :     UInitOnce    fInitOnce;
     104             : };
     105             : static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
     106             : 
     107             : static UnicodeSet *uni32Singleton;
     108             : static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
     109             : 
     110             : //----------------------------------------------------------------
     111             : // Inclusions list
     112             : //----------------------------------------------------------------
     113             : 
     114             : // USetAdder implementation
     115             : // Does not use uset.h to reduce code dependencies
     116             : static void U_CALLCONV
     117           0 : _set_add(USet *set, UChar32 c) {
     118           0 :     ((UnicodeSet *)set)->add(c);
     119           0 : }
     120             : 
     121             : static void U_CALLCONV
     122           0 : _set_addRange(USet *set, UChar32 start, UChar32 end) {
     123           0 :     ((UnicodeSet *)set)->add(start, end);
     124           0 : }
     125             : 
     126             : static void U_CALLCONV
     127           0 : _set_addString(USet *set, const UChar *str, int32_t length) {
     128           0 :     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
     129           0 : }
     130             : 
     131             : /**
     132             :  * Cleanup function for UnicodeSet
     133             :  */
     134           0 : static UBool U_CALLCONV uset_cleanup(void) {
     135           0 :     for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
     136           0 :         Inclusion &in = gInclusions[i];
     137           0 :         delete in.fSet;
     138           0 :         in.fSet = NULL;
     139           0 :         in.fInitOnce.reset();
     140             :     }
     141             : 
     142           0 :     delete uni32Singleton;
     143           0 :     uni32Singleton = NULL;
     144           0 :     uni32InitOnce.reset();
     145           0 :     return TRUE;
     146             : }
     147             : 
     148             : U_CDECL_END
     149             : 
     150             : U_NAMESPACE_BEGIN
     151             : 
     152             : /*
     153             : Reduce excessive reallocation, and make it easier to detect initialization problems.
     154             : Usually you don't see smaller sets than this for Unicode 5.0.
     155             : */
     156             : #define DEFAULT_INCLUSION_CAPACITY 3072
     157             : 
     158           0 : void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
     159             :     // This function is invoked only via umtx_initOnce().
     160             :     // This function is a friend of class UnicodeSet.
     161             : 
     162           0 :     U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
     163           0 :     UnicodeSet * &incl = gInclusions[src].fSet;
     164           0 :     U_ASSERT(incl == NULL);
     165             : 
     166           0 :     incl = new UnicodeSet();
     167           0 :     if (incl == NULL) {
     168           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     169           0 :         return;
     170             :     }
     171             :     USetAdder sa = {
     172           0 :         (USet *)incl,
     173             :         _set_add,
     174             :         _set_addRange,
     175             :         _set_addString,
     176             :         NULL, // don't need remove()
     177             :         NULL // don't need removeRange()
     178           0 :     };
     179             : 
     180           0 :     incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
     181           0 :     switch(src) {
     182             :     case UPROPS_SRC_CHAR:
     183           0 :         uchar_addPropertyStarts(&sa, &status);
     184           0 :         break;
     185             :     case UPROPS_SRC_PROPSVEC:
     186           0 :         upropsvec_addPropertyStarts(&sa, &status);
     187           0 :         break;
     188             :     case UPROPS_SRC_CHAR_AND_PROPSVEC:
     189           0 :         uchar_addPropertyStarts(&sa, &status);
     190           0 :         upropsvec_addPropertyStarts(&sa, &status);
     191           0 :         break;
     192             : #if !UCONFIG_NO_NORMALIZATION
     193             :     case UPROPS_SRC_CASE_AND_NORM: {
     194           0 :         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
     195           0 :         if(U_SUCCESS(status)) {
     196           0 :             impl->addPropertyStarts(&sa, status);
     197             :         }
     198           0 :         ucase_addPropertyStarts(&sa, &status);
     199           0 :         break;
     200             :     }
     201             :     case UPROPS_SRC_NFC: {
     202           0 :         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
     203           0 :         if(U_SUCCESS(status)) {
     204           0 :             impl->addPropertyStarts(&sa, status);
     205             :         }
     206           0 :         break;
     207             :     }
     208             :     case UPROPS_SRC_NFKC: {
     209           0 :         const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
     210           0 :         if(U_SUCCESS(status)) {
     211           0 :             impl->addPropertyStarts(&sa, status);
     212             :         }
     213           0 :         break;
     214             :     }
     215             :     case UPROPS_SRC_NFKC_CF: {
     216           0 :         const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
     217           0 :         if(U_SUCCESS(status)) {
     218           0 :             impl->addPropertyStarts(&sa, status);
     219             :         }
     220           0 :         break;
     221             :     }
     222             :     case UPROPS_SRC_NFC_CANON_ITER: {
     223           0 :         const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
     224           0 :         if(U_SUCCESS(status)) {
     225           0 :             impl->addCanonIterPropertyStarts(&sa, status);
     226             :         }
     227           0 :         break;
     228             :     }
     229             : #endif
     230             :     case UPROPS_SRC_CASE:
     231           0 :         ucase_addPropertyStarts(&sa, &status);
     232           0 :         break;
     233             :     case UPROPS_SRC_BIDI:
     234           0 :         ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
     235           0 :         break;
     236             :     default:
     237           0 :         status = U_INTERNAL_PROGRAM_ERROR;
     238           0 :         break;
     239             :     }
     240             : 
     241           0 :     if (U_FAILURE(status)) {
     242           0 :         delete incl;
     243           0 :         incl = NULL;
     244           0 :         return;
     245             :     }
     246             :     // Compact for caching
     247           0 :     incl->compact();
     248           0 :     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
     249             : }
     250             : 
     251             : 
     252             : 
     253           0 : const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
     254           0 :     U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
     255           0 :     Inclusion &i = gInclusions[src];
     256           0 :     umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);
     257           0 :     return i.fSet;
     258             : }
     259             : 
     260             : 
     261             : // Cache some sets for other services -------------------------------------- ***
     262           0 : void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
     263           0 :     U_ASSERT(uni32Singleton == NULL);
     264           0 :     uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
     265           0 :     if(uni32Singleton==NULL) {
     266           0 :         errorCode=U_MEMORY_ALLOCATION_ERROR;
     267             :     } else {
     268           0 :         uni32Singleton->freeze();
     269             :     }
     270           0 :     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
     271           0 : }
     272             : 
     273             : 
     274             : U_CFUNC UnicodeSet *
     275           0 : uniset_getUnicode32Instance(UErrorCode &errorCode) {
     276           0 :     umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
     277           0 :     return uni32Singleton;
     278             : }
     279             : 
     280             : // helper functions for matching of pattern syntax pieces ------------------ ***
     281             : // these functions are parallel to the PERL_OPEN etc. strings above
     282             : 
     283             : // using these functions is not only faster than UnicodeString::compare() and
     284             : // caseCompare(), but they also make UnicodeSet work for simple patterns when
     285             : // no Unicode properties data is available - when caseCompare() fails
     286             : 
     287             : static inline UBool
     288           0 : isPerlOpen(const UnicodeString &pattern, int32_t pos) {
     289             :     UChar c;
     290           0 :     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
     291             : }
     292             : 
     293             : /*static inline UBool
     294             : isPerlClose(const UnicodeString &pattern, int32_t pos) {
     295             :     return pattern.charAt(pos)==CLOSE_BRACE;
     296             : }*/
     297             : 
     298             : static inline UBool
     299           0 : isNameOpen(const UnicodeString &pattern, int32_t pos) {
     300           0 :     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
     301             : }
     302             : 
     303             : static inline UBool
     304           0 : isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
     305           0 :     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
     306             : }
     307             : 
     308             : /*static inline UBool
     309             : isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
     310             :     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
     311             : }*/
     312             : 
     313             : // TODO memory debugging provided inside uniset.cpp
     314             : // could be made available here but probably obsolete with use of modern
     315             : // memory leak checker tools
     316             : #define _dbgct(me)
     317             : 
     318             : //----------------------------------------------------------------
     319             : // Constructors &c
     320             : //----------------------------------------------------------------
     321             : 
     322             : /**
     323             :  * Constructs a set from the given pattern, optionally ignoring
     324             :  * white space.  See the class description for the syntax of the
     325             :  * pattern language.
     326             :  * @param pattern a string specifying what characters are in the set
     327             :  */
     328           0 : UnicodeSet::UnicodeSet(const UnicodeString& pattern,
     329           0 :                        UErrorCode& status) :
     330             :     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
     331             :     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
     332           0 :     fFlags(0)
     333             : {
     334           0 :     if(U_SUCCESS(status)){
     335           0 :         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
     336             :         /* test for NULL */
     337           0 :         if(list == NULL) {
     338           0 :             status = U_MEMORY_ALLOCATION_ERROR;  
     339             :         }else{
     340           0 :             allocateStrings(status);
     341           0 :             applyPattern(pattern, status);
     342             :         }
     343             :     }
     344             :     _dbgct(this);
     345           0 : }
     346             : 
     347             : //----------------------------------------------------------------
     348             : // Public API
     349             : //----------------------------------------------------------------
     350             : 
     351           0 : UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
     352             :                                      UErrorCode& status) {
     353             :     // Equivalent to
     354             :     //   return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
     355             :     // but without dependency on closeOver().
     356           0 :     ParsePosition pos(0);
     357           0 :     applyPatternIgnoreSpace(pattern, pos, NULL, status);
     358           0 :     if (U_FAILURE(status)) return *this;
     359             : 
     360           0 :     int32_t i = pos.getIndex();
     361             :     // Skip over trailing whitespace
     362           0 :     ICU_Utility::skipWhitespace(pattern, i, TRUE);
     363           0 :     if (i != pattern.length()) {
     364           0 :         status = U_ILLEGAL_ARGUMENT_ERROR;
     365             :     }
     366           0 :     return *this;
     367             : }
     368             : 
     369             : void
     370           0 : UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
     371             :                                     ParsePosition& pos,
     372             :                                     const SymbolTable* symbols,
     373             :                                     UErrorCode& status) {
     374           0 :     if (U_FAILURE(status)) {
     375           0 :         return;
     376             :     }
     377           0 :     if (isFrozen()) {
     378           0 :         status = U_NO_WRITE_PERMISSION;
     379           0 :         return;
     380             :     }
     381             :     // Need to build the pattern in a temporary string because
     382             :     // _applyPattern calls add() etc., which set pat to empty.
     383           0 :     UnicodeString rebuiltPat;
     384           0 :     RuleCharacterIterator chars(pattern, symbols, pos);
     385           0 :     applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
     386           0 :     if (U_FAILURE(status)) return;
     387           0 :     if (chars.inVariable()) {
     388             :         // syntaxError(chars, "Extra chars in variable value");
     389           0 :         status = U_MALFORMED_SET;
     390           0 :         return;
     391             :     }
     392           0 :     setPattern(rebuiltPat);
     393             : }
     394             : 
     395             : /**
     396             :  * Return true if the given position, in the given pattern, appears
     397             :  * to be the start of a UnicodeSet pattern.
     398             :  */
     399           0 : UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
     400           0 :     return ((pos+1) < pattern.length() &&
     401           0 :             pattern.charAt(pos) == (UChar)91/*[*/) ||
     402           0 :         resemblesPropertyPattern(pattern, pos);
     403             : }
     404             : 
     405             : //----------------------------------------------------------------
     406             : // Implementation: Pattern parsing
     407             : //----------------------------------------------------------------
     408             : 
     409             : /**
     410             :  * A small all-inline class to manage a UnicodeSet pointer.  Add
     411             :  * operator->() etc. as needed.
     412             :  */
     413             : class UnicodeSetPointer {
     414             :     UnicodeSet* p;
     415             : public:
     416           0 :     inline UnicodeSetPointer() : p(0) {}
     417           0 :     inline ~UnicodeSetPointer() { delete p; }
     418           0 :     inline UnicodeSet* pointer() { return p; }
     419           0 :     inline UBool allocate() {
     420           0 :         if (p == 0) {
     421           0 :             p = new UnicodeSet();
     422             :         }
     423           0 :         return p != 0;
     424             :     }
     425             : };
     426             : 
     427             : /**
     428             :  * Parse the pattern from the given RuleCharacterIterator.  The
     429             :  * iterator is advanced over the parsed pattern.
     430             :  * @param chars iterator over the pattern characters.  Upon return
     431             :  * it will be advanced to the first character after the parsed
     432             :  * pattern, or the end of the iteration if all characters are
     433             :  * parsed.
     434             :  * @param symbols symbol table to use to parse and dereference
     435             :  * variables, or null if none.
     436             :  * @param rebuiltPat the pattern that was parsed, rebuilt or
     437             :  * copied from the input pattern, as appropriate.
     438             :  * @param options a bit mask of zero or more of the following:
     439             :  * IGNORE_SPACE, CASE.
     440             :  */
     441           0 : void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
     442             :                               const SymbolTable* symbols,
     443             :                               UnicodeString& rebuiltPat,
     444             :                               uint32_t options,
     445             :                               UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
     446             :                               UErrorCode& ec) {
     447           0 :     if (U_FAILURE(ec)) return;
     448             : 
     449             :     // Syntax characters: [ ] ^ - & { }
     450             : 
     451             :     // Recognized special forms for chars, sets: c-c s-s s&s
     452             : 
     453             :     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
     454           0 :                    RuleCharacterIterator::PARSE_ESCAPES;
     455           0 :     if ((options & USET_IGNORE_SPACE) != 0) {
     456           0 :         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
     457             :     }
     458             : 
     459           0 :     UnicodeString patLocal, buf;
     460           0 :     UBool usePat = FALSE;
     461           0 :     UnicodeSetPointer scratch;
     462             :     RuleCharacterIterator::Pos backup;
     463             : 
     464             :     // mode: 0=before [, 1=between [...], 2=after ]
     465             :     // lastItem: 0=none, 1=char, 2=set
     466           0 :     int8_t lastItem = 0, mode = 0;
     467           0 :     UChar32 lastChar = 0;
     468           0 :     UChar op = 0;
     469             : 
     470           0 :     UBool invert = FALSE;
     471             : 
     472           0 :     clear();
     473             : 
     474           0 :     while (mode != 2 && !chars.atEnd()) {
     475           0 :         U_ASSERT((lastItem == 0 && op == 0) ||
     476             :                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
     477             :                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
     478           0 :                                     op == INTERSECTION /*'&'*/)));
     479             : 
     480           0 :         UChar32 c = 0;
     481           0 :         UBool literal = FALSE;
     482           0 :         UnicodeSet* nested = 0; // alias - do not delete
     483             : 
     484             :         // -------- Check for property pattern
     485             : 
     486             :         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
     487           0 :         int8_t setMode = 0;
     488           0 :         if (resemblesPropertyPattern(chars, opts)) {
     489           0 :             setMode = 2;
     490             :         }
     491             : 
     492             :         // -------- Parse '[' of opening delimiter OR nested set.
     493             :         // If there is a nested set, use `setMode' to define how
     494             :         // the set should be parsed.  If the '[' is part of the
     495             :         // opening delimiter for this pattern, parse special
     496             :         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
     497             :         // characters representing a nested set in the symbol
     498             :         // table.
     499             : 
     500             :         else {
     501             :             // Prepare to backup if necessary
     502           0 :             chars.getPos(backup);
     503           0 :             c = chars.next(opts, literal, ec);
     504           0 :             if (U_FAILURE(ec)) return;
     505             : 
     506           0 :             if (c == 0x5B /*'['*/ && !literal) {
     507           0 :                 if (mode == 1) {
     508           0 :                     chars.setPos(backup); // backup
     509           0 :                     setMode = 1;
     510             :                 } else {
     511             :                     // Handle opening '[' delimiter
     512           0 :                     mode = 1;
     513           0 :                     patLocal.append((UChar) 0x5B /*'['*/);
     514           0 :                     chars.getPos(backup); // prepare to backup
     515           0 :                     c = chars.next(opts, literal, ec); 
     516           0 :                     if (U_FAILURE(ec)) return;
     517           0 :                     if (c == 0x5E /*'^'*/ && !literal) {
     518           0 :                         invert = TRUE;
     519           0 :                         patLocal.append((UChar) 0x5E /*'^'*/);
     520           0 :                         chars.getPos(backup); // prepare to backup
     521           0 :                         c = chars.next(opts, literal, ec);
     522           0 :                         if (U_FAILURE(ec)) return;
     523             :                     }
     524             :                     // Fall through to handle special leading '-';
     525             :                     // otherwise restart loop for nested [], \p{}, etc.
     526           0 :                     if (c == HYPHEN /*'-'*/) {
     527           0 :                         literal = TRUE;
     528             :                         // Fall through to handle literal '-' below
     529             :                     } else {
     530           0 :                         chars.setPos(backup); // backup
     531           0 :                         continue;
     532             :                     }
     533             :                 }
     534           0 :             } else if (symbols != 0) {
     535           0 :                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
     536           0 :                 if (m != 0) {
     537           0 :                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
     538           0 :                     if (ms == NULL) {
     539           0 :                         ec = U_MALFORMED_SET;
     540           0 :                         return;
     541             :                     }
     542             :                     // casting away const, but `nested' won't be modified
     543             :                     // (important not to modify stored set)
     544           0 :                     nested = const_cast<UnicodeSet*>(ms);
     545           0 :                     setMode = 3;
     546             :                 }
     547             :             }
     548             :         }
     549             : 
     550             :         // -------- Handle a nested set.  This either is inline in
     551             :         // the pattern or represented by a stand-in that has
     552             :         // previously been parsed and was looked up in the symbol
     553             :         // table.
     554             : 
     555           0 :         if (setMode != 0) {
     556           0 :             if (lastItem == 1) {
     557           0 :                 if (op != 0) {
     558             :                     // syntaxError(chars, "Char expected after operator");
     559           0 :                     ec = U_MALFORMED_SET;
     560           0 :                     return;
     561             :                 }
     562           0 :                 add(lastChar, lastChar);
     563           0 :                 _appendToPat(patLocal, lastChar, FALSE);
     564           0 :                 lastItem = 0;
     565           0 :                 op = 0;
     566             :             }
     567             : 
     568           0 :             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
     569           0 :                 patLocal.append(op);
     570             :             }
     571             : 
     572           0 :             if (nested == 0) {
     573             :                 // lazy allocation
     574           0 :                 if (!scratch.allocate()) {
     575           0 :                     ec = U_MEMORY_ALLOCATION_ERROR;
     576           0 :                     return;
     577             :                 }
     578           0 :                 nested = scratch.pointer();
     579             :             }
     580           0 :             switch (setMode) {
     581             :             case 1:
     582           0 :                 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
     583           0 :                 break;
     584             :             case 2:
     585           0 :                 chars.skipIgnored(opts);
     586           0 :                 nested->applyPropertyPattern(chars, patLocal, ec);
     587           0 :                 if (U_FAILURE(ec)) return;
     588           0 :                 break;
     589             :             case 3: // `nested' already parsed
     590           0 :                 nested->_toPattern(patLocal, FALSE);
     591           0 :                 break;
     592             :             }
     593             : 
     594           0 :             usePat = TRUE;
     595             : 
     596           0 :             if (mode == 0) {
     597             :                 // Entire pattern is a category; leave parse loop
     598           0 :                 *this = *nested;
     599           0 :                 mode = 2;
     600           0 :                 break;
     601             :             }
     602             : 
     603           0 :             switch (op) {
     604             :             case HYPHEN: /*'-'*/
     605           0 :                 removeAll(*nested);
     606           0 :                 break;
     607             :             case INTERSECTION: /*'&'*/
     608           0 :                 retainAll(*nested);
     609           0 :                 break;
     610             :             case 0:
     611           0 :                 addAll(*nested);
     612           0 :                 break;
     613             :             }
     614             : 
     615           0 :             op = 0;
     616           0 :             lastItem = 2;
     617             : 
     618           0 :             continue;
     619             :         }
     620             : 
     621           0 :         if (mode == 0) {
     622             :             // syntaxError(chars, "Missing '['");
     623           0 :             ec = U_MALFORMED_SET;
     624           0 :             return;
     625             :         }
     626             : 
     627             :         // -------- Parse special (syntax) characters.  If the
     628             :         // current character is not special, or if it is escaped,
     629             :         // then fall through and handle it below.
     630             : 
     631           0 :         if (!literal) {
     632           0 :             switch (c) {
     633             :             case 0x5D /*']'*/:
     634           0 :                 if (lastItem == 1) {
     635           0 :                     add(lastChar, lastChar);
     636           0 :                     _appendToPat(patLocal, lastChar, FALSE);
     637             :                 }
     638             :                 // Treat final trailing '-' as a literal
     639           0 :                 if (op == HYPHEN /*'-'*/) {
     640           0 :                     add(op, op);
     641           0 :                     patLocal.append(op);
     642           0 :                 } else if (op == INTERSECTION /*'&'*/) {
     643             :                     // syntaxError(chars, "Trailing '&'");
     644           0 :                     ec = U_MALFORMED_SET;
     645           0 :                     return;
     646             :                 }
     647           0 :                 patLocal.append((UChar) 0x5D /*']'*/);
     648           0 :                 mode = 2;
     649           0 :                 continue;
     650             :             case HYPHEN /*'-'*/:
     651           0 :                 if (op == 0) {
     652           0 :                     if (lastItem != 0) {
     653           0 :                         op = (UChar) c;
     654           0 :                         continue;
     655             :                     } else {
     656             :                         // Treat final trailing '-' as a literal
     657           0 :                         add(c, c);
     658           0 :                         c = chars.next(opts, literal, ec);
     659           0 :                         if (U_FAILURE(ec)) return;
     660           0 :                         if (c == 0x5D /*']'*/ && !literal) {
     661           0 :                             patLocal.append(HYPHEN_RIGHT_BRACE, 2);
     662           0 :                             mode = 2;
     663           0 :                             continue;
     664             :                         }
     665             :                     }
     666             :                 }
     667             :                 // syntaxError(chars, "'-' not after char or set");
     668           0 :                 ec = U_MALFORMED_SET;
     669           0 :                 return;
     670             :             case INTERSECTION /*'&'*/:
     671           0 :                 if (lastItem == 2 && op == 0) {
     672           0 :                     op = (UChar) c;
     673           0 :                     continue;
     674             :                 }
     675             :                 // syntaxError(chars, "'&' not after set");
     676           0 :                 ec = U_MALFORMED_SET;
     677           0 :                 return;
     678             :             case 0x5E /*'^'*/:
     679             :                 // syntaxError(chars, "'^' not after '['");
     680           0 :                 ec = U_MALFORMED_SET;
     681           0 :                 return;
     682             :             case 0x7B /*'{'*/:
     683           0 :                 if (op != 0) {
     684             :                     // syntaxError(chars, "Missing operand after operator");
     685           0 :                     ec = U_MALFORMED_SET;
     686           0 :                     return;
     687             :                 }
     688           0 :                 if (lastItem == 1) {
     689           0 :                     add(lastChar, lastChar);
     690           0 :                     _appendToPat(patLocal, lastChar, FALSE);
     691             :                 }
     692           0 :                 lastItem = 0;
     693           0 :                 buf.truncate(0);
     694             :                 {
     695           0 :                     UBool ok = FALSE;
     696           0 :                     while (!chars.atEnd()) {
     697           0 :                         c = chars.next(opts, literal, ec);
     698           0 :                         if (U_FAILURE(ec)) return;
     699           0 :                         if (c == 0x7D /*'}'*/ && !literal) {
     700           0 :                             ok = TRUE;
     701           0 :                             break;
     702             :                         }
     703           0 :                         buf.append(c);
     704             :                     }
     705           0 :                     if (buf.length() < 1 || !ok) {
     706             :                         // syntaxError(chars, "Invalid multicharacter string");
     707           0 :                         ec = U_MALFORMED_SET;
     708           0 :                         return;
     709             :                     }
     710             :                 }
     711             :                 // We have new string. Add it to set and continue;
     712             :                 // we don't need to drop through to the further
     713             :                 // processing
     714           0 :                 add(buf);
     715           0 :                 patLocal.append((UChar) 0x7B /*'{'*/);
     716           0 :                 _appendToPat(patLocal, buf, FALSE);
     717           0 :                 patLocal.append((UChar) 0x7D /*'}'*/);
     718           0 :                 continue;
     719             :             case SymbolTable::SYMBOL_REF:
     720             :                 //         symbols  nosymbols
     721             :                 // [a-$]   error    error (ambiguous)
     722             :                 // [a$]    anchor   anchor
     723             :                 // [a-$x]  var "x"* literal '$'
     724             :                 // [a-$.]  error    literal '$'
     725             :                 // *We won't get here in the case of var "x"
     726             :                 {
     727           0 :                     chars.getPos(backup);
     728           0 :                     c = chars.next(opts, literal, ec);
     729           0 :                     if (U_FAILURE(ec)) return;
     730           0 :                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
     731           0 :                     if (symbols == 0 && !anchor) {
     732           0 :                         c = SymbolTable::SYMBOL_REF;
     733           0 :                         chars.setPos(backup);
     734           0 :                         break; // literal '$'
     735             :                     }
     736           0 :                     if (anchor && op == 0) {
     737           0 :                         if (lastItem == 1) {
     738           0 :                             add(lastChar, lastChar);
     739           0 :                             _appendToPat(patLocal, lastChar, FALSE);
     740             :                         }
     741           0 :                         add(U_ETHER);
     742           0 :                         usePat = TRUE;
     743           0 :                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
     744           0 :                         patLocal.append((UChar) 0x5D /*']'*/);
     745           0 :                         mode = 2;
     746           0 :                         continue;
     747             :                     }
     748             :                     // syntaxError(chars, "Unquoted '$'");
     749           0 :                     ec = U_MALFORMED_SET;
     750           0 :                     return;
     751             :                 }
     752             :             default:
     753           0 :                 break;
     754             :             }
     755             :         }
     756             : 
     757             :         // -------- Parse literal characters.  This includes both
     758             :         // escaped chars ("\u4E01") and non-syntax characters
     759             :         // ("a").
     760             : 
     761           0 :         switch (lastItem) {
     762             :         case 0:
     763           0 :             lastItem = 1;
     764           0 :             lastChar = c;
     765           0 :             break;
     766             :         case 1:
     767           0 :             if (op == HYPHEN /*'-'*/) {
     768           0 :                 if (lastChar >= c) {
     769             :                     // Don't allow redundant (a-a) or empty (b-a) ranges;
     770             :                     // these are most likely typos.
     771             :                     // syntaxError(chars, "Invalid range");
     772           0 :                     ec = U_MALFORMED_SET;
     773           0 :                     return;
     774             :                 }
     775           0 :                 add(lastChar, c);
     776           0 :                 _appendToPat(patLocal, lastChar, FALSE);
     777           0 :                 patLocal.append(op);
     778           0 :                 _appendToPat(patLocal, c, FALSE);
     779           0 :                 lastItem = 0;
     780           0 :                 op = 0;
     781             :             } else {
     782           0 :                 add(lastChar, lastChar);
     783           0 :                 _appendToPat(patLocal, lastChar, FALSE);
     784           0 :                 lastChar = c;
     785             :             }
     786           0 :             break;
     787             :         case 2:
     788           0 :             if (op != 0) {
     789             :                 // syntaxError(chars, "Set expected after operator");
     790           0 :                 ec = U_MALFORMED_SET;
     791           0 :                 return;
     792             :             }
     793           0 :             lastChar = c;
     794           0 :             lastItem = 1;
     795           0 :             break;
     796             :         }
     797             :     }
     798             : 
     799           0 :     if (mode != 2) {
     800             :         // syntaxError(chars, "Missing ']'");
     801           0 :         ec = U_MALFORMED_SET;
     802           0 :         return;
     803             :     }
     804             : 
     805           0 :     chars.skipIgnored(opts);
     806             : 
     807             :     /**
     808             :      * Handle global flags (invert, case insensitivity).  If this
     809             :      * pattern should be compiled case-insensitive, then we need
     810             :      * to close over case BEFORE COMPLEMENTING.  This makes
     811             :      * patterns like /[^abc]/i work.
     812             :      */
     813           0 :     if ((options & USET_CASE_INSENSITIVE) != 0) {
     814           0 :         (this->*caseClosure)(USET_CASE_INSENSITIVE);
     815             :     }
     816           0 :     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
     817           0 :         (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
     818             :     }
     819           0 :     if (invert) {
     820           0 :         complement();
     821             :     }
     822             : 
     823             :     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
     824             :     // generated pattern.
     825           0 :     if (usePat) {
     826           0 :         rebuiltPat.append(patLocal);
     827             :     } else {
     828           0 :         _generatePattern(rebuiltPat, FALSE);
     829             :     }
     830           0 :     if (isBogus() && U_SUCCESS(ec)) {
     831             :         // We likely ran out of memory. AHHH!
     832           0 :         ec = U_MEMORY_ALLOCATION_ERROR;
     833             :     }
     834             : }
     835             : 
     836             : //----------------------------------------------------------------
     837             : // Property set implementation
     838             : //----------------------------------------------------------------
     839             : 
     840           0 : static UBool numericValueFilter(UChar32 ch, void* context) {
     841           0 :     return u_getNumericValue(ch) == *(double*)context;
     842             : }
     843             : 
     844           0 : static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
     845           0 :     int32_t value = *(int32_t*)context;
     846           0 :     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
     847             : }
     848             : 
     849           0 : static UBool versionFilter(UChar32 ch, void* context) {
     850             :     static const UVersionInfo none = { 0, 0, 0, 0 };
     851             :     UVersionInfo v;
     852           0 :     u_charAge(ch, v);
     853           0 :     UVersionInfo* version = (UVersionInfo*)context;
     854           0 :     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
     855             : }
     856             : 
     857             : typedef struct {
     858             :     UProperty prop;
     859             :     int32_t value;
     860             : } IntPropertyContext;
     861             : 
     862           0 : static UBool intPropertyFilter(UChar32 ch, void* context) {
     863           0 :     IntPropertyContext* c = (IntPropertyContext*)context;
     864           0 :     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
     865             : }
     866             : 
     867           0 : static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
     868           0 :     return uscript_hasScript(ch, *(UScriptCode*)context);
     869             : }
     870             : 
     871             : /**
     872             :  * Generic filter-based scanning code for UCD property UnicodeSets.
     873             :  */
     874           0 : void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
     875             :                              void* context,
     876             :                              int32_t src,
     877             :                              UErrorCode &status) {
     878           0 :     if (U_FAILURE(status)) return;
     879             : 
     880             :     // Logically, walk through all Unicode characters, noting the start
     881             :     // and end of each range for which filter.contain(c) is
     882             :     // true.  Add each range to a set.
     883             :     //
     884             :     // To improve performance, use an inclusions set which
     885             :     // encodes information about character ranges that are known
     886             :     // to have identical properties.
     887             :     // getInclusions(src) contains exactly the first characters of
     888             :     // same-value ranges for the given properties "source".
     889           0 :     const UnicodeSet* inclusions = getInclusions(src, status);
     890           0 :     if (U_FAILURE(status)) {
     891           0 :         return;
     892             :     }
     893             : 
     894           0 :     clear();
     895             : 
     896           0 :     UChar32 startHasProperty = -1;
     897           0 :     int32_t limitRange = inclusions->getRangeCount();
     898             : 
     899           0 :     for (int j=0; j<limitRange; ++j) {
     900             :         // get current range
     901           0 :         UChar32 start = inclusions->getRangeStart(j);
     902           0 :         UChar32 end = inclusions->getRangeEnd(j);
     903             : 
     904             :         // for all the code points in the range, process
     905           0 :         for (UChar32 ch = start; ch <= end; ++ch) {
     906             :             // only add to this UnicodeSet on inflection points --
     907             :             // where the hasProperty value changes to false
     908           0 :             if ((*filter)(ch, context)) {
     909           0 :                 if (startHasProperty < 0) {
     910           0 :                     startHasProperty = ch;
     911             :                 }
     912           0 :             } else if (startHasProperty >= 0) {
     913           0 :                 add(startHasProperty, ch-1);
     914           0 :                 startHasProperty = -1;
     915             :             }
     916             :         }
     917             :     }
     918           0 :     if (startHasProperty >= 0) {
     919           0 :         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
     920             :     }
     921           0 :     if (isBogus() && U_SUCCESS(status)) {
     922             :         // We likely ran out of memory. AHHH!
     923           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     924             :     }
     925             : }
     926             : 
     927           0 : static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
     928             :     /* Note: we use ' ' in compiler code page */
     929           0 :     int32_t j = 0;
     930             :     char ch;
     931           0 :     --dstCapacity; /* make room for term. zero */
     932           0 :     while ((ch = *src++) != 0) {
     933           0 :         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
     934           0 :             continue;
     935             :         }
     936           0 :         if (j >= dstCapacity) return FALSE;
     937           0 :         dst[j++] = ch;
     938             :     }
     939           0 :     if (j > 0 && dst[j-1] == ' ') --j;
     940           0 :     dst[j] = 0;
     941           0 :     return TRUE;
     942             : }
     943             : 
     944             : //----------------------------------------------------------------
     945             : // Property set API
     946             : //----------------------------------------------------------------
     947             : 
     948             : #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
     949             : 
     950             : UnicodeSet&
     951           0 : UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
     952           0 :     if (U_FAILURE(ec) || isFrozen()) return *this;
     953             : 
     954           0 :     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
     955           0 :         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
     956           0 :     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
     957           0 :         UScriptCode script = (UScriptCode)value;
     958           0 :         applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
     959             :     } else {
     960           0 :         IntPropertyContext c = {prop, value};
     961           0 :         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
     962             :     }
     963           0 :     return *this;
     964             : }
     965             : 
     966             : UnicodeSet&
     967           0 : UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
     968             :                                const UnicodeString& value,
     969             :                                UErrorCode& ec) {
     970           0 :     if (U_FAILURE(ec) || isFrozen()) return *this;
     971             : 
     972             :     // prop and value used to be converted to char * using the default
     973             :     // converter instead of the invariant conversion.
     974             :     // This should not be necessary because all Unicode property and value
     975             :     // names use only invariant characters.
     976             :     // If there are any variant characters, then we won't find them anyway.
     977             :     // Checking first avoids assertion failures in the conversion.
     978           0 :     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
     979           0 :         !uprv_isInvariantUString(value.getBuffer(), value.length())
     980             :     ) {
     981           0 :         FAIL(ec);
     982             :     }
     983           0 :     CharString pname, vname;
     984           0 :     pname.appendInvariantChars(prop, ec);
     985           0 :     vname.appendInvariantChars(value, ec);
     986           0 :     if (U_FAILURE(ec)) return *this;
     987             : 
     988             :     UProperty p;
     989             :     int32_t v;
     990           0 :     UBool mustNotBeEmpty = FALSE, invert = FALSE;
     991             : 
     992           0 :     if (value.length() > 0) {
     993           0 :         p = u_getPropertyEnum(pname.data());
     994           0 :         if (p == UCHAR_INVALID_CODE) FAIL(ec);
     995             : 
     996             :         // Treat gc as gcm
     997           0 :         if (p == UCHAR_GENERAL_CATEGORY) {
     998           0 :             p = UCHAR_GENERAL_CATEGORY_MASK;
     999             :         }
    1000             : 
    1001           0 :         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
    1002           0 :             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
    1003           0 :             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
    1004           0 :             v = u_getPropertyValueEnum(p, vname.data());
    1005           0 :             if (v == UCHAR_INVALID_CODE) {
    1006             :                 // Handle numeric CCC
    1007           0 :                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
    1008           0 :                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
    1009             :                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
    1010             :                     char* end;
    1011           0 :                     double value = uprv_strtod(vname.data(), &end);
    1012           0 :                     v = (int32_t) value;
    1013           0 :                     if (v != value || v < 0 || *end != 0) {
    1014             :                         // non-integral or negative value, or trailing junk
    1015           0 :                         FAIL(ec);
    1016             :                     }
    1017             :                     // If the resultant set is empty then the numeric value
    1018             :                     // was invalid.
    1019           0 :                     mustNotBeEmpty = TRUE;
    1020             :                 } else {
    1021           0 :                     FAIL(ec);
    1022             :                 }
    1023           0 :             }
    1024             :         }
    1025             : 
    1026             :         else {
    1027             : 
    1028           0 :             switch (p) {
    1029             :             case UCHAR_NUMERIC_VALUE:
    1030             :                 {
    1031             :                     char* end;
    1032           0 :                     double value = uprv_strtod(vname.data(), &end);
    1033           0 :                     if (*end != 0) {
    1034           0 :                         FAIL(ec);
    1035             :                     }
    1036           0 :                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
    1037           0 :                     return *this;
    1038             :                 }
    1039             :             case UCHAR_NAME:
    1040             :                 {
    1041             :                     // Must munge name, since u_charFromName() does not do
    1042             :                     // 'loose' matching.
    1043             :                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
    1044           0 :                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
    1045           0 :                     UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
    1046           0 :                     if (U_SUCCESS(ec)) {
    1047           0 :                         clear();
    1048           0 :                         add(ch);
    1049           0 :                         return *this;
    1050             :                     } else {
    1051           0 :                         FAIL(ec);
    1052             :                     }
    1053             :                 }
    1054             :             case UCHAR_UNICODE_1_NAME:
    1055             :                 // ICU 49 deprecates the Unicode_1_Name property APIs.
    1056           0 :                 FAIL(ec);
    1057             :             case UCHAR_AGE:
    1058             :                 {
    1059             :                     // Must munge name, since u_versionFromString() does not do
    1060             :                     // 'loose' matching.
    1061             :                     char buf[128];
    1062           0 :                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
    1063             :                     UVersionInfo version;
    1064           0 :                     u_versionFromString(version, buf);
    1065           0 :                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
    1066           0 :                     return *this;
    1067             :                 }
    1068             :             case UCHAR_SCRIPT_EXTENSIONS:
    1069           0 :                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
    1070           0 :                 if (v == UCHAR_INVALID_CODE) {
    1071           0 :                     FAIL(ec);
    1072             :                 }
    1073             :                 // fall through to calling applyIntPropertyValue()
    1074           0 :                 break;
    1075             :             default:
    1076             :                 // p is a non-binary, non-enumerated property that we
    1077             :                 // don't support (yet).
    1078           0 :                 FAIL(ec);
    1079             :             }
    1080             :         }
    1081             :     }
    1082             : 
    1083             :     else {
    1084             :         // value is empty.  Interpret as General Category, Script, or
    1085             :         // Binary property.
    1086           0 :         p = UCHAR_GENERAL_CATEGORY_MASK;
    1087           0 :         v = u_getPropertyValueEnum(p, pname.data());
    1088           0 :         if (v == UCHAR_INVALID_CODE) {
    1089           0 :             p = UCHAR_SCRIPT;
    1090           0 :             v = u_getPropertyValueEnum(p, pname.data());
    1091           0 :             if (v == UCHAR_INVALID_CODE) {
    1092           0 :                 p = u_getPropertyEnum(pname.data());
    1093           0 :                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
    1094           0 :                     v = 1;
    1095           0 :                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
    1096           0 :                     set(MIN_VALUE, MAX_VALUE);
    1097           0 :                     return *this;
    1098           0 :                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
    1099           0 :                     set(0, 0x7F);
    1100           0 :                     return *this;
    1101           0 :                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
    1102             :                     // [:Assigned:]=[:^Cn:]
    1103           0 :                     p = UCHAR_GENERAL_CATEGORY_MASK;
    1104           0 :                     v = U_GC_CN_MASK;
    1105           0 :                     invert = TRUE;
    1106             :                 } else {
    1107           0 :                     FAIL(ec);
    1108             :                 }
    1109             :             }
    1110             :         }
    1111             :     }
    1112             : 
    1113           0 :     applyIntPropertyValue(p, v, ec);
    1114           0 :     if(invert) {
    1115           0 :         complement();
    1116             :     }
    1117             : 
    1118           0 :     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
    1119             :         // mustNotBeEmpty is set to true if an empty set indicates
    1120             :         // invalid input.
    1121           0 :         ec = U_ILLEGAL_ARGUMENT_ERROR;
    1122             :     }
    1123             : 
    1124           0 :     if (isBogus() && U_SUCCESS(ec)) {
    1125             :         // We likely ran out of memory. AHHH!
    1126           0 :         ec = U_MEMORY_ALLOCATION_ERROR;
    1127             :     }
    1128           0 :     return *this;
    1129             : }
    1130             : 
    1131             : //----------------------------------------------------------------
    1132             : // Property set patterns
    1133             : //----------------------------------------------------------------
    1134             : 
    1135             : /**
    1136             :  * Return true if the given position, in the given pattern, appears
    1137             :  * to be the start of a property set pattern.
    1138             :  */
    1139           0 : UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
    1140             :                                            int32_t pos) {
    1141             :     // Patterns are at least 5 characters long
    1142           0 :     if ((pos+5) > pattern.length()) {
    1143           0 :         return FALSE;
    1144             :     }
    1145             : 
    1146             :     // Look for an opening [:, [:^, \p, or \P
    1147           0 :     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
    1148             : }
    1149             : 
    1150             : /**
    1151             :  * Return true if the given iterator appears to point at a
    1152             :  * property pattern.  Regardless of the result, return with the
    1153             :  * iterator unchanged.
    1154             :  * @param chars iterator over the pattern characters.  Upon return
    1155             :  * it will be unchanged.
    1156             :  * @param iterOpts RuleCharacterIterator options
    1157             :  */
    1158           0 : UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
    1159             :                                            int32_t iterOpts) {
    1160             :     // NOTE: literal will always be FALSE, because we don't parse escapes.
    1161           0 :     UBool result = FALSE, literal;
    1162           0 :     UErrorCode ec = U_ZERO_ERROR;
    1163           0 :     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
    1164             :     RuleCharacterIterator::Pos pos;
    1165           0 :     chars.getPos(pos);
    1166           0 :     UChar32 c = chars.next(iterOpts, literal, ec);
    1167           0 :     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
    1168           0 :         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
    1169           0 :                                literal, ec);
    1170           0 :         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
    1171           0 :                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
    1172             :     }
    1173           0 :     chars.setPos(pos);
    1174           0 :     return result && U_SUCCESS(ec);
    1175             : }
    1176             : 
    1177             : /**
    1178             :  * Parse the given property pattern at the given parse position.
    1179             :  */
    1180           0 : UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
    1181             :                                              ParsePosition& ppos,
    1182             :                                              UErrorCode &ec) {
    1183           0 :     int32_t pos = ppos.getIndex();
    1184             : 
    1185           0 :     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
    1186           0 :     UBool isName = FALSE; // true for \N{pat}, o/w false
    1187           0 :     UBool invert = FALSE;
    1188             : 
    1189           0 :     if (U_FAILURE(ec)) return *this;
    1190             : 
    1191             :     // Minimum length is 5 characters, e.g. \p{L}
    1192           0 :     if ((pos+5) > pattern.length()) {
    1193           0 :         FAIL(ec);
    1194             :     }
    1195             : 
    1196             :     // On entry, ppos should point to one of the following locations:
    1197             :     // Look for an opening [:, [:^, \p, or \P
    1198           0 :     if (isPOSIXOpen(pattern, pos)) {
    1199           0 :         posix = TRUE;
    1200           0 :         pos += 2;
    1201           0 :         pos = ICU_Utility::skipWhitespace(pattern, pos);
    1202           0 :         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
    1203           0 :             ++pos;
    1204           0 :             invert = TRUE;
    1205             :         }
    1206           0 :     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
    1207           0 :         UChar c = pattern.charAt(pos+1);
    1208           0 :         invert = (c == UPPER_P);
    1209           0 :         isName = (c == UPPER_N);
    1210           0 :         pos += 2;
    1211           0 :         pos = ICU_Utility::skipWhitespace(pattern, pos);
    1212           0 :         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
    1213             :             // Syntax error; "\p" or "\P" not followed by "{"
    1214           0 :             FAIL(ec);
    1215             :         }
    1216             :     } else {
    1217             :         // Open delimiter not seen
    1218           0 :         FAIL(ec);
    1219             :     }
    1220             : 
    1221             :     // Look for the matching close delimiter, either :] or }
    1222             :     int32_t close;
    1223           0 :     if (posix) {
    1224           0 :       close = pattern.indexOf(POSIX_CLOSE, 2, pos);
    1225             :     } else {
    1226           0 :       close = pattern.indexOf(CLOSE_BRACE, pos);
    1227             :     }
    1228           0 :     if (close < 0) {
    1229             :         // Syntax error; close delimiter missing
    1230           0 :         FAIL(ec);
    1231             :     }
    1232             : 
    1233             :     // Look for an '=' sign.  If this is present, we will parse a
    1234             :     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
    1235             :     // pattern.
    1236           0 :     int32_t equals = pattern.indexOf(EQUALS, pos);
    1237           0 :     UnicodeString propName, valueName;
    1238           0 :     if (equals >= 0 && equals < close && !isName) {
    1239             :         // Equals seen; parse medium/long pattern
    1240           0 :         pattern.extractBetween(pos, equals, propName);
    1241           0 :         pattern.extractBetween(equals+1, close, valueName);
    1242             :     }
    1243             : 
    1244             :     else {
    1245             :         // Handle case where no '=' is seen, and \N{}
    1246           0 :         pattern.extractBetween(pos, close, propName);
    1247             :             
    1248             :         // Handle \N{name}
    1249           0 :         if (isName) {
    1250             :             // This is a little inefficient since it means we have to
    1251             :             // parse NAME_PROP back to UCHAR_NAME even though we already
    1252             :             // know it's UCHAR_NAME.  If we refactor the API to
    1253             :             // support args of (UProperty, char*) then we can remove
    1254             :             // NAME_PROP and make this a little more efficient.
    1255           0 :             valueName = propName;
    1256           0 :             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
    1257             :         }
    1258             :     }
    1259             : 
    1260           0 :     applyPropertyAlias(propName, valueName, ec);
    1261             : 
    1262           0 :     if (U_SUCCESS(ec)) {
    1263           0 :         if (invert) {
    1264           0 :             complement();
    1265             :         }
    1266             :             
    1267             :         // Move to the limit position after the close delimiter if the
    1268             :         // parse succeeded.
    1269           0 :         ppos.setIndex(close + (posix ? 2 : 1));
    1270             :     }
    1271             : 
    1272           0 :     return *this;
    1273             : }
    1274             : 
    1275             : /**
    1276             :  * Parse a property pattern.
    1277             :  * @param chars iterator over the pattern characters.  Upon return
    1278             :  * it will be advanced to the first character after the parsed
    1279             :  * pattern, or the end of the iteration if all characters are
    1280             :  * parsed.
    1281             :  * @param rebuiltPat the pattern that was parsed, rebuilt or
    1282             :  * copied from the input pattern, as appropriate.
    1283             :  */
    1284           0 : void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
    1285             :                                       UnicodeString& rebuiltPat,
    1286             :                                       UErrorCode& ec) {
    1287           0 :     if (U_FAILURE(ec)) return;
    1288           0 :     UnicodeString pattern;
    1289           0 :     chars.lookahead(pattern);
    1290           0 :     ParsePosition pos(0);
    1291           0 :     applyPropertyPattern(pattern, pos, ec);
    1292           0 :     if (U_FAILURE(ec)) return;
    1293           0 :     if (pos.getIndex() == 0) {
    1294             :         // syntaxError(chars, "Invalid property pattern");
    1295           0 :         ec = U_MALFORMED_SET;
    1296           0 :         return;
    1297             :     }
    1298           0 :     chars.jumpahead(pos.getIndex());
    1299           0 :     rebuiltPat.append(pattern, 0, pos.getIndex());
    1300             : }
    1301             : 
    1302             : U_NAMESPACE_END

Generated by: LCOV version 1.13