Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 1999-2014, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: uniset_props.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2004aug25
16 : * created by: Markus W. Scherer
17 : *
18 : * Character property dependent functions moved here from uniset.cpp
19 : */
20 :
21 : #include "unicode/utypes.h"
22 : #include "unicode/uniset.h"
23 : #include "unicode/parsepos.h"
24 : #include "unicode/uchar.h"
25 : #include "unicode/uscript.h"
26 : #include "unicode/symtable.h"
27 : #include "unicode/uset.h"
28 : #include "unicode/locid.h"
29 : #include "unicode/brkiter.h"
30 : #include "uset_imp.h"
31 : #include "ruleiter.h"
32 : #include "cmemory.h"
33 : #include "ucln_cmn.h"
34 : #include "util.h"
35 : #include "uvector.h"
36 : #include "uprops.h"
37 : #include "propname.h"
38 : #include "normalizer2impl.h"
39 : #include "ucase.h"
40 : #include "ubidi_props.h"
41 : #include "uinvchar.h"
42 : #include "uprops.h"
43 : #include "charstr.h"
44 : #include "cstring.h"
45 : #include "mutex.h"
46 : #include "umutex.h"
47 : #include "uassert.h"
48 : #include "hash.h"
49 :
50 : U_NAMESPACE_USE
51 :
52 : // initial storage. Must be >= 0
53 : // *** same as in uniset.cpp ! ***
54 : #define START_EXTRA 16
55 :
56 : // Define UChar constants using hex for EBCDIC compatibility
57 : // Used #define to reduce private static exports and memory access time.
58 : #define SET_OPEN ((UChar)0x005B) /*[*/
59 : #define SET_CLOSE ((UChar)0x005D) /*]*/
60 : #define HYPHEN ((UChar)0x002D) /*-*/
61 : #define COMPLEMENT ((UChar)0x005E) /*^*/
62 : #define COLON ((UChar)0x003A) /*:*/
63 : #define BACKSLASH ((UChar)0x005C) /*\*/
64 : #define INTERSECTION ((UChar)0x0026) /*&*/
65 : #define UPPER_U ((UChar)0x0055) /*U*/
66 : #define LOWER_U ((UChar)0x0075) /*u*/
67 : #define OPEN_BRACE ((UChar)123) /*{*/
68 : #define CLOSE_BRACE ((UChar)125) /*}*/
69 : #define UPPER_P ((UChar)0x0050) /*P*/
70 : #define LOWER_P ((UChar)0x0070) /*p*/
71 : #define UPPER_N ((UChar)78) /*N*/
72 : #define EQUALS ((UChar)0x003D) /*=*/
73 :
74 : //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
75 : static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
76 : //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
77 : //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
78 : //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
79 : static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
80 :
81 : // Special property set IDs
82 : static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
83 : static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
84 : static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
85 :
86 : // Unicode name property alias
87 : #define NAME_PROP "na"
88 : #define NAME_PROP_LENGTH 2
89 :
90 : /**
91 : * Delimiter string used in patterns to close a category reference:
92 : * ":]". Example: "[:Lu:]".
93 : */
94 : //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
95 :
96 : // Cached sets ------------------------------------------------------------- ***
97 :
98 : U_CDECL_BEGIN
99 : static UBool U_CALLCONV uset_cleanup();
100 :
101 : struct Inclusion {
102 : UnicodeSet *fSet;
103 : UInitOnce fInitOnce;
104 : };
105 : static Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions()
106 :
107 : static UnicodeSet *uni32Singleton;
108 : static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
109 :
110 : //----------------------------------------------------------------
111 : // Inclusions list
112 : //----------------------------------------------------------------
113 :
114 : // USetAdder implementation
115 : // Does not use uset.h to reduce code dependencies
116 : static void U_CALLCONV
117 0 : _set_add(USet *set, UChar32 c) {
118 0 : ((UnicodeSet *)set)->add(c);
119 0 : }
120 :
121 : static void U_CALLCONV
122 0 : _set_addRange(USet *set, UChar32 start, UChar32 end) {
123 0 : ((UnicodeSet *)set)->add(start, end);
124 0 : }
125 :
126 : static void U_CALLCONV
127 0 : _set_addString(USet *set, const UChar *str, int32_t length) {
128 0 : ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
129 0 : }
130 :
131 : /**
132 : * Cleanup function for UnicodeSet
133 : */
134 0 : static UBool U_CALLCONV uset_cleanup(void) {
135 0 : for(int32_t i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
136 0 : Inclusion &in = gInclusions[i];
137 0 : delete in.fSet;
138 0 : in.fSet = NULL;
139 0 : in.fInitOnce.reset();
140 : }
141 :
142 0 : delete uni32Singleton;
143 0 : uni32Singleton = NULL;
144 0 : uni32InitOnce.reset();
145 0 : return TRUE;
146 : }
147 :
148 : U_CDECL_END
149 :
150 : U_NAMESPACE_BEGIN
151 :
152 : /*
153 : Reduce excessive reallocation, and make it easier to detect initialization problems.
154 : Usually you don't see smaller sets than this for Unicode 5.0.
155 : */
156 : #define DEFAULT_INCLUSION_CAPACITY 3072
157 :
158 0 : void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
159 : // This function is invoked only via umtx_initOnce().
160 : // This function is a friend of class UnicodeSet.
161 :
162 0 : U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
163 0 : UnicodeSet * &incl = gInclusions[src].fSet;
164 0 : U_ASSERT(incl == NULL);
165 :
166 0 : incl = new UnicodeSet();
167 0 : if (incl == NULL) {
168 0 : status = U_MEMORY_ALLOCATION_ERROR;
169 0 : return;
170 : }
171 : USetAdder sa = {
172 0 : (USet *)incl,
173 : _set_add,
174 : _set_addRange,
175 : _set_addString,
176 : NULL, // don't need remove()
177 : NULL // don't need removeRange()
178 0 : };
179 :
180 0 : incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
181 0 : switch(src) {
182 : case UPROPS_SRC_CHAR:
183 0 : uchar_addPropertyStarts(&sa, &status);
184 0 : break;
185 : case UPROPS_SRC_PROPSVEC:
186 0 : upropsvec_addPropertyStarts(&sa, &status);
187 0 : break;
188 : case UPROPS_SRC_CHAR_AND_PROPSVEC:
189 0 : uchar_addPropertyStarts(&sa, &status);
190 0 : upropsvec_addPropertyStarts(&sa, &status);
191 0 : break;
192 : #if !UCONFIG_NO_NORMALIZATION
193 : case UPROPS_SRC_CASE_AND_NORM: {
194 0 : const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
195 0 : if(U_SUCCESS(status)) {
196 0 : impl->addPropertyStarts(&sa, status);
197 : }
198 0 : ucase_addPropertyStarts(&sa, &status);
199 0 : break;
200 : }
201 : case UPROPS_SRC_NFC: {
202 0 : const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
203 0 : if(U_SUCCESS(status)) {
204 0 : impl->addPropertyStarts(&sa, status);
205 : }
206 0 : break;
207 : }
208 : case UPROPS_SRC_NFKC: {
209 0 : const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
210 0 : if(U_SUCCESS(status)) {
211 0 : impl->addPropertyStarts(&sa, status);
212 : }
213 0 : break;
214 : }
215 : case UPROPS_SRC_NFKC_CF: {
216 0 : const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
217 0 : if(U_SUCCESS(status)) {
218 0 : impl->addPropertyStarts(&sa, status);
219 : }
220 0 : break;
221 : }
222 : case UPROPS_SRC_NFC_CANON_ITER: {
223 0 : const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
224 0 : if(U_SUCCESS(status)) {
225 0 : impl->addCanonIterPropertyStarts(&sa, status);
226 : }
227 0 : break;
228 : }
229 : #endif
230 : case UPROPS_SRC_CASE:
231 0 : ucase_addPropertyStarts(&sa, &status);
232 0 : break;
233 : case UPROPS_SRC_BIDI:
234 0 : ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
235 0 : break;
236 : default:
237 0 : status = U_INTERNAL_PROGRAM_ERROR;
238 0 : break;
239 : }
240 :
241 0 : if (U_FAILURE(status)) {
242 0 : delete incl;
243 0 : incl = NULL;
244 0 : return;
245 : }
246 : // Compact for caching
247 0 : incl->compact();
248 0 : ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
249 : }
250 :
251 :
252 :
253 0 : const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
254 0 : U_ASSERT(src >=0 && src<UPROPS_SRC_COUNT);
255 0 : Inclusion &i = gInclusions[src];
256 0 : umtx_initOnce(i.fInitOnce, &UnicodeSet_initInclusion, src, status);
257 0 : return i.fSet;
258 : }
259 :
260 :
261 : // Cache some sets for other services -------------------------------------- ***
262 0 : void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
263 0 : U_ASSERT(uni32Singleton == NULL);
264 0 : uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
265 0 : if(uni32Singleton==NULL) {
266 0 : errorCode=U_MEMORY_ALLOCATION_ERROR;
267 : } else {
268 0 : uni32Singleton->freeze();
269 : }
270 0 : ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
271 0 : }
272 :
273 :
274 : U_CFUNC UnicodeSet *
275 0 : uniset_getUnicode32Instance(UErrorCode &errorCode) {
276 0 : umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
277 0 : return uni32Singleton;
278 : }
279 :
280 : // helper functions for matching of pattern syntax pieces ------------------ ***
281 : // these functions are parallel to the PERL_OPEN etc. strings above
282 :
283 : // using these functions is not only faster than UnicodeString::compare() and
284 : // caseCompare(), but they also make UnicodeSet work for simple patterns when
285 : // no Unicode properties data is available - when caseCompare() fails
286 :
287 : static inline UBool
288 0 : isPerlOpen(const UnicodeString &pattern, int32_t pos) {
289 : UChar c;
290 0 : return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
291 : }
292 :
293 : /*static inline UBool
294 : isPerlClose(const UnicodeString &pattern, int32_t pos) {
295 : return pattern.charAt(pos)==CLOSE_BRACE;
296 : }*/
297 :
298 : static inline UBool
299 0 : isNameOpen(const UnicodeString &pattern, int32_t pos) {
300 0 : return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
301 : }
302 :
303 : static inline UBool
304 0 : isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
305 0 : return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
306 : }
307 :
308 : /*static inline UBool
309 : isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
310 : return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
311 : }*/
312 :
313 : // TODO memory debugging provided inside uniset.cpp
314 : // could be made available here but probably obsolete with use of modern
315 : // memory leak checker tools
316 : #define _dbgct(me)
317 :
318 : //----------------------------------------------------------------
319 : // Constructors &c
320 : //----------------------------------------------------------------
321 :
322 : /**
323 : * Constructs a set from the given pattern, optionally ignoring
324 : * white space. See the class description for the syntax of the
325 : * pattern language.
326 : * @param pattern a string specifying what characters are in the set
327 : */
328 0 : UnicodeSet::UnicodeSet(const UnicodeString& pattern,
329 0 : UErrorCode& status) :
330 : len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
331 : bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
332 0 : fFlags(0)
333 : {
334 0 : if(U_SUCCESS(status)){
335 0 : list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
336 : /* test for NULL */
337 0 : if(list == NULL) {
338 0 : status = U_MEMORY_ALLOCATION_ERROR;
339 : }else{
340 0 : allocateStrings(status);
341 0 : applyPattern(pattern, status);
342 : }
343 : }
344 : _dbgct(this);
345 0 : }
346 :
347 : //----------------------------------------------------------------
348 : // Public API
349 : //----------------------------------------------------------------
350 :
351 0 : UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
352 : UErrorCode& status) {
353 : // Equivalent to
354 : // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
355 : // but without dependency on closeOver().
356 0 : ParsePosition pos(0);
357 0 : applyPatternIgnoreSpace(pattern, pos, NULL, status);
358 0 : if (U_FAILURE(status)) return *this;
359 :
360 0 : int32_t i = pos.getIndex();
361 : // Skip over trailing whitespace
362 0 : ICU_Utility::skipWhitespace(pattern, i, TRUE);
363 0 : if (i != pattern.length()) {
364 0 : status = U_ILLEGAL_ARGUMENT_ERROR;
365 : }
366 0 : return *this;
367 : }
368 :
369 : void
370 0 : UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
371 : ParsePosition& pos,
372 : const SymbolTable* symbols,
373 : UErrorCode& status) {
374 0 : if (U_FAILURE(status)) {
375 0 : return;
376 : }
377 0 : if (isFrozen()) {
378 0 : status = U_NO_WRITE_PERMISSION;
379 0 : return;
380 : }
381 : // Need to build the pattern in a temporary string because
382 : // _applyPattern calls add() etc., which set pat to empty.
383 0 : UnicodeString rebuiltPat;
384 0 : RuleCharacterIterator chars(pattern, symbols, pos);
385 0 : applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status);
386 0 : if (U_FAILURE(status)) return;
387 0 : if (chars.inVariable()) {
388 : // syntaxError(chars, "Extra chars in variable value");
389 0 : status = U_MALFORMED_SET;
390 0 : return;
391 : }
392 0 : setPattern(rebuiltPat);
393 : }
394 :
395 : /**
396 : * Return true if the given position, in the given pattern, appears
397 : * to be the start of a UnicodeSet pattern.
398 : */
399 0 : UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
400 0 : return ((pos+1) < pattern.length() &&
401 0 : pattern.charAt(pos) == (UChar)91/*[*/) ||
402 0 : resemblesPropertyPattern(pattern, pos);
403 : }
404 :
405 : //----------------------------------------------------------------
406 : // Implementation: Pattern parsing
407 : //----------------------------------------------------------------
408 :
409 : /**
410 : * A small all-inline class to manage a UnicodeSet pointer. Add
411 : * operator->() etc. as needed.
412 : */
413 : class UnicodeSetPointer {
414 : UnicodeSet* p;
415 : public:
416 0 : inline UnicodeSetPointer() : p(0) {}
417 0 : inline ~UnicodeSetPointer() { delete p; }
418 0 : inline UnicodeSet* pointer() { return p; }
419 0 : inline UBool allocate() {
420 0 : if (p == 0) {
421 0 : p = new UnicodeSet();
422 : }
423 0 : return p != 0;
424 : }
425 : };
426 :
427 : /**
428 : * Parse the pattern from the given RuleCharacterIterator. The
429 : * iterator is advanced over the parsed pattern.
430 : * @param chars iterator over the pattern characters. Upon return
431 : * it will be advanced to the first character after the parsed
432 : * pattern, or the end of the iteration if all characters are
433 : * parsed.
434 : * @param symbols symbol table to use to parse and dereference
435 : * variables, or null if none.
436 : * @param rebuiltPat the pattern that was parsed, rebuilt or
437 : * copied from the input pattern, as appropriate.
438 : * @param options a bit mask of zero or more of the following:
439 : * IGNORE_SPACE, CASE.
440 : */
441 0 : void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
442 : const SymbolTable* symbols,
443 : UnicodeString& rebuiltPat,
444 : uint32_t options,
445 : UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
446 : UErrorCode& ec) {
447 0 : if (U_FAILURE(ec)) return;
448 :
449 : // Syntax characters: [ ] ^ - & { }
450 :
451 : // Recognized special forms for chars, sets: c-c s-s s&s
452 :
453 : int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
454 0 : RuleCharacterIterator::PARSE_ESCAPES;
455 0 : if ((options & USET_IGNORE_SPACE) != 0) {
456 0 : opts |= RuleCharacterIterator::SKIP_WHITESPACE;
457 : }
458 :
459 0 : UnicodeString patLocal, buf;
460 0 : UBool usePat = FALSE;
461 0 : UnicodeSetPointer scratch;
462 : RuleCharacterIterator::Pos backup;
463 :
464 : // mode: 0=before [, 1=between [...], 2=after ]
465 : // lastItem: 0=none, 1=char, 2=set
466 0 : int8_t lastItem = 0, mode = 0;
467 0 : UChar32 lastChar = 0;
468 0 : UChar op = 0;
469 :
470 0 : UBool invert = FALSE;
471 :
472 0 : clear();
473 :
474 0 : while (mode != 2 && !chars.atEnd()) {
475 0 : U_ASSERT((lastItem == 0 && op == 0) ||
476 : (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
477 : (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
478 0 : op == INTERSECTION /*'&'*/)));
479 :
480 0 : UChar32 c = 0;
481 0 : UBool literal = FALSE;
482 0 : UnicodeSet* nested = 0; // alias - do not delete
483 :
484 : // -------- Check for property pattern
485 :
486 : // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
487 0 : int8_t setMode = 0;
488 0 : if (resemblesPropertyPattern(chars, opts)) {
489 0 : setMode = 2;
490 : }
491 :
492 : // -------- Parse '[' of opening delimiter OR nested set.
493 : // If there is a nested set, use `setMode' to define how
494 : // the set should be parsed. If the '[' is part of the
495 : // opening delimiter for this pattern, parse special
496 : // strings "[", "[^", "[-", and "[^-". Check for stand-in
497 : // characters representing a nested set in the symbol
498 : // table.
499 :
500 : else {
501 : // Prepare to backup if necessary
502 0 : chars.getPos(backup);
503 0 : c = chars.next(opts, literal, ec);
504 0 : if (U_FAILURE(ec)) return;
505 :
506 0 : if (c == 0x5B /*'['*/ && !literal) {
507 0 : if (mode == 1) {
508 0 : chars.setPos(backup); // backup
509 0 : setMode = 1;
510 : } else {
511 : // Handle opening '[' delimiter
512 0 : mode = 1;
513 0 : patLocal.append((UChar) 0x5B /*'['*/);
514 0 : chars.getPos(backup); // prepare to backup
515 0 : c = chars.next(opts, literal, ec);
516 0 : if (U_FAILURE(ec)) return;
517 0 : if (c == 0x5E /*'^'*/ && !literal) {
518 0 : invert = TRUE;
519 0 : patLocal.append((UChar) 0x5E /*'^'*/);
520 0 : chars.getPos(backup); // prepare to backup
521 0 : c = chars.next(opts, literal, ec);
522 0 : if (U_FAILURE(ec)) return;
523 : }
524 : // Fall through to handle special leading '-';
525 : // otherwise restart loop for nested [], \p{}, etc.
526 0 : if (c == HYPHEN /*'-'*/) {
527 0 : literal = TRUE;
528 : // Fall through to handle literal '-' below
529 : } else {
530 0 : chars.setPos(backup); // backup
531 0 : continue;
532 : }
533 : }
534 0 : } else if (symbols != 0) {
535 0 : const UnicodeFunctor *m = symbols->lookupMatcher(c);
536 0 : if (m != 0) {
537 0 : const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
538 0 : if (ms == NULL) {
539 0 : ec = U_MALFORMED_SET;
540 0 : return;
541 : }
542 : // casting away const, but `nested' won't be modified
543 : // (important not to modify stored set)
544 0 : nested = const_cast<UnicodeSet*>(ms);
545 0 : setMode = 3;
546 : }
547 : }
548 : }
549 :
550 : // -------- Handle a nested set. This either is inline in
551 : // the pattern or represented by a stand-in that has
552 : // previously been parsed and was looked up in the symbol
553 : // table.
554 :
555 0 : if (setMode != 0) {
556 0 : if (lastItem == 1) {
557 0 : if (op != 0) {
558 : // syntaxError(chars, "Char expected after operator");
559 0 : ec = U_MALFORMED_SET;
560 0 : return;
561 : }
562 0 : add(lastChar, lastChar);
563 0 : _appendToPat(patLocal, lastChar, FALSE);
564 0 : lastItem = 0;
565 0 : op = 0;
566 : }
567 :
568 0 : if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
569 0 : patLocal.append(op);
570 : }
571 :
572 0 : if (nested == 0) {
573 : // lazy allocation
574 0 : if (!scratch.allocate()) {
575 0 : ec = U_MEMORY_ALLOCATION_ERROR;
576 0 : return;
577 : }
578 0 : nested = scratch.pointer();
579 : }
580 0 : switch (setMode) {
581 : case 1:
582 0 : nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec);
583 0 : break;
584 : case 2:
585 0 : chars.skipIgnored(opts);
586 0 : nested->applyPropertyPattern(chars, patLocal, ec);
587 0 : if (U_FAILURE(ec)) return;
588 0 : break;
589 : case 3: // `nested' already parsed
590 0 : nested->_toPattern(patLocal, FALSE);
591 0 : break;
592 : }
593 :
594 0 : usePat = TRUE;
595 :
596 0 : if (mode == 0) {
597 : // Entire pattern is a category; leave parse loop
598 0 : *this = *nested;
599 0 : mode = 2;
600 0 : break;
601 : }
602 :
603 0 : switch (op) {
604 : case HYPHEN: /*'-'*/
605 0 : removeAll(*nested);
606 0 : break;
607 : case INTERSECTION: /*'&'*/
608 0 : retainAll(*nested);
609 0 : break;
610 : case 0:
611 0 : addAll(*nested);
612 0 : break;
613 : }
614 :
615 0 : op = 0;
616 0 : lastItem = 2;
617 :
618 0 : continue;
619 : }
620 :
621 0 : if (mode == 0) {
622 : // syntaxError(chars, "Missing '['");
623 0 : ec = U_MALFORMED_SET;
624 0 : return;
625 : }
626 :
627 : // -------- Parse special (syntax) characters. If the
628 : // current character is not special, or if it is escaped,
629 : // then fall through and handle it below.
630 :
631 0 : if (!literal) {
632 0 : switch (c) {
633 : case 0x5D /*']'*/:
634 0 : if (lastItem == 1) {
635 0 : add(lastChar, lastChar);
636 0 : _appendToPat(patLocal, lastChar, FALSE);
637 : }
638 : // Treat final trailing '-' as a literal
639 0 : if (op == HYPHEN /*'-'*/) {
640 0 : add(op, op);
641 0 : patLocal.append(op);
642 0 : } else if (op == INTERSECTION /*'&'*/) {
643 : // syntaxError(chars, "Trailing '&'");
644 0 : ec = U_MALFORMED_SET;
645 0 : return;
646 : }
647 0 : patLocal.append((UChar) 0x5D /*']'*/);
648 0 : mode = 2;
649 0 : continue;
650 : case HYPHEN /*'-'*/:
651 0 : if (op == 0) {
652 0 : if (lastItem != 0) {
653 0 : op = (UChar) c;
654 0 : continue;
655 : } else {
656 : // Treat final trailing '-' as a literal
657 0 : add(c, c);
658 0 : c = chars.next(opts, literal, ec);
659 0 : if (U_FAILURE(ec)) return;
660 0 : if (c == 0x5D /*']'*/ && !literal) {
661 0 : patLocal.append(HYPHEN_RIGHT_BRACE, 2);
662 0 : mode = 2;
663 0 : continue;
664 : }
665 : }
666 : }
667 : // syntaxError(chars, "'-' not after char or set");
668 0 : ec = U_MALFORMED_SET;
669 0 : return;
670 : case INTERSECTION /*'&'*/:
671 0 : if (lastItem == 2 && op == 0) {
672 0 : op = (UChar) c;
673 0 : continue;
674 : }
675 : // syntaxError(chars, "'&' not after set");
676 0 : ec = U_MALFORMED_SET;
677 0 : return;
678 : case 0x5E /*'^'*/:
679 : // syntaxError(chars, "'^' not after '['");
680 0 : ec = U_MALFORMED_SET;
681 0 : return;
682 : case 0x7B /*'{'*/:
683 0 : if (op != 0) {
684 : // syntaxError(chars, "Missing operand after operator");
685 0 : ec = U_MALFORMED_SET;
686 0 : return;
687 : }
688 0 : if (lastItem == 1) {
689 0 : add(lastChar, lastChar);
690 0 : _appendToPat(patLocal, lastChar, FALSE);
691 : }
692 0 : lastItem = 0;
693 0 : buf.truncate(0);
694 : {
695 0 : UBool ok = FALSE;
696 0 : while (!chars.atEnd()) {
697 0 : c = chars.next(opts, literal, ec);
698 0 : if (U_FAILURE(ec)) return;
699 0 : if (c == 0x7D /*'}'*/ && !literal) {
700 0 : ok = TRUE;
701 0 : break;
702 : }
703 0 : buf.append(c);
704 : }
705 0 : if (buf.length() < 1 || !ok) {
706 : // syntaxError(chars, "Invalid multicharacter string");
707 0 : ec = U_MALFORMED_SET;
708 0 : return;
709 : }
710 : }
711 : // We have new string. Add it to set and continue;
712 : // we don't need to drop through to the further
713 : // processing
714 0 : add(buf);
715 0 : patLocal.append((UChar) 0x7B /*'{'*/);
716 0 : _appendToPat(patLocal, buf, FALSE);
717 0 : patLocal.append((UChar) 0x7D /*'}'*/);
718 0 : continue;
719 : case SymbolTable::SYMBOL_REF:
720 : // symbols nosymbols
721 : // [a-$] error error (ambiguous)
722 : // [a$] anchor anchor
723 : // [a-$x] var "x"* literal '$'
724 : // [a-$.] error literal '$'
725 : // *We won't get here in the case of var "x"
726 : {
727 0 : chars.getPos(backup);
728 0 : c = chars.next(opts, literal, ec);
729 0 : if (U_FAILURE(ec)) return;
730 0 : UBool anchor = (c == 0x5D /*']'*/ && !literal);
731 0 : if (symbols == 0 && !anchor) {
732 0 : c = SymbolTable::SYMBOL_REF;
733 0 : chars.setPos(backup);
734 0 : break; // literal '$'
735 : }
736 0 : if (anchor && op == 0) {
737 0 : if (lastItem == 1) {
738 0 : add(lastChar, lastChar);
739 0 : _appendToPat(patLocal, lastChar, FALSE);
740 : }
741 0 : add(U_ETHER);
742 0 : usePat = TRUE;
743 0 : patLocal.append((UChar) SymbolTable::SYMBOL_REF);
744 0 : patLocal.append((UChar) 0x5D /*']'*/);
745 0 : mode = 2;
746 0 : continue;
747 : }
748 : // syntaxError(chars, "Unquoted '$'");
749 0 : ec = U_MALFORMED_SET;
750 0 : return;
751 : }
752 : default:
753 0 : break;
754 : }
755 : }
756 :
757 : // -------- Parse literal characters. This includes both
758 : // escaped chars ("\u4E01") and non-syntax characters
759 : // ("a").
760 :
761 0 : switch (lastItem) {
762 : case 0:
763 0 : lastItem = 1;
764 0 : lastChar = c;
765 0 : break;
766 : case 1:
767 0 : if (op == HYPHEN /*'-'*/) {
768 0 : if (lastChar >= c) {
769 : // Don't allow redundant (a-a) or empty (b-a) ranges;
770 : // these are most likely typos.
771 : // syntaxError(chars, "Invalid range");
772 0 : ec = U_MALFORMED_SET;
773 0 : return;
774 : }
775 0 : add(lastChar, c);
776 0 : _appendToPat(patLocal, lastChar, FALSE);
777 0 : patLocal.append(op);
778 0 : _appendToPat(patLocal, c, FALSE);
779 0 : lastItem = 0;
780 0 : op = 0;
781 : } else {
782 0 : add(lastChar, lastChar);
783 0 : _appendToPat(patLocal, lastChar, FALSE);
784 0 : lastChar = c;
785 : }
786 0 : break;
787 : case 2:
788 0 : if (op != 0) {
789 : // syntaxError(chars, "Set expected after operator");
790 0 : ec = U_MALFORMED_SET;
791 0 : return;
792 : }
793 0 : lastChar = c;
794 0 : lastItem = 1;
795 0 : break;
796 : }
797 : }
798 :
799 0 : if (mode != 2) {
800 : // syntaxError(chars, "Missing ']'");
801 0 : ec = U_MALFORMED_SET;
802 0 : return;
803 : }
804 :
805 0 : chars.skipIgnored(opts);
806 :
807 : /**
808 : * Handle global flags (invert, case insensitivity). If this
809 : * pattern should be compiled case-insensitive, then we need
810 : * to close over case BEFORE COMPLEMENTING. This makes
811 : * patterns like /[^abc]/i work.
812 : */
813 0 : if ((options & USET_CASE_INSENSITIVE) != 0) {
814 0 : (this->*caseClosure)(USET_CASE_INSENSITIVE);
815 : }
816 0 : else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
817 0 : (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
818 : }
819 0 : if (invert) {
820 0 : complement();
821 : }
822 :
823 : // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
824 : // generated pattern.
825 0 : if (usePat) {
826 0 : rebuiltPat.append(patLocal);
827 : } else {
828 0 : _generatePattern(rebuiltPat, FALSE);
829 : }
830 0 : if (isBogus() && U_SUCCESS(ec)) {
831 : // We likely ran out of memory. AHHH!
832 0 : ec = U_MEMORY_ALLOCATION_ERROR;
833 : }
834 : }
835 :
836 : //----------------------------------------------------------------
837 : // Property set implementation
838 : //----------------------------------------------------------------
839 :
840 0 : static UBool numericValueFilter(UChar32 ch, void* context) {
841 0 : return u_getNumericValue(ch) == *(double*)context;
842 : }
843 :
844 0 : static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
845 0 : int32_t value = *(int32_t*)context;
846 0 : return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
847 : }
848 :
849 0 : static UBool versionFilter(UChar32 ch, void* context) {
850 : static const UVersionInfo none = { 0, 0, 0, 0 };
851 : UVersionInfo v;
852 0 : u_charAge(ch, v);
853 0 : UVersionInfo* version = (UVersionInfo*)context;
854 0 : return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
855 : }
856 :
857 : typedef struct {
858 : UProperty prop;
859 : int32_t value;
860 : } IntPropertyContext;
861 :
862 0 : static UBool intPropertyFilter(UChar32 ch, void* context) {
863 0 : IntPropertyContext* c = (IntPropertyContext*)context;
864 0 : return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
865 : }
866 :
867 0 : static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
868 0 : return uscript_hasScript(ch, *(UScriptCode*)context);
869 : }
870 :
871 : /**
872 : * Generic filter-based scanning code for UCD property UnicodeSets.
873 : */
874 0 : void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
875 : void* context,
876 : int32_t src,
877 : UErrorCode &status) {
878 0 : if (U_FAILURE(status)) return;
879 :
880 : // Logically, walk through all Unicode characters, noting the start
881 : // and end of each range for which filter.contain(c) is
882 : // true. Add each range to a set.
883 : //
884 : // To improve performance, use an inclusions set which
885 : // encodes information about character ranges that are known
886 : // to have identical properties.
887 : // getInclusions(src) contains exactly the first characters of
888 : // same-value ranges for the given properties "source".
889 0 : const UnicodeSet* inclusions = getInclusions(src, status);
890 0 : if (U_FAILURE(status)) {
891 0 : return;
892 : }
893 :
894 0 : clear();
895 :
896 0 : UChar32 startHasProperty = -1;
897 0 : int32_t limitRange = inclusions->getRangeCount();
898 :
899 0 : for (int j=0; j<limitRange; ++j) {
900 : // get current range
901 0 : UChar32 start = inclusions->getRangeStart(j);
902 0 : UChar32 end = inclusions->getRangeEnd(j);
903 :
904 : // for all the code points in the range, process
905 0 : for (UChar32 ch = start; ch <= end; ++ch) {
906 : // only add to this UnicodeSet on inflection points --
907 : // where the hasProperty value changes to false
908 0 : if ((*filter)(ch, context)) {
909 0 : if (startHasProperty < 0) {
910 0 : startHasProperty = ch;
911 : }
912 0 : } else if (startHasProperty >= 0) {
913 0 : add(startHasProperty, ch-1);
914 0 : startHasProperty = -1;
915 : }
916 : }
917 : }
918 0 : if (startHasProperty >= 0) {
919 0 : add((UChar32)startHasProperty, (UChar32)0x10FFFF);
920 : }
921 0 : if (isBogus() && U_SUCCESS(status)) {
922 : // We likely ran out of memory. AHHH!
923 0 : status = U_MEMORY_ALLOCATION_ERROR;
924 : }
925 : }
926 :
927 0 : static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
928 : /* Note: we use ' ' in compiler code page */
929 0 : int32_t j = 0;
930 : char ch;
931 0 : --dstCapacity; /* make room for term. zero */
932 0 : while ((ch = *src++) != 0) {
933 0 : if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
934 0 : continue;
935 : }
936 0 : if (j >= dstCapacity) return FALSE;
937 0 : dst[j++] = ch;
938 : }
939 0 : if (j > 0 && dst[j-1] == ' ') --j;
940 0 : dst[j] = 0;
941 0 : return TRUE;
942 : }
943 :
944 : //----------------------------------------------------------------
945 : // Property set API
946 : //----------------------------------------------------------------
947 :
948 : #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
949 :
950 : UnicodeSet&
951 0 : UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
952 0 : if (U_FAILURE(ec) || isFrozen()) return *this;
953 :
954 0 : if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
955 0 : applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
956 0 : } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
957 0 : UScriptCode script = (UScriptCode)value;
958 0 : applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec);
959 : } else {
960 0 : IntPropertyContext c = {prop, value};
961 0 : applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
962 : }
963 0 : return *this;
964 : }
965 :
966 : UnicodeSet&
967 0 : UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
968 : const UnicodeString& value,
969 : UErrorCode& ec) {
970 0 : if (U_FAILURE(ec) || isFrozen()) return *this;
971 :
972 : // prop and value used to be converted to char * using the default
973 : // converter instead of the invariant conversion.
974 : // This should not be necessary because all Unicode property and value
975 : // names use only invariant characters.
976 : // If there are any variant characters, then we won't find them anyway.
977 : // Checking first avoids assertion failures in the conversion.
978 0 : if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
979 0 : !uprv_isInvariantUString(value.getBuffer(), value.length())
980 : ) {
981 0 : FAIL(ec);
982 : }
983 0 : CharString pname, vname;
984 0 : pname.appendInvariantChars(prop, ec);
985 0 : vname.appendInvariantChars(value, ec);
986 0 : if (U_FAILURE(ec)) return *this;
987 :
988 : UProperty p;
989 : int32_t v;
990 0 : UBool mustNotBeEmpty = FALSE, invert = FALSE;
991 :
992 0 : if (value.length() > 0) {
993 0 : p = u_getPropertyEnum(pname.data());
994 0 : if (p == UCHAR_INVALID_CODE) FAIL(ec);
995 :
996 : // Treat gc as gcm
997 0 : if (p == UCHAR_GENERAL_CATEGORY) {
998 0 : p = UCHAR_GENERAL_CATEGORY_MASK;
999 : }
1000 :
1001 0 : if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1002 0 : (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1003 0 : (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1004 0 : v = u_getPropertyValueEnum(p, vname.data());
1005 0 : if (v == UCHAR_INVALID_CODE) {
1006 : // Handle numeric CCC
1007 0 : if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1008 0 : p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1009 : p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1010 : char* end;
1011 0 : double value = uprv_strtod(vname.data(), &end);
1012 0 : v = (int32_t) value;
1013 0 : if (v != value || v < 0 || *end != 0) {
1014 : // non-integral or negative value, or trailing junk
1015 0 : FAIL(ec);
1016 : }
1017 : // If the resultant set is empty then the numeric value
1018 : // was invalid.
1019 0 : mustNotBeEmpty = TRUE;
1020 : } else {
1021 0 : FAIL(ec);
1022 : }
1023 0 : }
1024 : }
1025 :
1026 : else {
1027 :
1028 0 : switch (p) {
1029 : case UCHAR_NUMERIC_VALUE:
1030 : {
1031 : char* end;
1032 0 : double value = uprv_strtod(vname.data(), &end);
1033 0 : if (*end != 0) {
1034 0 : FAIL(ec);
1035 : }
1036 0 : applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1037 0 : return *this;
1038 : }
1039 : case UCHAR_NAME:
1040 : {
1041 : // Must munge name, since u_charFromName() does not do
1042 : // 'loose' matching.
1043 : char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1044 0 : if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
1045 0 : UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
1046 0 : if (U_SUCCESS(ec)) {
1047 0 : clear();
1048 0 : add(ch);
1049 0 : return *this;
1050 : } else {
1051 0 : FAIL(ec);
1052 : }
1053 : }
1054 : case UCHAR_UNICODE_1_NAME:
1055 : // ICU 49 deprecates the Unicode_1_Name property APIs.
1056 0 : FAIL(ec);
1057 : case UCHAR_AGE:
1058 : {
1059 : // Must munge name, since u_versionFromString() does not do
1060 : // 'loose' matching.
1061 : char buf[128];
1062 0 : if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
1063 : UVersionInfo version;
1064 0 : u_versionFromString(version, buf);
1065 0 : applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
1066 0 : return *this;
1067 : }
1068 : case UCHAR_SCRIPT_EXTENSIONS:
1069 0 : v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
1070 0 : if (v == UCHAR_INVALID_CODE) {
1071 0 : FAIL(ec);
1072 : }
1073 : // fall through to calling applyIntPropertyValue()
1074 0 : break;
1075 : default:
1076 : // p is a non-binary, non-enumerated property that we
1077 : // don't support (yet).
1078 0 : FAIL(ec);
1079 : }
1080 : }
1081 : }
1082 :
1083 : else {
1084 : // value is empty. Interpret as General Category, Script, or
1085 : // Binary property.
1086 0 : p = UCHAR_GENERAL_CATEGORY_MASK;
1087 0 : v = u_getPropertyValueEnum(p, pname.data());
1088 0 : if (v == UCHAR_INVALID_CODE) {
1089 0 : p = UCHAR_SCRIPT;
1090 0 : v = u_getPropertyValueEnum(p, pname.data());
1091 0 : if (v == UCHAR_INVALID_CODE) {
1092 0 : p = u_getPropertyEnum(pname.data());
1093 0 : if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1094 0 : v = 1;
1095 0 : } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
1096 0 : set(MIN_VALUE, MAX_VALUE);
1097 0 : return *this;
1098 0 : } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
1099 0 : set(0, 0x7F);
1100 0 : return *this;
1101 0 : } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
1102 : // [:Assigned:]=[:^Cn:]
1103 0 : p = UCHAR_GENERAL_CATEGORY_MASK;
1104 0 : v = U_GC_CN_MASK;
1105 0 : invert = TRUE;
1106 : } else {
1107 0 : FAIL(ec);
1108 : }
1109 : }
1110 : }
1111 : }
1112 :
1113 0 : applyIntPropertyValue(p, v, ec);
1114 0 : if(invert) {
1115 0 : complement();
1116 : }
1117 :
1118 0 : if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1119 : // mustNotBeEmpty is set to true if an empty set indicates
1120 : // invalid input.
1121 0 : ec = U_ILLEGAL_ARGUMENT_ERROR;
1122 : }
1123 :
1124 0 : if (isBogus() && U_SUCCESS(ec)) {
1125 : // We likely ran out of memory. AHHH!
1126 0 : ec = U_MEMORY_ALLOCATION_ERROR;
1127 : }
1128 0 : return *this;
1129 : }
1130 :
1131 : //----------------------------------------------------------------
1132 : // Property set patterns
1133 : //----------------------------------------------------------------
1134 :
1135 : /**
1136 : * Return true if the given position, in the given pattern, appears
1137 : * to be the start of a property set pattern.
1138 : */
1139 0 : UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1140 : int32_t pos) {
1141 : // Patterns are at least 5 characters long
1142 0 : if ((pos+5) > pattern.length()) {
1143 0 : return FALSE;
1144 : }
1145 :
1146 : // Look for an opening [:, [:^, \p, or \P
1147 0 : return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1148 : }
1149 :
1150 : /**
1151 : * Return true if the given iterator appears to point at a
1152 : * property pattern. Regardless of the result, return with the
1153 : * iterator unchanged.
1154 : * @param chars iterator over the pattern characters. Upon return
1155 : * it will be unchanged.
1156 : * @param iterOpts RuleCharacterIterator options
1157 : */
1158 0 : UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1159 : int32_t iterOpts) {
1160 : // NOTE: literal will always be FALSE, because we don't parse escapes.
1161 0 : UBool result = FALSE, literal;
1162 0 : UErrorCode ec = U_ZERO_ERROR;
1163 0 : iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1164 : RuleCharacterIterator::Pos pos;
1165 0 : chars.getPos(pos);
1166 0 : UChar32 c = chars.next(iterOpts, literal, ec);
1167 0 : if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1168 0 : UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1169 0 : literal, ec);
1170 0 : result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1171 0 : (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1172 : }
1173 0 : chars.setPos(pos);
1174 0 : return result && U_SUCCESS(ec);
1175 : }
1176 :
1177 : /**
1178 : * Parse the given property pattern at the given parse position.
1179 : */
1180 0 : UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1181 : ParsePosition& ppos,
1182 : UErrorCode &ec) {
1183 0 : int32_t pos = ppos.getIndex();
1184 :
1185 0 : UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1186 0 : UBool isName = FALSE; // true for \N{pat}, o/w false
1187 0 : UBool invert = FALSE;
1188 :
1189 0 : if (U_FAILURE(ec)) return *this;
1190 :
1191 : // Minimum length is 5 characters, e.g. \p{L}
1192 0 : if ((pos+5) > pattern.length()) {
1193 0 : FAIL(ec);
1194 : }
1195 :
1196 : // On entry, ppos should point to one of the following locations:
1197 : // Look for an opening [:, [:^, \p, or \P
1198 0 : if (isPOSIXOpen(pattern, pos)) {
1199 0 : posix = TRUE;
1200 0 : pos += 2;
1201 0 : pos = ICU_Utility::skipWhitespace(pattern, pos);
1202 0 : if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1203 0 : ++pos;
1204 0 : invert = TRUE;
1205 : }
1206 0 : } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1207 0 : UChar c = pattern.charAt(pos+1);
1208 0 : invert = (c == UPPER_P);
1209 0 : isName = (c == UPPER_N);
1210 0 : pos += 2;
1211 0 : pos = ICU_Utility::skipWhitespace(pattern, pos);
1212 0 : if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1213 : // Syntax error; "\p" or "\P" not followed by "{"
1214 0 : FAIL(ec);
1215 : }
1216 : } else {
1217 : // Open delimiter not seen
1218 0 : FAIL(ec);
1219 : }
1220 :
1221 : // Look for the matching close delimiter, either :] or }
1222 : int32_t close;
1223 0 : if (posix) {
1224 0 : close = pattern.indexOf(POSIX_CLOSE, 2, pos);
1225 : } else {
1226 0 : close = pattern.indexOf(CLOSE_BRACE, pos);
1227 : }
1228 0 : if (close < 0) {
1229 : // Syntax error; close delimiter missing
1230 0 : FAIL(ec);
1231 : }
1232 :
1233 : // Look for an '=' sign. If this is present, we will parse a
1234 : // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1235 : // pattern.
1236 0 : int32_t equals = pattern.indexOf(EQUALS, pos);
1237 0 : UnicodeString propName, valueName;
1238 0 : if (equals >= 0 && equals < close && !isName) {
1239 : // Equals seen; parse medium/long pattern
1240 0 : pattern.extractBetween(pos, equals, propName);
1241 0 : pattern.extractBetween(equals+1, close, valueName);
1242 : }
1243 :
1244 : else {
1245 : // Handle case where no '=' is seen, and \N{}
1246 0 : pattern.extractBetween(pos, close, propName);
1247 :
1248 : // Handle \N{name}
1249 0 : if (isName) {
1250 : // This is a little inefficient since it means we have to
1251 : // parse NAME_PROP back to UCHAR_NAME even though we already
1252 : // know it's UCHAR_NAME. If we refactor the API to
1253 : // support args of (UProperty, char*) then we can remove
1254 : // NAME_PROP and make this a little more efficient.
1255 0 : valueName = propName;
1256 0 : propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1257 : }
1258 : }
1259 :
1260 0 : applyPropertyAlias(propName, valueName, ec);
1261 :
1262 0 : if (U_SUCCESS(ec)) {
1263 0 : if (invert) {
1264 0 : complement();
1265 : }
1266 :
1267 : // Move to the limit position after the close delimiter if the
1268 : // parse succeeded.
1269 0 : ppos.setIndex(close + (posix ? 2 : 1));
1270 : }
1271 :
1272 0 : return *this;
1273 : }
1274 :
1275 : /**
1276 : * Parse a property pattern.
1277 : * @param chars iterator over the pattern characters. Upon return
1278 : * it will be advanced to the first character after the parsed
1279 : * pattern, or the end of the iteration if all characters are
1280 : * parsed.
1281 : * @param rebuiltPat the pattern that was parsed, rebuilt or
1282 : * copied from the input pattern, as appropriate.
1283 : */
1284 0 : void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1285 : UnicodeString& rebuiltPat,
1286 : UErrorCode& ec) {
1287 0 : if (U_FAILURE(ec)) return;
1288 0 : UnicodeString pattern;
1289 0 : chars.lookahead(pattern);
1290 0 : ParsePosition pos(0);
1291 0 : applyPropertyPattern(pattern, pos, ec);
1292 0 : if (U_FAILURE(ec)) return;
1293 0 : if (pos.getIndex() == 0) {
1294 : // syntaxError(chars, "Invalid property pattern");
1295 0 : ec = U_MALFORMED_SET;
1296 0 : return;
1297 : }
1298 0 : chars.jumpahead(pos.getIndex());
1299 0 : rebuiltPat.append(pattern, 0, pos.getIndex());
1300 : }
1301 :
1302 : U_NAMESPACE_END
|