Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (C) 1999-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : * Date Name Description
9 : * 10/20/99 alan Creation.
10 : **********************************************************************
11 : */
12 :
13 : #include "unicode/utypes.h"
14 : #include "unicode/parsepos.h"
15 : #include "unicode/symtable.h"
16 : #include "unicode/uniset.h"
17 : #include "unicode/utf8.h"
18 : #include "unicode/utf16.h"
19 : #include "ruleiter.h"
20 : #include "cmemory.h"
21 : #include "cstring.h"
22 : #include "patternprops.h"
23 : #include "uelement.h"
24 : #include "util.h"
25 : #include "uvector.h"
26 : #include "charstr.h"
27 : #include "ustrfmt.h"
28 : #include "uassert.h"
29 : #include "bmpset.h"
30 : #include "unisetspan.h"
31 :
32 : // Define UChar constants using hex for EBCDIC compatibility
33 : // Used #define to reduce private static exports and memory access time.
34 : #define SET_OPEN ((UChar)0x005B) /*[*/
35 : #define SET_CLOSE ((UChar)0x005D) /*]*/
36 : #define HYPHEN ((UChar)0x002D) /*-*/
37 : #define COMPLEMENT ((UChar)0x005E) /*^*/
38 : #define COLON ((UChar)0x003A) /*:*/
39 : #define BACKSLASH ((UChar)0x005C) /*\*/
40 : #define INTERSECTION ((UChar)0x0026) /*&*/
41 : #define UPPER_U ((UChar)0x0055) /*U*/
42 : #define LOWER_U ((UChar)0x0075) /*u*/
43 : #define OPEN_BRACE ((UChar)123) /*{*/
44 : #define CLOSE_BRACE ((UChar)125) /*}*/
45 : #define UPPER_P ((UChar)0x0050) /*P*/
46 : #define LOWER_P ((UChar)0x0070) /*p*/
47 : #define UPPER_N ((UChar)78) /*N*/
48 : #define EQUALS ((UChar)0x003D) /*=*/
49 :
50 : // HIGH_VALUE > all valid values. 110000 for codepoints
51 : #define UNICODESET_HIGH 0x0110000
52 :
53 : // LOW <= all valid values. ZERO for codepoints
54 : #define UNICODESET_LOW 0x000000
55 :
56 : // initial storage. Must be >= 0
57 : #define START_EXTRA 16
58 :
59 : // extra amount for growth. Must be >= 0
60 : #define GROW_EXTRA START_EXTRA
61 :
62 : U_NAMESPACE_BEGIN
63 :
64 0 : SymbolTable::~SymbolTable() {}
65 :
66 0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
67 :
68 : /**
69 : * Modify the given UChar32 variable so that it is in range, by
70 : * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
71 : * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
72 : * It modifies its argument in-place and also returns it.
73 : */
74 0 : static inline UChar32 pinCodePoint(UChar32& c) {
75 0 : if (c < UNICODESET_LOW) {
76 0 : c = UNICODESET_LOW;
77 0 : } else if (c > (UNICODESET_HIGH-1)) {
78 0 : c = (UNICODESET_HIGH-1);
79 : }
80 0 : return c;
81 : }
82 :
83 : //----------------------------------------------------------------
84 : // Debugging
85 : //----------------------------------------------------------------
86 :
87 : // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
88 : // To enable the debugging, define the symbol DEBUG_MEM in the line
89 : // below. This will result in text being sent to stdout that looks
90 : // like this:
91 : // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
92 : // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
93 : // Each line lists a construction (ct) or destruction (dt) event, the
94 : // object address, the number of outstanding objects after the event,
95 : // and the pattern of the object in question.
96 :
97 : // #define DEBUG_MEM
98 :
99 : #ifdef DEBUG_MEM
100 : #include <stdio.h>
101 : static int32_t _dbgCount = 0;
102 :
103 : static inline void _dbgct(UnicodeSet* set) {
104 : UnicodeString str;
105 : set->toPattern(str, TRUE);
106 : char buf[40];
107 : str.extract(0, 39, buf, "");
108 : printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
109 : }
110 :
111 : static inline void _dbgdt(UnicodeSet* set) {
112 : UnicodeString str;
113 : set->toPattern(str, TRUE);
114 : char buf[40];
115 : str.extract(0, 39, buf, "");
116 : printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
117 : }
118 :
119 : #else
120 :
121 : #define _dbgct(set)
122 : #define _dbgdt(set)
123 :
124 : #endif
125 :
126 : //----------------------------------------------------------------
127 : // UnicodeString in UVector support
128 : //----------------------------------------------------------------
129 :
130 0 : static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
131 0 : dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
132 0 : }
133 :
134 0 : static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
135 0 : const UnicodeString &a = *(const UnicodeString*)t1.pointer;
136 0 : const UnicodeString &b = *(const UnicodeString*)t2.pointer;
137 0 : return a.compare(b);
138 : }
139 :
140 : //----------------------------------------------------------------
141 : // Constructors &c
142 : //----------------------------------------------------------------
143 :
144 : /**
145 : * Constructs an empty set.
146 : */
147 0 : UnicodeSet::UnicodeSet() :
148 : len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
149 : bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
150 0 : fFlags(0)
151 : {
152 0 : UErrorCode status = U_ZERO_ERROR;
153 0 : allocateStrings(status);
154 0 : if (U_FAILURE(status)) {
155 0 : return;
156 : }
157 0 : list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
158 0 : if(list!=NULL){
159 0 : list[0] = UNICODESET_HIGH;
160 : } else { // If memory allocation failed, set to bogus state.
161 0 : setToBogus();
162 0 : return;
163 : }
164 : _dbgct(this);
165 : }
166 :
167 : /**
168 : * Constructs a set containing the given range. If <code>end >
169 : * start</code> then an empty set is created.
170 : *
171 : * @param start first character, inclusive, of range
172 : * @param end last character, inclusive, of range
173 : */
174 0 : UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
175 : len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
176 : bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
177 0 : fFlags(0)
178 : {
179 0 : UErrorCode status = U_ZERO_ERROR;
180 0 : allocateStrings(status);
181 0 : if (U_FAILURE(status)) {
182 0 : return;
183 : }
184 0 : list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
185 0 : if(list!=NULL){
186 0 : list[0] = UNICODESET_HIGH;
187 0 : complement(start, end);
188 : } else { // If memory allocation failed, set to bogus state.
189 0 : setToBogus();
190 0 : return;
191 : }
192 : _dbgct(this);
193 : }
194 :
195 : /**
196 : * Constructs a set that is identical to the given UnicodeSet.
197 : */
198 0 : UnicodeSet::UnicodeSet(const UnicodeSet& o) :
199 : UnicodeFilter(o),
200 0 : len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
201 : bmpSet(0),
202 : buffer(0), bufferCapacity(0),
203 : patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
204 0 : fFlags(0)
205 : {
206 0 : UErrorCode status = U_ZERO_ERROR;
207 0 : allocateStrings(status);
208 0 : if (U_FAILURE(status)) {
209 0 : return;
210 : }
211 0 : list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
212 0 : if(list!=NULL){
213 0 : *this = o;
214 : } else { // If memory allocation failed, set to bogus state.
215 0 : setToBogus();
216 0 : return;
217 : }
218 : _dbgct(this);
219 : }
220 :
221 : // Copy-construct as thawed.
222 0 : UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
223 : UnicodeFilter(o),
224 0 : len(0), capacity(o.len + GROW_EXTRA), list(0),
225 : bmpSet(0),
226 : buffer(0), bufferCapacity(0),
227 : patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
228 0 : fFlags(0)
229 : {
230 0 : UErrorCode status = U_ZERO_ERROR;
231 0 : allocateStrings(status);
232 0 : if (U_FAILURE(status)) {
233 0 : return;
234 : }
235 0 : list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
236 0 : if(list!=NULL){
237 : // *this = o except for bmpSet and stringSpan
238 0 : len = o.len;
239 0 : uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
240 0 : if (strings != NULL && o.strings != NULL) {
241 0 : strings->assign(*o.strings, cloneUnicodeString, status);
242 : } else { // Invalid strings.
243 0 : setToBogus();
244 0 : return;
245 : }
246 0 : if (o.pat) {
247 0 : setPattern(UnicodeString(o.pat, o.patLen));
248 : }
249 : } else { // If memory allocation failed, set to bogus state.
250 0 : setToBogus();
251 0 : return;
252 : }
253 : _dbgct(this);
254 : }
255 :
256 : /**
257 : * Destructs the set.
258 : */
259 0 : UnicodeSet::~UnicodeSet() {
260 : _dbgdt(this); // first!
261 0 : uprv_free(list);
262 0 : delete bmpSet;
263 0 : if (buffer) {
264 0 : uprv_free(buffer);
265 : }
266 0 : delete strings;
267 0 : delete stringSpan;
268 0 : releasePattern();
269 0 : }
270 :
271 : /**
272 : * Assigns this object to be a copy of another.
273 : */
274 0 : UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
275 0 : if (this == &o) {
276 0 : return *this;
277 : }
278 0 : if (isFrozen()) {
279 0 : return *this;
280 : }
281 0 : if (o.isBogus()) {
282 0 : setToBogus();
283 0 : return *this;
284 : }
285 0 : UErrorCode ec = U_ZERO_ERROR;
286 0 : ensureCapacity(o.len, ec);
287 0 : if (U_FAILURE(ec)) {
288 0 : return *this; // There is no way to report this error :-(
289 : }
290 0 : len = o.len;
291 0 : uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
292 0 : if (o.bmpSet == NULL) {
293 0 : bmpSet = NULL;
294 : } else {
295 0 : bmpSet = new BMPSet(*o.bmpSet, list, len);
296 0 : if (bmpSet == NULL) { // Check for memory allocation error.
297 0 : setToBogus();
298 0 : return *this;
299 : }
300 : }
301 0 : if (strings != NULL && o.strings != NULL) {
302 0 : strings->assign(*o.strings, cloneUnicodeString, ec);
303 : } else { // Invalid strings.
304 0 : setToBogus();
305 0 : return *this;
306 : }
307 0 : if (o.stringSpan == NULL) {
308 0 : stringSpan = NULL;
309 : } else {
310 0 : stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
311 0 : if (stringSpan == NULL) { // Check for memory allocation error.
312 0 : setToBogus();
313 0 : return *this;
314 : }
315 : }
316 0 : releasePattern();
317 0 : if (o.pat) {
318 0 : setPattern(UnicodeString(o.pat, o.patLen));
319 : }
320 0 : return *this;
321 : }
322 :
323 : /**
324 : * Returns a copy of this object. All UnicodeMatcher objects have
325 : * to support cloning in order to allow classes using
326 : * UnicodeMatchers, such as Transliterator, to implement cloning.
327 : */
328 0 : UnicodeFunctor* UnicodeSet::clone() const {
329 0 : return new UnicodeSet(*this);
330 : }
331 :
332 0 : UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
333 0 : return new UnicodeSet(*this, TRUE);
334 : }
335 :
336 : /**
337 : * Compares the specified object with this set for equality. Returns
338 : * <tt>true</tt> if the two sets
339 : * have the same size, and every member of the specified set is
340 : * contained in this set (or equivalently, every member of this set is
341 : * contained in the specified set).
342 : *
343 : * @param o set to be compared for equality with this set.
344 : * @return <tt>true</tt> if the specified set is equal to this set.
345 : */
346 0 : UBool UnicodeSet::operator==(const UnicodeSet& o) const {
347 0 : if (len != o.len) return FALSE;
348 0 : for (int32_t i = 0; i < len; ++i) {
349 0 : if (list[i] != o.list[i]) return FALSE;
350 : }
351 0 : if (*strings != *o.strings) return FALSE;
352 0 : return TRUE;
353 : }
354 :
355 : /**
356 : * Returns the hash code value for this set.
357 : *
358 : * @return the hash code value for this set.
359 : * @see Object#hashCode()
360 : */
361 0 : int32_t UnicodeSet::hashCode(void) const {
362 0 : int32_t result = len;
363 0 : for (int32_t i = 0; i < len; ++i) {
364 0 : result *= 1000003;
365 0 : result += list[i];
366 : }
367 0 : return result;
368 : }
369 :
370 : //----------------------------------------------------------------
371 : // Public API
372 : //----------------------------------------------------------------
373 :
374 : /**
375 : * Returns the number of elements in this set (its cardinality),
376 : * Note than the elements of a set may include both individual
377 : * codepoints and strings.
378 : *
379 : * @return the number of elements in this set (its cardinality).
380 : */
381 0 : int32_t UnicodeSet::size(void) const {
382 0 : int32_t n = 0;
383 0 : int32_t count = getRangeCount();
384 0 : for (int32_t i = 0; i < count; ++i) {
385 0 : n += getRangeEnd(i) - getRangeStart(i) + 1;
386 : }
387 0 : return n + strings->size();
388 : }
389 :
390 : /**
391 : * Returns <tt>true</tt> if this set contains no elements.
392 : *
393 : * @return <tt>true</tt> if this set contains no elements.
394 : */
395 0 : UBool UnicodeSet::isEmpty(void) const {
396 0 : return len == 1 && strings->size() == 0;
397 : }
398 :
399 : /**
400 : * Returns true if this set contains the given character.
401 : * @param c character to be checked for containment
402 : * @return true if the test condition is met
403 : */
404 0 : UBool UnicodeSet::contains(UChar32 c) const {
405 : // Set i to the index of the start item greater than ch
406 : // We know we will terminate without length test!
407 : // LATER: for large sets, add binary search
408 : //int32_t i = -1;
409 : //for (;;) {
410 : // if (c < list[++i]) break;
411 : //}
412 0 : if (bmpSet != NULL) {
413 0 : return bmpSet->contains(c);
414 : }
415 0 : if (stringSpan != NULL) {
416 0 : return stringSpan->contains(c);
417 : }
418 0 : if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
419 0 : return FALSE;
420 : }
421 0 : int32_t i = findCodePoint(c);
422 0 : return (UBool)(i & 1); // return true if odd
423 : }
424 :
425 : /**
426 : * Returns the smallest value i such that c < list[i]. Caller
427 : * must ensure that c is a legal value or this method will enter
428 : * an infinite loop. This method performs a binary search.
429 : * @param c a character in the range MIN_VALUE..MAX_VALUE
430 : * inclusive
431 : * @return the smallest integer i in the range 0..len-1,
432 : * inclusive, such that c < list[i]
433 : */
434 0 : int32_t UnicodeSet::findCodePoint(UChar32 c) const {
435 : /* Examples:
436 : findCodePoint(c)
437 : set list[] c=0 1 3 4 7 8
438 : === ============== ===========
439 : [] [110000] 0 0 0 0 0 0
440 : [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
441 : [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
442 : [:Any:] [0, 110000] 1 1 1 1 1 1
443 : */
444 :
445 : // Return the smallest i such that c < list[i]. Assume
446 : // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
447 0 : if (c < list[0])
448 0 : return 0;
449 : // High runner test. c is often after the last range, so an
450 : // initial check for this condition pays off.
451 0 : int32_t lo = 0;
452 0 : int32_t hi = len - 1;
453 0 : if (lo >= hi || c >= list[hi-1])
454 0 : return hi;
455 : // invariant: c >= list[lo]
456 : // invariant: c < list[hi]
457 : for (;;) {
458 0 : int32_t i = (lo + hi) >> 1;
459 0 : if (i == lo) {
460 0 : break; // Found!
461 0 : } else if (c < list[i]) {
462 0 : hi = i;
463 : } else {
464 0 : lo = i;
465 : }
466 0 : }
467 0 : return hi;
468 : }
469 :
470 : /**
471 : * Returns true if this set contains every character
472 : * of the given range.
473 : * @param start first character, inclusive, of the range
474 : * @param end last character, inclusive, of the range
475 : * @return true if the test condition is met
476 : */
477 0 : UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
478 : //int32_t i = -1;
479 : //for (;;) {
480 : // if (start < list[++i]) break;
481 : //}
482 0 : int32_t i = findCodePoint(start);
483 0 : return ((i & 1) != 0 && end < list[i]);
484 : }
485 :
486 : /**
487 : * Returns <tt>true</tt> if this set contains the given
488 : * multicharacter string.
489 : * @param s string to be checked for containment
490 : * @return <tt>true</tt> if this set contains the specified string
491 : */
492 0 : UBool UnicodeSet::contains(const UnicodeString& s) const {
493 0 : if (s.length() == 0) return FALSE;
494 0 : int32_t cp = getSingleCP(s);
495 0 : if (cp < 0) {
496 0 : return strings->contains((void*) &s);
497 : } else {
498 0 : return contains((UChar32) cp);
499 : }
500 : }
501 :
502 : /**
503 : * Returns true if this set contains all the characters and strings
504 : * of the given set.
505 : * @param c set to be checked for containment
506 : * @return true if the test condition is met
507 : */
508 0 : UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
509 : // The specified set is a subset if all of its pairs are contained in
510 : // this set. It's possible to code this more efficiently in terms of
511 : // direct manipulation of the inversion lists if the need arises.
512 0 : int32_t n = c.getRangeCount();
513 0 : for (int i=0; i<n; ++i) {
514 0 : if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
515 0 : return FALSE;
516 : }
517 : }
518 0 : if (!strings->containsAll(*c.strings)) return FALSE;
519 0 : return TRUE;
520 : }
521 :
522 : /**
523 : * Returns true if this set contains all the characters
524 : * of the given string.
525 : * @param s string containing characters to be checked for containment
526 : * @return true if the test condition is met
527 : */
528 0 : UBool UnicodeSet::containsAll(const UnicodeString& s) const {
529 0 : return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
530 0 : s.length());
531 : }
532 :
533 : /**
534 : * Returns true if this set contains none of the characters
535 : * of the given range.
536 : * @param start first character, inclusive, of the range
537 : * @param end last character, inclusive, of the range
538 : * @return true if the test condition is met
539 : */
540 0 : UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
541 : //int32_t i = -1;
542 : //for (;;) {
543 : // if (start < list[++i]) break;
544 : //}
545 0 : int32_t i = findCodePoint(start);
546 0 : return ((i & 1) == 0 && end < list[i]);
547 : }
548 :
549 : /**
550 : * Returns true if this set contains none of the characters and strings
551 : * of the given set.
552 : * @param c set to be checked for containment
553 : * @return true if the test condition is met
554 : */
555 0 : UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
556 : // The specified set is a subset if all of its pairs are contained in
557 : // this set. It's possible to code this more efficiently in terms of
558 : // direct manipulation of the inversion lists if the need arises.
559 0 : int32_t n = c.getRangeCount();
560 0 : for (int32_t i=0; i<n; ++i) {
561 0 : if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
562 0 : return FALSE;
563 : }
564 : }
565 0 : if (!strings->containsNone(*c.strings)) return FALSE;
566 0 : return TRUE;
567 : }
568 :
569 : /**
570 : * Returns true if this set contains none of the characters
571 : * of the given string.
572 : * @param s string containing characters to be checked for containment
573 : * @return true if the test condition is met
574 : */
575 0 : UBool UnicodeSet::containsNone(const UnicodeString& s) const {
576 0 : return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
577 0 : s.length());
578 : }
579 :
580 : /**
581 : * Returns <tt>true</tt> if this set contains any character whose low byte
582 : * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
583 : * indexing.
584 : */
585 0 : UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
586 : /* The index value v, in the range [0,255], is contained in this set if
587 : * it is contained in any pair of this set. Pairs either have the high
588 : * bytes equal, or unequal. If the high bytes are equal, then we have
589 : * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
590 : * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
591 : * Then v is contained if xx <= v || v <= yy. (This is identical to the
592 : * time zone month containment logic.)
593 : */
594 : int32_t i;
595 0 : int32_t rangeCount=getRangeCount();
596 0 : for (i=0; i<rangeCount; ++i) {
597 0 : UChar32 low = getRangeStart(i);
598 0 : UChar32 high = getRangeEnd(i);
599 0 : if ((low & ~0xFF) == (high & ~0xFF)) {
600 0 : if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
601 0 : return TRUE;
602 : }
603 0 : } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
604 0 : return TRUE;
605 : }
606 : }
607 0 : if (strings->size() != 0) {
608 0 : for (i=0; i<strings->size(); ++i) {
609 0 : const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
610 : //if (s.length() == 0) {
611 : // // Empty strings match everything
612 : // return TRUE;
613 : //}
614 : // assert(s.length() != 0); // We enforce this elsewhere
615 0 : UChar32 c = s.char32At(0);
616 0 : if ((c & 0xFF) == v) {
617 0 : return TRUE;
618 : }
619 : }
620 : }
621 0 : return FALSE;
622 : }
623 :
624 : /**
625 : * Implementation of UnicodeMatcher::matches(). Always matches the
626 : * longest possible multichar string.
627 : */
628 0 : UMatchDegree UnicodeSet::matches(const Replaceable& text,
629 : int32_t& offset,
630 : int32_t limit,
631 : UBool incremental) {
632 0 : if (offset == limit) {
633 : // Strings, if any, have length != 0, so we don't worry
634 : // about them here. If we ever allow zero-length strings
635 : // we much check for them here.
636 0 : if (contains(U_ETHER)) {
637 0 : return incremental ? U_PARTIAL_MATCH : U_MATCH;
638 : } else {
639 0 : return U_MISMATCH;
640 : }
641 : } else {
642 0 : if (strings->size() != 0) { // try strings first
643 :
644 : // might separate forward and backward loops later
645 : // for now they are combined
646 :
647 : // TODO Improve efficiency of this, at least in the forward
648 : // direction, if not in both. In the forward direction we
649 : // can assume the strings are sorted.
650 :
651 : int32_t i;
652 0 : UBool forward = offset < limit;
653 :
654 : // firstChar is the leftmost char to match in the
655 : // forward direction or the rightmost char to match in
656 : // the reverse direction.
657 0 : UChar firstChar = text.charAt(offset);
658 :
659 : // If there are multiple strings that can match we
660 : // return the longest match.
661 0 : int32_t highWaterLength = 0;
662 :
663 0 : for (i=0; i<strings->size(); ++i) {
664 0 : const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
665 :
666 : //if (trial.length() == 0) {
667 : // return U_MATCH; // null-string always matches
668 : //}
669 : // assert(trial.length() != 0); // We ensure this elsewhere
670 :
671 0 : UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
672 :
673 : // Strings are sorted, so we can optimize in the
674 : // forward direction.
675 0 : if (forward && c > firstChar) break;
676 0 : if (c != firstChar) continue;
677 :
678 0 : int32_t matchLen = matchRest(text, offset, limit, trial);
679 :
680 0 : if (incremental) {
681 0 : int32_t maxLen = forward ? limit-offset : offset-limit;
682 0 : if (matchLen == maxLen) {
683 : // We have successfully matched but only up to limit.
684 0 : return U_PARTIAL_MATCH;
685 : }
686 : }
687 :
688 0 : if (matchLen == trial.length()) {
689 : // We have successfully matched the whole string.
690 0 : if (matchLen > highWaterLength) {
691 0 : highWaterLength = matchLen;
692 : }
693 : // In the forward direction we know strings
694 : // are sorted so we can bail early.
695 0 : if (forward && matchLen < highWaterLength) {
696 0 : break;
697 : }
698 0 : continue;
699 : }
700 : }
701 :
702 : // We've checked all strings without a partial match.
703 : // If we have full matches, return the longest one.
704 0 : if (highWaterLength != 0) {
705 0 : offset += forward ? highWaterLength : -highWaterLength;
706 0 : return U_MATCH;
707 : }
708 : }
709 0 : return UnicodeFilter::matches(text, offset, limit, incremental);
710 : }
711 : }
712 :
713 : /**
714 : * Returns the longest match for s in text at the given position.
715 : * If limit > start then match forward from start+1 to limit
716 : * matching all characters except s.charAt(0). If limit < start,
717 : * go backward starting from start-1 matching all characters
718 : * except s.charAt(s.length()-1). This method assumes that the
719 : * first character, text.charAt(start), matches s, so it does not
720 : * check it.
721 : * @param text the text to match
722 : * @param start the first character to match. In the forward
723 : * direction, text.charAt(start) is matched against s.charAt(0).
724 : * In the reverse direction, it is matched against
725 : * s.charAt(s.length()-1).
726 : * @param limit the limit offset for matching, either last+1 in
727 : * the forward direction, or last-1 in the reverse direction,
728 : * where last is the index of the last character to match.
729 : * @return If part of s matches up to the limit, return |limit -
730 : * start|. If all of s matches before reaching the limit, return
731 : * s.length(). If there is a mismatch between s and text, return
732 : * 0
733 : */
734 0 : int32_t UnicodeSet::matchRest(const Replaceable& text,
735 : int32_t start, int32_t limit,
736 : const UnicodeString& s) {
737 : int32_t i;
738 : int32_t maxLen;
739 0 : int32_t slen = s.length();
740 0 : if (start < limit) {
741 0 : maxLen = limit - start;
742 0 : if (maxLen > slen) maxLen = slen;
743 0 : for (i = 1; i < maxLen; ++i) {
744 0 : if (text.charAt(start + i) != s.charAt(i)) return 0;
745 : }
746 : } else {
747 0 : maxLen = start - limit;
748 0 : if (maxLen > slen) maxLen = slen;
749 0 : --slen; // <=> slen = s.length() - 1;
750 0 : for (i = 1; i < maxLen; ++i) {
751 0 : if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
752 : }
753 : }
754 0 : return maxLen;
755 : }
756 :
757 : /**
758 : * Implement of UnicodeMatcher
759 : */
760 0 : void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
761 0 : toUnionTo.addAll(*this);
762 0 : }
763 :
764 : /**
765 : * Returns the index of the given character within this set, where
766 : * the set is ordered by ascending code point. If the character
767 : * is not in this set, return -1. The inverse of this method is
768 : * <code>charAt()</code>.
769 : * @return an index from 0..size()-1, or -1
770 : */
771 0 : int32_t UnicodeSet::indexOf(UChar32 c) const {
772 0 : if (c < MIN_VALUE || c > MAX_VALUE) {
773 0 : return -1;
774 : }
775 0 : int32_t i = 0;
776 0 : int32_t n = 0;
777 : for (;;) {
778 0 : UChar32 start = list[i++];
779 0 : if (c < start) {
780 0 : return -1;
781 : }
782 0 : UChar32 limit = list[i++];
783 0 : if (c < limit) {
784 0 : return n + c - start;
785 : }
786 0 : n += limit - start;
787 0 : }
788 : }
789 :
790 : /**
791 : * Returns the character at the given index within this set, where
792 : * the set is ordered by ascending code point. If the index is
793 : * out of range, return (UChar32)-1. The inverse of this method is
794 : * <code>indexOf()</code>.
795 : * @param index an index from 0..size()-1
796 : * @return the character at the given index, or (UChar32)-1.
797 : */
798 0 : UChar32 UnicodeSet::charAt(int32_t index) const {
799 0 : if (index >= 0) {
800 : // len2 is the largest even integer <= len, that is, it is len
801 : // for even values and len-1 for odd values. With odd values
802 : // the last entry is UNICODESET_HIGH.
803 0 : int32_t len2 = len & ~1;
804 0 : for (int32_t i=0; i < len2;) {
805 0 : UChar32 start = list[i++];
806 0 : int32_t count = list[i++] - start;
807 0 : if (index < count) {
808 0 : return (UChar32)(start + index);
809 : }
810 0 : index -= count;
811 : }
812 : }
813 0 : return (UChar32)-1;
814 : }
815 :
816 : /**
817 : * Make this object represent the range <code>start - end</code>.
818 : * If <code>end > start</code> then this object is set to an
819 : * an empty range.
820 : *
821 : * @param start first character in the set, inclusive
822 : * @rparam end last character in the set, inclusive
823 : */
824 0 : UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
825 0 : clear();
826 0 : complement(start, end);
827 0 : return *this;
828 : }
829 :
830 : /**
831 : * Adds the specified range to this set if it is not already
832 : * present. If this set already contains the specified range,
833 : * the call leaves this set unchanged. If <code>end > start</code>
834 : * then an empty range is added, leaving the set unchanged.
835 : *
836 : * @param start first character, inclusive, of range to be added
837 : * to this set.
838 : * @param end last character, inclusive, of range to be added
839 : * to this set.
840 : */
841 0 : UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
842 0 : if (pinCodePoint(start) < pinCodePoint(end)) {
843 0 : UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
844 0 : add(range, 2, 0);
845 0 : } else if (start == end) {
846 0 : add(start);
847 : }
848 0 : return *this;
849 : }
850 :
851 : // #define DEBUG_US_ADD
852 :
853 : #ifdef DEBUG_US_ADD
854 : #include <stdio.h>
855 : void dump(UChar32 c) {
856 : if (c <= 0xFF) {
857 : printf("%c", (char)c);
858 : } else {
859 : printf("U+%04X", c);
860 : }
861 : }
862 : void dump(const UChar32* list, int32_t len) {
863 : printf("[");
864 : for (int32_t i=0; i<len; ++i) {
865 : if (i != 0) printf(", ");
866 : dump(list[i]);
867 : }
868 : printf("]");
869 : }
870 : #endif
871 :
872 : /**
873 : * Adds the specified character to this set if it is not already
874 : * present. If this set already contains the specified character,
875 : * the call leaves this set unchanged.
876 : */
877 0 : UnicodeSet& UnicodeSet::add(UChar32 c) {
878 : // find smallest i such that c < list[i]
879 : // if odd, then it is IN the set
880 : // if even, then it is OUT of the set
881 0 : int32_t i = findCodePoint(pinCodePoint(c));
882 :
883 : // already in set?
884 0 : if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
885 :
886 : // HIGH is 0x110000
887 : // assert(list[len-1] == HIGH);
888 :
889 : // empty = [HIGH]
890 : // [start_0, limit_0, start_1, limit_1, HIGH]
891 :
892 : // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
893 : // ^
894 : // list[i]
895 :
896 : // i == 0 means c is before the first range
897 :
898 : #ifdef DEBUG_US_ADD
899 : printf("Add of ");
900 : dump(c);
901 : printf(" found at %d", i);
902 : printf(": ");
903 : dump(list, len);
904 : printf(" => ");
905 : #endif
906 :
907 0 : if (c == list[i]-1) {
908 : // c is before start of next range
909 0 : list[i] = c;
910 : // if we touched the HIGH mark, then add a new one
911 0 : if (c == (UNICODESET_HIGH - 1)) {
912 0 : UErrorCode status = U_ZERO_ERROR;
913 0 : ensureCapacity(len+1, status);
914 0 : if (U_FAILURE(status)) {
915 0 : return *this; // There is no way to report this error :-(
916 : }
917 0 : list[len++] = UNICODESET_HIGH;
918 : }
919 0 : if (i > 0 && c == list[i-1]) {
920 : // collapse adjacent ranges
921 :
922 : // [..., start_k-1, c, c, limit_k, ..., HIGH]
923 : // ^
924 : // list[i]
925 :
926 : //for (int32_t k=i-1; k<len-2; ++k) {
927 : // list[k] = list[k+2];
928 : //}
929 0 : UChar32* dst = list + i - 1;
930 0 : UChar32* src = dst + 2;
931 0 : UChar32* srclimit = list + len;
932 0 : while (src < srclimit) *(dst++) = *(src++);
933 :
934 0 : len -= 2;
935 : }
936 : }
937 :
938 0 : else if (i > 0 && c == list[i-1]) {
939 : // c is after end of prior range
940 0 : list[i-1]++;
941 : // no need to check for collapse here
942 : }
943 :
944 : else {
945 : // At this point we know the new char is not adjacent to
946 : // any existing ranges, and it is not 10FFFF.
947 :
948 :
949 : // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
950 : // ^
951 : // list[i]
952 :
953 : // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
954 : // ^
955 : // list[i]
956 :
957 0 : UErrorCode status = U_ZERO_ERROR;
958 0 : ensureCapacity(len+2, status);
959 0 : if (U_FAILURE(status)) {
960 0 : return *this; // There is no way to report this error :-(
961 : }
962 :
963 : //for (int32_t k=len-1; k>=i; --k) {
964 : // list[k+2] = list[k];
965 : //}
966 0 : UChar32* src = list + len;
967 0 : UChar32* dst = src + 2;
968 0 : UChar32* srclimit = list + i;
969 0 : while (src > srclimit) *(--dst) = *(--src);
970 :
971 0 : list[i] = c;
972 0 : list[i+1] = c+1;
973 0 : len += 2;
974 : }
975 :
976 : #ifdef DEBUG_US_ADD
977 : dump(list, len);
978 : printf("\n");
979 :
980 : for (i=1; i<len; ++i) {
981 : if (list[i] <= list[i-1]) {
982 : // Corrupt array!
983 : printf("ERROR: list has been corrupted\n");
984 : exit(1);
985 : }
986 : }
987 : #endif
988 :
989 0 : releasePattern();
990 0 : return *this;
991 : }
992 :
993 : /**
994 : * Adds the specified multicharacter to this set if it is not already
995 : * present. If this set already contains the multicharacter,
996 : * the call leaves this set unchanged.
997 : * Thus "ch" => {"ch"}
998 : * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
999 : * @param s the source string
1000 : * @return the modified set, for chaining
1001 : */
1002 0 : UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1003 0 : if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1004 0 : int32_t cp = getSingleCP(s);
1005 0 : if (cp < 0) {
1006 0 : if (!strings->contains((void*) &s)) {
1007 0 : _add(s);
1008 0 : releasePattern();
1009 : }
1010 : } else {
1011 0 : add((UChar32)cp);
1012 : }
1013 0 : return *this;
1014 : }
1015 :
1016 : /**
1017 : * Adds the given string, in order, to 'strings'. The given string
1018 : * must have been checked by the caller to not be empty and to not
1019 : * already be in 'strings'.
1020 : */
1021 0 : void UnicodeSet::_add(const UnicodeString& s) {
1022 0 : if (isFrozen() || isBogus()) {
1023 0 : return;
1024 : }
1025 0 : UnicodeString* t = new UnicodeString(s);
1026 0 : if (t == NULL) { // Check for memory allocation error.
1027 0 : setToBogus();
1028 0 : return;
1029 : }
1030 0 : UErrorCode ec = U_ZERO_ERROR;
1031 0 : strings->sortedInsert(t, compareUnicodeString, ec);
1032 0 : if (U_FAILURE(ec)) {
1033 0 : setToBogus();
1034 0 : delete t;
1035 : }
1036 : }
1037 :
1038 : /**
1039 : * @return a code point IF the string consists of a single one.
1040 : * otherwise returns -1.
1041 : * @param string to test
1042 : */
1043 0 : int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1044 : //if (s.length() < 1) {
1045 : // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1046 : //}
1047 0 : if (s.length() > 2) return -1;
1048 0 : if (s.length() == 1) return s.charAt(0);
1049 :
1050 : // at this point, len = 2
1051 0 : UChar32 cp = s.char32At(0);
1052 0 : if (cp > 0xFFFF) { // is surrogate pair
1053 0 : return cp;
1054 : }
1055 0 : return -1;
1056 : }
1057 :
1058 : /**
1059 : * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1060 : * If this set already any particular character, it has no effect on that character.
1061 : * @param the source string
1062 : * @return the modified set, for chaining
1063 : */
1064 0 : UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1065 : UChar32 cp;
1066 0 : for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1067 0 : cp = s.char32At(i);
1068 0 : add(cp);
1069 : }
1070 0 : return *this;
1071 : }
1072 :
1073 : /**
1074 : * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1075 : * If this set already any particular character, it has no effect on that character.
1076 : * @param the source string
1077 : * @return the modified set, for chaining
1078 : */
1079 0 : UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1080 0 : UnicodeSet set;
1081 0 : set.addAll(s);
1082 0 : retainAll(set);
1083 0 : return *this;
1084 : }
1085 :
1086 : /**
1087 : * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1088 : * If this set already any particular character, it has no effect on that character.
1089 : * @param the source string
1090 : * @return the modified set, for chaining
1091 : */
1092 0 : UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1093 0 : UnicodeSet set;
1094 0 : set.addAll(s);
1095 0 : complementAll(set);
1096 0 : return *this;
1097 : }
1098 :
1099 : /**
1100 : * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1101 : * If this set already any particular character, it has no effect on that character.
1102 : * @param the source string
1103 : * @return the modified set, for chaining
1104 : */
1105 0 : UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1106 0 : UnicodeSet set;
1107 0 : set.addAll(s);
1108 0 : removeAll(set);
1109 0 : return *this;
1110 : }
1111 :
1112 0 : UnicodeSet& UnicodeSet::removeAllStrings() {
1113 0 : strings->removeAllElements();
1114 0 : return *this;
1115 : }
1116 :
1117 :
1118 : /**
1119 : * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1120 : * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1121 : * @param the source string
1122 : * @return a newly created set containing the given string
1123 : */
1124 0 : UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1125 0 : UnicodeSet *set = new UnicodeSet();
1126 0 : if (set != NULL) { // Check for memory allocation error.
1127 0 : set->add(s);
1128 : }
1129 0 : return set;
1130 : }
1131 :
1132 :
1133 : /**
1134 : * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1135 : * @param the source string
1136 : * @return a newly created set containing the given characters
1137 : */
1138 0 : UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1139 0 : UnicodeSet *set = new UnicodeSet();
1140 0 : if (set != NULL) { // Check for memory allocation error.
1141 0 : set->addAll(s);
1142 : }
1143 0 : return set;
1144 : }
1145 :
1146 : /**
1147 : * Retain only the elements in this set that are contained in the
1148 : * specified range. If <code>end > start</code> then an empty range is
1149 : * retained, leaving the set empty.
1150 : *
1151 : * @param start first character, inclusive, of range to be retained
1152 : * to this set.
1153 : * @param end last character, inclusive, of range to be retained
1154 : * to this set.
1155 : */
1156 0 : UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1157 0 : if (pinCodePoint(start) <= pinCodePoint(end)) {
1158 0 : UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1159 0 : retain(range, 2, 0);
1160 : } else {
1161 0 : clear();
1162 : }
1163 0 : return *this;
1164 : }
1165 :
1166 0 : UnicodeSet& UnicodeSet::retain(UChar32 c) {
1167 0 : return retain(c, c);
1168 : }
1169 :
1170 : /**
1171 : * Removes the specified range from this set if it is present.
1172 : * The set will not contain the specified range once the call
1173 : * returns. If <code>end > start</code> then an empty range is
1174 : * removed, leaving the set unchanged.
1175 : *
1176 : * @param start first character, inclusive, of range to be removed
1177 : * from this set.
1178 : * @param end last character, inclusive, of range to be removed
1179 : * from this set.
1180 : */
1181 0 : UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1182 0 : if (pinCodePoint(start) <= pinCodePoint(end)) {
1183 0 : UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1184 0 : retain(range, 2, 2);
1185 : }
1186 0 : return *this;
1187 : }
1188 :
1189 : /**
1190 : * Removes the specified character from this set if it is present.
1191 : * The set will not contain the specified range once the call
1192 : * returns.
1193 : */
1194 0 : UnicodeSet& UnicodeSet::remove(UChar32 c) {
1195 0 : return remove(c, c);
1196 : }
1197 :
1198 : /**
1199 : * Removes the specified string from this set if it is present.
1200 : * The set will not contain the specified character once the call
1201 : * returns.
1202 : * @param the source string
1203 : * @return the modified set, for chaining
1204 : */
1205 0 : UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1206 0 : if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1207 0 : int32_t cp = getSingleCP(s);
1208 0 : if (cp < 0) {
1209 0 : strings->removeElement((void*) &s);
1210 0 : releasePattern();
1211 : } else {
1212 0 : remove((UChar32)cp, (UChar32)cp);
1213 : }
1214 0 : return *this;
1215 : }
1216 :
1217 : /**
1218 : * Complements the specified range in this set. Any character in
1219 : * the range will be removed if it is in this set, or will be
1220 : * added if it is not in this set. If <code>end > start</code>
1221 : * then an empty range is xor'ed, leaving the set unchanged.
1222 : *
1223 : * @param start first character, inclusive, of range to be removed
1224 : * from this set.
1225 : * @param end last character, inclusive, of range to be removed
1226 : * from this set.
1227 : */
1228 0 : UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1229 0 : if (isFrozen() || isBogus()) {
1230 0 : return *this;
1231 : }
1232 0 : if (pinCodePoint(start) <= pinCodePoint(end)) {
1233 0 : UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1234 0 : exclusiveOr(range, 2, 0);
1235 : }
1236 0 : releasePattern();
1237 0 : return *this;
1238 : }
1239 :
1240 0 : UnicodeSet& UnicodeSet::complement(UChar32 c) {
1241 0 : return complement(c, c);
1242 : }
1243 :
1244 : /**
1245 : * This is equivalent to
1246 : * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1247 : */
1248 0 : UnicodeSet& UnicodeSet::complement(void) {
1249 0 : if (isFrozen() || isBogus()) {
1250 0 : return *this;
1251 : }
1252 0 : UErrorCode status = U_ZERO_ERROR;
1253 0 : if (list[0] == UNICODESET_LOW) {
1254 0 : ensureBufferCapacity(len-1, status);
1255 0 : if (U_FAILURE(status)) {
1256 0 : return *this;
1257 : }
1258 0 : uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32));
1259 0 : --len;
1260 : } else {
1261 0 : ensureBufferCapacity(len+1, status);
1262 0 : if (U_FAILURE(status)) {
1263 0 : return *this;
1264 : }
1265 0 : uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32));
1266 0 : buffer[0] = UNICODESET_LOW;
1267 0 : ++len;
1268 : }
1269 0 : swapBuffers();
1270 0 : releasePattern();
1271 0 : return *this;
1272 : }
1273 :
1274 : /**
1275 : * Complement the specified string in this set.
1276 : * The set will not contain the specified string once the call
1277 : * returns.
1278 : * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1279 : * @param s the string to complement
1280 : * @return this object, for chaining
1281 : */
1282 0 : UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1283 0 : if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1284 0 : int32_t cp = getSingleCP(s);
1285 0 : if (cp < 0) {
1286 0 : if (strings->contains((void*) &s)) {
1287 0 : strings->removeElement((void*) &s);
1288 : } else {
1289 0 : _add(s);
1290 : }
1291 0 : releasePattern();
1292 : } else {
1293 0 : complement((UChar32)cp, (UChar32)cp);
1294 : }
1295 0 : return *this;
1296 : }
1297 :
1298 : /**
1299 : * Adds all of the elements in the specified set to this set if
1300 : * they're not already present. This operation effectively
1301 : * modifies this set so that its value is the <i>union</i> of the two
1302 : * sets. The behavior of this operation is unspecified if the specified
1303 : * collection is modified while the operation is in progress.
1304 : *
1305 : * @param c set whose elements are to be added to this set.
1306 : * @see #add(char, char)
1307 : */
1308 0 : UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1309 0 : if ( c.len>0 && c.list!=NULL ) {
1310 0 : add(c.list, c.len, 0);
1311 : }
1312 :
1313 : // Add strings in order
1314 0 : if ( c.strings!=NULL ) {
1315 0 : for (int32_t i=0; i<c.strings->size(); ++i) {
1316 0 : const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1317 0 : if (!strings->contains((void*) s)) {
1318 0 : _add(*s);
1319 : }
1320 : }
1321 : }
1322 0 : return *this;
1323 : }
1324 :
1325 : /**
1326 : * Retains only the elements in this set that are contained in the
1327 : * specified set. In other words, removes from this set all of
1328 : * its elements that are not contained in the specified set. This
1329 : * operation effectively modifies this set so that its value is
1330 : * the <i>intersection</i> of the two sets.
1331 : *
1332 : * @param c set that defines which elements this set will retain.
1333 : */
1334 0 : UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1335 0 : if (isFrozen() || isBogus()) {
1336 0 : return *this;
1337 : }
1338 0 : retain(c.list, c.len, 0);
1339 0 : strings->retainAll(*c.strings);
1340 0 : return *this;
1341 : }
1342 :
1343 : /**
1344 : * Removes from this set all of its elements that are contained in the
1345 : * specified set. This operation effectively modifies this
1346 : * set so that its value is the <i>asymmetric set difference</i> of
1347 : * the two sets.
1348 : *
1349 : * @param c set that defines which elements will be removed from
1350 : * this set.
1351 : */
1352 0 : UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1353 0 : if (isFrozen() || isBogus()) {
1354 0 : return *this;
1355 : }
1356 0 : retain(c.list, c.len, 2);
1357 0 : strings->removeAll(*c.strings);
1358 0 : return *this;
1359 : }
1360 :
1361 : /**
1362 : * Complements in this set all elements contained in the specified
1363 : * set. Any character in the other set will be removed if it is
1364 : * in this set, or will be added if it is not in this set.
1365 : *
1366 : * @param c set that defines which elements will be xor'ed from
1367 : * this set.
1368 : */
1369 0 : UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1370 0 : if (isFrozen() || isBogus()) {
1371 0 : return *this;
1372 : }
1373 0 : exclusiveOr(c.list, c.len, 0);
1374 :
1375 0 : for (int32_t i=0; i<c.strings->size(); ++i) {
1376 0 : void* e = c.strings->elementAt(i);
1377 0 : if (!strings->removeElement(e)) {
1378 0 : _add(*(const UnicodeString*)e);
1379 : }
1380 : }
1381 0 : return *this;
1382 : }
1383 :
1384 : /**
1385 : * Removes all of the elements from this set. This set will be
1386 : * empty after this call returns.
1387 : */
1388 0 : UnicodeSet& UnicodeSet::clear(void) {
1389 0 : if (isFrozen()) {
1390 0 : return *this;
1391 : }
1392 0 : if (list != NULL) {
1393 0 : list[0] = UNICODESET_HIGH;
1394 : }
1395 0 : len = 1;
1396 0 : releasePattern();
1397 0 : if (strings != NULL) {
1398 0 : strings->removeAllElements();
1399 : }
1400 0 : if (list != NULL && strings != NULL) {
1401 : // Remove bogus
1402 0 : fFlags = 0;
1403 : }
1404 0 : return *this;
1405 : }
1406 :
1407 : /**
1408 : * Iteration method that returns the number of ranges contained in
1409 : * this set.
1410 : * @see #getRangeStart
1411 : * @see #getRangeEnd
1412 : */
1413 0 : int32_t UnicodeSet::getRangeCount() const {
1414 0 : return len/2;
1415 : }
1416 :
1417 : /**
1418 : * Iteration method that returns the first character in the
1419 : * specified range of this set.
1420 : * @see #getRangeCount
1421 : * @see #getRangeEnd
1422 : */
1423 0 : UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1424 0 : return list[index*2];
1425 : }
1426 :
1427 : /**
1428 : * Iteration method that returns the last character in the
1429 : * specified range of this set.
1430 : * @see #getRangeStart
1431 : * @see #getRangeEnd
1432 : */
1433 0 : UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1434 0 : return list[index*2 + 1] - 1;
1435 : }
1436 :
1437 0 : int32_t UnicodeSet::getStringCount() const {
1438 0 : return strings->size();
1439 : }
1440 :
1441 0 : const UnicodeString* UnicodeSet::getString(int32_t index) const {
1442 0 : return (const UnicodeString*) strings->elementAt(index);
1443 : }
1444 :
1445 : /**
1446 : * Reallocate this objects internal structures to take up the least
1447 : * possible space, without changing this object's value.
1448 : */
1449 0 : UnicodeSet& UnicodeSet::compact() {
1450 0 : if (isFrozen() || isBogus()) {
1451 0 : return *this;
1452 : }
1453 : // Delete buffer first to defragment memory less.
1454 0 : if (buffer != NULL) {
1455 0 : uprv_free(buffer);
1456 0 : buffer = NULL;
1457 : }
1458 0 : if (len < capacity) {
1459 : // Make the capacity equal to len or 1.
1460 : // We don't want to realloc of 0 size.
1461 0 : int32_t newCapacity = len + (len == 0);
1462 0 : UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1463 0 : if (temp) {
1464 0 : list = temp;
1465 0 : capacity = newCapacity;
1466 : }
1467 : // else what the heck happened?! We allocated less memory!
1468 : // Oh well. We'll keep our original array.
1469 : }
1470 0 : return *this;
1471 : }
1472 :
1473 : #ifdef DEBUG_SERIALIZE
1474 : #include <stdio.h>
1475 : #endif
1476 :
1477 : /**
1478 : * Deserialize constructor.
1479 : */
1480 0 : UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1481 : : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1482 : bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1483 0 : fFlags(0) {
1484 :
1485 0 : if(U_FAILURE(ec)) {
1486 0 : setToBogus();
1487 0 : return;
1488 : }
1489 :
1490 0 : if( (serialization != kSerialized)
1491 0 : || (data==NULL)
1492 0 : || (dataLen < 1)) {
1493 0 : ec = U_ILLEGAL_ARGUMENT_ERROR;
1494 0 : setToBogus();
1495 0 : return;
1496 : }
1497 :
1498 0 : allocateStrings(ec);
1499 0 : if (U_FAILURE(ec)) {
1500 0 : setToBogus();
1501 0 : return;
1502 : }
1503 :
1504 : // bmp?
1505 0 : int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1506 0 : int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1507 :
1508 0 : len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1509 : #ifdef DEBUG_SERIALIZE
1510 : printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1511 : #endif
1512 0 : capacity = len+1;
1513 0 : list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1514 0 : if(!list || U_FAILURE(ec)) {
1515 0 : setToBogus();
1516 0 : return;
1517 : }
1518 : // copy bmp
1519 : int32_t i;
1520 0 : for(i = 0; i< bmpLength;i++) {
1521 0 : list[i] = data[i+headerSize];
1522 : #ifdef DEBUG_SERIALIZE
1523 : printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1524 : #endif
1525 : }
1526 : // copy smp
1527 0 : for(i=bmpLength;i<len;i++) {
1528 0 : list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1529 0 : ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1530 : #ifdef DEBUG_SERIALIZE
1531 : printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1532 : #endif
1533 : }
1534 : // terminator
1535 0 : list[len++]=UNICODESET_HIGH;
1536 : }
1537 :
1538 :
1539 0 : int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1540 : int32_t bmpLength, length, destLength;
1541 :
1542 0 : if (U_FAILURE(ec)) {
1543 0 : return 0;
1544 : }
1545 :
1546 0 : if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1547 0 : ec=U_ILLEGAL_ARGUMENT_ERROR;
1548 0 : return 0;
1549 : }
1550 :
1551 : /* count necessary 16-bit units */
1552 0 : length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1553 : // assert(length>=0);
1554 0 : if (length==0) {
1555 : /* empty set */
1556 0 : if (destCapacity>0) {
1557 0 : *dest=0;
1558 : } else {
1559 0 : ec=U_BUFFER_OVERFLOW_ERROR;
1560 : }
1561 0 : return 1;
1562 : }
1563 : /* now length>0 */
1564 :
1565 0 : if (this->list[length-1]<=0xffff) {
1566 : /* all BMP */
1567 0 : bmpLength=length;
1568 0 : } else if (this->list[0]>=0x10000) {
1569 : /* all supplementary */
1570 0 : bmpLength=0;
1571 0 : length*=2;
1572 : } else {
1573 : /* some BMP, some supplementary */
1574 0 : for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1575 0 : length=bmpLength+2*(length-bmpLength);
1576 : }
1577 : #ifdef DEBUG_SERIALIZE
1578 : printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1579 : #endif
1580 : /* length: number of 16-bit array units */
1581 0 : if (length>0x7fff) {
1582 : /* there are only 15 bits for the length in the first serialized word */
1583 0 : ec=U_INDEX_OUTOFBOUNDS_ERROR;
1584 0 : return 0;
1585 : }
1586 :
1587 : /*
1588 : * total serialized length:
1589 : * number of 16-bit array units (length) +
1590 : * 1 length unit (always) +
1591 : * 1 bmpLength unit (if there are supplementary values)
1592 : */
1593 0 : destLength=length+((length>bmpLength)?2:1);
1594 0 : if (destLength<=destCapacity) {
1595 : const UChar32 *p;
1596 : int32_t i;
1597 :
1598 : #ifdef DEBUG_SERIALIZE
1599 : printf("writeHdr\n");
1600 : #endif
1601 0 : *dest=(uint16_t)length;
1602 0 : if (length>bmpLength) {
1603 0 : *dest|=0x8000;
1604 0 : *++dest=(uint16_t)bmpLength;
1605 : }
1606 0 : ++dest;
1607 :
1608 : /* write the BMP part of the array */
1609 0 : p=this->list;
1610 0 : for (i=0; i<bmpLength; ++i) {
1611 : #ifdef DEBUG_SERIALIZE
1612 : printf("writebmp: %x\n", (int)*p);
1613 : #endif
1614 0 : *dest++=(uint16_t)*p++;
1615 : }
1616 :
1617 : /* write the supplementary part of the array */
1618 0 : for (; i<length; i+=2) {
1619 : #ifdef DEBUG_SERIALIZE
1620 : printf("write32: %x\n", (int)*p);
1621 : #endif
1622 0 : *dest++=(uint16_t)(*p>>16);
1623 0 : *dest++=(uint16_t)*p++;
1624 : }
1625 : } else {
1626 0 : ec=U_BUFFER_OVERFLOW_ERROR;
1627 : }
1628 0 : return destLength;
1629 : }
1630 :
1631 : //----------------------------------------------------------------
1632 : // Implementation: Utility methods
1633 : //----------------------------------------------------------------
1634 :
1635 : /**
1636 : * Allocate our strings vector and return TRUE if successful.
1637 : */
1638 0 : UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1639 0 : if (U_FAILURE(status)) {
1640 0 : return FALSE;
1641 : }
1642 0 : strings = new UVector(uprv_deleteUObject,
1643 0 : uhash_compareUnicodeString, 1, status);
1644 0 : if (strings == NULL) { // Check for memory allocation error.
1645 0 : status = U_MEMORY_ALLOCATION_ERROR;
1646 0 : return FALSE;
1647 : }
1648 0 : if (U_FAILURE(status)) {
1649 0 : delete strings;
1650 0 : strings = NULL;
1651 0 : return FALSE;
1652 : }
1653 0 : return TRUE;
1654 : }
1655 :
1656 0 : void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1657 0 : if (newLen <= capacity)
1658 0 : return;
1659 0 : UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1660 0 : if (temp == NULL) {
1661 0 : ec = U_MEMORY_ALLOCATION_ERROR;
1662 0 : setToBogus();
1663 0 : return;
1664 : }
1665 0 : list = temp;
1666 0 : capacity = newLen + GROW_EXTRA;
1667 : // else we keep the original contents on the memory failure.
1668 : }
1669 :
1670 0 : void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1671 0 : if (buffer != NULL && newLen <= bufferCapacity)
1672 0 : return;
1673 0 : UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1674 0 : if (temp == NULL) {
1675 0 : ec = U_MEMORY_ALLOCATION_ERROR;
1676 0 : setToBogus();
1677 0 : return;
1678 : }
1679 0 : buffer = temp;
1680 0 : bufferCapacity = newLen + GROW_EXTRA;
1681 : // else we keep the original contents on the memory failure.
1682 : }
1683 :
1684 : /**
1685 : * Swap list and buffer.
1686 : */
1687 0 : void UnicodeSet::swapBuffers(void) {
1688 : // swap list and buffer
1689 0 : UChar32* temp = list;
1690 0 : list = buffer;
1691 0 : buffer = temp;
1692 :
1693 0 : int32_t c = capacity;
1694 0 : capacity = bufferCapacity;
1695 0 : bufferCapacity = c;
1696 0 : }
1697 :
1698 0 : void UnicodeSet::setToBogus() {
1699 0 : clear(); // Remove everything in the set.
1700 0 : fFlags = kIsBogus;
1701 0 : }
1702 :
1703 : //----------------------------------------------------------------
1704 : // Implementation: Fundamental operators
1705 : //----------------------------------------------------------------
1706 :
1707 0 : static inline UChar32 max(UChar32 a, UChar32 b) {
1708 0 : return (a > b) ? a : b;
1709 : }
1710 :
1711 : // polarity = 0, 3 is normal: x xor y
1712 : // polarity = 1, 2: x xor ~y == x === y
1713 :
1714 0 : void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1715 0 : if (isFrozen() || isBogus()) {
1716 0 : return;
1717 : }
1718 0 : UErrorCode status = U_ZERO_ERROR;
1719 0 : ensureBufferCapacity(len + otherLen, status);
1720 0 : if (U_FAILURE(status)) {
1721 0 : return;
1722 : }
1723 :
1724 0 : int32_t i = 0, j = 0, k = 0;
1725 0 : UChar32 a = list[i++];
1726 : UChar32 b;
1727 0 : if (polarity == 1 || polarity == 2) {
1728 0 : b = UNICODESET_LOW;
1729 0 : if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1730 0 : ++j;
1731 0 : b = other[j];
1732 : }
1733 : } else {
1734 0 : b = other[j++];
1735 : }
1736 : // simplest of all the routines
1737 : // sort the values, discarding identicals!
1738 : for (;;) {
1739 0 : if (a < b) {
1740 0 : buffer[k++] = a;
1741 0 : a = list[i++];
1742 0 : } else if (b < a) {
1743 0 : buffer[k++] = b;
1744 0 : b = other[j++];
1745 0 : } else if (a != UNICODESET_HIGH) { // at this point, a == b
1746 : // discard both values!
1747 0 : a = list[i++];
1748 0 : b = other[j++];
1749 : } else { // DONE!
1750 0 : buffer[k++] = UNICODESET_HIGH;
1751 0 : len = k;
1752 0 : break;
1753 : }
1754 : }
1755 0 : swapBuffers();
1756 0 : releasePattern();
1757 : }
1758 :
1759 : // polarity = 0 is normal: x union y
1760 : // polarity = 2: x union ~y
1761 : // polarity = 1: ~x union y
1762 : // polarity = 3: ~x union ~y
1763 :
1764 0 : void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1765 0 : if (isFrozen() || isBogus() || other==NULL) {
1766 0 : return;
1767 : }
1768 0 : UErrorCode status = U_ZERO_ERROR;
1769 0 : ensureBufferCapacity(len + otherLen, status);
1770 0 : if (U_FAILURE(status)) {
1771 0 : return;
1772 : }
1773 :
1774 0 : int32_t i = 0, j = 0, k = 0;
1775 0 : UChar32 a = list[i++];
1776 0 : UChar32 b = other[j++];
1777 : // change from xor is that we have to check overlapping pairs
1778 : // polarity bit 1 means a is second, bit 2 means b is.
1779 : for (;;) {
1780 0 : switch (polarity) {
1781 : case 0: // both first; take lower if unequal
1782 0 : if (a < b) { // take a
1783 : // Back up over overlapping ranges in buffer[]
1784 0 : if (k > 0 && a <= buffer[k-1]) {
1785 : // Pick latter end value in buffer[] vs. list[]
1786 0 : a = max(list[i], buffer[--k]);
1787 : } else {
1788 : // No overlap
1789 0 : buffer[k++] = a;
1790 0 : a = list[i];
1791 : }
1792 0 : i++; // Common if/else code factored out
1793 0 : polarity ^= 1;
1794 0 : } else if (b < a) { // take b
1795 0 : if (k > 0 && b <= buffer[k-1]) {
1796 0 : b = max(other[j], buffer[--k]);
1797 : } else {
1798 0 : buffer[k++] = b;
1799 0 : b = other[j];
1800 : }
1801 0 : j++;
1802 0 : polarity ^= 2;
1803 : } else { // a == b, take a, drop b
1804 0 : if (a == UNICODESET_HIGH) goto loop_end;
1805 : // This is symmetrical; it doesn't matter if
1806 : // we backtrack with a or b. - liu
1807 0 : if (k > 0 && a <= buffer[k-1]) {
1808 0 : a = max(list[i], buffer[--k]);
1809 : } else {
1810 : // No overlap
1811 0 : buffer[k++] = a;
1812 0 : a = list[i];
1813 : }
1814 0 : i++;
1815 0 : polarity ^= 1;
1816 0 : b = other[j++];
1817 0 : polarity ^= 2;
1818 : }
1819 0 : break;
1820 : case 3: // both second; take higher if unequal, and drop other
1821 0 : if (b <= a) { // take a
1822 0 : if (a == UNICODESET_HIGH) goto loop_end;
1823 0 : buffer[k++] = a;
1824 : } else { // take b
1825 0 : if (b == UNICODESET_HIGH) goto loop_end;
1826 0 : buffer[k++] = b;
1827 : }
1828 0 : a = list[i++];
1829 0 : polarity ^= 1; // factored common code
1830 0 : b = other[j++];
1831 0 : polarity ^= 2;
1832 0 : break;
1833 : case 1: // a second, b first; if b < a, overlap
1834 0 : if (a < b) { // no overlap, take a
1835 0 : buffer[k++] = a; a = list[i++]; polarity ^= 1;
1836 0 : } else if (b < a) { // OVERLAP, drop b
1837 0 : b = other[j++];
1838 0 : polarity ^= 2;
1839 : } else { // a == b, drop both!
1840 0 : if (a == UNICODESET_HIGH) goto loop_end;
1841 0 : a = list[i++];
1842 0 : polarity ^= 1;
1843 0 : b = other[j++];
1844 0 : polarity ^= 2;
1845 : }
1846 0 : break;
1847 : case 2: // a first, b second; if a < b, overlap
1848 0 : if (b < a) { // no overlap, take b
1849 0 : buffer[k++] = b;
1850 0 : b = other[j++];
1851 0 : polarity ^= 2;
1852 0 : } else if (a < b) { // OVERLAP, drop a
1853 0 : a = list[i++];
1854 0 : polarity ^= 1;
1855 : } else { // a == b, drop both!
1856 0 : if (a == UNICODESET_HIGH) goto loop_end;
1857 0 : a = list[i++];
1858 0 : polarity ^= 1;
1859 0 : b = other[j++];
1860 0 : polarity ^= 2;
1861 : }
1862 0 : break;
1863 : }
1864 : }
1865 : loop_end:
1866 0 : buffer[k++] = UNICODESET_HIGH; // terminate
1867 0 : len = k;
1868 0 : swapBuffers();
1869 0 : releasePattern();
1870 : }
1871 :
1872 : // polarity = 0 is normal: x intersect y
1873 : // polarity = 2: x intersect ~y == set-minus
1874 : // polarity = 1: ~x intersect y
1875 : // polarity = 3: ~x intersect ~y
1876 :
1877 0 : void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1878 0 : if (isFrozen() || isBogus()) {
1879 0 : return;
1880 : }
1881 0 : UErrorCode status = U_ZERO_ERROR;
1882 0 : ensureBufferCapacity(len + otherLen, status);
1883 0 : if (U_FAILURE(status)) {
1884 0 : return;
1885 : }
1886 :
1887 0 : int32_t i = 0, j = 0, k = 0;
1888 0 : UChar32 a = list[i++];
1889 0 : UChar32 b = other[j++];
1890 : // change from xor is that we have to check overlapping pairs
1891 : // polarity bit 1 means a is second, bit 2 means b is.
1892 : for (;;) {
1893 0 : switch (polarity) {
1894 : case 0: // both first; drop the smaller
1895 0 : if (a < b) { // drop a
1896 0 : a = list[i++];
1897 0 : polarity ^= 1;
1898 0 : } else if (b < a) { // drop b
1899 0 : b = other[j++];
1900 0 : polarity ^= 2;
1901 : } else { // a == b, take one, drop other
1902 0 : if (a == UNICODESET_HIGH) goto loop_end;
1903 0 : buffer[k++] = a;
1904 0 : a = list[i++];
1905 0 : polarity ^= 1;
1906 0 : b = other[j++];
1907 0 : polarity ^= 2;
1908 : }
1909 0 : break;
1910 : case 3: // both second; take lower if unequal
1911 0 : if (a < b) { // take a
1912 0 : buffer[k++] = a;
1913 0 : a = list[i++];
1914 0 : polarity ^= 1;
1915 0 : } else if (b < a) { // take b
1916 0 : buffer[k++] = b;
1917 0 : b = other[j++];
1918 0 : polarity ^= 2;
1919 : } else { // a == b, take one, drop other
1920 0 : if (a == UNICODESET_HIGH) goto loop_end;
1921 0 : buffer[k++] = a;
1922 0 : a = list[i++];
1923 0 : polarity ^= 1;
1924 0 : b = other[j++];
1925 0 : polarity ^= 2;
1926 : }
1927 0 : break;
1928 : case 1: // a second, b first;
1929 0 : if (a < b) { // NO OVERLAP, drop a
1930 0 : a = list[i++];
1931 0 : polarity ^= 1;
1932 0 : } else if (b < a) { // OVERLAP, take b
1933 0 : buffer[k++] = b;
1934 0 : b = other[j++];
1935 0 : polarity ^= 2;
1936 : } else { // a == b, drop both!
1937 0 : if (a == UNICODESET_HIGH) goto loop_end;
1938 0 : a = list[i++];
1939 0 : polarity ^= 1;
1940 0 : b = other[j++];
1941 0 : polarity ^= 2;
1942 : }
1943 0 : break;
1944 : case 2: // a first, b second; if a < b, overlap
1945 0 : if (b < a) { // no overlap, drop b
1946 0 : b = other[j++];
1947 0 : polarity ^= 2;
1948 0 : } else if (a < b) { // OVERLAP, take a
1949 0 : buffer[k++] = a;
1950 0 : a = list[i++];
1951 0 : polarity ^= 1;
1952 : } else { // a == b, drop both!
1953 0 : if (a == UNICODESET_HIGH) goto loop_end;
1954 0 : a = list[i++];
1955 0 : polarity ^= 1;
1956 0 : b = other[j++];
1957 0 : polarity ^= 2;
1958 : }
1959 0 : break;
1960 : }
1961 : }
1962 : loop_end:
1963 0 : buffer[k++] = UNICODESET_HIGH; // terminate
1964 0 : len = k;
1965 0 : swapBuffers();
1966 0 : releasePattern();
1967 : }
1968 :
1969 : /**
1970 : * Append the <code>toPattern()</code> representation of a
1971 : * string to the given <code>StringBuffer</code>.
1972 : */
1973 0 : void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1974 : escapeUnprintable) {
1975 : UChar32 cp;
1976 0 : for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1977 0 : _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1978 : }
1979 0 : }
1980 :
1981 : /**
1982 : * Append the <code>toPattern()</code> representation of a
1983 : * character to the given <code>StringBuffer</code>.
1984 : */
1985 0 : void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1986 : escapeUnprintable) {
1987 0 : if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1988 : // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1989 : // unprintable
1990 0 : if (ICU_Utility::escapeUnprintable(buf, c)) {
1991 0 : return;
1992 : }
1993 : }
1994 : // Okay to let ':' pass through
1995 0 : switch (c) {
1996 : case SET_OPEN:
1997 : case SET_CLOSE:
1998 : case HYPHEN:
1999 : case COMPLEMENT:
2000 : case INTERSECTION:
2001 : case BACKSLASH:
2002 : case OPEN_BRACE:
2003 : case CLOSE_BRACE:
2004 : case COLON:
2005 : case SymbolTable::SYMBOL_REF:
2006 0 : buf.append(BACKSLASH);
2007 0 : break;
2008 : default:
2009 : // Escape whitespace
2010 0 : if (PatternProps::isWhiteSpace(c)) {
2011 0 : buf.append(BACKSLASH);
2012 : }
2013 0 : break;
2014 : }
2015 0 : buf.append(c);
2016 : }
2017 :
2018 : /**
2019 : * Append a string representation of this set to result. This will be
2020 : * a cleaned version of the string passed to applyPattern(), if there
2021 : * is one. Otherwise it will be generated.
2022 : */
2023 0 : UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2024 : UBool escapeUnprintable) const
2025 : {
2026 0 : if (pat != NULL) {
2027 : int32_t i;
2028 0 : int32_t backslashCount = 0;
2029 0 : for (i=0; i<patLen; ) {
2030 : UChar32 c;
2031 0 : U16_NEXT(pat, i, patLen, c);
2032 0 : if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2033 : // If the unprintable character is preceded by an odd
2034 : // number of backslashes, then it has been escaped.
2035 : // Before unescaping it, we delete the final
2036 : // backslash.
2037 0 : if ((backslashCount % 2) == 1) {
2038 0 : result.truncate(result.length() - 1);
2039 : }
2040 0 : ICU_Utility::escapeUnprintable(result, c);
2041 0 : backslashCount = 0;
2042 : } else {
2043 0 : result.append(c);
2044 0 : if (c == BACKSLASH) {
2045 0 : ++backslashCount;
2046 : } else {
2047 0 : backslashCount = 0;
2048 : }
2049 : }
2050 : }
2051 0 : return result;
2052 : }
2053 :
2054 0 : return _generatePattern(result, escapeUnprintable);
2055 : }
2056 :
2057 : /**
2058 : * Returns a string representation of this set. If the result of
2059 : * calling this function is passed to a UnicodeSet constructor, it
2060 : * will produce another set that is equal to this one.
2061 : */
2062 0 : UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2063 : UBool escapeUnprintable) const
2064 : {
2065 0 : result.truncate(0);
2066 0 : return _toPattern(result, escapeUnprintable);
2067 : }
2068 :
2069 : /**
2070 : * Generate and append a string representation of this set to result.
2071 : * This does not use this.pat, the cleaned up copy of the string
2072 : * passed to applyPattern().
2073 : */
2074 0 : UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2075 : UBool escapeUnprintable) const
2076 : {
2077 0 : result.append(SET_OPEN);
2078 :
2079 : // // Check against the predefined categories. We implicitly build
2080 : // // up ALL category sets the first time toPattern() is called.
2081 : // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2082 : // if (*this == getCategorySet(cat)) {
2083 : // result.append(COLON);
2084 : // result.append(CATEGORY_NAMES, cat*2, 2);
2085 : // return result.append(CATEGORY_CLOSE);
2086 : // }
2087 : // }
2088 :
2089 0 : int32_t count = getRangeCount();
2090 :
2091 : // If the set contains at least 2 intervals and includes both
2092 : // MIN_VALUE and MAX_VALUE, then the inverse representation will
2093 : // be more economical.
2094 0 : if (count > 1 &&
2095 0 : getRangeStart(0) == MIN_VALUE &&
2096 0 : getRangeEnd(count-1) == MAX_VALUE) {
2097 :
2098 : // Emit the inverse
2099 0 : result.append(COMPLEMENT);
2100 :
2101 0 : for (int32_t i = 1; i < count; ++i) {
2102 0 : UChar32 start = getRangeEnd(i-1)+1;
2103 0 : UChar32 end = getRangeStart(i)-1;
2104 0 : _appendToPat(result, start, escapeUnprintable);
2105 0 : if (start != end) {
2106 0 : if ((start+1) != end) {
2107 0 : result.append(HYPHEN);
2108 : }
2109 0 : _appendToPat(result, end, escapeUnprintable);
2110 : }
2111 : }
2112 : }
2113 :
2114 : // Default; emit the ranges as pairs
2115 : else {
2116 0 : for (int32_t i = 0; i < count; ++i) {
2117 0 : UChar32 start = getRangeStart(i);
2118 0 : UChar32 end = getRangeEnd(i);
2119 0 : _appendToPat(result, start, escapeUnprintable);
2120 0 : if (start != end) {
2121 0 : if ((start+1) != end) {
2122 0 : result.append(HYPHEN);
2123 : }
2124 0 : _appendToPat(result, end, escapeUnprintable);
2125 : }
2126 : }
2127 : }
2128 :
2129 0 : for (int32_t i = 0; i<strings->size(); ++i) {
2130 0 : result.append(OPEN_BRACE);
2131 0 : _appendToPat(result,
2132 0 : *(const UnicodeString*) strings->elementAt(i),
2133 0 : escapeUnprintable);
2134 0 : result.append(CLOSE_BRACE);
2135 : }
2136 0 : return result.append(SET_CLOSE);
2137 : }
2138 :
2139 : /**
2140 : * Release existing cached pattern
2141 : */
2142 0 : void UnicodeSet::releasePattern() {
2143 0 : if (pat) {
2144 0 : uprv_free(pat);
2145 0 : pat = NULL;
2146 0 : patLen = 0;
2147 : }
2148 0 : }
2149 :
2150 : /**
2151 : * Set the new pattern to cache.
2152 : */
2153 0 : void UnicodeSet::setPattern(const UnicodeString& newPat) {
2154 0 : releasePattern();
2155 0 : int32_t newPatLen = newPat.length();
2156 0 : pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2157 0 : if (pat) {
2158 0 : patLen = newPatLen;
2159 0 : newPat.extractBetween(0, patLen, pat);
2160 0 : pat[patLen] = 0;
2161 : }
2162 : // else we don't care if malloc failed. This was just a nice cache.
2163 : // We can regenerate an equivalent pattern later when requested.
2164 0 : }
2165 :
2166 0 : UnicodeFunctor *UnicodeSet::freeze() {
2167 0 : if(!isFrozen() && !isBogus()) {
2168 : // Do most of what compact() does before freezing because
2169 : // compact() will not work when the set is frozen.
2170 : // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2171 :
2172 : // Delete buffer first to defragment memory less.
2173 0 : if (buffer != NULL) {
2174 0 : uprv_free(buffer);
2175 0 : buffer = NULL;
2176 : }
2177 0 : if (capacity > (len + GROW_EXTRA)) {
2178 : // Make the capacity equal to len or 1.
2179 : // We don't want to realloc of 0 size.
2180 0 : capacity = len + (len == 0);
2181 0 : list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2182 0 : if (list == NULL) { // Check for memory allocation error.
2183 0 : setToBogus();
2184 0 : return this;
2185 : }
2186 : }
2187 :
2188 : // Optimize contains() and span() and similar functions.
2189 0 : if (!strings->isEmpty()) {
2190 0 : stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2191 0 : if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2192 : // All strings are irrelevant for span() etc. because
2193 : // all of each string's code points are contained in this set.
2194 : // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2195 : // many relevant strings as UTF-16.
2196 : // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2197 0 : delete stringSpan;
2198 0 : stringSpan = NULL;
2199 : }
2200 : }
2201 0 : if (stringSpan == NULL) {
2202 : // No span-relevant strings: Optimize for code point spans.
2203 0 : bmpSet=new BMPSet(list, len);
2204 0 : if (bmpSet == NULL) { // Check for memory allocation error.
2205 0 : setToBogus();
2206 : }
2207 : }
2208 : }
2209 0 : return this;
2210 : }
2211 :
2212 0 : int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2213 0 : if(length>0 && bmpSet!=NULL) {
2214 0 : return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2215 : }
2216 0 : if(length<0) {
2217 0 : length=u_strlen(s);
2218 : }
2219 0 : if(length==0) {
2220 0 : return 0;
2221 : }
2222 0 : if(stringSpan!=NULL) {
2223 0 : return stringSpan->span(s, length, spanCondition);
2224 0 : } else if(!strings->isEmpty()) {
2225 0 : uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2226 : UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2227 0 : UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2228 0 : UnicodeSetStringSpan strSpan(*this, *strings, which);
2229 0 : if(strSpan.needsStringSpanUTF16()) {
2230 0 : return strSpan.span(s, length, spanCondition);
2231 : }
2232 : }
2233 :
2234 0 : if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2235 0 : spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2236 : }
2237 :
2238 : UChar32 c;
2239 0 : int32_t start=0, prev=0;
2240 0 : do {
2241 0 : U16_NEXT(s, start, length, c);
2242 0 : if(spanCondition!=contains(c)) {
2243 0 : break;
2244 : }
2245 0 : } while((prev=start)<length);
2246 0 : return prev;
2247 : }
2248 :
2249 0 : int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2250 0 : if(length>0 && bmpSet!=NULL) {
2251 0 : return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2252 : }
2253 0 : if(length<0) {
2254 0 : length=u_strlen(s);
2255 : }
2256 0 : if(length==0) {
2257 0 : return 0;
2258 : }
2259 0 : if(stringSpan!=NULL) {
2260 0 : return stringSpan->spanBack(s, length, spanCondition);
2261 0 : } else if(!strings->isEmpty()) {
2262 0 : uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2263 : UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2264 0 : UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2265 0 : UnicodeSetStringSpan strSpan(*this, *strings, which);
2266 0 : if(strSpan.needsStringSpanUTF16()) {
2267 0 : return strSpan.spanBack(s, length, spanCondition);
2268 : }
2269 : }
2270 :
2271 0 : if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2272 0 : spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2273 : }
2274 :
2275 : UChar32 c;
2276 0 : int32_t prev=length;
2277 0 : do {
2278 0 : U16_PREV(s, 0, length, c);
2279 0 : if(spanCondition!=contains(c)) {
2280 0 : break;
2281 : }
2282 0 : } while((prev=length)>0);
2283 0 : return prev;
2284 : }
2285 :
2286 0 : int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2287 0 : if(length>0 && bmpSet!=NULL) {
2288 0 : const uint8_t *s0=(const uint8_t *)s;
2289 0 : return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2290 : }
2291 0 : if(length<0) {
2292 0 : length=(int32_t)uprv_strlen(s);
2293 : }
2294 0 : if(length==0) {
2295 0 : return 0;
2296 : }
2297 0 : if(stringSpan!=NULL) {
2298 0 : return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2299 0 : } else if(!strings->isEmpty()) {
2300 0 : uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2301 : UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2302 0 : UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2303 0 : UnicodeSetStringSpan strSpan(*this, *strings, which);
2304 0 : if(strSpan.needsStringSpanUTF8()) {
2305 0 : return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2306 : }
2307 : }
2308 :
2309 0 : if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2310 0 : spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2311 : }
2312 :
2313 : UChar32 c;
2314 0 : int32_t start=0, prev=0;
2315 0 : do {
2316 0 : U8_NEXT_OR_FFFD(s, start, length, c);
2317 0 : if(spanCondition!=contains(c)) {
2318 0 : break;
2319 : }
2320 0 : } while((prev=start)<length);
2321 0 : return prev;
2322 : }
2323 :
2324 0 : int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2325 0 : if(length>0 && bmpSet!=NULL) {
2326 0 : const uint8_t *s0=(const uint8_t *)s;
2327 0 : return bmpSet->spanBackUTF8(s0, length, spanCondition);
2328 : }
2329 0 : if(length<0) {
2330 0 : length=(int32_t)uprv_strlen(s);
2331 : }
2332 0 : if(length==0) {
2333 0 : return 0;
2334 : }
2335 0 : if(stringSpan!=NULL) {
2336 0 : return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337 0 : } else if(!strings->isEmpty()) {
2338 0 : uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2339 : UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2340 0 : UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2341 0 : UnicodeSetStringSpan strSpan(*this, *strings, which);
2342 0 : if(strSpan.needsStringSpanUTF8()) {
2343 0 : return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2344 : }
2345 : }
2346 :
2347 0 : if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2348 0 : spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2349 : }
2350 :
2351 : UChar32 c;
2352 0 : int32_t prev=length;
2353 0 : do {
2354 0 : U8_PREV_OR_FFFD(s, 0, length, c);
2355 0 : if(spanCondition!=contains(c)) {
2356 0 : break;
2357 : }
2358 0 : } while((prev=length)>0);
2359 0 : return prev;
2360 : }
2361 :
2362 : U_NAMESPACE_END
|