Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2007-2012, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: unisetspan.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2007mar01
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #include "unicode/utypes.h"
20 : #include "unicode/uniset.h"
21 : #include "unicode/ustring.h"
22 : #include "unicode/utf8.h"
23 : #include "unicode/utf16.h"
24 : #include "cmemory.h"
25 : #include "uvector.h"
26 : #include "unisetspan.h"
27 :
28 : U_NAMESPACE_BEGIN
29 :
30 : /*
31 : * List of offsets from the current position from where to try matching
32 : * a code point or a string.
33 : * Store offsets rather than indexes to simplify the code and use the same list
34 : * for both increments (in span()) and decrements (in spanBack()).
35 : *
36 : * Assumption: The maximum offset is limited, and the offsets that are stored
37 : * at any one time are relatively dense, that is, there are normally no gaps of
38 : * hundreds or thousands of offset values.
39 : *
40 : * The implementation uses a circular buffer of byte flags,
41 : * each indicating whether the corresponding offset is in the list.
42 : * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43 : * physically moving part of the list.
44 : *
45 : * Note: In principle, the caller should setMaxLength() to the maximum of the
46 : * max string length and U16_LENGTH/U8_LENGTH to account for
47 : * "long" single code points.
48 : * However, this implementation uses at least a staticList with more than
49 : * U8_LENGTH entries anyway.
50 : *
51 : * Note: If maxLength were guaranteed to be no more than 32 or 64,
52 : * the list could be stored as bit flags in a single integer.
53 : * Rather than handling a circular buffer with a start list index,
54 : * the integer would simply be shifted when lower offsets are removed.
55 : * UnicodeSet does not have a limit on the lengths of strings.
56 : */
57 : class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58 : public:
59 0 : OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60 :
61 0 : ~OffsetList() {
62 0 : if(list!=staticList) {
63 0 : uprv_free(list);
64 : }
65 0 : }
66 :
67 : // Call exactly once if the list is to be used.
68 0 : void setMaxLength(int32_t maxLength) {
69 0 : if(maxLength<=(int32_t)sizeof(staticList)) {
70 0 : capacity=(int32_t)sizeof(staticList);
71 : } else {
72 0 : UBool *l=(UBool *)uprv_malloc(maxLength);
73 0 : if(l!=NULL) {
74 0 : list=l;
75 0 : capacity=maxLength;
76 : }
77 : }
78 0 : uprv_memset(list, 0, capacity);
79 0 : }
80 :
81 : void clear() {
82 : uprv_memset(list, 0, capacity);
83 : start=length=0;
84 : }
85 :
86 0 : UBool isEmpty() const {
87 0 : return (UBool)(length==0);
88 : }
89 :
90 : // Reduce all stored offsets by delta, used when the current position
91 : // moves by delta.
92 : // There must not be any offsets lower than delta.
93 : // If there is an offset equal to delta, it is removed.
94 : // delta=[1..maxLength]
95 0 : void shift(int32_t delta) {
96 0 : int32_t i=start+delta;
97 0 : if(i>=capacity) {
98 0 : i-=capacity;
99 : }
100 0 : if(list[i]) {
101 0 : list[i]=FALSE;
102 0 : --length;
103 : }
104 0 : start=i;
105 0 : }
106 :
107 : // Add an offset. The list must not contain it yet.
108 : // offset=[1..maxLength]
109 0 : void addOffset(int32_t offset) {
110 0 : int32_t i=start+offset;
111 0 : if(i>=capacity) {
112 0 : i-=capacity;
113 : }
114 0 : list[i]=TRUE;
115 0 : ++length;
116 0 : }
117 :
118 : // offset=[1..maxLength]
119 0 : UBool containsOffset(int32_t offset) const {
120 0 : int32_t i=start+offset;
121 0 : if(i>=capacity) {
122 0 : i-=capacity;
123 : }
124 0 : return list[i];
125 : }
126 :
127 : // Find the lowest stored offset from a non-empty list, remove it,
128 : // and reduce all other offsets by this minimum.
129 : // Returns [1..maxLength].
130 0 : int32_t popMinimum() {
131 : // Look for the next offset in list[start+1..capacity-1].
132 0 : int32_t i=start, result;
133 0 : while(++i<capacity) {
134 0 : if(list[i]) {
135 0 : list[i]=FALSE;
136 0 : --length;
137 0 : result=i-start;
138 0 : start=i;
139 0 : return result;
140 : }
141 : }
142 : // i==capacity
143 :
144 : // Wrap around and look for the next offset in list[0..start].
145 : // Since the list is not empty, there will be one.
146 0 : result=capacity-start;
147 0 : i=0;
148 0 : while(!list[i]) {
149 0 : ++i;
150 : }
151 0 : list[i]=FALSE;
152 0 : --length;
153 0 : start=i;
154 0 : return result+=i;
155 : }
156 :
157 : private:
158 : UBool *list;
159 : int32_t capacity;
160 : int32_t length;
161 : int32_t start;
162 :
163 : UBool staticList[16];
164 : };
165 :
166 : // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167 : static int32_t
168 0 : getUTF8Length(const UChar *s, int32_t length) {
169 0 : UErrorCode errorCode=U_ZERO_ERROR;
170 0 : int32_t length8=0;
171 0 : u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172 0 : if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173 0 : return length8;
174 : } else {
175 : // The string contains an unpaired surrogate.
176 : // Ignore this string.
177 0 : return 0;
178 : }
179 : }
180 :
181 : // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182 : static int32_t
183 0 : appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184 0 : UErrorCode errorCode=U_ZERO_ERROR;
185 0 : int32_t length8=0;
186 0 : u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187 0 : if(U_SUCCESS(errorCode)) {
188 0 : return length8;
189 : } else {
190 : // The string contains an unpaired surrogate.
191 : // Ignore this string.
192 0 : return 0;
193 : }
194 : }
195 :
196 : static inline uint8_t
197 0 : makeSpanLengthByte(int32_t spanLength) {
198 : // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199 0 : return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200 : }
201 :
202 : // Construct for all variants of span(), or only for any one variant.
203 : // Initialize as little as possible, for single use.
204 0 : UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205 : const UVector &setStrings,
206 0 : uint32_t which)
207 : : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208 : utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209 : utf8Length(0),
210 : maxLength16(0), maxLength8(0),
211 0 : all((UBool)(which==ALL)) {
212 0 : spanSet.retainAll(set);
213 0 : if(which&NOT_CONTAINED) {
214 : // Default to the same sets.
215 : // addToSpanNotSet() will create a separate set if necessary.
216 0 : pSpanNotSet=&spanSet;
217 : }
218 :
219 : // Determine if the strings even need to be taken into account at all for span() etc.
220 : // If any string is relevant, then all strings need to be used for
221 : // span(longest match) but only the relevant ones for span(while contained).
222 : // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223 : // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224 : // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225 : // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226 0 : int32_t stringsLength=strings.size();
227 :
228 : int32_t i, spanLength;
229 0 : UBool someRelevant=FALSE;
230 0 : for(i=0; i<stringsLength; ++i) {
231 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232 0 : const UChar *s16=string.getBuffer();
233 0 : int32_t length16=string.length();
234 : UBool thisRelevant;
235 0 : spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
236 0 : if(spanLength<length16) { // Relevant string.
237 0 : someRelevant=thisRelevant=TRUE;
238 : } else {
239 0 : thisRelevant=FALSE;
240 : }
241 0 : if((which&UTF16) && length16>maxLength16) {
242 0 : maxLength16=length16;
243 : }
244 0 : if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
245 0 : int32_t length8=getUTF8Length(s16, length16);
246 0 : utf8Length+=length8;
247 0 : if(length8>maxLength8) {
248 0 : maxLength8=length8;
249 : }
250 : }
251 : }
252 0 : if(!someRelevant) {
253 0 : maxLength16=maxLength8=0;
254 0 : return;
255 : }
256 :
257 : // Freeze after checking for the need to use strings at all because freezing
258 : // a set takes some time and memory which are wasted if there are no relevant strings.
259 0 : if(all) {
260 0 : spanSet.freeze();
261 : }
262 :
263 : uint8_t *spanBackLengths;
264 : uint8_t *spanUTF8Lengths;
265 : uint8_t *spanBackUTF8Lengths;
266 :
267 : // Allocate a block of meta data.
268 : int32_t allocSize;
269 0 : if(all) {
270 : // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
271 0 : allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
272 : } else {
273 0 : allocSize=stringsLength; // One set of span lengths.
274 0 : if(which&UTF8) {
275 : // UTF-8 lengths and UTF-8 strings.
276 0 : allocSize+=stringsLength*4+utf8Length;
277 : }
278 : }
279 0 : if(allocSize<=(int32_t)sizeof(staticLengths)) {
280 0 : utf8Lengths=staticLengths;
281 : } else {
282 0 : utf8Lengths=(int32_t *)uprv_malloc(allocSize);
283 0 : if(utf8Lengths==NULL) {
284 0 : maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
285 0 : return; // Out of memory.
286 : }
287 : }
288 :
289 0 : if(all) {
290 : // Store span lengths for all span() variants.
291 0 : spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
292 0 : spanBackLengths=spanLengths+stringsLength;
293 0 : spanUTF8Lengths=spanBackLengths+stringsLength;
294 0 : spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
295 0 : utf8=spanBackUTF8Lengths+stringsLength;
296 : } else {
297 : // Store span lengths for only one span() variant.
298 0 : if(which&UTF8) {
299 0 : spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
300 0 : utf8=spanLengths+stringsLength;
301 : } else {
302 0 : spanLengths=(uint8_t *)utf8Lengths;
303 : }
304 0 : spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
305 : }
306 :
307 : // Set the meta data and pSpanNotSet and write the UTF-8 strings.
308 0 : int32_t utf8Count=0; // Count UTF-8 bytes written so far.
309 :
310 0 : for(i=0; i<stringsLength; ++i) {
311 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
312 0 : const UChar *s16=string.getBuffer();
313 0 : int32_t length16=string.length();
314 0 : spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
315 0 : if(spanLength<length16) { // Relevant string.
316 0 : if(which&UTF16) {
317 0 : if(which&CONTAINED) {
318 0 : if(which&FWD) {
319 0 : spanLengths[i]=makeSpanLengthByte(spanLength);
320 : }
321 0 : if(which&BACK) {
322 0 : spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
323 0 : spanBackLengths[i]=makeSpanLengthByte(spanLength);
324 : }
325 : } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
326 0 : spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
327 : }
328 : }
329 0 : if(which&UTF8) {
330 0 : uint8_t *s8=utf8+utf8Count;
331 0 : int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
332 0 : utf8Count+=utf8Lengths[i]=length8;
333 0 : if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
334 0 : spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
335 : } else { // Relevant for UTF-8.
336 0 : if(which&CONTAINED) {
337 0 : if(which&FWD) {
338 0 : spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339 0 : spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340 : }
341 0 : if(which&BACK) {
342 0 : spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
343 0 : spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
344 : }
345 : } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
346 0 : spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
347 : }
348 : }
349 : }
350 0 : if(which&NOT_CONTAINED) {
351 : // Add string start and end code points to the spanNotSet so that
352 : // a span(while not contained) stops before any string.
353 : UChar32 c;
354 0 : if(which&FWD) {
355 0 : int32_t len=0;
356 0 : U16_NEXT(s16, len, length16, c);
357 0 : addToSpanNotSet(c);
358 : }
359 0 : if(which&BACK) {
360 0 : int32_t len=length16;
361 0 : U16_PREV(s16, 0, len, c);
362 0 : addToSpanNotSet(c);
363 : }
364 : }
365 : } else { // Irrelevant string.
366 0 : if(which&UTF8) {
367 0 : if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
368 0 : uint8_t *s8=utf8+utf8Count;
369 0 : int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
370 0 : utf8Count+=utf8Lengths[i]=length8;
371 : } else {
372 0 : utf8Lengths[i]=0;
373 : }
374 : }
375 0 : if(all) {
376 0 : spanLengths[i]=spanBackLengths[i]=
377 0 : spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
378 : (uint8_t)ALL_CP_CONTAINED;
379 : } else {
380 : // All spanXYZLengths pointers contain the same address.
381 0 : spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
382 : }
383 : }
384 : }
385 :
386 : // Finish.
387 0 : if(all) {
388 0 : pSpanNotSet->freeze();
389 : }
390 : }
391 :
392 : // Copy constructor. Assumes which==ALL for a frozen set.
393 0 : UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
394 0 : const UVector &newParentSetStrings)
395 : : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
396 : utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
397 0 : utf8Length(otherStringSpan.utf8Length),
398 0 : maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
399 0 : all(TRUE) {
400 0 : if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
401 0 : pSpanNotSet=&spanSet;
402 : } else {
403 0 : pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
404 : }
405 :
406 : // Allocate a block of meta data.
407 : // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
408 0 : int32_t stringsLength=strings.size();
409 0 : int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
410 0 : if(allocSize<=(int32_t)sizeof(staticLengths)) {
411 0 : utf8Lengths=staticLengths;
412 : } else {
413 0 : utf8Lengths=(int32_t *)uprv_malloc(allocSize);
414 0 : if(utf8Lengths==NULL) {
415 0 : maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
416 0 : return; // Out of memory.
417 : }
418 : }
419 :
420 0 : spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
421 0 : utf8=spanLengths+stringsLength*4;
422 0 : uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
423 : }
424 :
425 0 : UnicodeSetStringSpan::~UnicodeSetStringSpan() {
426 0 : if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
427 0 : delete pSpanNotSet;
428 : }
429 0 : if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
430 0 : uprv_free(utf8Lengths);
431 : }
432 0 : }
433 :
434 0 : void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
435 0 : if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
436 0 : if(spanSet.contains(c)) {
437 0 : return; // Nothing to do.
438 : }
439 0 : UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
440 0 : if(newSet==NULL) {
441 0 : return; // Out of memory.
442 : } else {
443 0 : pSpanNotSet=newSet;
444 : }
445 : }
446 0 : pSpanNotSet->add(c);
447 : }
448 :
449 : // Compare strings without any argument checks. Requires length>0.
450 : static inline UBool
451 0 : matches16(const UChar *s, const UChar *t, int32_t length) {
452 0 : do {
453 0 : if(*s++!=*t++) {
454 0 : return FALSE;
455 : }
456 : } while(--length>0);
457 0 : return TRUE;
458 : }
459 :
460 : static inline UBool
461 0 : matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
462 0 : do {
463 0 : if(*s++!=*t++) {
464 0 : return FALSE;
465 : }
466 : } while(--length>0);
467 0 : return TRUE;
468 : }
469 :
470 : // Compare 16-bit Unicode strings (which may be malformed UTF-16)
471 : // at code point boundaries.
472 : // That is, each edge of a match must not be in the middle of a surrogate pair.
473 : static inline UBool
474 0 : matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
475 0 : s+=start;
476 0 : limit-=start;
477 0 : return matches16(s, t, length) &&
478 0 : !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
479 0 : !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
480 : }
481 :
482 : // Does the set contain the next code point?
483 : // If so, return its length; otherwise return its negative length.
484 : static inline int32_t
485 0 : spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
486 0 : UChar c=*s, c2;
487 0 : if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
488 0 : return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
489 : }
490 0 : return set.contains(c) ? 1 : -1;
491 : }
492 :
493 : static inline int32_t
494 0 : spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
495 0 : UChar c=s[length-1], c2;
496 0 : if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
497 0 : return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
498 : }
499 0 : return set.contains(c) ? 1 : -1;
500 : }
501 :
502 : static inline int32_t
503 0 : spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
504 0 : UChar32 c=*s;
505 0 : if((int8_t)c>=0) {
506 0 : return set.contains(c) ? 1 : -1;
507 : }
508 : // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
509 0 : int32_t i=0;
510 0 : U8_NEXT_OR_FFFD(s, i, length, c);
511 0 : return set.contains(c) ? i : -i;
512 : }
513 :
514 : static inline int32_t
515 0 : spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
516 0 : UChar32 c=s[length-1];
517 0 : if((int8_t)c>=0) {
518 0 : return set.contains(c) ? 1 : -1;
519 : }
520 0 : int32_t i=length-1;
521 0 : c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
522 0 : length-=i;
523 0 : return set.contains(c) ? length : -length;
524 : }
525 :
526 : /*
527 : * Note: In span() when spanLength==0 (after a string match, or at the beginning
528 : * after an empty code point span) and in spanNot() and spanNotUTF8(),
529 : * string matching could use a binary search
530 : * because all string matches are done from the same start index.
531 : *
532 : * For UTF-8, this would require a comparison function that returns UTF-16 order.
533 : *
534 : * This optimization should not be necessary for normal UnicodeSets because
535 : * most sets have no strings, and most sets with strings have
536 : * very few very short strings.
537 : * For cases with many strings, it might be better to use a different API
538 : * and implementation with a DFA (state machine).
539 : */
540 :
541 : /*
542 : * Algorithm for span(USET_SPAN_CONTAINED)
543 : *
544 : * Theoretical algorithm:
545 : * - Iterate through the string, and at each code point boundary:
546 : * + If the code point there is in the set, then remember to continue after it.
547 : * + If a set string matches at the current position, then remember to continue after it.
548 : * + Either recursively span for each code point or string match,
549 : * or recursively span for all but the shortest one and
550 : * iteratively continue the span with the shortest local match.
551 : * + Remember the longest recursive span (the farthest end point).
552 : * + If there is no match at the current position, neither for the code point there
553 : * nor for any set string, then stop and return the longest recursive span length.
554 : *
555 : * Optimized implementation:
556 : *
557 : * (We assume that most sets will have very few very short strings.
558 : * A span using a string-less set is extremely fast.)
559 : *
560 : * Create and cache a spanSet which contains all of the single code points
561 : * of the original set but none of its strings.
562 : *
563 : * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
564 : * - Loop:
565 : * + Try to match each set string at the end of the spanLength.
566 : * ~ Set strings that start with set-contained code points must be matched
567 : * with a partial overlap because the recursive algorithm would have tried
568 : * to match them at every position.
569 : * ~ Set strings that entirely consist of set-contained code points
570 : * are irrelevant for span(USET_SPAN_CONTAINED) because the
571 : * recursive algorithm would continue after them anyway
572 : * and find the longest recursive match from their end.
573 : * ~ Rather than recursing, note each end point of a set string match.
574 : * + If no set string matched after spanSet.span(), then return
575 : * with where the spanSet.span() ended.
576 : * + If at least one set string matched after spanSet.span(), then
577 : * pop the shortest string match end point and continue
578 : * the loop, trying to match all set strings from there.
579 : * + If at least one more set string matched after a previous string match,
580 : * then test if the code point after the previous string match is also
581 : * contained in the set.
582 : * Continue the loop with the shortest end point of either this code point
583 : * or a matching set string.
584 : * + If no more set string matched after a previous string match,
585 : * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
586 : * Stop if spanLength==0, otherwise continue the loop.
587 : *
588 : * By noting each end point of a set string match,
589 : * the function visits each string position at most once and finishes
590 : * in linear time.
591 : *
592 : * The recursive algorithm may visit the same string position many times
593 : * if multiple paths lead to it and finishes in exponential time.
594 : */
595 :
596 : /*
597 : * Algorithm for span(USET_SPAN_SIMPLE)
598 : *
599 : * Theoretical algorithm:
600 : * - Iterate through the string, and at each code point boundary:
601 : * + If the code point there is in the set, then remember to continue after it.
602 : * + If a set string matches at the current position, then remember to continue after it.
603 : * + Continue from the farthest match position and ignore all others.
604 : * + If there is no match at the current position,
605 : * then stop and return the current position.
606 : *
607 : * Optimized implementation:
608 : *
609 : * (Same assumption and spanSet as above.)
610 : *
611 : * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
612 : * - Loop:
613 : * + Try to match each set string at the end of the spanLength.
614 : * ~ Set strings that start with set-contained code points must be matched
615 : * with a partial overlap because the standard algorithm would have tried
616 : * to match them earlier.
617 : * ~ Set strings that entirely consist of set-contained code points
618 : * must be matched with a full overlap because the longest-match algorithm
619 : * would hide set string matches that end earlier.
620 : * Such set strings need not be matched earlier inside the code point span
621 : * because the standard algorithm would then have continued after
622 : * the set string match anyway.
623 : * ~ Remember the longest set string match (farthest end point) from the earliest
624 : * starting point.
625 : * + If no set string matched after spanSet.span(), then return
626 : * with where the spanSet.span() ended.
627 : * + If at least one set string matched, then continue the loop after the
628 : * longest match from the earliest position.
629 : * + If no more set string matched after a previous string match,
630 : * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
631 : * Stop if spanLength==0, otherwise continue the loop.
632 : */
633 :
634 0 : int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
635 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
636 0 : return spanNot(s, length);
637 : }
638 0 : int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
639 0 : if(spanLength==length) {
640 0 : return length;
641 : }
642 :
643 : // Consider strings; they may overlap with the span.
644 0 : OffsetList offsets;
645 0 : if(spanCondition==USET_SPAN_CONTAINED) {
646 : // Use offset list to try all possibilities.
647 0 : offsets.setMaxLength(maxLength16);
648 : }
649 0 : int32_t pos=spanLength, rest=length-pos;
650 0 : int32_t i, stringsLength=strings.size();
651 : for(;;) {
652 0 : if(spanCondition==USET_SPAN_CONTAINED) {
653 0 : for(i=0; i<stringsLength; ++i) {
654 0 : int32_t overlap=spanLengths[i];
655 0 : if(overlap==ALL_CP_CONTAINED) {
656 0 : continue; // Irrelevant string.
657 : }
658 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
659 0 : const UChar *s16=string.getBuffer();
660 0 : int32_t length16=string.length();
661 :
662 : // Try to match this string at pos-overlap..pos.
663 0 : if(overlap>=LONG_SPAN) {
664 0 : overlap=length16;
665 : // While contained: No point matching fully inside the code point span.
666 0 : U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
667 : }
668 0 : if(overlap>spanLength) {
669 0 : overlap=spanLength;
670 : }
671 0 : int32_t inc=length16-overlap; // Keep overlap+inc==length16.
672 : for(;;) {
673 0 : if(inc>rest) {
674 0 : break;
675 : }
676 : // Try to match if the increment is not listed already.
677 0 : if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
678 0 : if(inc==rest) {
679 0 : return length; // Reached the end of the string.
680 : }
681 0 : offsets.addOffset(inc);
682 : }
683 0 : if(overlap==0) {
684 0 : break;
685 : }
686 0 : --overlap;
687 0 : ++inc;
688 : }
689 : }
690 : } else /* USET_SPAN_SIMPLE */ {
691 0 : int32_t maxInc=0, maxOverlap=0;
692 0 : for(i=0; i<stringsLength; ++i) {
693 0 : int32_t overlap=spanLengths[i];
694 : // For longest match, we do need to try to match even an all-contained string
695 : // to find the match from the earliest start.
696 :
697 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
698 0 : const UChar *s16=string.getBuffer();
699 0 : int32_t length16=string.length();
700 :
701 : // Try to match this string at pos-overlap..pos.
702 0 : if(overlap>=LONG_SPAN) {
703 0 : overlap=length16;
704 : // Longest match: Need to match fully inside the code point span
705 : // to find the match from the earliest start.
706 : }
707 0 : if(overlap>spanLength) {
708 0 : overlap=spanLength;
709 : }
710 0 : int32_t inc=length16-overlap; // Keep overlap+inc==length16.
711 : for(;;) {
712 0 : if(inc>rest || overlap<maxOverlap) {
713 : break;
714 : }
715 : // Try to match if the string is longer or starts earlier.
716 0 : if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
717 0 : matches16CPB(s, pos-overlap, length, s16, length16)
718 : ) {
719 0 : maxInc=inc; // Longest match from earliest start.
720 0 : maxOverlap=overlap;
721 0 : break;
722 : }
723 0 : --overlap;
724 0 : ++inc;
725 : }
726 : }
727 :
728 0 : if(maxInc!=0 || maxOverlap!=0) {
729 : // Longest-match algorithm, and there was a string match.
730 : // Simply continue after it.
731 0 : pos+=maxInc;
732 0 : rest-=maxInc;
733 0 : if(rest==0) {
734 0 : return length; // Reached the end of the string.
735 : }
736 0 : spanLength=0; // Match strings from after a string match.
737 0 : continue;
738 : }
739 : }
740 : // Finished trying to match all strings at pos.
741 :
742 0 : if(spanLength!=0 || pos==0) {
743 : // The position is after an unlimited code point span (spanLength!=0),
744 : // not after a string match.
745 : // The only position where spanLength==0 after a span is pos==0.
746 : // Otherwise, an unlimited code point span is only tried again when no
747 : // strings match, and if such a non-initial span fails we stop.
748 0 : if(offsets.isEmpty()) {
749 0 : return pos; // No strings matched after a span.
750 : }
751 : // Match strings from after the next string match.
752 : } else {
753 : // The position is after a string match (or a single code point).
754 0 : if(offsets.isEmpty()) {
755 : // No more strings matched after a previous string match.
756 : // Try another code point span from after the last string match.
757 0 : spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
758 0 : if( spanLength==rest || // Reached the end of the string, or
759 : spanLength==0 // neither strings nor span progressed.
760 : ) {
761 0 : return pos+spanLength;
762 : }
763 0 : pos+=spanLength;
764 0 : rest-=spanLength;
765 0 : continue; // spanLength>0: Match strings from after a span.
766 : } else {
767 : // Try to match only one code point from after a string match if some
768 : // string matched beyond it, so that we try all possible positions
769 : // and don't overshoot.
770 0 : spanLength=spanOne(spanSet, s+pos, rest);
771 0 : if(spanLength>0) {
772 0 : if(spanLength==rest) {
773 0 : return length; // Reached the end of the string.
774 : }
775 : // Match strings after this code point.
776 : // There cannot be any increments below it because UnicodeSet strings
777 : // contain multiple code points.
778 0 : pos+=spanLength;
779 0 : rest-=spanLength;
780 0 : offsets.shift(spanLength);
781 0 : spanLength=0;
782 0 : continue; // Match strings from after a single code point.
783 : }
784 : // Match strings from after the next string match.
785 : }
786 : }
787 0 : int32_t minOffset=offsets.popMinimum();
788 0 : pos+=minOffset;
789 0 : rest-=minOffset;
790 0 : spanLength=0; // Match strings from after a string match.
791 0 : }
792 : }
793 :
794 0 : int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
795 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
796 0 : return spanNotBack(s, length);
797 : }
798 0 : int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
799 0 : if(pos==0) {
800 0 : return 0;
801 : }
802 0 : int32_t spanLength=length-pos;
803 :
804 : // Consider strings; they may overlap with the span.
805 0 : OffsetList offsets;
806 0 : if(spanCondition==USET_SPAN_CONTAINED) {
807 : // Use offset list to try all possibilities.
808 0 : offsets.setMaxLength(maxLength16);
809 : }
810 0 : int32_t i, stringsLength=strings.size();
811 0 : uint8_t *spanBackLengths=spanLengths;
812 0 : if(all) {
813 0 : spanBackLengths+=stringsLength;
814 : }
815 : for(;;) {
816 0 : if(spanCondition==USET_SPAN_CONTAINED) {
817 0 : for(i=0; i<stringsLength; ++i) {
818 0 : int32_t overlap=spanBackLengths[i];
819 0 : if(overlap==ALL_CP_CONTAINED) {
820 0 : continue; // Irrelevant string.
821 : }
822 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
823 0 : const UChar *s16=string.getBuffer();
824 0 : int32_t length16=string.length();
825 :
826 : // Try to match this string at pos-(length16-overlap)..pos-length16.
827 0 : if(overlap>=LONG_SPAN) {
828 0 : overlap=length16;
829 : // While contained: No point matching fully inside the code point span.
830 0 : int32_t len1=0;
831 0 : U16_FWD_1(s16, len1, overlap);
832 0 : overlap-=len1; // Length of the string minus the first code point.
833 : }
834 0 : if(overlap>spanLength) {
835 0 : overlap=spanLength;
836 : }
837 0 : int32_t dec=length16-overlap; // Keep dec+overlap==length16.
838 : for(;;) {
839 0 : if(dec>pos) {
840 0 : break;
841 : }
842 : // Try to match if the decrement is not listed already.
843 0 : if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
844 0 : if(dec==pos) {
845 0 : return 0; // Reached the start of the string.
846 : }
847 0 : offsets.addOffset(dec);
848 : }
849 0 : if(overlap==0) {
850 0 : break;
851 : }
852 0 : --overlap;
853 0 : ++dec;
854 : }
855 : }
856 : } else /* USET_SPAN_SIMPLE */ {
857 0 : int32_t maxDec=0, maxOverlap=0;
858 0 : for(i=0; i<stringsLength; ++i) {
859 0 : int32_t overlap=spanBackLengths[i];
860 : // For longest match, we do need to try to match even an all-contained string
861 : // to find the match from the latest end.
862 :
863 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
864 0 : const UChar *s16=string.getBuffer();
865 0 : int32_t length16=string.length();
866 :
867 : // Try to match this string at pos-(length16-overlap)..pos-length16.
868 0 : if(overlap>=LONG_SPAN) {
869 0 : overlap=length16;
870 : // Longest match: Need to match fully inside the code point span
871 : // to find the match from the latest end.
872 : }
873 0 : if(overlap>spanLength) {
874 0 : overlap=spanLength;
875 : }
876 0 : int32_t dec=length16-overlap; // Keep dec+overlap==length16.
877 : for(;;) {
878 0 : if(dec>pos || overlap<maxOverlap) {
879 : break;
880 : }
881 : // Try to match if the string is longer or ends later.
882 0 : if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
883 0 : matches16CPB(s, pos-dec, length, s16, length16)
884 : ) {
885 0 : maxDec=dec; // Longest match from latest end.
886 0 : maxOverlap=overlap;
887 0 : break;
888 : }
889 0 : --overlap;
890 0 : ++dec;
891 : }
892 : }
893 :
894 0 : if(maxDec!=0 || maxOverlap!=0) {
895 : // Longest-match algorithm, and there was a string match.
896 : // Simply continue before it.
897 0 : pos-=maxDec;
898 0 : if(pos==0) {
899 0 : return 0; // Reached the start of the string.
900 : }
901 0 : spanLength=0; // Match strings from before a string match.
902 0 : continue;
903 : }
904 : }
905 : // Finished trying to match all strings at pos.
906 :
907 0 : if(spanLength!=0 || pos==length) {
908 : // The position is before an unlimited code point span (spanLength!=0),
909 : // not before a string match.
910 : // The only position where spanLength==0 before a span is pos==length.
911 : // Otherwise, an unlimited code point span is only tried again when no
912 : // strings match, and if such a non-initial span fails we stop.
913 0 : if(offsets.isEmpty()) {
914 0 : return pos; // No strings matched before a span.
915 : }
916 : // Match strings from before the next string match.
917 : } else {
918 : // The position is before a string match (or a single code point).
919 0 : if(offsets.isEmpty()) {
920 : // No more strings matched before a previous string match.
921 : // Try another code point span from before the last string match.
922 0 : int32_t oldPos=pos;
923 0 : pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
924 0 : spanLength=oldPos-pos;
925 0 : if( pos==0 || // Reached the start of the string, or
926 : spanLength==0 // neither strings nor span progressed.
927 : ) {
928 0 : return pos;
929 : }
930 0 : continue; // spanLength>0: Match strings from before a span.
931 : } else {
932 : // Try to match only one code point from before a string match if some
933 : // string matched beyond it, so that we try all possible positions
934 : // and don't overshoot.
935 0 : spanLength=spanOneBack(spanSet, s, pos);
936 0 : if(spanLength>0) {
937 0 : if(spanLength==pos) {
938 0 : return 0; // Reached the start of the string.
939 : }
940 : // Match strings before this code point.
941 : // There cannot be any decrements below it because UnicodeSet strings
942 : // contain multiple code points.
943 0 : pos-=spanLength;
944 0 : offsets.shift(spanLength);
945 0 : spanLength=0;
946 0 : continue; // Match strings from before a single code point.
947 : }
948 : // Match strings from before the next string match.
949 : }
950 : }
951 0 : pos-=offsets.popMinimum();
952 0 : spanLength=0; // Match strings from before a string match.
953 0 : }
954 : }
955 :
956 0 : int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
957 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
958 0 : return spanNotUTF8(s, length);
959 : }
960 0 : int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
961 0 : if(spanLength==length) {
962 0 : return length;
963 : }
964 :
965 : // Consider strings; they may overlap with the span.
966 0 : OffsetList offsets;
967 0 : if(spanCondition==USET_SPAN_CONTAINED) {
968 : // Use offset list to try all possibilities.
969 0 : offsets.setMaxLength(maxLength8);
970 : }
971 0 : int32_t pos=spanLength, rest=length-pos;
972 0 : int32_t i, stringsLength=strings.size();
973 0 : uint8_t *spanUTF8Lengths=spanLengths;
974 0 : if(all) {
975 0 : spanUTF8Lengths+=2*stringsLength;
976 : }
977 : for(;;) {
978 0 : const uint8_t *s8=utf8;
979 : int32_t length8;
980 0 : if(spanCondition==USET_SPAN_CONTAINED) {
981 0 : for(i=0; i<stringsLength; ++i) {
982 0 : length8=utf8Lengths[i];
983 0 : if(length8==0) {
984 0 : continue; // String not representable in UTF-8.
985 : }
986 0 : int32_t overlap=spanUTF8Lengths[i];
987 0 : if(overlap==ALL_CP_CONTAINED) {
988 0 : s8+=length8;
989 0 : continue; // Irrelevant string.
990 : }
991 :
992 : // Try to match this string at pos-overlap..pos.
993 0 : if(overlap>=LONG_SPAN) {
994 0 : overlap=length8;
995 : // While contained: No point matching fully inside the code point span.
996 0 : U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
997 : }
998 0 : if(overlap>spanLength) {
999 0 : overlap=spanLength;
1000 : }
1001 0 : int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1002 : for(;;) {
1003 0 : if(inc>rest) {
1004 0 : break;
1005 : }
1006 : // Try to match if the increment is not listed already.
1007 : // Match at code point boundaries. (The UTF-8 strings were converted
1008 : // from UTF-16 and are guaranteed to be well-formed.)
1009 0 : if( !U8_IS_TRAIL(s[pos-overlap]) &&
1010 0 : !offsets.containsOffset(inc) &&
1011 0 : matches8(s+pos-overlap, s8, length8)
1012 :
1013 : ) {
1014 0 : if(inc==rest) {
1015 0 : return length; // Reached the end of the string.
1016 : }
1017 0 : offsets.addOffset(inc);
1018 : }
1019 0 : if(overlap==0) {
1020 0 : break;
1021 : }
1022 0 : --overlap;
1023 0 : ++inc;
1024 : }
1025 0 : s8+=length8;
1026 : }
1027 : } else /* USET_SPAN_SIMPLE */ {
1028 0 : int32_t maxInc=0, maxOverlap=0;
1029 0 : for(i=0; i<stringsLength; ++i) {
1030 0 : length8=utf8Lengths[i];
1031 0 : if(length8==0) {
1032 0 : continue; // String not representable in UTF-8.
1033 : }
1034 0 : int32_t overlap=spanUTF8Lengths[i];
1035 : // For longest match, we do need to try to match even an all-contained string
1036 : // to find the match from the earliest start.
1037 :
1038 : // Try to match this string at pos-overlap..pos.
1039 0 : if(overlap>=LONG_SPAN) {
1040 0 : overlap=length8;
1041 : // Longest match: Need to match fully inside the code point span
1042 : // to find the match from the earliest start.
1043 : }
1044 0 : if(overlap>spanLength) {
1045 0 : overlap=spanLength;
1046 : }
1047 0 : int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1048 : for(;;) {
1049 0 : if(inc>rest || overlap<maxOverlap) {
1050 : break;
1051 : }
1052 : // Try to match if the string is longer or starts earlier.
1053 : // Match at code point boundaries. (The UTF-8 strings were converted
1054 : // from UTF-16 and are guaranteed to be well-formed.)
1055 0 : if( !U8_IS_TRAIL(s[pos-overlap]) &&
1056 0 : (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1057 0 : matches8(s+pos-overlap, s8, length8)
1058 :
1059 : ) {
1060 0 : maxInc=inc; // Longest match from earliest start.
1061 0 : maxOverlap=overlap;
1062 0 : break;
1063 : }
1064 0 : --overlap;
1065 0 : ++inc;
1066 : }
1067 0 : s8+=length8;
1068 : }
1069 :
1070 0 : if(maxInc!=0 || maxOverlap!=0) {
1071 : // Longest-match algorithm, and there was a string match.
1072 : // Simply continue after it.
1073 0 : pos+=maxInc;
1074 0 : rest-=maxInc;
1075 0 : if(rest==0) {
1076 0 : return length; // Reached the end of the string.
1077 : }
1078 0 : spanLength=0; // Match strings from after a string match.
1079 0 : continue;
1080 : }
1081 : }
1082 : // Finished trying to match all strings at pos.
1083 :
1084 0 : if(spanLength!=0 || pos==0) {
1085 : // The position is after an unlimited code point span (spanLength!=0),
1086 : // not after a string match.
1087 : // The only position where spanLength==0 after a span is pos==0.
1088 : // Otherwise, an unlimited code point span is only tried again when no
1089 : // strings match, and if such a non-initial span fails we stop.
1090 0 : if(offsets.isEmpty()) {
1091 0 : return pos; // No strings matched after a span.
1092 : }
1093 : // Match strings from after the next string match.
1094 : } else {
1095 : // The position is after a string match (or a single code point).
1096 0 : if(offsets.isEmpty()) {
1097 : // No more strings matched after a previous string match.
1098 : // Try another code point span from after the last string match.
1099 0 : spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1100 0 : if( spanLength==rest || // Reached the end of the string, or
1101 : spanLength==0 // neither strings nor span progressed.
1102 : ) {
1103 0 : return pos+spanLength;
1104 : }
1105 0 : pos+=spanLength;
1106 0 : rest-=spanLength;
1107 0 : continue; // spanLength>0: Match strings from after a span.
1108 : } else {
1109 : // Try to match only one code point from after a string match if some
1110 : // string matched beyond it, so that we try all possible positions
1111 : // and don't overshoot.
1112 0 : spanLength=spanOneUTF8(spanSet, s+pos, rest);
1113 0 : if(spanLength>0) {
1114 0 : if(spanLength==rest) {
1115 0 : return length; // Reached the end of the string.
1116 : }
1117 : // Match strings after this code point.
1118 : // There cannot be any increments below it because UnicodeSet strings
1119 : // contain multiple code points.
1120 0 : pos+=spanLength;
1121 0 : rest-=spanLength;
1122 0 : offsets.shift(spanLength);
1123 0 : spanLength=0;
1124 0 : continue; // Match strings from after a single code point.
1125 : }
1126 : // Match strings from after the next string match.
1127 : }
1128 : }
1129 0 : int32_t minOffset=offsets.popMinimum();
1130 0 : pos+=minOffset;
1131 0 : rest-=minOffset;
1132 0 : spanLength=0; // Match strings from after a string match.
1133 0 : }
1134 : }
1135 :
1136 0 : int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1137 0 : if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1138 0 : return spanNotBackUTF8(s, length);
1139 : }
1140 0 : int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1141 0 : if(pos==0) {
1142 0 : return 0;
1143 : }
1144 0 : int32_t spanLength=length-pos;
1145 :
1146 : // Consider strings; they may overlap with the span.
1147 0 : OffsetList offsets;
1148 0 : if(spanCondition==USET_SPAN_CONTAINED) {
1149 : // Use offset list to try all possibilities.
1150 0 : offsets.setMaxLength(maxLength8);
1151 : }
1152 0 : int32_t i, stringsLength=strings.size();
1153 0 : uint8_t *spanBackUTF8Lengths=spanLengths;
1154 0 : if(all) {
1155 0 : spanBackUTF8Lengths+=3*stringsLength;
1156 : }
1157 : for(;;) {
1158 0 : const uint8_t *s8=utf8;
1159 : int32_t length8;
1160 0 : if(spanCondition==USET_SPAN_CONTAINED) {
1161 0 : for(i=0; i<stringsLength; ++i) {
1162 0 : length8=utf8Lengths[i];
1163 0 : if(length8==0) {
1164 0 : continue; // String not representable in UTF-8.
1165 : }
1166 0 : int32_t overlap=spanBackUTF8Lengths[i];
1167 0 : if(overlap==ALL_CP_CONTAINED) {
1168 0 : s8+=length8;
1169 0 : continue; // Irrelevant string.
1170 : }
1171 :
1172 : // Try to match this string at pos-(length8-overlap)..pos-length8.
1173 0 : if(overlap>=LONG_SPAN) {
1174 0 : overlap=length8;
1175 : // While contained: No point matching fully inside the code point span.
1176 0 : int32_t len1=0;
1177 0 : U8_FWD_1(s8, len1, overlap);
1178 0 : overlap-=len1; // Length of the string minus the first code point.
1179 : }
1180 0 : if(overlap>spanLength) {
1181 0 : overlap=spanLength;
1182 : }
1183 0 : int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1184 : for(;;) {
1185 0 : if(dec>pos) {
1186 0 : break;
1187 : }
1188 : // Try to match if the decrement is not listed already.
1189 : // Match at code point boundaries. (The UTF-8 strings were converted
1190 : // from UTF-16 and are guaranteed to be well-formed.)
1191 0 : if( !U8_IS_TRAIL(s[pos-dec]) &&
1192 0 : !offsets.containsOffset(dec) &&
1193 0 : matches8(s+pos-dec, s8, length8)
1194 : ) {
1195 0 : if(dec==pos) {
1196 0 : return 0; // Reached the start of the string.
1197 : }
1198 0 : offsets.addOffset(dec);
1199 : }
1200 0 : if(overlap==0) {
1201 0 : break;
1202 : }
1203 0 : --overlap;
1204 0 : ++dec;
1205 : }
1206 0 : s8+=length8;
1207 : }
1208 : } else /* USET_SPAN_SIMPLE */ {
1209 0 : int32_t maxDec=0, maxOverlap=0;
1210 0 : for(i=0; i<stringsLength; ++i) {
1211 0 : length8=utf8Lengths[i];
1212 0 : if(length8==0) {
1213 0 : continue; // String not representable in UTF-8.
1214 : }
1215 0 : int32_t overlap=spanBackUTF8Lengths[i];
1216 : // For longest match, we do need to try to match even an all-contained string
1217 : // to find the match from the latest end.
1218 :
1219 : // Try to match this string at pos-(length8-overlap)..pos-length8.
1220 0 : if(overlap>=LONG_SPAN) {
1221 0 : overlap=length8;
1222 : // Longest match: Need to match fully inside the code point span
1223 : // to find the match from the latest end.
1224 : }
1225 0 : if(overlap>spanLength) {
1226 0 : overlap=spanLength;
1227 : }
1228 0 : int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1229 : for(;;) {
1230 0 : if(dec>pos || overlap<maxOverlap) {
1231 : break;
1232 : }
1233 : // Try to match if the string is longer or ends later.
1234 : // Match at code point boundaries. (The UTF-8 strings were converted
1235 : // from UTF-16 and are guaranteed to be well-formed.)
1236 0 : if( !U8_IS_TRAIL(s[pos-dec]) &&
1237 0 : (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1238 0 : matches8(s+pos-dec, s8, length8)
1239 : ) {
1240 0 : maxDec=dec; // Longest match from latest end.
1241 0 : maxOverlap=overlap;
1242 0 : break;
1243 : }
1244 0 : --overlap;
1245 0 : ++dec;
1246 : }
1247 0 : s8+=length8;
1248 : }
1249 :
1250 0 : if(maxDec!=0 || maxOverlap!=0) {
1251 : // Longest-match algorithm, and there was a string match.
1252 : // Simply continue before it.
1253 0 : pos-=maxDec;
1254 0 : if(pos==0) {
1255 0 : return 0; // Reached the start of the string.
1256 : }
1257 0 : spanLength=0; // Match strings from before a string match.
1258 0 : continue;
1259 : }
1260 : }
1261 : // Finished trying to match all strings at pos.
1262 :
1263 0 : if(spanLength!=0 || pos==length) {
1264 : // The position is before an unlimited code point span (spanLength!=0),
1265 : // not before a string match.
1266 : // The only position where spanLength==0 before a span is pos==length.
1267 : // Otherwise, an unlimited code point span is only tried again when no
1268 : // strings match, and if such a non-initial span fails we stop.
1269 0 : if(offsets.isEmpty()) {
1270 0 : return pos; // No strings matched before a span.
1271 : }
1272 : // Match strings from before the next string match.
1273 : } else {
1274 : // The position is before a string match (or a single code point).
1275 0 : if(offsets.isEmpty()) {
1276 : // No more strings matched before a previous string match.
1277 : // Try another code point span from before the last string match.
1278 0 : int32_t oldPos=pos;
1279 0 : pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1280 0 : spanLength=oldPos-pos;
1281 0 : if( pos==0 || // Reached the start of the string, or
1282 : spanLength==0 // neither strings nor span progressed.
1283 : ) {
1284 0 : return pos;
1285 : }
1286 0 : continue; // spanLength>0: Match strings from before a span.
1287 : } else {
1288 : // Try to match only one code point from before a string match if some
1289 : // string matched beyond it, so that we try all possible positions
1290 : // and don't overshoot.
1291 0 : spanLength=spanOneBackUTF8(spanSet, s, pos);
1292 0 : if(spanLength>0) {
1293 0 : if(spanLength==pos) {
1294 0 : return 0; // Reached the start of the string.
1295 : }
1296 : // Match strings before this code point.
1297 : // There cannot be any decrements below it because UnicodeSet strings
1298 : // contain multiple code points.
1299 0 : pos-=spanLength;
1300 0 : offsets.shift(spanLength);
1301 0 : spanLength=0;
1302 0 : continue; // Match strings from before a single code point.
1303 : }
1304 : // Match strings from before the next string match.
1305 : }
1306 : }
1307 0 : pos-=offsets.popMinimum();
1308 0 : spanLength=0; // Match strings from before a string match.
1309 0 : }
1310 : }
1311 :
1312 : /*
1313 : * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1314 : *
1315 : * Theoretical algorithm:
1316 : * - Iterate through the string, and at each code point boundary:
1317 : * + If the code point there is in the set, then return with the current position.
1318 : * + If a set string matches at the current position, then return with the current position.
1319 : *
1320 : * Optimized implementation:
1321 : *
1322 : * (Same assumption as for span() above.)
1323 : *
1324 : * Create and cache a spanNotSet which contains all of the single code points
1325 : * of the original set but none of its strings.
1326 : * For each set string add its initial code point to the spanNotSet.
1327 : * (Also add its final code point for spanNotBack().)
1328 : *
1329 : * - Loop:
1330 : * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1331 : * + If the current code point is in the original set, then
1332 : * return the current position.
1333 : * + If any set string matches at the current position, then
1334 : * return the current position.
1335 : * + If there is no match at the current position, neither for the code point there
1336 : * nor for any set string, then skip this code point and continue the loop.
1337 : * This happens for set-string-initial code points that were added to spanNotSet
1338 : * when there is not actually a match for such a set string.
1339 : */
1340 :
1341 0 : int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1342 0 : int32_t pos=0, rest=length;
1343 0 : int32_t i, stringsLength=strings.size();
1344 0 : do {
1345 : // Span until we find a code point from the set,
1346 : // or a code point that starts or ends some string.
1347 0 : i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1348 0 : if(i==rest) {
1349 0 : return length; // Reached the end of the string.
1350 : }
1351 0 : pos+=i;
1352 0 : rest-=i;
1353 :
1354 : // Check whether the current code point is in the original set,
1355 : // without the string starts and ends.
1356 0 : int32_t cpLength=spanOne(spanSet, s+pos, rest);
1357 0 : if(cpLength>0) {
1358 0 : return pos; // There is a set element at pos.
1359 : }
1360 :
1361 : // Try to match the strings at pos.
1362 0 : for(i=0; i<stringsLength; ++i) {
1363 0 : if(spanLengths[i]==ALL_CP_CONTAINED) {
1364 0 : continue; // Irrelevant string.
1365 : }
1366 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1367 0 : const UChar *s16=string.getBuffer();
1368 0 : int32_t length16=string.length();
1369 0 : if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1370 0 : return pos; // There is a set element at pos.
1371 : }
1372 : }
1373 :
1374 : // The span(while not contained) ended on a string start/end which is
1375 : // not in the original set. Skip this code point and continue.
1376 : // cpLength<0
1377 0 : pos-=cpLength;
1378 0 : rest+=cpLength;
1379 0 : } while(rest!=0);
1380 0 : return length; // Reached the end of the string.
1381 : }
1382 :
1383 0 : int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1384 0 : int32_t pos=length;
1385 0 : int32_t i, stringsLength=strings.size();
1386 0 : do {
1387 : // Span until we find a code point from the set,
1388 : // or a code point that starts or ends some string.
1389 0 : pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1390 0 : if(pos==0) {
1391 0 : return 0; // Reached the start of the string.
1392 : }
1393 :
1394 : // Check whether the current code point is in the original set,
1395 : // without the string starts and ends.
1396 0 : int32_t cpLength=spanOneBack(spanSet, s, pos);
1397 0 : if(cpLength>0) {
1398 0 : return pos; // There is a set element at pos.
1399 : }
1400 :
1401 : // Try to match the strings at pos.
1402 0 : for(i=0; i<stringsLength; ++i) {
1403 : // Use spanLengths rather than a spanBackLengths pointer because
1404 : // it is easier and we only need to know whether the string is irrelevant
1405 : // which is the same in either array.
1406 0 : if(spanLengths[i]==ALL_CP_CONTAINED) {
1407 0 : continue; // Irrelevant string.
1408 : }
1409 0 : const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1410 0 : const UChar *s16=string.getBuffer();
1411 0 : int32_t length16=string.length();
1412 0 : if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1413 0 : return pos; // There is a set element at pos.
1414 : }
1415 : }
1416 :
1417 : // The span(while not contained) ended on a string start/end which is
1418 : // not in the original set. Skip this code point and continue.
1419 : // cpLength<0
1420 0 : pos+=cpLength;
1421 0 : } while(pos!=0);
1422 0 : return 0; // Reached the start of the string.
1423 : }
1424 :
1425 0 : int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1426 0 : int32_t pos=0, rest=length;
1427 0 : int32_t i, stringsLength=strings.size();
1428 0 : uint8_t *spanUTF8Lengths=spanLengths;
1429 0 : if(all) {
1430 0 : spanUTF8Lengths+=2*stringsLength;
1431 : }
1432 0 : do {
1433 : // Span until we find a code point from the set,
1434 : // or a code point that starts or ends some string.
1435 0 : i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1436 0 : if(i==rest) {
1437 0 : return length; // Reached the end of the string.
1438 : }
1439 0 : pos+=i;
1440 0 : rest-=i;
1441 :
1442 : // Check whether the current code point is in the original set,
1443 : // without the string starts and ends.
1444 0 : int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1445 0 : if(cpLength>0) {
1446 0 : return pos; // There is a set element at pos.
1447 : }
1448 :
1449 : // Try to match the strings at pos.
1450 0 : const uint8_t *s8=utf8;
1451 : int32_t length8;
1452 0 : for(i=0; i<stringsLength; ++i) {
1453 0 : length8=utf8Lengths[i];
1454 : // ALL_CP_CONTAINED: Irrelevant string.
1455 0 : if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1456 0 : return pos; // There is a set element at pos.
1457 : }
1458 0 : s8+=length8;
1459 : }
1460 :
1461 : // The span(while not contained) ended on a string start/end which is
1462 : // not in the original set. Skip this code point and continue.
1463 : // cpLength<0
1464 0 : pos-=cpLength;
1465 0 : rest+=cpLength;
1466 0 : } while(rest!=0);
1467 0 : return length; // Reached the end of the string.
1468 : }
1469 :
1470 0 : int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1471 0 : int32_t pos=length;
1472 0 : int32_t i, stringsLength=strings.size();
1473 0 : uint8_t *spanBackUTF8Lengths=spanLengths;
1474 0 : if(all) {
1475 0 : spanBackUTF8Lengths+=3*stringsLength;
1476 : }
1477 0 : do {
1478 : // Span until we find a code point from the set,
1479 : // or a code point that starts or ends some string.
1480 0 : pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1481 0 : if(pos==0) {
1482 0 : return 0; // Reached the start of the string.
1483 : }
1484 :
1485 : // Check whether the current code point is in the original set,
1486 : // without the string starts and ends.
1487 0 : int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1488 0 : if(cpLength>0) {
1489 0 : return pos; // There is a set element at pos.
1490 : }
1491 :
1492 : // Try to match the strings at pos.
1493 0 : const uint8_t *s8=utf8;
1494 : int32_t length8;
1495 0 : for(i=0; i<stringsLength; ++i) {
1496 0 : length8=utf8Lengths[i];
1497 : // ALL_CP_CONTAINED: Irrelevant string.
1498 0 : if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1499 0 : return pos; // There is a set element at pos.
1500 : }
1501 0 : s8+=length8;
1502 : }
1503 :
1504 : // The span(while not contained) ended on a string start/end which is
1505 : // not in the original set. Skip this code point and continue.
1506 : // cpLength<0
1507 0 : pos+=cpLength;
1508 0 : } while(pos!=0);
1509 0 : return 0; // Reached the start of the string.
1510 : }
1511 :
1512 : U_NAMESPACE_END
|