Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * utf16collationiterator.cpp
9 : *
10 : * created on: 2010oct27
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #include "unicode/utypes.h"
15 :
16 : #if !UCONFIG_NO_COLLATION
17 :
18 : #include "charstr.h"
19 : #include "cmemory.h"
20 : #include "collation.h"
21 : #include "collationdata.h"
22 : #include "collationfcd.h"
23 : #include "collationiterator.h"
24 : #include "normalizer2impl.h"
25 : #include "uassert.h"
26 : #include "utf16collationiterator.h"
27 :
28 : U_NAMESPACE_BEGIN
29 :
30 0 : UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
31 0 : const UChar *newText)
32 : : CollationIterator(other),
33 : start(newText),
34 0 : pos(newText + (other.pos - other.start)),
35 0 : limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
36 0 : }
37 :
38 0 : UTF16CollationIterator::~UTF16CollationIterator() {}
39 :
40 : UBool
41 0 : UTF16CollationIterator::operator==(const CollationIterator &other) const {
42 0 : if(!CollationIterator::operator==(other)) { return FALSE; }
43 0 : const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
44 : // Compare the iterator state but not the text: Assume that the caller does that.
45 0 : return (pos - start) == (o.pos - o.start);
46 : }
47 :
48 : void
49 0 : UTF16CollationIterator::resetToOffset(int32_t newOffset) {
50 0 : reset();
51 0 : pos = start + newOffset;
52 0 : }
53 :
54 : int32_t
55 0 : UTF16CollationIterator::getOffset() const {
56 0 : return (int32_t)(pos - start);
57 : }
58 :
59 : uint32_t
60 0 : UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
61 0 : if(pos == limit) {
62 0 : c = U_SENTINEL;
63 0 : return Collation::FALLBACK_CE32;
64 : }
65 0 : c = *pos++;
66 0 : return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
67 : }
68 :
69 : UChar
70 0 : UTF16CollationIterator::handleGetTrailSurrogate() {
71 0 : if(pos == limit) { return 0; }
72 : UChar trail;
73 0 : if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
74 0 : return trail;
75 : }
76 :
77 : UBool
78 0 : UTF16CollationIterator::foundNULTerminator() {
79 0 : if(limit == NULL) {
80 0 : limit = --pos;
81 0 : return TRUE;
82 : } else {
83 0 : return FALSE;
84 : }
85 : }
86 :
87 : UChar32
88 0 : UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
89 0 : if(pos == limit) {
90 0 : return U_SENTINEL;
91 : }
92 0 : UChar32 c = *pos;
93 0 : if(c == 0 && limit == NULL) {
94 0 : limit = pos;
95 0 : return U_SENTINEL;
96 : }
97 0 : ++pos;
98 : UChar trail;
99 0 : if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
100 0 : ++pos;
101 0 : return U16_GET_SUPPLEMENTARY(c, trail);
102 : } else {
103 0 : return c;
104 : }
105 : }
106 :
107 : UChar32
108 0 : UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
109 0 : if(pos == start) {
110 0 : return U_SENTINEL;
111 : }
112 0 : UChar32 c = *--pos;
113 : UChar lead;
114 0 : if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
115 0 : --pos;
116 0 : return U16_GET_SUPPLEMENTARY(lead, c);
117 : } else {
118 0 : return c;
119 : }
120 : }
121 :
122 : void
123 0 : UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
124 0 : while(num > 0 && pos != limit) {
125 0 : UChar32 c = *pos;
126 0 : if(c == 0 && limit == NULL) {
127 0 : limit = pos;
128 0 : break;
129 : }
130 0 : ++pos;
131 0 : --num;
132 0 : if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
133 0 : ++pos;
134 : }
135 : }
136 0 : }
137 :
138 : void
139 0 : UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
140 0 : while(num > 0 && pos != start) {
141 0 : UChar32 c = *--pos;
142 0 : --num;
143 0 : if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
144 0 : --pos;
145 : }
146 : }
147 0 : }
148 :
149 : // FCDUTF16CollationIterator ----------------------------------------------- ***
150 :
151 0 : FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
152 0 : const UChar *newText)
153 : : UTF16CollationIterator(other),
154 : rawStart(newText),
155 0 : segmentStart(newText + (other.segmentStart - other.rawStart)),
156 0 : segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
157 0 : rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
158 0 : nfcImpl(other.nfcImpl),
159 : normalized(other.normalized),
160 0 : checkDir(other.checkDir) {
161 0 : if(checkDir != 0 || other.start == other.segmentStart) {
162 0 : start = newText + (other.start - other.rawStart);
163 0 : pos = newText + (other.pos - other.rawStart);
164 0 : limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
165 : } else {
166 0 : start = normalized.getBuffer();
167 0 : pos = start + (other.pos - other.start);
168 0 : limit = start + normalized.length();
169 : }
170 0 : }
171 :
172 0 : FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}
173 :
174 : UBool
175 0 : FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
176 : // Skip the UTF16CollationIterator and call its parent.
177 0 : if(!CollationIterator::operator==(other)) { return FALSE; }
178 0 : const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
179 : // Compare the iterator state but not the text: Assume that the caller does that.
180 0 : if(checkDir != o.checkDir) { return FALSE; }
181 0 : if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
182 0 : if(checkDir != 0 || start == segmentStart) {
183 0 : return (pos - rawStart) == (o.pos - o.rawStart);
184 : } else {
185 0 : return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
186 0 : (pos - start) == (o.pos - o.start);
187 : }
188 : }
189 :
190 : void
191 0 : FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
192 0 : reset();
193 0 : start = segmentStart = pos = rawStart + newOffset;
194 0 : limit = rawLimit;
195 0 : checkDir = 1;
196 0 : }
197 :
198 : int32_t
199 0 : FCDUTF16CollationIterator::getOffset() const {
200 0 : if(checkDir != 0 || start == segmentStart) {
201 0 : return (int32_t)(pos - rawStart);
202 0 : } else if(pos == start) {
203 0 : return (int32_t)(segmentStart - rawStart);
204 : } else {
205 0 : return (int32_t)(segmentLimit - rawStart);
206 : }
207 : }
208 :
209 : uint32_t
210 0 : FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
211 : for(;;) {
212 0 : if(checkDir > 0) {
213 0 : if(pos == limit) {
214 0 : c = U_SENTINEL;
215 0 : return Collation::FALLBACK_CE32;
216 : }
217 0 : c = *pos++;
218 0 : if(CollationFCD::hasTccc(c)) {
219 0 : if(CollationFCD::maybeTibetanCompositeVowel(c) ||
220 0 : (pos != limit && CollationFCD::hasLccc(*pos))) {
221 0 : --pos;
222 0 : if(!nextSegment(errorCode)) {
223 0 : c = U_SENTINEL;
224 0 : return Collation::FALLBACK_CE32;
225 : }
226 0 : c = *pos++;
227 : }
228 : }
229 0 : break;
230 0 : } else if(checkDir == 0 && pos != limit) {
231 0 : c = *pos++;
232 0 : break;
233 : } else {
234 0 : switchToForward();
235 : }
236 : }
237 0 : return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
238 : }
239 :
240 : UBool
241 0 : FCDUTF16CollationIterator::foundNULTerminator() {
242 0 : if(limit == NULL) {
243 0 : limit = rawLimit = --pos;
244 0 : return TRUE;
245 : } else {
246 0 : return FALSE;
247 : }
248 : }
249 :
250 : UChar32
251 0 : FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
252 : UChar32 c;
253 : for(;;) {
254 0 : if(checkDir > 0) {
255 0 : if(pos == limit) {
256 0 : return U_SENTINEL;
257 : }
258 0 : c = *pos++;
259 0 : if(CollationFCD::hasTccc(c)) {
260 0 : if(CollationFCD::maybeTibetanCompositeVowel(c) ||
261 0 : (pos != limit && CollationFCD::hasLccc(*pos))) {
262 0 : --pos;
263 0 : if(!nextSegment(errorCode)) {
264 0 : return U_SENTINEL;
265 : }
266 0 : c = *pos++;
267 : }
268 0 : } else if(c == 0 && limit == NULL) {
269 0 : limit = rawLimit = --pos;
270 0 : return U_SENTINEL;
271 : }
272 0 : break;
273 0 : } else if(checkDir == 0 && pos != limit) {
274 0 : c = *pos++;
275 0 : break;
276 : } else {
277 0 : switchToForward();
278 : }
279 : }
280 : UChar trail;
281 0 : if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
282 0 : ++pos;
283 0 : return U16_GET_SUPPLEMENTARY(c, trail);
284 : } else {
285 0 : return c;
286 : }
287 : }
288 :
289 : UChar32
290 0 : FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
291 : UChar32 c;
292 : for(;;) {
293 0 : if(checkDir < 0) {
294 0 : if(pos == start) {
295 0 : return U_SENTINEL;
296 : }
297 0 : c = *--pos;
298 0 : if(CollationFCD::hasLccc(c)) {
299 0 : if(CollationFCD::maybeTibetanCompositeVowel(c) ||
300 0 : (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
301 0 : ++pos;
302 0 : if(!previousSegment(errorCode)) {
303 0 : return U_SENTINEL;
304 : }
305 0 : c = *--pos;
306 : }
307 : }
308 0 : break;
309 0 : } else if(checkDir == 0 && pos != start) {
310 0 : c = *--pos;
311 0 : break;
312 : } else {
313 0 : switchToBackward();
314 : }
315 : }
316 : UChar lead;
317 0 : if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
318 0 : --pos;
319 0 : return U16_GET_SUPPLEMENTARY(lead, c);
320 : } else {
321 0 : return c;
322 : }
323 : }
324 :
325 : void
326 0 : FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
327 : // Specify the class to avoid a virtual-function indirection.
328 : // In Java, we would declare this class final.
329 0 : while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
330 0 : --num;
331 : }
332 0 : }
333 :
334 : void
335 0 : FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
336 : // Specify the class to avoid a virtual-function indirection.
337 : // In Java, we would declare this class final.
338 0 : while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
339 0 : --num;
340 : }
341 0 : }
342 :
343 : void
344 0 : FCDUTF16CollationIterator::switchToForward() {
345 0 : U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
346 0 : if(checkDir < 0) {
347 : // Turn around from backward checking.
348 0 : start = segmentStart = pos;
349 0 : if(pos == segmentLimit) {
350 0 : limit = rawLimit;
351 0 : checkDir = 1; // Check forward.
352 : } else { // pos < segmentLimit
353 0 : checkDir = 0; // Stay in FCD segment.
354 : }
355 : } else {
356 : // Reached the end of the FCD segment.
357 0 : if(start == segmentStart) {
358 : // The input text segment is FCD, extend it forward.
359 : } else {
360 : // The input text segment needed to be normalized.
361 : // Switch to checking forward from it.
362 0 : pos = start = segmentStart = segmentLimit;
363 : // Note: If this segment is at the end of the input text,
364 : // then it might help to return FALSE to indicate that, so that
365 : // we do not have to re-check and normalize when we turn around and go backwards.
366 : // However, that would complicate the call sites for an optimization of an unusual case.
367 : }
368 0 : limit = rawLimit;
369 0 : checkDir = 1;
370 : }
371 0 : }
372 :
373 : UBool
374 0 : FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
375 0 : if(U_FAILURE(errorCode)) { return FALSE; }
376 0 : U_ASSERT(checkDir > 0 && pos != limit);
377 : // The input text [segmentStart..pos[ passes the FCD check.
378 0 : const UChar *p = pos;
379 0 : uint8_t prevCC = 0;
380 : for(;;) {
381 : // Fetch the next character's fcd16 value.
382 0 : const UChar *q = p;
383 0 : uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
384 0 : uint8_t leadCC = (uint8_t)(fcd16 >> 8);
385 0 : if(leadCC == 0 && q != pos) {
386 : // FCD boundary before the [q, p[ character.
387 0 : limit = segmentLimit = q;
388 0 : break;
389 : }
390 0 : if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
391 : // Fails FCD check. Find the next FCD boundary and normalize.
392 0 : do {
393 0 : q = p;
394 0 : } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
395 0 : if(!normalize(pos, q, errorCode)) { return FALSE; }
396 0 : pos = start;
397 0 : break;
398 : }
399 0 : prevCC = (uint8_t)fcd16;
400 0 : if(p == rawLimit || prevCC == 0) {
401 : // FCD boundary after the last character.
402 0 : limit = segmentLimit = p;
403 0 : break;
404 : }
405 0 : }
406 0 : U_ASSERT(pos != limit);
407 0 : checkDir = 0;
408 0 : return TRUE;
409 : }
410 :
411 : void
412 0 : FCDUTF16CollationIterator::switchToBackward() {
413 0 : U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
414 0 : if(checkDir > 0) {
415 : // Turn around from forward checking.
416 0 : limit = segmentLimit = pos;
417 0 : if(pos == segmentStart) {
418 0 : start = rawStart;
419 0 : checkDir = -1; // Check backward.
420 : } else { // pos > segmentStart
421 0 : checkDir = 0; // Stay in FCD segment.
422 : }
423 : } else {
424 : // Reached the start of the FCD segment.
425 0 : if(start == segmentStart) {
426 : // The input text segment is FCD, extend it backward.
427 : } else {
428 : // The input text segment needed to be normalized.
429 : // Switch to checking backward from it.
430 0 : pos = limit = segmentLimit = segmentStart;
431 : }
432 0 : start = rawStart;
433 0 : checkDir = -1;
434 : }
435 0 : }
436 :
437 : UBool
438 0 : FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
439 0 : if(U_FAILURE(errorCode)) { return FALSE; }
440 0 : U_ASSERT(checkDir < 0 && pos != start);
441 : // The input text [pos..segmentLimit[ passes the FCD check.
442 0 : const UChar *p = pos;
443 0 : uint8_t nextCC = 0;
444 : for(;;) {
445 : // Fetch the previous character's fcd16 value.
446 0 : const UChar *q = p;
447 0 : uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
448 0 : uint8_t trailCC = (uint8_t)fcd16;
449 0 : if(trailCC == 0 && q != pos) {
450 : // FCD boundary after the [p, q[ character.
451 0 : start = segmentStart = q;
452 0 : break;
453 : }
454 0 : if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
455 0 : CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
456 : // Fails FCD check. Find the previous FCD boundary and normalize.
457 0 : do {
458 0 : q = p;
459 0 : } while(fcd16 > 0xff && p != rawStart &&
460 0 : (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
461 0 : if(!normalize(q, pos, errorCode)) { return FALSE; }
462 0 : pos = limit;
463 0 : break;
464 : }
465 0 : nextCC = (uint8_t)(fcd16 >> 8);
466 0 : if(p == rawStart || nextCC == 0) {
467 : // FCD boundary before the following character.
468 0 : start = segmentStart = p;
469 0 : break;
470 : }
471 0 : }
472 0 : U_ASSERT(pos != start);
473 0 : checkDir = 0;
474 0 : return TRUE;
475 : }
476 :
477 : UBool
478 0 : FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
479 : // NFD without argument checking.
480 0 : U_ASSERT(U_SUCCESS(errorCode));
481 0 : nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
482 0 : if(U_FAILURE(errorCode)) { return FALSE; }
483 : // Switch collation processing into the FCD buffer
484 : // with the result of normalizing [segmentStart, segmentLimit[.
485 0 : segmentStart = from;
486 0 : segmentLimit = to;
487 0 : start = normalized.getBuffer();
488 0 : limit = start + normalized.length();
489 0 : return TRUE;
490 : }
491 :
492 : U_NAMESPACE_END
493 :
494 : #endif // !UCONFIG_NO_COLLATION
|