Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationiterator.cpp
9 : *
10 : * created on: 2010oct27
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #include "utypeinfo.h" // for 'typeid' to work
15 :
16 : #include "unicode/utypes.h"
17 :
18 : #if !UCONFIG_NO_COLLATION
19 :
20 : #include "unicode/ucharstrie.h"
21 : #include "unicode/ustringtrie.h"
22 : #include "charstr.h"
23 : #include "cmemory.h"
24 : #include "collation.h"
25 : #include "collationdata.h"
26 : #include "collationfcd.h"
27 : #include "collationiterator.h"
28 : #include "normalizer2impl.h"
29 : #include "uassert.h"
30 : #include "uvectr32.h"
31 :
32 : U_NAMESPACE_BEGIN
33 :
34 0 : CollationIterator::CEBuffer::~CEBuffer() {}
35 :
36 : UBool
37 0 : CollationIterator::CEBuffer::ensureAppendCapacity(int32_t appCap, UErrorCode &errorCode) {
38 0 : int32_t capacity = buffer.getCapacity();
39 0 : if((length + appCap) <= capacity) { return TRUE; }
40 0 : if(U_FAILURE(errorCode)) { return FALSE; }
41 0 : do {
42 0 : if(capacity < 1000) {
43 0 : capacity *= 4;
44 : } else {
45 0 : capacity *= 2;
46 : }
47 0 : } while(capacity < (length + appCap));
48 0 : int64_t *p = buffer.resize(capacity, length);
49 0 : if(p == NULL) {
50 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
51 0 : return FALSE;
52 : }
53 0 : return TRUE;
54 : }
55 :
56 : // State of combining marks skipped in discontiguous contraction.
57 : // We create a state object on first use and keep it around deactivated between uses.
58 0 : class SkippedState : public UMemory {
59 : public:
60 : // Born active but empty.
61 0 : SkippedState() : pos(0), skipLengthAtMatch(0) {}
62 0 : void clear() {
63 0 : oldBuffer.remove();
64 0 : pos = 0;
65 : // The newBuffer is reset by setFirstSkipped().
66 0 : }
67 :
68 0 : UBool isEmpty() const { return oldBuffer.isEmpty(); }
69 :
70 0 : UBool hasNext() const { return pos < oldBuffer.length(); }
71 :
72 : // Requires hasNext().
73 0 : UChar32 next() {
74 0 : UChar32 c = oldBuffer.char32At(pos);
75 0 : pos += U16_LENGTH(c);
76 0 : return c;
77 : }
78 :
79 : // Accounts for one more input code point read beyond the end of the marks buffer.
80 0 : void incBeyond() {
81 0 : U_ASSERT(!hasNext());
82 0 : ++pos;
83 0 : }
84 :
85 : // Goes backward through the skipped-marks buffer.
86 : // Returns the number of code points read beyond the skipped marks
87 : // that need to be backtracked through normal input.
88 0 : int32_t backwardNumCodePoints(int32_t n) {
89 0 : int32_t length = oldBuffer.length();
90 0 : int32_t beyond = pos - length;
91 0 : if(beyond > 0) {
92 0 : if(beyond >= n) {
93 : // Not back far enough to re-enter the oldBuffer.
94 0 : pos -= n;
95 0 : return n;
96 : } else {
97 : // Back out all beyond-oldBuffer code points and re-enter the buffer.
98 0 : pos = oldBuffer.moveIndex32(length, beyond - n);
99 0 : return beyond;
100 : }
101 : } else {
102 : // Go backwards from inside the oldBuffer.
103 0 : pos = oldBuffer.moveIndex32(pos, -n);
104 0 : return 0;
105 : }
106 : }
107 :
108 0 : void setFirstSkipped(UChar32 c) {
109 0 : skipLengthAtMatch = 0;
110 0 : newBuffer.setTo(c);
111 0 : }
112 :
113 0 : void skip(UChar32 c) {
114 0 : newBuffer.append(c);
115 0 : }
116 :
117 0 : void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
118 :
119 : // Replaces the characters we consumed with the newly skipped ones.
120 0 : void replaceMatch() {
121 : // Note: UnicodeString.replace() pins pos to at most length().
122 0 : oldBuffer.replace(0, pos, newBuffer, 0, skipLengthAtMatch);
123 0 : pos = 0;
124 0 : }
125 :
126 0 : void saveTrieState(const UCharsTrie &trie) { trie.saveState(state); }
127 0 : void resetToTrieState(UCharsTrie &trie) const { trie.resetToState(state); }
128 :
129 : private:
130 : // Combining marks skipped in previous discontiguous-contraction matching.
131 : // After that discontiguous contraction was completed, we start reading them from here.
132 : UnicodeString oldBuffer;
133 : // Combining marks newly skipped in current discontiguous-contraction matching.
134 : // These might have been read from the normal text or from the oldBuffer.
135 : UnicodeString newBuffer;
136 : // Reading index in oldBuffer,
137 : // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
138 : int32_t pos;
139 : // newBuffer.length() at the time of the last matching character.
140 : // When a partial match fails, we back out skipped and partial-matching input characters.
141 : int32_t skipLengthAtMatch;
142 : // We save the trie state before we attempt to match a character,
143 : // so that we can skip it and try the next one.
144 : UCharsTrie::State state;
145 : };
146 :
147 0 : CollationIterator::CollationIterator(const CollationIterator &other)
148 : : UObject(other),
149 0 : trie(other.trie),
150 0 : data(other.data),
151 0 : cesIndex(other.cesIndex),
152 : skipped(NULL),
153 0 : numCpFwd(other.numCpFwd),
154 0 : isNumeric(other.isNumeric) {
155 0 : UErrorCode errorCode = U_ZERO_ERROR;
156 0 : int32_t length = other.ceBuffer.length;
157 0 : if(length > 0 && ceBuffer.ensureAppendCapacity(length, errorCode)) {
158 0 : for(int32_t i = 0; i < length; ++i) {
159 0 : ceBuffer.set(i, other.ceBuffer.get(i));
160 : }
161 0 : ceBuffer.length = length;
162 : } else {
163 0 : cesIndex = 0;
164 : }
165 0 : }
166 :
167 0 : CollationIterator::~CollationIterator() {
168 0 : delete skipped;
169 0 : }
170 :
171 : UBool
172 0 : CollationIterator::operator==(const CollationIterator &other) const {
173 : // Subclasses: Call this method and then add more specific checks.
174 : // Compare the iterator state but not the collation data (trie & data fields):
175 : // Assume that the caller compares the data.
176 : // Ignore skipped since that should be unused between calls to nextCE().
177 : // (It only stays around to avoid another memory allocation.)
178 0 : if(!(typeid(*this) == typeid(other) &&
179 0 : ceBuffer.length == other.ceBuffer.length &&
180 0 : cesIndex == other.cesIndex &&
181 0 : numCpFwd == other.numCpFwd &&
182 0 : isNumeric == other.isNumeric)) {
183 0 : return FALSE;
184 : }
185 0 : for(int32_t i = 0; i < ceBuffer.length; ++i) {
186 0 : if(ceBuffer.get(i) != other.ceBuffer.get(i)) { return FALSE; }
187 : }
188 0 : return TRUE;
189 : }
190 :
191 : void
192 0 : CollationIterator::reset() {
193 0 : cesIndex = ceBuffer.length = 0;
194 0 : if(skipped != NULL) { skipped->clear(); }
195 0 : }
196 :
197 : int32_t
198 0 : CollationIterator::fetchCEs(UErrorCode &errorCode) {
199 0 : while(U_SUCCESS(errorCode) && nextCE(errorCode) != Collation::NO_CE) {
200 : // No need to loop for each expansion CE.
201 0 : cesIndex = ceBuffer.length;
202 : }
203 0 : return ceBuffer.length;
204 : }
205 :
206 : uint32_t
207 0 : CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
208 0 : c = nextCodePoint(errorCode);
209 0 : return (c < 0) ? Collation::FALLBACK_CE32 : data->getCE32(c);
210 : }
211 :
212 : UChar
213 0 : CollationIterator::handleGetTrailSurrogate() {
214 0 : return 0;
215 : }
216 :
217 : UBool
218 0 : CollationIterator::foundNULTerminator() {
219 0 : return FALSE;
220 : }
221 :
222 : UBool
223 0 : CollationIterator::forbidSurrogateCodePoints() const {
224 0 : return FALSE;
225 : }
226 :
227 : uint32_t
228 0 : CollationIterator::getDataCE32(UChar32 c) const {
229 0 : return data->getCE32(c);
230 : }
231 :
232 : uint32_t
233 0 : CollationIterator::getCE32FromBuilderData(uint32_t /*ce32*/, UErrorCode &errorCode) {
234 0 : if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
235 0 : return 0;
236 : }
237 :
238 : int64_t
239 0 : CollationIterator::nextCEFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
240 : UErrorCode &errorCode) {
241 0 : --ceBuffer.length; // Undo ceBuffer.incLength().
242 0 : appendCEsFromCE32(d, c, ce32, TRUE, errorCode);
243 0 : if(U_SUCCESS(errorCode)) {
244 0 : return ceBuffer.get(cesIndex++);
245 : } else {
246 0 : return Collation::NO_CE_PRIMARY;
247 : }
248 : }
249 :
250 : void
251 0 : CollationIterator::appendCEsFromCE32(const CollationData *d, UChar32 c, uint32_t ce32,
252 : UBool forward, UErrorCode &errorCode) {
253 0 : while(Collation::isSpecialCE32(ce32)) {
254 0 : switch(Collation::tagFromCE32(ce32)) {
255 : case Collation::FALLBACK_TAG:
256 : case Collation::RESERVED_TAG_3:
257 0 : if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; }
258 0 : return;
259 : case Collation::LONG_PRIMARY_TAG:
260 0 : ceBuffer.append(Collation::ceFromLongPrimaryCE32(ce32), errorCode);
261 0 : return;
262 : case Collation::LONG_SECONDARY_TAG:
263 0 : ceBuffer.append(Collation::ceFromLongSecondaryCE32(ce32), errorCode);
264 0 : return;
265 : case Collation::LATIN_EXPANSION_TAG:
266 0 : if(ceBuffer.ensureAppendCapacity(2, errorCode)) {
267 0 : ceBuffer.set(ceBuffer.length, Collation::latinCE0FromCE32(ce32));
268 0 : ceBuffer.set(ceBuffer.length + 1, Collation::latinCE1FromCE32(ce32));
269 0 : ceBuffer.length += 2;
270 : }
271 0 : return;
272 : case Collation::EXPANSION32_TAG: {
273 0 : const uint32_t *ce32s = d->ce32s + Collation::indexFromCE32(ce32);
274 0 : int32_t length = Collation::lengthFromCE32(ce32);
275 0 : if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
276 0 : do {
277 0 : ceBuffer.appendUnsafe(Collation::ceFromCE32(*ce32s++));
278 : } while(--length > 0);
279 : }
280 0 : return;
281 : }
282 : case Collation::EXPANSION_TAG: {
283 0 : const int64_t *ces = d->ces + Collation::indexFromCE32(ce32);
284 0 : int32_t length = Collation::lengthFromCE32(ce32);
285 0 : if(ceBuffer.ensureAppendCapacity(length, errorCode)) {
286 0 : do {
287 0 : ceBuffer.appendUnsafe(*ces++);
288 : } while(--length > 0);
289 : }
290 0 : return;
291 : }
292 : case Collation::BUILDER_DATA_TAG:
293 0 : ce32 = getCE32FromBuilderData(ce32, errorCode);
294 0 : if(U_FAILURE(errorCode)) { return; }
295 0 : if(ce32 == Collation::FALLBACK_CE32) {
296 0 : d = data->base;
297 0 : ce32 = d->getCE32(c);
298 : }
299 0 : break;
300 : case Collation::PREFIX_TAG:
301 0 : if(forward) { backwardNumCodePoints(1, errorCode); }
302 0 : ce32 = getCE32FromPrefix(d, ce32, errorCode);
303 0 : if(forward) { forwardNumCodePoints(1, errorCode); }
304 0 : break;
305 : case Collation::CONTRACTION_TAG: {
306 0 : const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
307 0 : uint32_t defaultCE32 = CollationData::readCE32(p); // Default if no suffix match.
308 0 : if(!forward) {
309 : // Backward contractions are handled by previousCEUnsafe().
310 : // c has contractions but they were not found.
311 0 : ce32 = defaultCE32;
312 0 : break;
313 : }
314 : UChar32 nextCp;
315 0 : if(skipped == NULL && numCpFwd < 0) {
316 : // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
317 : // avoiding the function call and the nextSkippedCodePoint() overhead.
318 0 : nextCp = nextCodePoint(errorCode);
319 0 : if(nextCp < 0) {
320 : // No more text.
321 0 : ce32 = defaultCE32;
322 0 : break;
323 0 : } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
324 0 : !CollationFCD::mayHaveLccc(nextCp)) {
325 : // All contraction suffixes start with characters with lccc!=0
326 : // but the next code point has lccc==0.
327 0 : backwardNumCodePoints(1, errorCode);
328 0 : ce32 = defaultCE32;
329 0 : break;
330 : }
331 : } else {
332 0 : nextCp = nextSkippedCodePoint(errorCode);
333 0 : if(nextCp < 0) {
334 : // No more text.
335 0 : ce32 = defaultCE32;
336 0 : break;
337 0 : } else if((ce32 & Collation::CONTRACT_NEXT_CCC) != 0 &&
338 0 : !CollationFCD::mayHaveLccc(nextCp)) {
339 : // All contraction suffixes start with characters with lccc!=0
340 : // but the next code point has lccc==0.
341 0 : backwardNumSkipped(1, errorCode);
342 0 : ce32 = defaultCE32;
343 0 : break;
344 : }
345 : }
346 0 : ce32 = nextCE32FromContraction(d, ce32, p + 2, defaultCE32, nextCp, errorCode);
347 0 : if(ce32 == Collation::NO_CE32) {
348 : // CEs from a discontiguous contraction plus the skipped combining marks
349 : // have been appended already.
350 0 : return;
351 : }
352 0 : break;
353 : }
354 : case Collation::DIGIT_TAG:
355 0 : if(isNumeric) {
356 0 : appendNumericCEs(ce32, forward, errorCode);
357 0 : return;
358 : } else {
359 : // Fetch the non-numeric-collation CE32 and continue.
360 0 : ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
361 0 : break;
362 : }
363 : case Collation::U0000_TAG:
364 0 : U_ASSERT(c == 0);
365 0 : if(forward && foundNULTerminator()) {
366 : // Handle NUL-termination. (Not needed in Java.)
367 0 : ceBuffer.append(Collation::NO_CE, errorCode);
368 0 : return;
369 : } else {
370 : // Fetch the normal ce32 for U+0000 and continue.
371 0 : ce32 = d->ce32s[0];
372 0 : break;
373 : }
374 : case Collation::HANGUL_TAG: {
375 0 : const uint32_t *jamoCE32s = d->jamoCE32s;
376 0 : c -= Hangul::HANGUL_BASE;
377 0 : UChar32 t = c % Hangul::JAMO_T_COUNT;
378 0 : c /= Hangul::JAMO_T_COUNT;
379 0 : UChar32 v = c % Hangul::JAMO_V_COUNT;
380 0 : c /= Hangul::JAMO_V_COUNT;
381 0 : if((ce32 & Collation::HANGUL_NO_SPECIAL_JAMO) != 0) {
382 : // None of the Jamo CE32s are isSpecialCE32().
383 : // Avoid recursive function calls and per-Jamo tests.
384 0 : if(ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3, errorCode)) {
385 0 : ceBuffer.set(ceBuffer.length, Collation::ceFromCE32(jamoCE32s[c]));
386 0 : ceBuffer.set(ceBuffer.length + 1, Collation::ceFromCE32(jamoCE32s[19 + v]));
387 0 : ceBuffer.length += 2;
388 0 : if(t != 0) {
389 0 : ceBuffer.appendUnsafe(Collation::ceFromCE32(jamoCE32s[39 + t]));
390 : }
391 : }
392 0 : return;
393 : } else {
394 : // We should not need to compute each Jamo code point.
395 : // In particular, there should be no offset or implicit ce32.
396 0 : appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[c], forward, errorCode);
397 0 : appendCEsFromCE32(d, U_SENTINEL, jamoCE32s[19 + v], forward, errorCode);
398 0 : if(t == 0) { return; }
399 : // offset 39 = 19 + 21 - 1:
400 : // 19 = JAMO_L_COUNT
401 : // 21 = JAMO_T_COUNT
402 : // -1 = omit t==0
403 0 : ce32 = jamoCE32s[39 + t];
404 0 : c = U_SENTINEL;
405 0 : break;
406 : }
407 : }
408 : case Collation::LEAD_SURROGATE_TAG: {
409 0 : U_ASSERT(forward); // Backward iteration should never see lead surrogate code _unit_ data.
410 0 : U_ASSERT(U16_IS_LEAD(c));
411 : UChar trail;
412 0 : if(U16_IS_TRAIL(trail = handleGetTrailSurrogate())) {
413 0 : c = U16_GET_SUPPLEMENTARY(c, trail);
414 0 : ce32 &= Collation::LEAD_TYPE_MASK;
415 0 : if(ce32 == Collation::LEAD_ALL_UNASSIGNED) {
416 0 : ce32 = Collation::UNASSIGNED_CE32; // unassigned-implicit
417 0 : } else if(ce32 == Collation::LEAD_ALL_FALLBACK ||
418 : (ce32 = d->getCE32FromSupplementary(c)) == Collation::FALLBACK_CE32) {
419 : // fall back to the base data
420 0 : d = d->base;
421 0 : ce32 = d->getCE32FromSupplementary(c);
422 : }
423 : } else {
424 : // c is an unpaired surrogate.
425 0 : ce32 = Collation::UNASSIGNED_CE32;
426 : }
427 0 : break;
428 : }
429 : case Collation::OFFSET_TAG:
430 0 : U_ASSERT(c >= 0);
431 0 : ceBuffer.append(d->getCEFromOffsetCE32(c, ce32), errorCode);
432 0 : return;
433 : case Collation::IMPLICIT_TAG:
434 0 : U_ASSERT(c >= 0);
435 0 : if(U_IS_SURROGATE(c) && forbidSurrogateCodePoints()) {
436 0 : ce32 = Collation::FFFD_CE32;
437 0 : break;
438 : } else {
439 0 : ceBuffer.append(Collation::unassignedCEFromCodePoint(c), errorCode);
440 0 : return;
441 : }
442 : }
443 : }
444 0 : ceBuffer.append(Collation::ceFromSimpleCE32(ce32), errorCode);
445 : }
446 :
447 : uint32_t
448 0 : CollationIterator::getCE32FromPrefix(const CollationData *d, uint32_t ce32,
449 : UErrorCode &errorCode) {
450 0 : const UChar *p = d->contexts + Collation::indexFromCE32(ce32);
451 0 : ce32 = CollationData::readCE32(p); // Default if no prefix match.
452 0 : p += 2;
453 : // Number of code points read before the original code point.
454 0 : int32_t lookBehind = 0;
455 0 : UCharsTrie prefixes(p);
456 : for(;;) {
457 0 : UChar32 c = previousCodePoint(errorCode);
458 0 : if(c < 0) { break; }
459 0 : ++lookBehind;
460 0 : UStringTrieResult match = prefixes.nextForCodePoint(c);
461 0 : if(USTRINGTRIE_HAS_VALUE(match)) {
462 0 : ce32 = (uint32_t)prefixes.getValue();
463 : }
464 0 : if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
465 0 : }
466 0 : forwardNumCodePoints(lookBehind, errorCode);
467 0 : return ce32;
468 : }
469 :
470 : UChar32
471 0 : CollationIterator::nextSkippedCodePoint(UErrorCode &errorCode) {
472 0 : if(skipped != NULL && skipped->hasNext()) { return skipped->next(); }
473 0 : if(numCpFwd == 0) { return U_SENTINEL; }
474 0 : UChar32 c = nextCodePoint(errorCode);
475 0 : if(skipped != NULL && !skipped->isEmpty() && c >= 0) { skipped->incBeyond(); }
476 0 : if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
477 0 : return c;
478 : }
479 :
480 : void
481 0 : CollationIterator::backwardNumSkipped(int32_t n, UErrorCode &errorCode) {
482 0 : if(skipped != NULL && !skipped->isEmpty()) {
483 0 : n = skipped->backwardNumCodePoints(n);
484 : }
485 0 : backwardNumCodePoints(n, errorCode);
486 0 : if(numCpFwd >= 0) { numCpFwd += n; }
487 0 : }
488 :
489 : uint32_t
490 0 : CollationIterator::nextCE32FromContraction(const CollationData *d, uint32_t contractionCE32,
491 : const UChar *p, uint32_t ce32, UChar32 c,
492 : UErrorCode &errorCode) {
493 : // c: next code point after the original one
494 :
495 : // Number of code points read beyond the original code point.
496 : // Needed for discontiguous contraction matching.
497 0 : int32_t lookAhead = 1;
498 : // Number of code points read since the last match (initially only c).
499 0 : int32_t sinceMatch = 1;
500 : // Normally we only need a contiguous match,
501 : // and therefore need not remember the suffixes state from before a mismatch for retrying.
502 : // If we are already processing skipped combining marks, then we do track the state.
503 0 : UCharsTrie suffixes(p);
504 0 : if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
505 0 : UStringTrieResult match = suffixes.firstForCodePoint(c);
506 : for(;;) {
507 : UChar32 nextCp;
508 0 : if(USTRINGTRIE_HAS_VALUE(match)) {
509 0 : ce32 = (uint32_t)suffixes.getValue();
510 0 : if(!USTRINGTRIE_HAS_NEXT(match) || (c = nextSkippedCodePoint(errorCode)) < 0) {
511 0 : return ce32;
512 : }
513 0 : if(skipped != NULL && !skipped->isEmpty()) { skipped->saveTrieState(suffixes); }
514 0 : sinceMatch = 1;
515 0 : } else if(match == USTRINGTRIE_NO_MATCH || (nextCp = nextSkippedCodePoint(errorCode)) < 0) {
516 : // No match for c, or partial match (USTRINGTRIE_NO_VALUE) and no further text.
517 : // Back up if necessary, and try a discontiguous contraction.
518 0 : if((contractionCE32 & Collation::CONTRACT_TRAILING_CCC) != 0 &&
519 : // Discontiguous contraction matching extends an existing match.
520 : // If there is no match yet, then there is nothing to do.
521 0 : ((contractionCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
522 : sinceMatch < lookAhead)) {
523 : // The last character of at least one suffix has lccc!=0,
524 : // allowing for discontiguous contractions.
525 : // UCA S2.1.1 only processes non-starters immediately following
526 : // "a match in the table" (sinceMatch=1).
527 0 : if(sinceMatch > 1) {
528 : // Return to the state after the last match.
529 : // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
530 0 : backwardNumSkipped(sinceMatch, errorCode);
531 0 : c = nextSkippedCodePoint(errorCode);
532 0 : lookAhead -= sinceMatch - 1;
533 0 : sinceMatch = 1;
534 : }
535 0 : if(d->getFCD16(c) > 0xff) {
536 : return nextCE32FromDiscontiguousContraction(
537 0 : d, suffixes, ce32, lookAhead, c, errorCode);
538 : }
539 : }
540 0 : break;
541 : } else {
542 : // Continue after partial match (USTRINGTRIE_NO_VALUE) for c.
543 : // It does not have a result value, therefore it is not itself "a match in the table".
544 : // If a partially-matched c has ccc!=0 then
545 : // it might be skipped in discontiguous contraction.
546 0 : c = nextCp;
547 0 : ++sinceMatch;
548 : }
549 0 : ++lookAhead;
550 0 : match = suffixes.nextForCodePoint(c);
551 0 : }
552 0 : backwardNumSkipped(sinceMatch, errorCode);
553 0 : return ce32;
554 : }
555 :
556 : uint32_t
557 0 : CollationIterator::nextCE32FromDiscontiguousContraction(
558 : const CollationData *d, UCharsTrie &suffixes, uint32_t ce32,
559 : int32_t lookAhead, UChar32 c,
560 : UErrorCode &errorCode) {
561 0 : if(U_FAILURE(errorCode)) { return 0; }
562 :
563 : // UCA section 3.3.2 Contractions:
564 : // Contractions that end with non-starter characters
565 : // are known as discontiguous contractions.
566 : // ... discontiguous contractions must be detected in input text
567 : // whenever the final sequence of non-starter characters could be rearranged
568 : // so as to make a contiguous matching sequence that is canonically equivalent.
569 :
570 : // UCA: http://www.unicode.org/reports/tr10/#S2.1
571 : // S2.1 Find the longest initial substring S at each point that has a match in the table.
572 : // S2.1.1 If there are any non-starters following S, process each non-starter C.
573 : // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
574 : // Note: A non-starter in a string is called blocked
575 : // if there is another non-starter of the same canonical combining class or zero
576 : // between it and the last character of canonical combining class 0.
577 : // S2.1.3 If there is a match, replace S by S + C, and remove C.
578 :
579 : // First: Is a discontiguous contraction even possible?
580 0 : uint16_t fcd16 = d->getFCD16(c);
581 0 : U_ASSERT(fcd16 > 0xff); // The caller checked this already, as a shortcut.
582 0 : UChar32 nextCp = nextSkippedCodePoint(errorCode);
583 0 : if(nextCp < 0) {
584 : // No further text.
585 0 : backwardNumSkipped(1, errorCode);
586 0 : return ce32;
587 : }
588 0 : ++lookAhead;
589 0 : uint8_t prevCC = (uint8_t)fcd16;
590 0 : fcd16 = d->getFCD16(nextCp);
591 0 : if(fcd16 <= 0xff) {
592 : // The next code point after c is a starter (S2.1.1 "process each non-starter").
593 0 : backwardNumSkipped(2, errorCode);
594 0 : return ce32;
595 : }
596 :
597 : // We have read and matched (lookAhead-2) code points,
598 : // read non-matching c and peeked ahead at nextCp.
599 : // Return to the state before the mismatch and continue matching with nextCp.
600 0 : if(skipped == NULL || skipped->isEmpty()) {
601 0 : if(skipped == NULL) {
602 0 : skipped = new SkippedState();
603 0 : if(skipped == NULL) {
604 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
605 0 : return 0;
606 : }
607 : }
608 0 : suffixes.reset();
609 0 : if(lookAhead > 2) {
610 : // Replay the partial match so far.
611 0 : backwardNumCodePoints(lookAhead, errorCode);
612 0 : suffixes.firstForCodePoint(nextCodePoint(errorCode));
613 0 : for(int32_t i = 3; i < lookAhead; ++i) {
614 0 : suffixes.nextForCodePoint(nextCodePoint(errorCode));
615 : }
616 : // Skip c (which did not match) and nextCp (which we will try now).
617 0 : forwardNumCodePoints(2, errorCode);
618 : }
619 0 : skipped->saveTrieState(suffixes);
620 : } else {
621 : // Reset to the trie state before the failed match of c.
622 0 : skipped->resetToTrieState(suffixes);
623 : }
624 :
625 0 : skipped->setFirstSkipped(c);
626 : // Number of code points read since the last match (at this point: c and nextCp).
627 0 : int32_t sinceMatch = 2;
628 0 : c = nextCp;
629 : for(;;) {
630 : UStringTrieResult match;
631 : // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
632 0 : if(prevCC < (fcd16 >> 8) && USTRINGTRIE_HAS_VALUE(match = suffixes.nextForCodePoint(c))) {
633 : // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
634 : // Keep prevCC unchanged.
635 0 : ce32 = (uint32_t)suffixes.getValue();
636 0 : sinceMatch = 0;
637 0 : skipped->recordMatch();
638 0 : if(!USTRINGTRIE_HAS_NEXT(match)) { break; }
639 0 : skipped->saveTrieState(suffixes);
640 : } else {
641 : // No match for "S + C", skip C.
642 0 : skipped->skip(c);
643 0 : skipped->resetToTrieState(suffixes);
644 0 : prevCC = (uint8_t)fcd16;
645 : }
646 0 : if((c = nextSkippedCodePoint(errorCode)) < 0) { break; }
647 0 : ++sinceMatch;
648 0 : fcd16 = d->getFCD16(c);
649 0 : if(fcd16 <= 0xff) {
650 : // The next code point after c is a starter (S2.1.1 "process each non-starter").
651 0 : break;
652 : }
653 0 : }
654 0 : backwardNumSkipped(sinceMatch, errorCode);
655 0 : UBool isTopDiscontiguous = skipped->isEmpty();
656 0 : skipped->replaceMatch();
657 0 : if(isTopDiscontiguous && !skipped->isEmpty()) {
658 : // We did get a match after skipping one or more combining marks,
659 : // and we are not in a recursive discontiguous contraction.
660 : // Append CEs from the contraction ce32
661 : // and then from the combining marks that we skipped before the match.
662 0 : c = U_SENTINEL;
663 : for(;;) {
664 0 : appendCEsFromCE32(d, c, ce32, TRUE, errorCode);
665 : // Fetch CE32s for skipped combining marks from the normal data, with fallback,
666 : // rather than from the CollationData where we found the contraction.
667 0 : if(!skipped->hasNext()) { break; }
668 0 : c = skipped->next();
669 0 : ce32 = getDataCE32(c);
670 0 : if(ce32 == Collation::FALLBACK_CE32) {
671 0 : d = data->base;
672 0 : ce32 = d->getCE32(c);
673 : } else {
674 0 : d = data;
675 : }
676 : // Note: A nested discontiguous-contraction match
677 : // replaces consumed combining marks with newly skipped ones
678 : // and resets the reading position to the beginning.
679 : }
680 0 : skipped->clear();
681 0 : ce32 = Collation::NO_CE32; // Signal to the caller that the result is in the ceBuffer.
682 : }
683 0 : return ce32;
684 : }
685 :
686 : void
687 0 : CollationIterator::appendNumericCEs(uint32_t ce32, UBool forward, UErrorCode &errorCode) {
688 : // Collect digits.
689 0 : CharString digits;
690 0 : if(forward) {
691 : for(;;) {
692 0 : char digit = Collation::digitFromCE32(ce32);
693 0 : digits.append(digit, errorCode);
694 0 : if(numCpFwd == 0) { break; }
695 0 : UChar32 c = nextCodePoint(errorCode);
696 0 : if(c < 0) { break; }
697 0 : ce32 = data->getCE32(c);
698 0 : if(ce32 == Collation::FALLBACK_CE32) {
699 0 : ce32 = data->base->getCE32(c);
700 : }
701 0 : if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
702 0 : backwardNumCodePoints(1, errorCode);
703 0 : break;
704 : }
705 0 : if(numCpFwd > 0) { --numCpFwd; }
706 0 : }
707 : } else {
708 : for(;;) {
709 0 : char digit = Collation::digitFromCE32(ce32);
710 0 : digits.append(digit, errorCode);
711 0 : UChar32 c = previousCodePoint(errorCode);
712 0 : if(c < 0) { break; }
713 0 : ce32 = data->getCE32(c);
714 0 : if(ce32 == Collation::FALLBACK_CE32) {
715 0 : ce32 = data->base->getCE32(c);
716 : }
717 0 : if(!Collation::hasCE32Tag(ce32, Collation::DIGIT_TAG)) {
718 0 : forwardNumCodePoints(1, errorCode);
719 0 : break;
720 : }
721 0 : }
722 : // Reverse the digit string.
723 0 : char *p = digits.data();
724 0 : char *q = p + digits.length() - 1;
725 0 : while(p < q) {
726 0 : char digit = *p;
727 0 : *p++ = *q;
728 0 : *q-- = digit;
729 : }
730 : }
731 0 : if(U_FAILURE(errorCode)) { return; }
732 0 : int32_t pos = 0;
733 0 : do {
734 : // Skip leading zeros.
735 0 : while(pos < (digits.length() - 1) && digits[pos] == 0) { ++pos; }
736 : // Write a sequence of CEs for at most 254 digits at a time.
737 0 : int32_t segmentLength = digits.length() - pos;
738 0 : if(segmentLength > 254) { segmentLength = 254; }
739 0 : appendNumericSegmentCEs(digits.data() + pos, segmentLength, errorCode);
740 0 : pos += segmentLength;
741 0 : } while(U_SUCCESS(errorCode) && pos < digits.length());
742 : }
743 :
744 : void
745 0 : CollationIterator::appendNumericSegmentCEs(const char *digits, int32_t length, UErrorCode &errorCode) {
746 0 : U_ASSERT(1 <= length && length <= 254);
747 0 : U_ASSERT(length == 1 || digits[0] != 0);
748 0 : uint32_t numericPrimary = data->numericPrimary;
749 : // Note: We use primary byte values 2..255: digits are not compressible.
750 0 : if(length <= 7) {
751 : // Very dense encoding for small numbers.
752 0 : int32_t value = digits[0];
753 0 : for(int32_t i = 1; i < length; ++i) {
754 0 : value = value * 10 + digits[i];
755 : }
756 : // Primary weight second byte values:
757 : // 74 byte values 2.. 75 for small numbers in two-byte primary weights.
758 : // 40 byte values 76..115 for medium numbers in three-byte primary weights.
759 : // 16 byte values 116..131 for large numbers in four-byte primary weights.
760 : // 124 byte values 132..255 for very large numbers with 4..127 digit pairs.
761 0 : int32_t firstByte = 2;
762 0 : int32_t numBytes = 74;
763 0 : if(value < numBytes) {
764 : // Two-byte primary for 0..73, good for day & month numbers etc.
765 0 : uint32_t primary = numericPrimary | ((firstByte + value) << 16);
766 0 : ceBuffer.append(Collation::makeCE(primary), errorCode);
767 0 : return;
768 : }
769 0 : value -= numBytes;
770 0 : firstByte += numBytes;
771 0 : numBytes = 40;
772 0 : if(value < numBytes * 254) {
773 : // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
774 0 : uint32_t primary = numericPrimary |
775 0 : ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
776 0 : ceBuffer.append(Collation::makeCE(primary), errorCode);
777 0 : return;
778 : }
779 0 : value -= numBytes * 254;
780 0 : firstByte += numBytes;
781 0 : numBytes = 16;
782 0 : if(value < numBytes * 254 * 254) {
783 : // Four-byte primary for 10234..1042489=10234+16*254*254-1.
784 0 : uint32_t primary = numericPrimary | (2 + value % 254);
785 0 : value /= 254;
786 0 : primary |= (2 + value % 254) << 8;
787 0 : value /= 254;
788 0 : primary |= (firstByte + value % 254) << 16;
789 0 : ceBuffer.append(Collation::makeCE(primary), errorCode);
790 0 : return;
791 : }
792 : // original value > 1042489
793 : }
794 0 : U_ASSERT(length >= 7);
795 :
796 : // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
797 : // then we generate primary bytes with those pairs.
798 : // Omit trailing 00 pairs.
799 : // Decrement the value for the last pair.
800 :
801 : // Set the exponent. 4 pairs->132, 5 pairs->133, ..., 127 pairs->255.
802 0 : int32_t numPairs = (length + 1) / 2;
803 0 : uint32_t primary = numericPrimary | ((132 - 4 + numPairs) << 16);
804 : // Find the length without trailing 00 pairs.
805 0 : while(digits[length - 1] == 0 && digits[length - 2] == 0) {
806 0 : length -= 2;
807 : }
808 : // Read the first pair.
809 : uint32_t pair;
810 : int32_t pos;
811 0 : if(length & 1) {
812 : // Only "half a pair" if we have an odd number of digits.
813 0 : pair = digits[0];
814 0 : pos = 1;
815 : } else {
816 0 : pair = digits[0] * 10 + digits[1];
817 0 : pos = 2;
818 : }
819 0 : pair = 11 + 2 * pair;
820 : // Add the pairs of digits between pos and length.
821 0 : int32_t shift = 8;
822 0 : while(pos < length) {
823 0 : if(shift == 0) {
824 : // Every three pairs/bytes we need to store a 4-byte-primary CE
825 : // and start with a new CE with the '0' primary lead byte.
826 0 : primary |= pair;
827 0 : ceBuffer.append(Collation::makeCE(primary), errorCode);
828 0 : primary = numericPrimary;
829 0 : shift = 16;
830 : } else {
831 0 : primary |= pair << shift;
832 0 : shift -= 8;
833 : }
834 0 : pair = 11 + 2 * (digits[pos] * 10 + digits[pos + 1]);
835 0 : pos += 2;
836 : }
837 0 : primary |= (pair - 1) << shift;
838 0 : ceBuffer.append(Collation::makeCE(primary), errorCode);
839 : }
840 :
841 : int64_t
842 0 : CollationIterator::previousCE(UVector32 &offsets, UErrorCode &errorCode) {
843 0 : if(ceBuffer.length > 0) {
844 : // Return the previous buffered CE.
845 0 : return ceBuffer.get(--ceBuffer.length);
846 : }
847 0 : offsets.removeAllElements();
848 0 : int32_t limitOffset = getOffset();
849 0 : UChar32 c = previousCodePoint(errorCode);
850 0 : if(c < 0) { return Collation::NO_CE; }
851 0 : if(data->isUnsafeBackward(c, isNumeric)) {
852 0 : return previousCEUnsafe(c, offsets, errorCode);
853 : }
854 : // Simple, safe-backwards iteration:
855 : // Get a CE going backwards, handle prefixes but no contractions.
856 0 : uint32_t ce32 = data->getCE32(c);
857 : const CollationData *d;
858 0 : if(ce32 == Collation::FALLBACK_CE32) {
859 0 : d = data->base;
860 0 : ce32 = d->getCE32(c);
861 : } else {
862 0 : d = data;
863 : }
864 0 : if(Collation::isSimpleOrLongCE32(ce32)) {
865 0 : return Collation::ceFromCE32(ce32);
866 : }
867 0 : appendCEsFromCE32(d, c, ce32, FALSE, errorCode);
868 0 : if(U_SUCCESS(errorCode)) {
869 0 : if(ceBuffer.length > 1) {
870 0 : offsets.addElement(getOffset(), errorCode);
871 : // For an expansion, the offset of each non-initial CE is the limit offset,
872 : // consistent with forward iteration.
873 0 : while(offsets.size() <= ceBuffer.length) {
874 0 : offsets.addElement(limitOffset, errorCode);
875 : };
876 : }
877 0 : return ceBuffer.get(--ceBuffer.length);
878 : } else {
879 0 : return Collation::NO_CE_PRIMARY;
880 : }
881 : }
882 :
883 : int64_t
884 0 : CollationIterator::previousCEUnsafe(UChar32 c, UVector32 &offsets, UErrorCode &errorCode) {
885 : // We just move through the input counting safe and unsafe code points
886 : // without collecting the unsafe-backward substring into a buffer and
887 : // switching to it.
888 : // This is to keep the logic simple. Otherwise we would have to handle
889 : // prefix matching going before the backward buffer, switching
890 : // to iteration and back, etc.
891 : // In the most important case of iterating over a normal string,
892 : // reading from the string itself is already maximally fast.
893 : // The only drawback there is that after getting the CEs we always
894 : // skip backward to the safe character rather than switching out
895 : // of a backwardBuffer.
896 : // But this should not be the common case for previousCE(),
897 : // and correctness and maintainability are more important than
898 : // complex optimizations.
899 : // Find the first safe character before c.
900 0 : int32_t numBackward = 1;
901 0 : while((c = previousCodePoint(errorCode)) >= 0) {
902 0 : ++numBackward;
903 0 : if(!data->isUnsafeBackward(c, isNumeric)) {
904 0 : break;
905 : }
906 : }
907 : // Set the forward iteration limit.
908 : // Note: This counts code points.
909 : // We cannot enforce a limit in the middle of a surrogate pair or similar.
910 0 : numCpFwd = numBackward;
911 : // Reset the forward iterator.
912 0 : cesIndex = 0;
913 0 : U_ASSERT(ceBuffer.length == 0);
914 : // Go forward and collect the CEs.
915 0 : int32_t offset = getOffset();
916 0 : while(numCpFwd > 0) {
917 : // nextCE() normally reads one code point.
918 : // Contraction matching and digit specials read more and check numCpFwd.
919 0 : --numCpFwd;
920 : // Append one or more CEs to the ceBuffer.
921 0 : (void)nextCE(errorCode);
922 0 : U_ASSERT(U_FAILURE(errorCode) || ceBuffer.get(ceBuffer.length - 1) != Collation::NO_CE);
923 : // No need to loop for getting each expansion CE from nextCE().
924 0 : cesIndex = ceBuffer.length;
925 : // However, we need to write an offset for each CE.
926 : // This is for CollationElementIterator::getOffset() to return
927 : // intermediate offsets from the unsafe-backwards segment.
928 0 : U_ASSERT(offsets.size() < ceBuffer.length);
929 0 : offsets.addElement(offset, errorCode);
930 : // For an expansion, the offset of each non-initial CE is the limit offset,
931 : // consistent with forward iteration.
932 0 : offset = getOffset();
933 0 : while(offsets.size() < ceBuffer.length) {
934 0 : offsets.addElement(offset, errorCode);
935 : };
936 : }
937 0 : U_ASSERT(offsets.size() == ceBuffer.length);
938 : // End offset corresponding to just after the unsafe-backwards segment.
939 0 : offsets.addElement(offset, errorCode);
940 : // Reset the forward iteration limit
941 : // and move backward to before the segment for which we fetched CEs.
942 0 : numCpFwd = -1;
943 0 : backwardNumCodePoints(numBackward, errorCode);
944 : // Use the collected CEs and return the last one.
945 0 : cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented.
946 0 : if(U_SUCCESS(errorCode)) {
947 0 : return ceBuffer.get(--ceBuffer.length);
948 : } else {
949 0 : return Collation::NO_CE_PRIMARY;
950 : }
951 : }
952 :
953 : U_NAMESPACE_END
954 :
955 : #endif // !UCONFIG_NO_COLLATION
|