Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2013-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationfastlatinbuilder.cpp
9 : *
10 : * created on: 2013aug09
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2
15 : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
16 : #include <stdio.h>
17 : #include <string>
18 : #endif
19 :
20 : #include "unicode/utypes.h"
21 :
22 : #if !UCONFIG_NO_COLLATION
23 :
24 : #include "unicode/ucol.h"
25 : #include "unicode/ucharstrie.h"
26 : #include "unicode/unistr.h"
27 : #include "unicode/uobject.h"
28 : #include "unicode/uscript.h"
29 : #include "cmemory.h"
30 : #include "collation.h"
31 : #include "collationdata.h"
32 : #include "collationfastlatin.h"
33 : #include "collationfastlatinbuilder.h"
34 : #include "uassert.h"
35 : #include "uvectr64.h"
36 :
37 : U_NAMESPACE_BEGIN
38 :
39 : struct CollationData;
40 :
41 : namespace {
42 :
43 : /**
44 : * Compare two signed int64_t values as if they were unsigned.
45 : */
46 : int32_t
47 0 : compareInt64AsUnsigned(int64_t a, int64_t b) {
48 0 : if((uint64_t)a < (uint64_t)b) {
49 0 : return -1;
50 0 : } else if((uint64_t)a > (uint64_t)b) {
51 0 : return 1;
52 : } else {
53 0 : return 0;
54 : }
55 : }
56 :
57 : // TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp
58 : /**
59 : * Like Java Collections.binarySearch(List, String, Comparator).
60 : *
61 : * @return the index>=0 where the item was found,
62 : * or the index<0 for inserting the string at ~index in sorted order
63 : */
64 : int32_t
65 0 : binarySearch(const int64_t list[], int32_t limit, int64_t ce) {
66 0 : if (limit == 0) { return ~0; }
67 0 : int32_t start = 0;
68 : for (;;) {
69 0 : int32_t i = (start + limit) / 2;
70 0 : int32_t cmp = compareInt64AsUnsigned(ce, list[i]);
71 0 : if (cmp == 0) {
72 0 : return i;
73 0 : } else if (cmp < 0) {
74 0 : if (i == start) {
75 0 : return ~start; // insert ce before i
76 : }
77 0 : limit = i;
78 : } else {
79 0 : if (i == start) {
80 0 : return ~(start + 1); // insert ce after i
81 : }
82 0 : start = i;
83 : }
84 0 : }
85 : }
86 :
87 : } // namespace
88 :
89 0 : CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode &errorCode)
90 : : ce0(0), ce1(0),
91 : contractionCEs(errorCode), uniqueCEs(errorCode),
92 : miniCEs(NULL),
93 : firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),
94 : firstShortPrimary(0), shortPrimaryOverflow(FALSE),
95 0 : headerLength(0) {
96 0 : }
97 :
98 0 : CollationFastLatinBuilder::~CollationFastLatinBuilder() {
99 0 : uprv_free(miniCEs);
100 0 : }
101 :
102 : UBool
103 0 : CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorCode) {
104 0 : if(U_FAILURE(errorCode)) { return FALSE; }
105 0 : if(!result.isEmpty()) { // This builder is not reusable.
106 0 : errorCode = U_INVALID_STATE_ERROR;
107 0 : return FALSE;
108 : }
109 0 : if(!loadGroups(data, errorCode)) { return FALSE; }
110 :
111 : // Fast handling of digits.
112 0 : firstShortPrimary = firstDigitPrimary;
113 0 : getCEs(data, errorCode);
114 0 : if(!encodeUniqueCEs(errorCode)) { return FALSE; }
115 0 : if(shortPrimaryOverflow) {
116 : // Give digits long mini primaries,
117 : // so that there are more short primaries for letters.
118 0 : firstShortPrimary = firstLatinPrimary;
119 0 : resetCEs();
120 0 : getCEs(data, errorCode);
121 0 : if(!encodeUniqueCEs(errorCode)) { return FALSE; }
122 : }
123 : // Note: If we still have a short-primary overflow but not a long-primary overflow,
124 : // then we could calculate how many more long primaries would fit,
125 : // and set the firstShortPrimary to that many after the current firstShortPrimary,
126 : // and try again.
127 : // However, this might only benefit the en_US_POSIX tailoring,
128 : // and it is simpler to suppress building fast Latin data for it in genrb,
129 : // or by returning FALSE here if shortPrimaryOverflow.
130 :
131 0 : UBool ok = !shortPrimaryOverflow &&
132 0 : encodeCharCEs(errorCode) && encodeContractions(errorCode);
133 0 : contractionCEs.removeAllElements(); // might reduce heap memory usage
134 0 : uniqueCEs.removeAllElements();
135 0 : return ok;
136 : }
137 :
138 : UBool
139 0 : CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) {
140 0 : if(U_FAILURE(errorCode)) { return FALSE; }
141 0 : headerLength = 1 + NUM_SPECIAL_GROUPS;
142 0 : uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
143 0 : result.append((UChar)r0);
144 : // The first few reordering groups should be special groups
145 : // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
146 0 : for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
147 0 : lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i);
148 0 : if(lastSpecialPrimaries[i] == 0) {
149 : // missing data
150 0 : return FALSE;
151 : }
152 0 : result.append((UChar)0); // reserve a slot for this group
153 : }
154 :
155 0 : firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT);
156 0 : firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN);
157 0 : lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN);
158 0 : if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
159 : // missing data
160 0 : return FALSE;
161 : }
162 0 : return TRUE;
163 : }
164 :
165 : UBool
166 0 : CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const {
167 : // Both or neither need to be encoded as short primaries,
168 : // so that we can test only one and use the same bit mask.
169 0 : if(p >= firstShortPrimary) {
170 0 : return q >= firstShortPrimary;
171 0 : } else if(q >= firstShortPrimary) {
172 0 : return FALSE;
173 : }
174 : // Both or neither must be potentially-variable,
175 : // so that we can test only one and determine if both are variable.
176 0 : uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
177 0 : if(p > lastVariablePrimary) {
178 0 : return q > lastVariablePrimary;
179 0 : } else if(q > lastVariablePrimary) {
180 0 : return FALSE;
181 : }
182 : // Both will be encoded with long mini primaries.
183 : // They must be in the same special reordering group,
184 : // so that we can test only one and determine if both are variable.
185 0 : U_ASSERT(p != 0 && q != 0);
186 0 : for(int32_t i = 0;; ++i) { // will terminate
187 0 : uint32_t lastPrimary = lastSpecialPrimaries[i];
188 0 : if(p <= lastPrimary) {
189 0 : return q <= lastPrimary;
190 0 : } else if(q <= lastPrimary) {
191 0 : return FALSE;
192 : }
193 0 : }
194 : }
195 :
196 : void
197 0 : CollationFastLatinBuilder::resetCEs() {
198 0 : contractionCEs.removeAllElements();
199 0 : uniqueCEs.removeAllElements();
200 0 : shortPrimaryOverflow = FALSE;
201 0 : result.truncate(headerLength);
202 0 : }
203 :
204 : void
205 0 : CollationFastLatinBuilder::getCEs(const CollationData &data, UErrorCode &errorCode) {
206 0 : if(U_FAILURE(errorCode)) { return; }
207 0 : int32_t i = 0;
208 0 : for(UChar c = 0;; ++i, ++c) {
209 0 : if(c == CollationFastLatin::LATIN_LIMIT) {
210 0 : c = CollationFastLatin::PUNCT_START;
211 0 : } else if(c == CollationFastLatin::PUNCT_LIMIT) {
212 0 : break;
213 : }
214 : const CollationData *d;
215 0 : uint32_t ce32 = data.getCE32(c);
216 0 : if(ce32 == Collation::FALLBACK_CE32) {
217 0 : d = data.base;
218 0 : ce32 = d->getCE32(c);
219 : } else {
220 0 : d = &data;
221 : }
222 0 : if(getCEsFromCE32(*d, c, ce32, errorCode)) {
223 0 : charCEs[i][0] = ce0;
224 0 : charCEs[i][1] = ce1;
225 0 : addUniqueCE(ce0, errorCode);
226 0 : addUniqueCE(ce1, errorCode);
227 : } else {
228 : // bail out for c
229 0 : charCEs[i][0] = ce0 = Collation::NO_CE;
230 0 : charCEs[i][1] = ce1 = 0;
231 : }
232 0 : if(c == 0 && !isContractionCharCE(ce0)) {
233 : // Always map U+0000 to a contraction.
234 : // Write a contraction list with only a default value if there is no real contraction.
235 0 : U_ASSERT(contractionCEs.isEmpty());
236 0 : addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
237 0 : charCEs[0][0] = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
238 0 : charCEs[0][1] = 0;
239 : }
240 0 : }
241 : // Terminate the last contraction list.
242 0 : contractionCEs.addElement(CollationFastLatin::CONTR_CHAR_MASK, errorCode);
243 : }
244 :
245 : UBool
246 0 : CollationFastLatinBuilder::getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32,
247 : UErrorCode &errorCode) {
248 0 : if(U_FAILURE(errorCode)) { return FALSE; }
249 0 : ce32 = data.getFinalCE32(ce32);
250 0 : ce1 = 0;
251 0 : if(Collation::isSimpleOrLongCE32(ce32)) {
252 0 : ce0 = Collation::ceFromCE32(ce32);
253 : } else {
254 0 : switch(Collation::tagFromCE32(ce32)) {
255 : case Collation::LATIN_EXPANSION_TAG:
256 0 : ce0 = Collation::latinCE0FromCE32(ce32);
257 0 : ce1 = Collation::latinCE1FromCE32(ce32);
258 0 : break;
259 : case Collation::EXPANSION32_TAG: {
260 0 : const uint32_t *ce32s = data.ce32s + Collation::indexFromCE32(ce32);
261 0 : int32_t length = Collation::lengthFromCE32(ce32);
262 0 : if(length <= 2) {
263 0 : ce0 = Collation::ceFromCE32(ce32s[0]);
264 0 : if(length == 2) {
265 0 : ce1 = Collation::ceFromCE32(ce32s[1]);
266 : }
267 0 : break;
268 : } else {
269 0 : return FALSE;
270 : }
271 : }
272 : case Collation::EXPANSION_TAG: {
273 0 : const int64_t *ces = data.ces + Collation::indexFromCE32(ce32);
274 0 : int32_t length = Collation::lengthFromCE32(ce32);
275 0 : if(length <= 2) {
276 0 : ce0 = ces[0];
277 0 : if(length == 2) {
278 0 : ce1 = ces[1];
279 : }
280 0 : break;
281 : } else {
282 0 : return FALSE;
283 : }
284 : }
285 : // Note: We could support PREFIX_TAG (assert c>=0)
286 : // by recursing on its default CE32 and checking that none of the prefixes starts
287 : // with a fast Latin character.
288 : // However, currently (2013) there are only the L-before-middle-dot
289 : // prefix mappings in the Latin range, and those would be rejected anyway.
290 : case Collation::CONTRACTION_TAG:
291 0 : U_ASSERT(c >= 0);
292 0 : return getCEsFromContractionCE32(data, ce32, errorCode);
293 : case Collation::OFFSET_TAG:
294 0 : U_ASSERT(c >= 0);
295 0 : ce0 = data.getCEFromOffsetCE32(c, ce32);
296 0 : break;
297 : default:
298 0 : return FALSE;
299 : }
300 : }
301 : // A mapping can be completely ignorable.
302 0 : if(ce0 == 0) { return ce1 == 0; }
303 : // We do not support an ignorable ce0 unless it is completely ignorable.
304 0 : uint32_t p0 = (uint32_t)(ce0 >> 32);
305 0 : if(p0 == 0) { return FALSE; }
306 : // We only support primaries up to the Latin script.
307 0 : if(p0 > lastLatinPrimary) { return FALSE; }
308 : // We support non-common secondary and case weights only together with short primaries.
309 0 : uint32_t lower32_0 = (uint32_t)ce0;
310 0 : if(p0 < firstShortPrimary) {
311 0 : uint32_t sc0 = lower32_0 & Collation::SECONDARY_AND_CASE_MASK;
312 0 : if(sc0 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
313 : }
314 : // No below-common tertiary weights.
315 0 : if((lower32_0 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
316 0 : if(ce1 != 0) {
317 : // Both primaries must be in the same group,
318 : // or both must get short mini primaries,
319 : // or a short-primary CE is followed by a secondary CE.
320 : // This is so that we can test the first primary and use the same mask for both,
321 : // and determine for both whether they are variable.
322 0 : uint32_t p1 = (uint32_t)(ce1 >> 32);
323 0 : if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return FALSE; }
324 0 : uint32_t lower32_1 = (uint32_t)ce1;
325 : // No tertiary CEs.
326 0 : if((lower32_1 >> 16) == 0) { return FALSE; }
327 : // We support non-common secondary and case weights
328 : // only for secondary CEs or together with short primaries.
329 0 : if(p1 != 0 && p1 < firstShortPrimary) {
330 0 : uint32_t sc1 = lower32_1 & Collation::SECONDARY_AND_CASE_MASK;
331 0 : if(sc1 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
332 : }
333 : // No below-common tertiary weights.
334 0 : if((lower32_1 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
335 : }
336 : // No quaternary weights.
337 0 : if(((ce0 | ce1) & Collation::QUATERNARY_MASK) != 0) { return FALSE; }
338 0 : return TRUE;
339 : }
340 :
341 : UBool
342 0 : CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData &data, uint32_t ce32,
343 : UErrorCode &errorCode) {
344 0 : if(U_FAILURE(errorCode)) { return FALSE; }
345 0 : const UChar *p = data.contexts + Collation::indexFromCE32(ce32);
346 0 : ce32 = CollationData::readCE32(p); // Default if no suffix match.
347 : // Since the original ce32 is not a prefix mapping,
348 : // the default ce32 must not be another contraction.
349 0 : U_ASSERT(!Collation::isContractionCE32(ce32));
350 0 : int32_t contractionIndex = contractionCEs.size();
351 0 : if(getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
352 0 : addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
353 : } else {
354 : // Bail out for c-without-contraction.
355 0 : addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, Collation::NO_CE, 0, errorCode);
356 : }
357 : // Handle an encodable contraction unless the next contraction is too long
358 : // and starts with the same character.
359 0 : int32_t prevX = -1;
360 0 : UBool addContraction = FALSE;
361 0 : UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
362 0 : while(suffixes.next(errorCode)) {
363 0 : const UnicodeString &suffix = suffixes.getString();
364 0 : int32_t x = CollationFastLatin::getCharIndex(suffix.charAt(0));
365 0 : if(x < 0) { continue; } // ignore anything but fast Latin text
366 0 : if(x == prevX) {
367 0 : if(addContraction) {
368 : // Bail out for all contractions starting with this character.
369 0 : addContractionEntry(x, Collation::NO_CE, 0, errorCode);
370 0 : addContraction = FALSE;
371 : }
372 0 : continue;
373 : }
374 0 : if(addContraction) {
375 0 : addContractionEntry(prevX, ce0, ce1, errorCode);
376 : }
377 0 : ce32 = (uint32_t)suffixes.getValue();
378 0 : if(suffix.length() == 1 && getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
379 0 : addContraction = TRUE;
380 : } else {
381 0 : addContractionEntry(x, Collation::NO_CE, 0, errorCode);
382 0 : addContraction = FALSE;
383 : }
384 0 : prevX = x;
385 : }
386 0 : if(addContraction) {
387 0 : addContractionEntry(prevX, ce0, ce1, errorCode);
388 : }
389 0 : if(U_FAILURE(errorCode)) { return FALSE; }
390 : // Note: There might not be any fast Latin contractions, but
391 : // we need to enter contraction handling anyway so that we can bail out
392 : // when there is a non-fast-Latin character following.
393 : // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
394 : // following umlaut and bail out, rather than return the difference of Y vs. u.
395 0 : ce0 = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
396 0 : ce1 = 0;
397 0 : return TRUE;
398 : }
399 :
400 : void
401 0 : CollationFastLatinBuilder::addContractionEntry(int32_t x, int64_t cce0, int64_t cce1,
402 : UErrorCode &errorCode) {
403 0 : contractionCEs.addElement(x, errorCode);
404 0 : contractionCEs.addElement(cce0, errorCode);
405 0 : contractionCEs.addElement(cce1, errorCode);
406 0 : addUniqueCE(cce0, errorCode);
407 0 : addUniqueCE(cce1, errorCode);
408 0 : }
409 :
410 : void
411 0 : CollationFastLatinBuilder::addUniqueCE(int64_t ce, UErrorCode &errorCode) {
412 0 : if(U_FAILURE(errorCode)) { return; }
413 0 : if(ce == 0 || (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY) { return; }
414 0 : ce &= ~(int64_t)Collation::CASE_MASK; // blank out case bits
415 0 : int32_t i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
416 0 : if(i < 0) {
417 0 : uniqueCEs.insertElementAt(ce, ~i, errorCode);
418 : }
419 : }
420 :
421 : uint32_t
422 0 : CollationFastLatinBuilder::getMiniCE(int64_t ce) const {
423 0 : ce &= ~(int64_t)Collation::CASE_MASK; // blank out case bits
424 0 : int32_t index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
425 0 : U_ASSERT(index >= 0);
426 0 : return miniCEs[index];
427 : }
428 :
429 : UBool
430 0 : CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
431 0 : if(U_FAILURE(errorCode)) { return FALSE; }
432 0 : uprv_free(miniCEs);
433 0 : miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2);
434 0 : if(miniCEs == NULL) {
435 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
436 0 : return FALSE;
437 : }
438 0 : int32_t group = 0;
439 0 : uint32_t lastGroupPrimary = lastSpecialPrimaries[group];
440 : // The lowest unique CE must be at least a secondary CE.
441 0 : U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);
442 0 : uint32_t prevPrimary = 0;
443 0 : uint32_t prevSecondary = 0;
444 0 : uint32_t pri = 0;
445 0 : uint32_t sec = 0;
446 0 : uint32_t ter = CollationFastLatin::COMMON_TER;
447 0 : for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
448 0 : int64_t ce = uniqueCEs.elementAti(i);
449 : // Note: At least one of the p/s/t weights changes from one unique CE to the next.
450 : // (uniqueCEs does not store case bits.)
451 0 : uint32_t p = (uint32_t)(ce >> 32);
452 0 : if(p != prevPrimary) {
453 0 : while(p > lastGroupPrimary) {
454 0 : U_ASSERT(pri <= CollationFastLatin::MAX_LONG);
455 : // Set the group's header entry to the
456 : // last "long primary" in or before the group.
457 0 : result.setCharAt(1 + group, (UChar)pri);
458 0 : if(++group < NUM_SPECIAL_GROUPS) {
459 0 : lastGroupPrimary = lastSpecialPrimaries[group];
460 : } else {
461 0 : lastGroupPrimary = 0xffffffff;
462 0 : break;
463 : }
464 : }
465 0 : if(p < firstShortPrimary) {
466 0 : if(pri == 0) {
467 0 : pri = CollationFastLatin::MIN_LONG;
468 0 : } else if(pri < CollationFastLatin::MAX_LONG) {
469 0 : pri += CollationFastLatin::LONG_INC;
470 : } else {
471 : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
472 : printf("long-primary overflow for %08x\n", p);
473 : #endif
474 0 : miniCEs[i] = CollationFastLatin::BAIL_OUT;
475 0 : continue;
476 : }
477 : } else {
478 0 : if(pri < CollationFastLatin::MIN_SHORT) {
479 0 : pri = CollationFastLatin::MIN_SHORT;
480 0 : } else if(pri < (CollationFastLatin::MAX_SHORT - CollationFastLatin::SHORT_INC)) {
481 : // Reserve the highest primary weight for U+FFFF.
482 0 : pri += CollationFastLatin::SHORT_INC;
483 : } else {
484 : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
485 : printf("short-primary overflow for %08x\n", p);
486 : #endif
487 0 : shortPrimaryOverflow = TRUE;
488 0 : miniCEs[i] = CollationFastLatin::BAIL_OUT;
489 0 : continue;
490 : }
491 : }
492 0 : prevPrimary = p;
493 0 : prevSecondary = Collation::COMMON_WEIGHT16;
494 0 : sec = CollationFastLatin::COMMON_SEC;
495 0 : ter = CollationFastLatin::COMMON_TER;
496 : }
497 0 : uint32_t lower32 = (uint32_t)ce;
498 0 : uint32_t s = lower32 >> 16;
499 0 : if(s != prevSecondary) {
500 0 : if(pri == 0) {
501 0 : if(sec == 0) {
502 0 : sec = CollationFastLatin::MIN_SEC_HIGH;
503 0 : } else if(sec < CollationFastLatin::MAX_SEC_HIGH) {
504 0 : sec += CollationFastLatin::SEC_INC;
505 : } else {
506 0 : miniCEs[i] = CollationFastLatin::BAIL_OUT;
507 0 : continue;
508 : }
509 0 : prevSecondary = s;
510 0 : ter = CollationFastLatin::COMMON_TER;
511 0 : } else if(s < Collation::COMMON_WEIGHT16) {
512 0 : if(sec == CollationFastLatin::COMMON_SEC) {
513 0 : sec = CollationFastLatin::MIN_SEC_BEFORE;
514 0 : } else if(sec < CollationFastLatin::MAX_SEC_BEFORE) {
515 0 : sec += CollationFastLatin::SEC_INC;
516 : } else {
517 0 : miniCEs[i] = CollationFastLatin::BAIL_OUT;
518 0 : continue;
519 : }
520 0 : } else if(s == Collation::COMMON_WEIGHT16) {
521 0 : sec = CollationFastLatin::COMMON_SEC;
522 : } else {
523 0 : if(sec < CollationFastLatin::MIN_SEC_AFTER) {
524 0 : sec = CollationFastLatin::MIN_SEC_AFTER;
525 0 : } else if(sec < CollationFastLatin::MAX_SEC_AFTER) {
526 0 : sec += CollationFastLatin::SEC_INC;
527 : } else {
528 0 : miniCEs[i] = CollationFastLatin::BAIL_OUT;
529 0 : continue;
530 : }
531 : }
532 0 : prevSecondary = s;
533 0 : ter = CollationFastLatin::COMMON_TER;
534 : }
535 0 : U_ASSERT((lower32 & Collation::CASE_MASK) == 0); // blanked out in uniqueCEs
536 0 : uint32_t t = lower32 & Collation::ONLY_TERTIARY_MASK;
537 0 : if(t > Collation::COMMON_WEIGHT16) {
538 0 : if(ter < CollationFastLatin::MAX_TER_AFTER) {
539 0 : ++ter;
540 : } else {
541 0 : miniCEs[i] = CollationFastLatin::BAIL_OUT;
542 0 : continue;
543 : }
544 : }
545 0 : if(CollationFastLatin::MIN_LONG <= pri && pri <= CollationFastLatin::MAX_LONG) {
546 0 : U_ASSERT(sec == CollationFastLatin::COMMON_SEC);
547 0 : miniCEs[i] = (uint16_t)(pri | ter);
548 : } else {
549 0 : miniCEs[i] = (uint16_t)(pri | sec | ter);
550 : }
551 : }
552 : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
553 : printf("last mini primary: %04x\n", pri);
554 : #endif
555 : #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
556 : for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
557 : int64_t ce = uniqueCEs.elementAti(i);
558 : printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
559 : }
560 : #endif
561 0 : return U_SUCCESS(errorCode);
562 : }
563 :
564 : UBool
565 0 : CollationFastLatinBuilder::encodeCharCEs(UErrorCode &errorCode) {
566 0 : if(U_FAILURE(errorCode)) { return FALSE; }
567 0 : int32_t miniCEsStart = result.length();
568 0 : for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
569 0 : result.append((UChar)0); // initialize to completely ignorable
570 : }
571 0 : int32_t indexBase = result.length();
572 0 : for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
573 0 : int64_t ce = charCEs[i][0];
574 0 : if(isContractionCharCE(ce)) { continue; } // defer contraction
575 0 : uint32_t miniCE = encodeTwoCEs(ce, charCEs[i][1]);
576 0 : if(miniCE > 0xffff) {
577 : // Note: There is a chance that this new expansion is the same as a previous one,
578 : // and if so, then we could reuse the other expansion.
579 : // However, that seems unlikely.
580 0 : int32_t expansionIndex = result.length() - indexBase;
581 0 : if(expansionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
582 0 : miniCE = CollationFastLatin::BAIL_OUT;
583 : } else {
584 0 : result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
585 0 : miniCE = CollationFastLatin::EXPANSION | expansionIndex;
586 : }
587 : }
588 0 : result.setCharAt(miniCEsStart + i, (UChar)miniCE);
589 : }
590 0 : return U_SUCCESS(errorCode);
591 : }
592 :
593 : UBool
594 0 : CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) {
595 : // We encode all contraction lists so that the first word of a list
596 : // terminates the previous list, and we only need one additional terminator at the end.
597 0 : if(U_FAILURE(errorCode)) { return FALSE; }
598 0 : int32_t indexBase = headerLength + CollationFastLatin::NUM_FAST_CHARS;
599 0 : int32_t firstContractionIndex = result.length();
600 0 : for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
601 0 : int64_t ce = charCEs[i][0];
602 0 : if(!isContractionCharCE(ce)) { continue; }
603 0 : int32_t contractionIndex = result.length() - indexBase;
604 0 : if(contractionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
605 0 : result.setCharAt(headerLength + i, CollationFastLatin::BAIL_OUT);
606 0 : continue;
607 : }
608 0 : UBool firstTriple = TRUE;
609 0 : for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) {
610 0 : int32_t x = contractionCEs.elementAti(index);
611 0 : if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; }
612 0 : int64_t cce0 = contractionCEs.elementAti(index + 1);
613 0 : int64_t cce1 = contractionCEs.elementAti(index + 2);
614 0 : uint32_t miniCE = encodeTwoCEs(cce0, cce1);
615 0 : if(miniCE == CollationFastLatin::BAIL_OUT) {
616 0 : result.append((UChar)(x | (1 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
617 0 : } else if(miniCE <= 0xffff) {
618 0 : result.append((UChar)(x | (2 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
619 0 : result.append((UChar)miniCE);
620 : } else {
621 0 : result.append((UChar)(x | (3 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
622 0 : result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
623 : }
624 0 : firstTriple = FALSE;
625 0 : }
626 : // Note: There is a chance that this new contraction list is the same as a previous one,
627 : // and if so, then we could truncate the result and reuse the other list.
628 : // However, that seems unlikely.
629 0 : result.setCharAt(headerLength + i,
630 0 : (UChar)(CollationFastLatin::CONTRACTION | contractionIndex));
631 : }
632 0 : if(result.length() > firstContractionIndex) {
633 : // Terminate the last contraction list.
634 0 : result.append((UChar)CollationFastLatin::CONTR_CHAR_MASK);
635 : }
636 0 : if(result.isBogus()) {
637 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
638 0 : return FALSE;
639 : }
640 : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
641 : printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
642 : puts(" header & below-digit groups map");
643 : int32_t i = 0;
644 : for(; i < headerLength; ++i) {
645 : printf(" %04x", result[i]);
646 : }
647 : printf("\n char mini CEs");
648 : U_ASSERT(CollationFastLatin::NUM_FAST_CHARS % 16 == 0);
649 : for(; i < indexBase; i += 16) {
650 : UChar32 c = i - headerLength;
651 : if(c >= CollationFastLatin::LATIN_LIMIT) {
652 : c = CollationFastLatin::PUNCT_START + c - CollationFastLatin::LATIN_LIMIT;
653 : }
654 : printf("\n %04x:", c);
655 : for(int32_t j = 0; j < 16; ++j) {
656 : printf(" %04x", result[i + j]);
657 : }
658 : }
659 : printf("\n expansions & contractions");
660 : for(; i < result.length(); ++i) {
661 : if((i - indexBase) % 16 == 0) { puts(""); }
662 : printf(" %04x", result[i]);
663 : }
664 : puts("");
665 : #endif
666 0 : return TRUE;
667 : }
668 :
669 : uint32_t
670 0 : CollationFastLatinBuilder::encodeTwoCEs(int64_t first, int64_t second) const {
671 0 : if(first == 0) {
672 0 : return 0; // completely ignorable
673 : }
674 0 : if(first == Collation::NO_CE) {
675 0 : return CollationFastLatin::BAIL_OUT;
676 : }
677 0 : U_ASSERT((uint32_t)(first >> 32) != Collation::NO_CE_PRIMARY);
678 :
679 0 : uint32_t miniCE = getMiniCE(first);
680 0 : if(miniCE == CollationFastLatin::BAIL_OUT) { return miniCE; }
681 0 : if(miniCE >= CollationFastLatin::MIN_SHORT) {
682 : // Extract & copy the case bits.
683 : // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
684 0 : uint32_t c = (((uint32_t)first & Collation::CASE_MASK) >> (14 - 3));
685 : // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
686 0 : c += CollationFastLatin::LOWER_CASE;
687 0 : miniCE |= c;
688 : }
689 0 : if(second == 0) { return miniCE; }
690 :
691 0 : uint32_t miniCE1 = getMiniCE(second);
692 0 : if(miniCE1 == CollationFastLatin::BAIL_OUT) { return miniCE1; }
693 :
694 0 : uint32_t case1 = (uint32_t)second & Collation::CASE_MASK;
695 0 : if(miniCE >= CollationFastLatin::MIN_SHORT &&
696 0 : (miniCE & CollationFastLatin::SECONDARY_MASK) == CollationFastLatin::COMMON_SEC) {
697 : // Try to combine the two mini CEs into one.
698 0 : uint32_t sec1 = miniCE1 & CollationFastLatin::SECONDARY_MASK;
699 0 : uint32_t ter1 = miniCE1 & CollationFastLatin::TERTIARY_MASK;
700 0 : if(sec1 >= CollationFastLatin::MIN_SEC_HIGH && case1 == 0 &&
701 : ter1 == CollationFastLatin::COMMON_TER) {
702 : // sec1>=sec_high implies pri1==0.
703 0 : return (miniCE & ~CollationFastLatin::SECONDARY_MASK) | sec1;
704 : }
705 : }
706 :
707 0 : if(miniCE1 <= CollationFastLatin::SECONDARY_MASK || CollationFastLatin::MIN_SHORT <= miniCE1) {
708 : // Secondary CE, or a CE with a short primary, copy the case bits.
709 0 : case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE;
710 0 : miniCE1 |= case1;
711 : }
712 0 : return (miniCE << 16) | miniCE1;
713 : }
714 :
715 : U_NAMESPACE_END
716 :
717 : #endif // !UCONFIG_NO_COLLATION
|