Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collation.h
9 : *
10 : * created on: 2010oct27
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __COLLATION_H__
15 : #define __COLLATION_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : U_NAMESPACE_BEGIN
22 :
23 : /**
24 : * Collation v2 basic definitions and static helper functions.
25 : *
26 : * Data structures except for expansion tables store 32-bit CEs which are
27 : * either specials (see tags below) or are compact forms of 64-bit CEs.
28 : */
29 : class U_I18N_API Collation {
30 : public:
31 : // Special sort key bytes for all levels.
32 : static const uint8_t TERMINATOR_BYTE = 0;
33 : static const uint8_t LEVEL_SEPARATOR_BYTE = 1;
34 :
35 : /** The secondary/tertiary lower limit for tailoring before any root elements. */
36 : static const uint32_t BEFORE_WEIGHT16 = 0x0100;
37 :
38 : /**
39 : * Merge-sort-key separator.
40 : * Same as the unique primary and identical-level weights of U+FFFE.
41 : * Must not be used as primary compression low terminator.
42 : * Otherwise usable.
43 : */
44 : static const uint8_t MERGE_SEPARATOR_BYTE = 2;
45 : static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE
46 : static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE
47 :
48 : /**
49 : * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE.
50 : * Reserved value in primary second byte if the lead byte is compressible.
51 : * Otherwise usable in all CE weight bytes.
52 : */
53 : static const uint8_t PRIMARY_COMPRESSION_LOW_BYTE = 3;
54 : /**
55 : * Primary compression high terminator.
56 : * Reserved value in primary second byte if the lead byte is compressible.
57 : * Otherwise usable in all CE weight bytes.
58 : */
59 : static const uint8_t PRIMARY_COMPRESSION_HIGH_BYTE = 0xff;
60 :
61 : /** Default secondary/tertiary weight lead byte. */
62 : static const uint8_t COMMON_BYTE = 5;
63 : static const uint32_t COMMON_WEIGHT16 = 0x0500;
64 : /** Middle 16 bits of a CE with a common secondary weight. */
65 : static const uint32_t COMMON_SECONDARY_CE = 0x05000000;
66 : /** Lower 16 bits of a CE with a common tertiary weight. */
67 : static const uint32_t COMMON_TERTIARY_CE = 0x0500;
68 : /** Lower 32 bits of a CE with common secondary and tertiary weights. */
69 : static const uint32_t COMMON_SEC_AND_TER_CE = 0x05000500;
70 :
71 : static const uint32_t SECONDARY_MASK = 0xffff0000;
72 : static const uint32_t CASE_MASK = 0xc000;
73 : static const uint32_t SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK;
74 : /** Only the 2*6 bits for the pure tertiary weight. */
75 : static const uint32_t ONLY_TERTIARY_MASK = 0x3f3f;
76 : /** Only the secondary & tertiary bits; no case, no quaternary. */
77 : static const uint32_t ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK;
78 : /** Case bits and tertiary bits. */
79 : static const uint32_t CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK;
80 : static const uint32_t QUATERNARY_MASK = 0xc0;
81 : /** Case bits and quaternary bits. */
82 : static const uint32_t CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK;
83 :
84 : static const uint8_t UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible
85 : /**
86 : * First unassigned: AlphabeticIndex overflow boundary.
87 : * We want a 3-byte primary so that it fits into the root elements table.
88 : *
89 : * This 3-byte primary will not collide with
90 : * any unassigned-implicit 4-byte primaries because
91 : * the first few hundred Unicode code points all have real mappings.
92 : */
93 : static const uint32_t FIRST_UNASSIGNED_PRIMARY = 0xfe040200;
94 :
95 : static const uint8_t TRAIL_WEIGHT_BYTE = 0xff; // not compressible
96 : static const uint32_t FIRST_TRAILING_PRIMARY = 0xff020200; // [first trailing]
97 : static const uint32_t MAX_PRIMARY = 0xffff0000; // U+FFFF
98 : static const uint32_t MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF
99 :
100 : // CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD).
101 : // We use the third-highest primary weight for U+FFFD (as in UCA 6.3+).
102 : static const uint32_t FFFD_PRIMARY = MAX_PRIMARY - 0x20000;
103 : static const uint32_t FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000;
104 :
105 : /**
106 : * A CE32 is special if its low byte is this or greater.
107 : * Impossible case bits 11 mark special CE32s.
108 : * This value itself is used to indicate a fallback to the base collator.
109 : */
110 : static const uint8_t SPECIAL_CE32_LOW_BYTE = 0xc0;
111 : static const uint32_t FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE;
112 : /**
113 : * Low byte of a long-primary special CE32.
114 : */
115 : static const uint8_t LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG
116 :
117 : static const uint32_t UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE.
118 :
119 : static const uint32_t NO_CE32 = 1;
120 :
121 : /** No CE: End of input. Only used in runtime code, not stored in data. */
122 : static const uint32_t NO_CE_PRIMARY = 1; // not a left-adjusted weight
123 : static const uint32_t NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE
124 : static const int64_t NO_CE = INT64_C(0x101000100); // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16
125 :
126 : /** Sort key levels. */
127 : enum Level {
128 : /** Unspecified level. */
129 : NO_LEVEL,
130 : PRIMARY_LEVEL,
131 : SECONDARY_LEVEL,
132 : CASE_LEVEL,
133 : TERTIARY_LEVEL,
134 : QUATERNARY_LEVEL,
135 : IDENTICAL_LEVEL,
136 : /** Beyond sort key bytes. */
137 : ZERO_LEVEL
138 : };
139 :
140 : /**
141 : * Sort key level flags: xx_FLAG = 1 << xx_LEVEL.
142 : * In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets.
143 : */
144 : static const uint32_t NO_LEVEL_FLAG = 1;
145 : static const uint32_t PRIMARY_LEVEL_FLAG = 2;
146 : static const uint32_t SECONDARY_LEVEL_FLAG = 4;
147 : static const uint32_t CASE_LEVEL_FLAG = 8;
148 : static const uint32_t TERTIARY_LEVEL_FLAG = 0x10;
149 : static const uint32_t QUATERNARY_LEVEL_FLAG = 0x20;
150 : static const uint32_t IDENTICAL_LEVEL_FLAG = 0x40;
151 : static const uint32_t ZERO_LEVEL_FLAG = 0x80;
152 :
153 : /**
154 : * Special-CE32 tags, from bits 3..0 of a special 32-bit CE.
155 : * Bits 31..8 are available for tag-specific data.
156 : * Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0.
157 : */
158 : enum {
159 : /**
160 : * Fall back to the base collator.
161 : * This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32.
162 : * Bits 31..8: Unused, 0.
163 : */
164 : FALLBACK_TAG = 0,
165 : /**
166 : * Long-primary CE with COMMON_SEC_AND_TER_CE.
167 : * Bits 31..8: Three-byte primary.
168 : */
169 : LONG_PRIMARY_TAG = 1,
170 : /**
171 : * Long-secondary CE with zero primary.
172 : * Bits 31..16: Secondary weight.
173 : * Bits 15.. 8: Tertiary weight.
174 : */
175 : LONG_SECONDARY_TAG = 2,
176 : /**
177 : * Unused.
178 : * May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG),
179 : * storing the secondary in bits 31..24, the ccc in bits 23..16,
180 : * and the tertiary in bits 15..8.
181 : */
182 : RESERVED_TAG_3 = 3,
183 : /**
184 : * Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05].
185 : * Bits 31..24: Single-byte primary weight pp of the first CE.
186 : * Bits 23..16: Tertiary weight tt of the first CE.
187 : * Bits 15.. 8: Secondary weight ss of the second CE.
188 : */
189 : LATIN_EXPANSION_TAG = 4,
190 : /**
191 : * Points to one or more simple/long-primary/long-secondary 32-bit CE32s.
192 : * Bits 31..13: Index into uint32_t table.
193 : * Bits 12.. 8: Length=1..31.
194 : */
195 : EXPANSION32_TAG = 5,
196 : /**
197 : * Points to one or more 64-bit CEs.
198 : * Bits 31..13: Index into CE table.
199 : * Bits 12.. 8: Length=1..31.
200 : */
201 : EXPANSION_TAG = 6,
202 : /**
203 : * Builder data, used only in the CollationDataBuilder, not in runtime data.
204 : *
205 : * If bit 8 is 0: Builder context, points to a list of context-sensitive mappings.
206 : * Bits 31..13: Index to the builder's list of ConditionalCE32 for this character.
207 : * Bits 12.. 9: Unused, 0.
208 : *
209 : * If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value.
210 : * The builder fetches the Jamo CE32 from the trie.
211 : * Bits 31..13: Jamo code point.
212 : * Bits 12.. 9: Unused, 0.
213 : */
214 : BUILDER_DATA_TAG = 7,
215 : /**
216 : * Points to prefix trie.
217 : * Bits 31..13: Index into prefix/contraction data.
218 : * Bits 12.. 8: Unused, 0.
219 : */
220 : PREFIX_TAG = 8,
221 : /**
222 : * Points to contraction data.
223 : * Bits 31..13: Index into prefix/contraction data.
224 : * Bits 12..11: Unused, 0.
225 : * Bit 10: CONTRACT_TRAILING_CCC flag.
226 : * Bit 9: CONTRACT_NEXT_CCC flag.
227 : * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag.
228 : */
229 : CONTRACTION_TAG = 9,
230 : /**
231 : * Decimal digit.
232 : * Bits 31..13: Index into uint32_t table for non-numeric-collation CE32.
233 : * Bit 12: Unused, 0.
234 : * Bits 11.. 8: Digit value 0..9.
235 : */
236 : DIGIT_TAG = 10,
237 : /**
238 : * Tag for U+0000, for moving the NUL-termination handling
239 : * from the regular fastpath into specials-handling code.
240 : * Bits 31..8: Unused, 0.
241 : */
242 : U0000_TAG = 11,
243 : /**
244 : * Tag for a Hangul syllable.
245 : * Bits 31..9: Unused, 0.
246 : * Bit 8: HANGUL_NO_SPECIAL_JAMO flag.
247 : */
248 : HANGUL_TAG = 12,
249 : /**
250 : * Tag for a lead surrogate code unit.
251 : * Optional optimization for UTF-16 string processing.
252 : * Bits 31..10: Unused, 0.
253 : * 9.. 8: =0: All associated supplementary code points are unassigned-implict.
254 : * =1: All associated supplementary code points fall back to the base data.
255 : * else: (Normally 2) Look up the data for the supplementary code point.
256 : */
257 : LEAD_SURROGATE_TAG = 13,
258 : /**
259 : * Tag for CEs with primary weights in code point order.
260 : * Bits 31..13: Index into CE table, for one data "CE".
261 : * Bits 12.. 8: Unused, 0.
262 : *
263 : * This data "CE" has the following bit fields:
264 : * Bits 63..32: Three-byte primary pppppp00.
265 : * 31.. 8: Start/base code point of the in-order range.
266 : * 7: Flag isCompressible primary.
267 : * 6.. 0: Per-code point primary-weight increment.
268 : */
269 : OFFSET_TAG = 14,
270 : /**
271 : * Implicit CE tag. Compute an unassigned-implicit CE.
272 : * All bits are set (UNASSIGNED_CE32=0xffffffff).
273 : */
274 : IMPLICIT_TAG = 15
275 : };
276 :
277 0 : static UBool isAssignedCE32(uint32_t ce32) {
278 0 : return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32;
279 : }
280 :
281 : /**
282 : * We limit the number of CEs in an expansion
283 : * so that we can use a small number of length bits in the data structure,
284 : * and so that an implementation can copy CEs at runtime without growing a destination buffer.
285 : */
286 : static const int32_t MAX_EXPANSION_LENGTH = 31;
287 : static const int32_t MAX_INDEX = 0x7ffff;
288 :
289 : /**
290 : * Set if there is no match for the single (no-suffix) character itself.
291 : * This is only possible if there is a prefix.
292 : * In this case, discontiguous contraction matching cannot add combining marks
293 : * starting from an empty suffix.
294 : * The default CE32 is used anyway if there is no suffix match.
295 : */
296 : static const uint32_t CONTRACT_SINGLE_CP_NO_MATCH = 0x100;
297 : /** Set if the first character of every contraction suffix has lccc!=0. */
298 : static const uint32_t CONTRACT_NEXT_CCC = 0x200;
299 : /** Set if any contraction suffix ends with lccc!=0. */
300 : static const uint32_t CONTRACT_TRAILING_CCC = 0x400;
301 :
302 : /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */
303 : static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100;
304 :
305 : static const uint32_t LEAD_ALL_UNASSIGNED = 0;
306 : static const uint32_t LEAD_ALL_FALLBACK = 0x100;
307 : static const uint32_t LEAD_MIXED = 0x200;
308 : static const uint32_t LEAD_TYPE_MASK = 0x300;
309 :
310 0 : static uint32_t makeLongPrimaryCE32(uint32_t p) { return p | LONG_PRIMARY_CE32_LOW_BYTE; }
311 :
312 : /** Turns the long-primary CE32 into a primary weight pppppp00. */
313 0 : static inline uint32_t primaryFromLongPrimaryCE32(uint32_t ce32) {
314 0 : return ce32 & 0xffffff00;
315 : }
316 0 : static inline int64_t ceFromLongPrimaryCE32(uint32_t ce32) {
317 0 : return ((int64_t)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE;
318 : }
319 :
320 0 : static uint32_t makeLongSecondaryCE32(uint32_t lower32) {
321 0 : return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG;
322 : }
323 0 : static inline int64_t ceFromLongSecondaryCE32(uint32_t ce32) {
324 0 : return ce32 & 0xffffff00;
325 : }
326 :
327 : /** Makes a special CE32 with tag, index and length. */
328 0 : static uint32_t makeCE32FromTagIndexAndLength(int32_t tag, int32_t index, int32_t length) {
329 0 : return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag;
330 : }
331 : /** Makes a special CE32 with only tag and index. */
332 0 : static uint32_t makeCE32FromTagAndIndex(int32_t tag, int32_t index) {
333 0 : return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag;
334 : }
335 :
336 0 : static inline UBool isSpecialCE32(uint32_t ce32) {
337 0 : return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE;
338 : }
339 :
340 0 : static inline int32_t tagFromCE32(uint32_t ce32) {
341 0 : return (int32_t)(ce32 & 0xf);
342 : }
343 :
344 0 : static inline UBool hasCE32Tag(uint32_t ce32, int32_t tag) {
345 0 : return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag;
346 : }
347 :
348 0 : static inline UBool isLongPrimaryCE32(uint32_t ce32) {
349 0 : return hasCE32Tag(ce32, LONG_PRIMARY_TAG);
350 : }
351 :
352 0 : static UBool isSimpleOrLongCE32(uint32_t ce32) {
353 0 : return !isSpecialCE32(ce32) ||
354 0 : tagFromCE32(ce32) == LONG_PRIMARY_TAG ||
355 0 : tagFromCE32(ce32) == LONG_SECONDARY_TAG;
356 : }
357 :
358 : /**
359 : * @return TRUE if the ce32 yields one or more CEs without further data lookups
360 : */
361 0 : static UBool isSelfContainedCE32(uint32_t ce32) {
362 0 : return !isSpecialCE32(ce32) ||
363 0 : tagFromCE32(ce32) == LONG_PRIMARY_TAG ||
364 0 : tagFromCE32(ce32) == LONG_SECONDARY_TAG ||
365 0 : tagFromCE32(ce32) == LATIN_EXPANSION_TAG;
366 : }
367 :
368 0 : static inline UBool isPrefixCE32(uint32_t ce32) {
369 0 : return hasCE32Tag(ce32, PREFIX_TAG);
370 : }
371 :
372 0 : static inline UBool isContractionCE32(uint32_t ce32) {
373 0 : return hasCE32Tag(ce32, CONTRACTION_TAG);
374 : }
375 :
376 0 : static inline UBool ce32HasContext(uint32_t ce32) {
377 0 : return isSpecialCE32(ce32) &&
378 0 : (tagFromCE32(ce32) == PREFIX_TAG ||
379 0 : tagFromCE32(ce32) == CONTRACTION_TAG);
380 : }
381 :
382 : /**
383 : * Get the first of the two Latin-expansion CEs encoded in ce32.
384 : * @see LATIN_EXPANSION_TAG
385 : */
386 0 : static inline int64_t latinCE0FromCE32(uint32_t ce32) {
387 0 : return ((int64_t)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8);
388 : }
389 :
390 : /**
391 : * Get the second of the two Latin-expansion CEs encoded in ce32.
392 : * @see LATIN_EXPANSION_TAG
393 : */
394 0 : static inline int64_t latinCE1FromCE32(uint32_t ce32) {
395 0 : return ((ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE;
396 : }
397 :
398 : /**
399 : * Returns the data index from a special CE32.
400 : */
401 0 : static inline int32_t indexFromCE32(uint32_t ce32) {
402 0 : return (int32_t)(ce32 >> 13);
403 : }
404 :
405 : /**
406 : * Returns the data length from a ce32.
407 : */
408 0 : static inline int32_t lengthFromCE32(uint32_t ce32) {
409 0 : return (ce32 >> 8) & 31;
410 : }
411 :
412 : /**
413 : * Returns the digit value from a DIGIT_TAG ce32.
414 : */
415 0 : static inline char digitFromCE32(uint32_t ce32) {
416 0 : return (char)((ce32 >> 8) & 0xf);
417 : }
418 :
419 : /** Returns a 64-bit CE from a simple CE32 (not special). */
420 0 : static inline int64_t ceFromSimpleCE32(uint32_t ce32) {
421 : // normal form ppppsstt -> pppp0000ss00tt00
422 : // assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE
423 0 : return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8);
424 : }
425 :
426 : /** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */
427 0 : static inline int64_t ceFromCE32(uint32_t ce32) {
428 0 : uint32_t tertiary = ce32 & 0xff;
429 0 : if(tertiary < SPECIAL_CE32_LOW_BYTE) {
430 : // normal form ppppsstt -> pppp0000ss00tt00
431 0 : return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (tertiary << 8);
432 : } else {
433 0 : ce32 -= tertiary;
434 0 : if((tertiary & 0xf) == LONG_PRIMARY_TAG) {
435 : // long-primary form ppppppC1 -> pppppp00050000500
436 0 : return ((int64_t)ce32 << 32) | COMMON_SEC_AND_TER_CE;
437 : } else {
438 : // long-secondary form ssssttC2 -> 00000000sssstt00
439 : // assert (tertiary & 0xf) == LONG_SECONDARY_TAG
440 0 : return ce32;
441 : }
442 : }
443 : }
444 :
445 : /** Creates a CE from a primary weight. */
446 0 : static inline int64_t makeCE(uint32_t p) {
447 0 : return ((int64_t)p << 32) | COMMON_SEC_AND_TER_CE;
448 : }
449 : /**
450 : * Creates a CE from a primary weight,
451 : * 16-bit secondary/tertiary weights, and a 2-bit quaternary.
452 : */
453 0 : static inline int64_t makeCE(uint32_t p, uint32_t s, uint32_t t, uint32_t q) {
454 0 : return ((int64_t)p << 32) | (s << 16) | t | (q << 6);
455 : }
456 :
457 : /**
458 : * Increments a 2-byte primary by a code point offset.
459 : */
460 : static uint32_t incTwoBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible,
461 : int32_t offset);
462 :
463 : /**
464 : * Increments a 3-byte primary by a code point offset.
465 : */
466 : static uint32_t incThreeBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible,
467 : int32_t offset);
468 :
469 : /**
470 : * Decrements a 2-byte primary by one range step (1..0x7f).
471 : */
472 : static uint32_t decTwoBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step);
473 :
474 : /**
475 : * Decrements a 3-byte primary by one range step (1..0x7f).
476 : */
477 : static uint32_t decThreeBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step);
478 :
479 : /**
480 : * Computes a 3-byte primary for c's OFFSET_TAG data "CE".
481 : */
482 : static uint32_t getThreeBytePrimaryForOffsetData(UChar32 c, int64_t dataCE);
483 :
484 : /**
485 : * Returns the unassigned-character implicit primary weight for any valid code point c.
486 : */
487 : static uint32_t unassignedPrimaryFromCodePoint(UChar32 c);
488 :
489 0 : static inline int64_t unassignedCEFromCodePoint(UChar32 c) {
490 0 : return makeCE(unassignedPrimaryFromCodePoint(c));
491 : }
492 :
493 : private:
494 : Collation(); // No instantiation.
495 : };
496 :
497 : U_NAMESPACE_END
498 :
499 : #endif // !UCONFIG_NO_COLLATION
500 : #endif // __COLLATION_H__
|