Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2012-2014, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationfcd.h
9 : *
10 : * created on: 2012aug18
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __COLLATIONFCD_H__
15 : #define __COLLATIONFCD_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : #include "unicode/utf16.h"
22 :
23 : U_NAMESPACE_BEGIN
24 :
25 : /**
26 : * Data and functions for the FCD check fast path.
27 : *
28 : * The fast path looks at a pair of 16-bit code units and checks
29 : * whether there is an FCD boundary between them;
30 : * there is if the first unit has a trailing ccc=0 (!hasTccc(first))
31 : * or the second unit has a leading ccc=0 (!hasLccc(second)),
32 : * or both.
33 : * When the fast path finds a possible non-boundary,
34 : * then the FCD check slow path looks at the actual sequence of FCD values.
35 : *
36 : * This is a pure optimization.
37 : * The fast path must at least find all possible non-boundaries.
38 : * If the fast path is too pessimistic, it costs performance.
39 : *
40 : * For a pair of BMP characters, the fast path tests are precise (1 bit per character).
41 : *
42 : * For a supplementary code point, the two units are its lead and trail surrogates.
43 : * We set hasTccc(lead)=true if any of its 1024 associated supplementary code points
44 : * has lccc!=0 or tccc!=0.
45 : * We set hasLccc(trail)=true for all trail surrogates.
46 : * As a result, we leave the fast path if the lead surrogate might start a
47 : * supplementary code point that is not FCD-inert.
48 : * (So the fast path need not detect that there is a surrogate pair,
49 : * nor look ahead to the next full code point.)
50 : *
51 : * hasLccc(lead)=true if any of its 1024 associated supplementary code points
52 : * has lccc!=0, for fast boundary checking between BMP & supplementary.
53 : *
54 : * hasTccc(trail)=false:
55 : * It should only be tested for unpaired trail surrogates which are FCD-inert.
56 : */
57 : class U_I18N_API CollationFCD {
58 : public:
59 0 : static inline UBool hasLccc(UChar32 c) {
60 : // assert c <= 0xffff
61 : // c can be negative, e.g., U_SENTINEL from UCharIterator;
62 : // that is handled in the first test.
63 : int32_t i;
64 : return
65 : // U+0300 is the first character with lccc!=0.
66 0 : c >= 0x300 &&
67 0 : (i = lcccIndex[c >> 5]) != 0 &&
68 0 : (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;
69 : }
70 :
71 0 : static inline UBool hasTccc(UChar32 c) {
72 : // assert c <= 0xffff
73 : // c can be negative, e.g., U_SENTINEL from UCharIterator;
74 : // that is handled in the first test.
75 : int32_t i;
76 : return
77 : // U+00C0 is the first character with tccc!=0.
78 0 : c >= 0xc0 &&
79 0 : (i = tcccIndex[c >> 5]) != 0 &&
80 0 : (tcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;
81 : }
82 :
83 0 : static inline UBool mayHaveLccc(UChar32 c) {
84 : // Handles all of Unicode 0..10FFFF.
85 : // c can be negative, e.g., U_SENTINEL.
86 : // U+0300 is the first character with lccc!=0.
87 0 : if(c < 0x300) { return FALSE; }
88 0 : if(c > 0xffff) { c = U16_LEAD(c); }
89 : int32_t i;
90 : return
91 0 : (i = lcccIndex[c >> 5]) != 0 &&
92 0 : (lcccBits[i] & ((uint32_t)1 << (c & 0x1f))) != 0;
93 : }
94 :
95 : /**
96 : * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
97 : * must be decomposed before reaching the core collation code,
98 : * or else some sequences including them, even ones passing the FCD check,
99 : * do not yield canonically equivalent results.
100 : *
101 : * This is a fast and imprecise test.
102 : *
103 : * @param c a code point
104 : * @return TRUE if c is U+0F73, U+0F75 or U+0F81 or one of several other Tibetan characters
105 : */
106 0 : static inline UBool maybeTibetanCompositeVowel(UChar32 c) {
107 0 : return (c & 0x1fff01) == 0xf01;
108 : }
109 :
110 : /**
111 : * Tibetan composite vowel signs (U+0F73, U+0F75, U+0F81)
112 : * must be decomposed before reaching the core collation code,
113 : * or else some sequences including them, even ones passing the FCD check,
114 : * do not yield canonically equivalent results.
115 : *
116 : * They have distinct lccc/tccc combinations: 129/130 or 129/132.
117 : *
118 : * @param fcd16 the FCD value (lccc/tccc combination) of a code point
119 : * @return TRUE if fcd16 is from U+0F73, U+0F75 or U+0F81
120 : */
121 0 : static inline UBool isFCD16OfTibetanCompositeVowel(uint16_t fcd16) {
122 0 : return fcd16 == 0x8182 || fcd16 == 0x8184;
123 : }
124 :
125 : private:
126 : CollationFCD(); // No instantiation.
127 :
128 : static const uint8_t lcccIndex[2048];
129 : static const uint8_t tcccIndex[2048];
130 : static const uint32_t lcccBits[];
131 : static const uint32_t tcccBits[];
132 : };
133 :
134 : U_NAMESPACE_END
135 :
136 : #endif // !UCONFIG_NO_COLLATION
137 : #endif // __COLLATIONFCD_H__
|