Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2010-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationdata.h
9 : *
10 : * created on: 2010oct27
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __COLLATIONDATA_H__
15 : #define __COLLATIONDATA_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : #include "unicode/ucol.h"
22 : #include "unicode/uniset.h"
23 : #include "collation.h"
24 : #include "normalizer2impl.h"
25 : #include "utrie2.h"
26 :
27 : struct UDataMemory;
28 :
29 : U_NAMESPACE_BEGIN
30 :
31 : class UVector32;
32 :
33 : /**
34 : * Collation data container.
35 : * Immutable data created by a CollationDataBuilder, or loaded from a file,
36 : * or deserialized from API-provided binary data.
37 : *
38 : * Includes data for the collation base (root/default), aliased if this is not the base.
39 : */
40 : struct U_I18N_API CollationData : public UMemory {
41 : // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
42 : // parallel with the ranges, and resetting ranges that are indexed.
43 : // The reordering builder code could clone the resulting template array.
44 : enum {
45 : REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
46 : REORDER_RESERVED_AFTER_LATIN
47 : };
48 :
49 : enum {
50 : MAX_NUM_SPECIAL_REORDER_CODES = 8,
51 : /** C++ only, data reader check scriptStartsLength. */
52 : MAX_NUM_SCRIPT_RANGES = 256
53 : };
54 :
55 0 : CollationData(const Normalizer2Impl &nfc)
56 0 : : trie(NULL),
57 : ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
58 : jamoCE32s(NULL),
59 : nfcImpl(nfc),
60 : numericPrimary(0x12000000),
61 : ce32sLength(0), cesLength(0), contextsLength(0),
62 : compressibleBytes(NULL),
63 : unsafeBackwardSet(NULL),
64 : fastLatinTable(NULL), fastLatinTableLength(0),
65 : numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
66 0 : rootElements(NULL), rootElementsLength(0) {}
67 :
68 0 : uint32_t getCE32(UChar32 c) const {
69 0 : return UTRIE2_GET32(trie, c);
70 : }
71 :
72 0 : uint32_t getCE32FromSupplementary(UChar32 c) const {
73 0 : return UTRIE2_GET32_FROM_SUPP(trie, c);
74 : }
75 :
76 0 : UBool isDigit(UChar32 c) const {
77 0 : return c < 0x660 ? c <= 0x39 && 0x30 <= c :
78 0 : Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
79 : }
80 :
81 0 : UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
82 0 : return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
83 : }
84 :
85 0 : UBool isCompressibleLeadByte(uint32_t b) const {
86 0 : return compressibleBytes[b];
87 : }
88 :
89 0 : inline UBool isCompressiblePrimary(uint32_t p) const {
90 0 : return isCompressibleLeadByte(p >> 24);
91 : }
92 :
93 : /**
94 : * Returns the CE32 from two contexts words.
95 : * Access to the defaultCE32 for contraction and prefix matching.
96 : */
97 0 : static uint32_t readCE32(const UChar *p) {
98 0 : return ((uint32_t)p[0] << 16) | p[1];
99 : }
100 :
101 : /**
102 : * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
103 : * Requires that ce32 is special.
104 : */
105 : uint32_t getIndirectCE32(uint32_t ce32) const;
106 : /**
107 : * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
108 : * if ce32 is special.
109 : */
110 : uint32_t getFinalCE32(uint32_t ce32) const;
111 :
112 : /**
113 : * Computes a CE from c's ce32 which has the OFFSET_TAG.
114 : */
115 0 : int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
116 0 : int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
117 0 : return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
118 : }
119 :
120 : /**
121 : * Returns the single CE that c maps to.
122 : * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
123 : */
124 : int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
125 :
126 : /**
127 : * Returns the FCD16 value for code point c. c must be >= 0.
128 : */
129 0 : uint16_t getFCD16(UChar32 c) const {
130 0 : return nfcImpl.getFCD16(c);
131 : }
132 :
133 : /**
134 : * Returns the first primary for the script's reordering group.
135 : * @return the primary with only the first primary lead byte of the group
136 : * (not necessarily an actual root collator primary weight),
137 : * or 0 if the script is unknown
138 : */
139 : uint32_t getFirstPrimaryForGroup(int32_t script) const;
140 :
141 : /**
142 : * Returns the last primary for the script's reordering group.
143 : * @return the last primary of the group
144 : * (not an actual root collator primary weight),
145 : * or 0 if the script is unknown
146 : */
147 : uint32_t getLastPrimaryForGroup(int32_t script) const;
148 :
149 : /**
150 : * Finds the reordering group which contains the primary weight.
151 : * @return the first script of the group, or -1 if the weight is beyond the last group
152 : */
153 : int32_t getGroupForPrimary(uint32_t p) const;
154 :
155 : int32_t getEquivalentScripts(int32_t script,
156 : int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
157 :
158 : /**
159 : * Writes the permutation of primary-weight ranges
160 : * for the given reordering of scripts and groups.
161 : * The caller checks for illegal arguments and
162 : * takes care of [DEFAULT] and memory allocation.
163 : *
164 : * Each list element will be a (limit, offset) pair as described
165 : * for the CollationSettings::reorderRanges.
166 : * The list will be empty if no ranges are reordered.
167 : */
168 : void makeReorderRanges(const int32_t *reorder, int32_t length,
169 : UVector32 &ranges, UErrorCode &errorCode) const;
170 :
171 : /** @see jamoCE32s */
172 : static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
173 :
174 : /** Main lookup trie. */
175 : const UTrie2 *trie;
176 : /**
177 : * Array of CE32 values.
178 : * At index 0 there must be CE32(U+0000)
179 : * to support U+0000's special-tag for NUL-termination handling.
180 : */
181 : const uint32_t *ce32s;
182 : /** Array of CE values for expansions and OFFSET_TAG. */
183 : const int64_t *ces;
184 : /** Array of prefix and contraction-suffix matching data. */
185 : const UChar *contexts;
186 : /** Base collation data, or NULL if this data itself is a base. */
187 : const CollationData *base;
188 : /**
189 : * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
190 : * They are normally simple CE32s, rarely expansions.
191 : * For fast handling of HANGUL_TAG.
192 : */
193 : const uint32_t *jamoCE32s;
194 : const Normalizer2Impl &nfcImpl;
195 : /** The single-byte primary weight (xx000000) for numeric collation. */
196 : uint32_t numericPrimary;
197 :
198 : int32_t ce32sLength;
199 : int32_t cesLength;
200 : int32_t contextsLength;
201 :
202 : /** 256 flags for which primary-weight lead bytes are compressible. */
203 : const UBool *compressibleBytes;
204 : /**
205 : * Set of code points that are unsafe for starting string comparison after an identical prefix,
206 : * or in backwards CE iteration.
207 : */
208 : const UnicodeSet *unsafeBackwardSet;
209 :
210 : /**
211 : * Fast Latin table for common-Latin-text string comparisons.
212 : * Data structure see class CollationFastLatin.
213 : */
214 : const uint16_t *fastLatinTable;
215 : int32_t fastLatinTableLength;
216 :
217 : /**
218 : * Data for scripts and reordering groups.
219 : * Uses include building a reordering permutation table and
220 : * providing script boundaries to AlphabeticIndex.
221 : */
222 : int32_t numScripts;
223 : /**
224 : * The length of scriptsIndex is numScripts+16.
225 : * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
226 : * 16 special reorder codes (not all used) are mapped starting at numScripts.
227 : * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
228 : * There are special codes at the end for reorder-reserved primary ranges.
229 : *
230 : * Multiple scripts may share a range and index, for example Hira & Kana.
231 : */
232 : const uint16_t *scriptsIndex;
233 : /**
234 : * Start primary weight (top 16 bits only) for a group/script/reserved range
235 : * indexed by scriptsIndex.
236 : * The first range (separators & terminators) and the last range (trailing weights)
237 : * are not reorderable, and no scriptsIndex entry points to them.
238 : */
239 : const uint16_t *scriptStarts;
240 : int32_t scriptStartsLength;
241 :
242 : /**
243 : * Collation elements in the root collator.
244 : * Used by the CollationRootElements class. The data structure is described there.
245 : * NULL in a tailoring.
246 : */
247 : const uint32_t *rootElements;
248 : int32_t rootElementsLength;
249 :
250 : private:
251 : int32_t getScriptIndex(int32_t script) const;
252 : void makeReorderRanges(const int32_t *reorder, int32_t length,
253 : UBool latinMustMove,
254 : UVector32 &ranges, UErrorCode &errorCode) const;
255 : int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
256 : int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
257 : };
258 :
259 : U_NAMESPACE_END
260 :
261 : #endif // !UCONFIG_NO_COLLATION
262 : #endif // __COLLATIONDATA_H__
|