Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2013-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationsettings.h
9 : *
10 : * created on: 2013feb07
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #ifndef __COLLATIONSETTINGS_H__
15 : #define __COLLATIONSETTINGS_H__
16 :
17 : #include "unicode/utypes.h"
18 :
19 : #if !UCONFIG_NO_COLLATION
20 :
21 : #include "unicode/ucol.h"
22 : #include "collation.h"
23 : #include "sharedobject.h"
24 : #include "umutex.h"
25 :
26 : U_NAMESPACE_BEGIN
27 :
28 : struct CollationData;
29 :
30 : /**
31 : * Collation settings/options/attributes.
32 : * These are the values that can be changed via API.
33 : */
34 : struct U_I18N_API CollationSettings : public SharedObject {
35 : /**
36 : * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
37 : */
38 : static const int32_t CHECK_FCD = 1;
39 : /**
40 : * Options bit 1: Numeric collation.
41 : * Also known as CODAN = COllate Digits As Numbers.
42 : *
43 : * Treat digit sequences as numbers with CE sequences in numeric order,
44 : * rather than returning a normal CE for each digit.
45 : */
46 : static const int32_t NUMERIC = 2;
47 : /**
48 : * "Shifted" alternate handling, see ALTERNATE_MASK.
49 : */
50 : static const int32_t SHIFTED = 4;
51 : /**
52 : * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
53 : * Reserve values 8 and 0xc for shift-trimmed and blanked.
54 : */
55 : static const int32_t ALTERNATE_MASK = 0xc;
56 : /**
57 : * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
58 : */
59 : static const int32_t MAX_VARIABLE_SHIFT = 4;
60 : /** maxVariable options bit mask before shifting. */
61 : static const int32_t MAX_VARIABLE_MASK = 0x70;
62 : /** Options bit 7: Reserved/unused/0. */
63 : /**
64 : * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
65 : */
66 : static const int32_t UPPER_FIRST = 0x100;
67 : /**
68 : * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
69 : * unless case level is on (when they are *moved* into the separate case level).
70 : * By default, the case bits are removed from the tertiary weight (ignored).
71 : *
72 : * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
73 : * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
74 : */
75 : static const int32_t CASE_FIRST = 0x200;
76 : /**
77 : * Options bit mask for caseFirst and upperFirst, before shifting.
78 : * Same value as caseFirst==upperFirst.
79 : */
80 : static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
81 : /**
82 : * Options bit 10: Insert the case level between the secondary and tertiary levels.
83 : */
84 : static const int32_t CASE_LEVEL = 0x400;
85 : /**
86 : * Options bit 11: Compare secondary weights backwards. ("French secondary")
87 : */
88 : static const int32_t BACKWARD_SECONDARY = 0x800;
89 : /**
90 : * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
91 : * It is the top used bit field in the options. (No need to mask after shifting.)
92 : */
93 : static const int32_t STRENGTH_SHIFT = 12;
94 : /** Strength options bit mask before shifting. */
95 : static const int32_t STRENGTH_MASK = 0xf000;
96 :
97 : /** maxVariable values */
98 : enum MaxVariable {
99 : MAX_VAR_SPACE,
100 : MAX_VAR_PUNCT,
101 : MAX_VAR_SYMBOL,
102 : MAX_VAR_CURRENCY
103 : };
104 :
105 0 : CollationSettings()
106 0 : : options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
107 : (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
108 : variableTop(0),
109 : reorderTable(NULL),
110 : minHighNoReorder(0),
111 : reorderRanges(NULL), reorderRangesLength(0),
112 : reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
113 0 : fastLatinOptions(-1) {}
114 :
115 : CollationSettings(const CollationSettings &other);
116 : virtual ~CollationSettings();
117 :
118 : UBool operator==(const CollationSettings &other) const;
119 :
120 0 : inline UBool operator!=(const CollationSettings &other) const {
121 0 : return !operator==(other);
122 : }
123 :
124 : int32_t hashCode() const;
125 :
126 : void resetReordering();
127 : void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
128 : const uint32_t *ranges, int32_t rangesLength,
129 : const uint8_t *table, UErrorCode &errorCode);
130 : void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
131 : UErrorCode &errorCode);
132 : void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
133 :
134 0 : inline UBool hasReordering() const { return reorderTable != NULL; }
135 : static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
136 0 : inline uint32_t reorder(uint32_t p) const {
137 0 : uint8_t b = reorderTable[p >> 24];
138 0 : if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
139 0 : return ((uint32_t)b << 24) | (p & 0xffffff);
140 : } else {
141 0 : return reorderEx(p);
142 : }
143 : }
144 :
145 : void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
146 :
147 0 : static int32_t getStrength(int32_t options) {
148 0 : return options >> STRENGTH_SHIFT;
149 : }
150 :
151 0 : int32_t getStrength() const {
152 0 : return getStrength(options);
153 : }
154 :
155 : /** Sets the options bit for an on/off attribute. */
156 : void setFlag(int32_t bit, UColAttributeValue value,
157 : int32_t defaultOptions, UErrorCode &errorCode);
158 :
159 : UColAttributeValue getFlag(int32_t bit) const {
160 : return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
161 : }
162 :
163 : void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
164 :
165 0 : UColAttributeValue getCaseFirst() const {
166 0 : int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
167 0 : return (option == 0) ? UCOL_OFF :
168 0 : (option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
169 : }
170 :
171 : void setAlternateHandling(UColAttributeValue value,
172 : int32_t defaultOptions, UErrorCode &errorCode);
173 :
174 0 : UColAttributeValue getAlternateHandling() const {
175 0 : return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
176 : }
177 :
178 : void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
179 :
180 0 : MaxVariable getMaxVariable() const {
181 0 : return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
182 : }
183 :
184 : /**
185 : * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
186 : */
187 0 : static inline UBool isTertiaryWithCaseBits(int32_t options) {
188 0 : return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
189 : }
190 0 : static uint32_t getTertiaryMask(int32_t options) {
191 : // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
192 0 : return isTertiaryWithCaseBits(options) ?
193 0 : Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
194 : }
195 :
196 0 : static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
197 : // On tertiary level, consider case bits and sort uppercase first
198 : // if caseLevel is off and caseFirst==upperFirst.
199 0 : return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
200 : }
201 :
202 0 : inline UBool dontCheckFCD() const {
203 0 : return (options & CHECK_FCD) == 0;
204 : }
205 :
206 : inline UBool hasBackwardSecondary() const {
207 : return (options & BACKWARD_SECONDARY) != 0;
208 : }
209 :
210 0 : inline UBool isNumeric() const {
211 0 : return (options & NUMERIC) != 0;
212 : }
213 :
214 : /** CHECK_FCD etc. */
215 : int32_t options;
216 : /** Variable-top primary weight. */
217 : uint32_t variableTop;
218 : /**
219 : * 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
220 : * A 0 entry at a non-zero index means that the primary lead byte is "split"
221 : * (there are different offsets for primaries that share that lead byte)
222 : * and the reordering offset must be determined via the reorderRanges.
223 : */
224 : const uint8_t *reorderTable;
225 : /** Limit of last reordered range. 0 if no reordering or no split bytes. */
226 : uint32_t minHighNoReorder;
227 : /**
228 : * Primary-weight ranges for script reordering,
229 : * to be used by reorder(p) for split-reordered primary lead bytes.
230 : *
231 : * Each entry is a (limit, offset) pair.
232 : * The upper 16 bits of the entry are the upper 16 bits of the
233 : * exclusive primary limit of a range.
234 : * Primaries between the previous limit and this one have their lead bytes
235 : * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
236 : *
237 : * CollationData::makeReorderRanges() writes a full list where the first range
238 : * (at least for terminators and separators) has a 0 offset.
239 : * The last range has a non-zero offset.
240 : * minHighNoReorder is set to the limit of that last range.
241 : *
242 : * In the settings object, the initial ranges before the first split lead byte
243 : * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
244 : * If there are no split-reordered lead bytes, then no ranges are needed.
245 : */
246 : const uint32_t *reorderRanges;
247 : int32_t reorderRangesLength;
248 : /** Array of reorder codes; ignored if reorderCodesLength == 0. */
249 : const int32_t *reorderCodes;
250 : /** Number of reorder codes; 0 if no reordering. */
251 : int32_t reorderCodesLength;
252 : /**
253 : * Capacity of reorderCodes.
254 : * If 0, then the codes, the ranges, and the table are aliases.
255 : * Otherwise, this object owns the memory via the reorderCodes pointer;
256 : * the codes, the ranges, and the table are in the same memory block, in that order.
257 : */
258 : int32_t reorderCodesCapacity;
259 :
260 : /** Options for CollationFastLatin. Negative if disabled. */
261 : int32_t fastLatinOptions;
262 : uint16_t fastLatinPrimaries[0x180];
263 :
264 : private:
265 : void setReorderArrays(const int32_t *codes, int32_t codesLength,
266 : const uint32_t *ranges, int32_t rangesLength,
267 : const uint8_t *table, UErrorCode &errorCode);
268 : uint32_t reorderEx(uint32_t p) const;
269 : };
270 :
271 : U_NAMESPACE_END
272 :
273 : #endif // !UCONFIG_NO_COLLATION
274 : #endif // __COLLATIONSETTINGS_H__
|