Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2012-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationdata.cpp
9 : *
10 : * created on: 2012jul28
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #include "unicode/utypes.h"
15 :
16 : #if !UCONFIG_NO_COLLATION
17 :
18 : #include "unicode/ucol.h"
19 : #include "unicode/udata.h"
20 : #include "unicode/uscript.h"
21 : #include "cmemory.h"
22 : #include "collation.h"
23 : #include "collationdata.h"
24 : #include "uassert.h"
25 : #include "utrie2.h"
26 : #include "uvectr32.h"
27 :
28 : U_NAMESPACE_BEGIN
29 :
30 : uint32_t
31 0 : CollationData::getIndirectCE32(uint32_t ce32) const {
32 0 : U_ASSERT(Collation::isSpecialCE32(ce32));
33 0 : int32_t tag = Collation::tagFromCE32(ce32);
34 0 : if(tag == Collation::DIGIT_TAG) {
35 : // Fetch the non-numeric-collation CE32.
36 0 : ce32 = ce32s[Collation::indexFromCE32(ce32)];
37 0 : } else if(tag == Collation::LEAD_SURROGATE_TAG) {
38 0 : ce32 = Collation::UNASSIGNED_CE32;
39 0 : } else if(tag == Collation::U0000_TAG) {
40 : // Fetch the normal ce32 for U+0000.
41 0 : ce32 = ce32s[0];
42 : }
43 0 : return ce32;
44 : }
45 :
46 : uint32_t
47 0 : CollationData::getFinalCE32(uint32_t ce32) const {
48 0 : if(Collation::isSpecialCE32(ce32)) {
49 0 : ce32 = getIndirectCE32(ce32);
50 : }
51 0 : return ce32;
52 : }
53 :
54 : int64_t
55 0 : CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
56 0 : if(U_FAILURE(errorCode)) { return 0; }
57 : // Keep parallel with CollationDataBuilder::getSingleCE().
58 : const CollationData *d;
59 0 : uint32_t ce32 = getCE32(c);
60 0 : if(ce32 == Collation::FALLBACK_CE32) {
61 0 : d = base;
62 0 : ce32 = base->getCE32(c);
63 : } else {
64 0 : d = this;
65 : }
66 0 : while(Collation::isSpecialCE32(ce32)) {
67 0 : switch(Collation::tagFromCE32(ce32)) {
68 : case Collation::LATIN_EXPANSION_TAG:
69 : case Collation::BUILDER_DATA_TAG:
70 : case Collation::PREFIX_TAG:
71 : case Collation::CONTRACTION_TAG:
72 : case Collation::HANGUL_TAG:
73 : case Collation::LEAD_SURROGATE_TAG:
74 0 : errorCode = U_UNSUPPORTED_ERROR;
75 0 : return 0;
76 : case Collation::FALLBACK_TAG:
77 : case Collation::RESERVED_TAG_3:
78 0 : errorCode = U_INTERNAL_PROGRAM_ERROR;
79 0 : return 0;
80 : case Collation::LONG_PRIMARY_TAG:
81 0 : return Collation::ceFromLongPrimaryCE32(ce32);
82 : case Collation::LONG_SECONDARY_TAG:
83 0 : return Collation::ceFromLongSecondaryCE32(ce32);
84 : case Collation::EXPANSION32_TAG:
85 0 : if(Collation::lengthFromCE32(ce32) == 1) {
86 0 : ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
87 0 : break;
88 : } else {
89 0 : errorCode = U_UNSUPPORTED_ERROR;
90 0 : return 0;
91 : }
92 : case Collation::EXPANSION_TAG: {
93 0 : if(Collation::lengthFromCE32(ce32) == 1) {
94 0 : return d->ces[Collation::indexFromCE32(ce32)];
95 : } else {
96 0 : errorCode = U_UNSUPPORTED_ERROR;
97 0 : return 0;
98 : }
99 : }
100 : case Collation::DIGIT_TAG:
101 : // Fetch the non-numeric-collation CE32 and continue.
102 0 : ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
103 0 : break;
104 : case Collation::U0000_TAG:
105 0 : U_ASSERT(c == 0);
106 : // Fetch the normal ce32 for U+0000 and continue.
107 0 : ce32 = d->ce32s[0];
108 0 : break;
109 : case Collation::OFFSET_TAG:
110 0 : return d->getCEFromOffsetCE32(c, ce32);
111 : case Collation::IMPLICIT_TAG:
112 0 : return Collation::unassignedCEFromCodePoint(c);
113 : }
114 : }
115 0 : return Collation::ceFromSimpleCE32(ce32);
116 : }
117 :
118 : uint32_t
119 0 : CollationData::getFirstPrimaryForGroup(int32_t script) const {
120 0 : int32_t index = getScriptIndex(script);
121 0 : return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
122 : }
123 :
124 : uint32_t
125 0 : CollationData::getLastPrimaryForGroup(int32_t script) const {
126 0 : int32_t index = getScriptIndex(script);
127 0 : if(index == 0) {
128 0 : return 0;
129 : }
130 0 : uint32_t limit = scriptStarts[index + 1];
131 0 : return (limit << 16) - 1;
132 : }
133 :
134 : int32_t
135 0 : CollationData::getGroupForPrimary(uint32_t p) const {
136 0 : p >>= 16;
137 0 : if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
138 0 : return -1;
139 : }
140 0 : int32_t index = 1;
141 0 : while(p >= scriptStarts[index + 1]) { ++index; }
142 0 : for(int32_t i = 0; i < numScripts; ++i) {
143 0 : if(scriptsIndex[i] == index) {
144 0 : return i;
145 : }
146 : }
147 0 : for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
148 0 : if(scriptsIndex[numScripts + i] == index) {
149 0 : return UCOL_REORDER_CODE_FIRST + i;
150 : }
151 : }
152 0 : return -1;
153 : }
154 :
155 : int32_t
156 0 : CollationData::getScriptIndex(int32_t script) const {
157 0 : if(script < 0) {
158 0 : return 0;
159 0 : } else if(script < numScripts) {
160 0 : return scriptsIndex[script];
161 0 : } else if(script < UCOL_REORDER_CODE_FIRST) {
162 0 : return 0;
163 : } else {
164 0 : script -= UCOL_REORDER_CODE_FIRST;
165 0 : if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
166 0 : return scriptsIndex[numScripts + script];
167 : } else {
168 0 : return 0;
169 : }
170 : }
171 : }
172 :
173 : int32_t
174 0 : CollationData::getEquivalentScripts(int32_t script,
175 : int32_t dest[], int32_t capacity,
176 : UErrorCode &errorCode) const {
177 0 : if(U_FAILURE(errorCode)) { return 0; }
178 0 : int32_t index = getScriptIndex(script);
179 0 : if(index == 0) { return 0; }
180 0 : if(script >= UCOL_REORDER_CODE_FIRST) {
181 : // Special groups have no aliases.
182 0 : if(capacity > 0) {
183 0 : dest[0] = script;
184 : } else {
185 0 : errorCode = U_BUFFER_OVERFLOW_ERROR;
186 : }
187 0 : return 1;
188 : }
189 :
190 0 : int32_t length = 0;
191 0 : for(int32_t i = 0; i < numScripts; ++i) {
192 0 : if(scriptsIndex[i] == index) {
193 0 : if(length < capacity) {
194 0 : dest[length] = i;
195 : }
196 0 : ++length;
197 : }
198 : }
199 0 : if(length > capacity) {
200 0 : errorCode = U_BUFFER_OVERFLOW_ERROR;
201 : }
202 0 : return length;
203 : }
204 :
205 : void
206 0 : CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
207 : UVector32 &ranges, UErrorCode &errorCode) const {
208 0 : makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
209 0 : }
210 :
211 : void
212 0 : CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
213 : UBool latinMustMove,
214 : UVector32 &ranges, UErrorCode &errorCode) const {
215 0 : if(U_FAILURE(errorCode)) { return; }
216 0 : ranges.removeAllElements();
217 0 : if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
218 0 : return;
219 : }
220 :
221 : // Maps each script-or-group range to a new lead byte.
222 : uint8_t table[MAX_NUM_SCRIPT_RANGES];
223 0 : uprv_memset(table, 0, sizeof(table));
224 :
225 : {
226 : // Set "don't care" values for reserved ranges.
227 0 : int32_t index = scriptsIndex[
228 0 : numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
229 0 : if(index != 0) {
230 0 : table[index] = 0xff;
231 : }
232 0 : index = scriptsIndex[
233 0 : numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
234 0 : if(index != 0) {
235 0 : table[index] = 0xff;
236 : }
237 : }
238 :
239 : // Never reorder special low and high primary lead bytes.
240 0 : U_ASSERT(scriptStartsLength >= 2);
241 0 : U_ASSERT(scriptStarts[0] == 0);
242 0 : int32_t lowStart = scriptStarts[1];
243 0 : U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
244 0 : int32_t highLimit = scriptStarts[scriptStartsLength - 1];
245 0 : U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
246 :
247 : // Get the set of special reorder codes in the input list.
248 : // This supports a fixed number of special reorder codes;
249 : // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
250 0 : uint32_t specials = 0;
251 0 : for(int32_t i = 0; i < length; ++i) {
252 0 : int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
253 0 : if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
254 0 : specials |= (uint32_t)1 << reorderCode;
255 : }
256 : }
257 :
258 : // Start the reordering with the special low reorder codes that do not occur in the input.
259 0 : for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
260 0 : int32_t index = scriptsIndex[numScripts + i];
261 0 : if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
262 0 : lowStart = addLowScriptRange(table, index, lowStart);
263 : }
264 : }
265 :
266 : // Skip the reserved range before Latin if Latin is the first script,
267 : // so that we do not move it unnecessarily.
268 0 : int32_t skippedReserved = 0;
269 0 : if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
270 0 : int32_t index = scriptsIndex[USCRIPT_LATIN];
271 0 : U_ASSERT(index != 0);
272 0 : int32_t start = scriptStarts[index];
273 0 : U_ASSERT(lowStart <= start);
274 0 : skippedReserved = start - lowStart;
275 0 : lowStart = start;
276 : }
277 :
278 : // Reorder according to the input scripts, continuing from the bottom of the primary range.
279 0 : int32_t originalLength = length; // length will be decremented if "others" is in the list.
280 0 : UBool hasReorderToEnd = FALSE;
281 0 : for(int32_t i = 0; i < length;) {
282 0 : int32_t script = reorder[i++];
283 0 : if(script == USCRIPT_UNKNOWN) {
284 : // Put the remaining scripts at the top.
285 0 : hasReorderToEnd = TRUE;
286 0 : while(i < length) {
287 0 : script = reorder[--length];
288 0 : if(script == USCRIPT_UNKNOWN || // Must occur at most once.
289 : script == UCOL_REORDER_CODE_DEFAULT) {
290 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
291 0 : return;
292 : }
293 0 : int32_t index = getScriptIndex(script);
294 0 : if(index == 0) { continue; }
295 0 : if(table[index] != 0) { // Duplicate or equivalent script.
296 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
297 0 : return;
298 : }
299 0 : highLimit = addHighScriptRange(table, index, highLimit);
300 : }
301 0 : break;
302 : }
303 0 : if(script == UCOL_REORDER_CODE_DEFAULT) {
304 : // The default code must be the only one in the list, and that is handled by the caller.
305 : // Otherwise it must not be used.
306 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
307 0 : return;
308 : }
309 0 : int32_t index = getScriptIndex(script);
310 0 : if(index == 0) { continue; }
311 0 : if(table[index] != 0) { // Duplicate or equivalent script.
312 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313 0 : return;
314 : }
315 0 : lowStart = addLowScriptRange(table, index, lowStart);
316 : }
317 :
318 : // Put all remaining scripts into the middle.
319 0 : for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
320 0 : int32_t leadByte = table[i];
321 0 : if(leadByte != 0) { continue; }
322 0 : int32_t start = scriptStarts[i];
323 0 : if(!hasReorderToEnd && start > lowStart) {
324 : // No need to move this script.
325 0 : lowStart = start;
326 : }
327 0 : lowStart = addLowScriptRange(table, i, lowStart);
328 : }
329 0 : if(lowStart > highLimit) {
330 0 : if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
331 : // Try not skipping the before-Latin reserved range.
332 0 : makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
333 0 : return;
334 : }
335 : // We need more primary lead bytes than available, despite the reserved ranges.
336 0 : errorCode = U_BUFFER_OVERFLOW_ERROR;
337 0 : return;
338 : }
339 :
340 : // Turn lead bytes into a list of (limit, offset) pairs.
341 : // Encode each pair in one list element:
342 : // Upper 16 bits = limit, lower 16 = signed lead byte offset.
343 0 : int32_t offset = 0;
344 0 : for(int32_t i = 1;; ++i) {
345 0 : int32_t nextOffset = offset;
346 0 : while(i < scriptStartsLength - 1) {
347 0 : int32_t newLeadByte = table[i];
348 0 : if(newLeadByte == 0xff) {
349 : // "Don't care" lead byte for reserved range, continue with current offset.
350 : } else {
351 0 : nextOffset = newLeadByte - (scriptStarts[i] >> 8);
352 0 : if(nextOffset != offset) { break; }
353 : }
354 0 : ++i;
355 : }
356 0 : if(offset != 0 || i < scriptStartsLength - 1) {
357 0 : ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
358 : }
359 0 : if(i == scriptStartsLength - 1) { break; }
360 0 : offset = nextOffset;
361 0 : }
362 : }
363 :
364 : int32_t
365 0 : CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
366 0 : int32_t start = scriptStarts[index];
367 0 : if((start & 0xff) < (lowStart & 0xff)) {
368 0 : lowStart += 0x100;
369 : }
370 0 : table[index] = (uint8_t)(lowStart >> 8);
371 0 : int32_t limit = scriptStarts[index + 1];
372 0 : lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
373 0 : return lowStart;
374 : }
375 :
376 : int32_t
377 0 : CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
378 0 : int32_t limit = scriptStarts[index + 1];
379 0 : if((limit & 0xff) > (highLimit & 0xff)) {
380 0 : highLimit -= 0x100;
381 : }
382 0 : int32_t start = scriptStarts[index];
383 0 : highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
384 0 : table[index] = (uint8_t)(highLimit >> 8);
385 0 : return highLimit;
386 : }
387 :
388 : U_NAMESPACE_END
389 :
390 : #endif // !UCONFIG_NO_COLLATION
|