Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2013-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationdatareader.cpp
9 : *
10 : * created on: 2013feb07
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #include "unicode/utypes.h"
15 :
16 : #if !UCONFIG_NO_COLLATION
17 :
18 : #include "unicode/ucol.h"
19 : #include "unicode/udata.h"
20 : #include "unicode/uscript.h"
21 : #include "cmemory.h"
22 : #include "collation.h"
23 : #include "collationdata.h"
24 : #include "collationdatareader.h"
25 : #include "collationfastlatin.h"
26 : #include "collationkeys.h"
27 : #include "collationrootelements.h"
28 : #include "collationsettings.h"
29 : #include "collationtailoring.h"
30 : #include "collunsafe.h"
31 : #include "normalizer2impl.h"
32 : #include "uassert.h"
33 : #include "ucmndata.h"
34 : #include "utrie2.h"
35 :
36 : U_NAMESPACE_BEGIN
37 :
38 : namespace {
39 :
40 0 : int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
41 0 : return (i < length) ? indexes[i] : -1;
42 : }
43 :
44 : } // namespace
45 :
46 : void
47 0 : CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
48 : CollationTailoring &tailoring, UErrorCode &errorCode) {
49 0 : if(U_FAILURE(errorCode)) { return; }
50 0 : if(base != NULL) {
51 0 : if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
52 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
53 0 : return;
54 : }
55 0 : const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
56 0 : if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
57 0 : isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
58 0 : errorCode = U_INVALID_FORMAT_ERROR;
59 0 : return;
60 : }
61 0 : if(base->getUCAVersion() != tailoring.getUCAVersion()) {
62 0 : errorCode = U_COLLATOR_VERSION_MISMATCH;
63 0 : return;
64 : }
65 0 : int32_t headerLength = header->dataHeader.headerSize;
66 0 : inBytes += headerLength;
67 0 : if(inLength >= 0) {
68 0 : inLength -= headerLength;
69 : }
70 : }
71 :
72 0 : if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
73 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
74 0 : return;
75 : }
76 0 : const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
77 0 : int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
78 0 : if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
79 0 : errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
80 0 : return;
81 : }
82 :
83 : // Assume that the tailoring data is in initial state,
84 : // with NULL pointers and 0 lengths.
85 :
86 : // Set pointers to non-empty data parts.
87 : // Do this in order of their byte offsets. (Should help porting to Java.)
88 :
89 : int32_t index; // one of the indexes[] slots
90 : int32_t offset; // byte offset for the index part
91 : int32_t length; // number of bytes in the index part
92 :
93 0 : if(indexesLength > IX_TOTAL_SIZE) {
94 0 : length = inIndexes[IX_TOTAL_SIZE];
95 0 : } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
96 0 : length = inIndexes[indexesLength - 1];
97 : } else {
98 0 : length = 0; // only indexes, and inLength was already checked for them
99 : }
100 0 : if(0 <= inLength && inLength < length) {
101 0 : errorCode = U_INVALID_FORMAT_ERROR;
102 0 : return;
103 : }
104 :
105 0 : const CollationData *baseData = base == NULL ? NULL : base->data;
106 0 : const int32_t *reorderCodes = NULL;
107 0 : int32_t reorderCodesLength = 0;
108 0 : const uint32_t *reorderRanges = NULL;
109 0 : int32_t reorderRangesLength = 0;
110 0 : index = IX_REORDER_CODES_OFFSET;
111 0 : offset = getIndex(inIndexes, indexesLength, index);
112 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
113 0 : if(length >= 4) {
114 0 : if(baseData == NULL) {
115 : // We assume for collation settings that
116 : // the base data does not have a reordering.
117 0 : errorCode = U_INVALID_FORMAT_ERROR;
118 0 : return;
119 : }
120 0 : reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
121 0 : reorderCodesLength = length / 4;
122 :
123 : // The reorderRanges (if any) are the trailing reorderCodes entries.
124 : // Split the array at the boundary.
125 : // Script or reorder codes do not exceed 16-bit values.
126 : // Range limits are stored in the upper 16 bits, and are never 0.
127 0 : while(reorderRangesLength < reorderCodesLength &&
128 0 : (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
129 0 : ++reorderRangesLength;
130 : }
131 0 : U_ASSERT(reorderRangesLength < reorderCodesLength);
132 0 : if(reorderRangesLength != 0) {
133 0 : reorderCodesLength -= reorderRangesLength;
134 0 : reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
135 : }
136 : }
137 :
138 : // There should be a reorder table only if there are reorder codes.
139 : // However, when there are reorder codes the reorder table may be omitted to reduce
140 : // the data size.
141 0 : const uint8_t *reorderTable = NULL;
142 0 : index = IX_REORDER_TABLE_OFFSET;
143 0 : offset = getIndex(inIndexes, indexesLength, index);
144 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
145 0 : if(length >= 256) {
146 0 : if(reorderCodesLength == 0) {
147 0 : errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
148 0 : return;
149 : }
150 0 : reorderTable = inBytes + offset;
151 : } else {
152 : // If we have reorder codes, then build the reorderTable at the end,
153 : // when the CollationData is otherwise complete.
154 : }
155 :
156 0 : if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
157 0 : errorCode = U_INVALID_FORMAT_ERROR;
158 0 : return;
159 : }
160 0 : CollationData *data = NULL; // Remains NULL if there are no mappings.
161 :
162 0 : index = IX_TRIE_OFFSET;
163 0 : offset = getIndex(inIndexes, indexesLength, index);
164 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
165 0 : if(length >= 8) {
166 0 : if(!tailoring.ensureOwnedData(errorCode)) { return; }
167 0 : data = tailoring.ownedData;
168 0 : data->base = baseData;
169 0 : data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
170 0 : data->trie = tailoring.trie = utrie2_openFromSerialized(
171 0 : UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
172 : &errorCode);
173 0 : if(U_FAILURE(errorCode)) { return; }
174 0 : } else if(baseData != NULL) {
175 : // Use the base data. Only the settings are tailored.
176 0 : tailoring.data = baseData;
177 : } else {
178 0 : errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
179 0 : return;
180 : }
181 :
182 0 : index = IX_CES_OFFSET;
183 0 : offset = getIndex(inIndexes, indexesLength, index);
184 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
185 0 : if(length >= 8) {
186 0 : if(data == NULL) {
187 0 : errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
188 0 : return;
189 : }
190 0 : data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
191 0 : data->cesLength = length / 8;
192 : }
193 :
194 0 : index = IX_CE32S_OFFSET;
195 0 : offset = getIndex(inIndexes, indexesLength, index);
196 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
197 0 : if(length >= 4) {
198 0 : if(data == NULL) {
199 0 : errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
200 0 : return;
201 : }
202 0 : data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
203 0 : data->ce32sLength = length / 4;
204 : }
205 :
206 0 : int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
207 0 : if(jamoCE32sStart >= 0) {
208 0 : if(data == NULL || data->ce32s == NULL) {
209 0 : errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
210 0 : return;
211 : }
212 0 : data->jamoCE32s = data->ce32s + jamoCE32sStart;
213 0 : } else if(data == NULL) {
214 : // Nothing to do.
215 0 : } else if(baseData != NULL) {
216 0 : data->jamoCE32s = baseData->jamoCE32s;
217 : } else {
218 0 : errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
219 0 : return;
220 : }
221 :
222 0 : index = IX_ROOT_ELEMENTS_OFFSET;
223 0 : offset = getIndex(inIndexes, indexesLength, index);
224 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
225 0 : if(length >= 4) {
226 0 : length /= 4;
227 0 : if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
228 0 : errorCode = U_INVALID_FORMAT_ERROR;
229 0 : return;
230 : }
231 0 : data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
232 0 : data->rootElementsLength = length;
233 0 : uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
234 0 : if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
235 0 : errorCode = U_INVALID_FORMAT_ERROR;
236 0 : return;
237 : }
238 0 : uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
239 0 : if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
240 : // [fixed last secondary common byte] is too low,
241 : // and secondary weights would collide with compressed common secondaries.
242 0 : errorCode = U_INVALID_FORMAT_ERROR;
243 0 : return;
244 : }
245 : }
246 :
247 0 : index = IX_CONTEXTS_OFFSET;
248 0 : offset = getIndex(inIndexes, indexesLength, index);
249 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
250 0 : if(length >= 2) {
251 0 : if(data == NULL) {
252 0 : errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
253 0 : return;
254 : }
255 0 : data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
256 0 : data->contextsLength = length / 2;
257 : }
258 :
259 0 : index = IX_UNSAFE_BWD_OFFSET;
260 0 : offset = getIndex(inIndexes, indexesLength, index);
261 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
262 0 : if(length >= 2) {
263 0 : if(data == NULL) {
264 0 : errorCode = U_INVALID_FORMAT_ERROR;
265 0 : return;
266 : }
267 0 : if(baseData == NULL) {
268 : #if defined(COLLUNSAFE_COLL_VERSION) && defined (COLLUNSAFE_SERIALIZE)
269 0 : tailoring.unsafeBackwardSet = new UnicodeSet(unsafe_serializedData, unsafe_serializedCount, UnicodeSet::kSerialized, errorCode);
270 0 : if(tailoring.unsafeBackwardSet == NULL) {
271 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
272 0 : return;
273 0 : } else if (U_FAILURE(errorCode)) {
274 0 : return;
275 : }
276 : #else
277 : // Create the unsafe-backward set for the root collator.
278 : // Include all non-zero combining marks and trail surrogates.
279 : // We do this at load time, rather than at build time,
280 : // to simplify Unicode version bootstrapping:
281 : // The root data builder only needs the new FractionalUCA.txt data,
282 : // but it need not be built with a version of ICU already updated to
283 : // the corresponding new Unicode Character Database.
284 : //
285 : // The following is an optimized version of
286 : // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
287 : // It is faster and requires fewer code dependencies.
288 : tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
289 : if(tailoring.unsafeBackwardSet == NULL) {
290 : errorCode = U_MEMORY_ALLOCATION_ERROR;
291 : return;
292 : }
293 : data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
294 : #endif // !COLLUNSAFE_SERIALIZE || !COLLUNSAFE_COLL_VERSION
295 : } else {
296 : // Clone the root collator's set contents.
297 0 : tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
298 0 : baseData->unsafeBackwardSet->cloneAsThawed());
299 0 : if(tailoring.unsafeBackwardSet == NULL) {
300 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
301 0 : return;
302 : }
303 : }
304 : // Add the ranges from the data file to the unsafe-backward set.
305 : USerializedSet sset;
306 0 : const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
307 0 : if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
308 0 : errorCode = U_INVALID_FORMAT_ERROR;
309 0 : return;
310 : }
311 0 : int32_t count = uset_getSerializedRangeCount(&sset);
312 0 : for(int32_t i = 0; i < count; ++i) {
313 : UChar32 start, end;
314 0 : uset_getSerializedRange(&sset, i, &start, &end);
315 0 : tailoring.unsafeBackwardSet->add(start, end);
316 : }
317 : // Mark each lead surrogate as "unsafe"
318 : // if any of its 1024 associated supplementary code points is "unsafe".
319 0 : UChar32 c = 0x10000;
320 0 : for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
321 0 : if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
322 0 : tailoring.unsafeBackwardSet->add(lead);
323 : }
324 : }
325 0 : tailoring.unsafeBackwardSet->freeze();
326 0 : data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
327 0 : } else if(data == NULL) {
328 : // Nothing to do.
329 0 : } else if(baseData != NULL) {
330 : // No tailoring-specific data: Alias the root collator's set.
331 0 : data->unsafeBackwardSet = baseData->unsafeBackwardSet;
332 : } else {
333 0 : errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
334 0 : return;
335 : }
336 :
337 : // If the fast Latin format version is different,
338 : // or the version is set to 0 for "no fast Latin table",
339 : // then just always use the normal string comparison path.
340 0 : if(data != NULL) {
341 0 : data->fastLatinTable = NULL;
342 0 : data->fastLatinTableLength = 0;
343 0 : if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
344 0 : index = IX_FAST_LATIN_TABLE_OFFSET;
345 0 : offset = getIndex(inIndexes, indexesLength, index);
346 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
347 0 : if(length >= 2) {
348 0 : data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
349 0 : data->fastLatinTableLength = length / 2;
350 0 : if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
351 0 : errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
352 0 : return;
353 : }
354 0 : } else if(baseData != NULL) {
355 0 : data->fastLatinTable = baseData->fastLatinTable;
356 0 : data->fastLatinTableLength = baseData->fastLatinTableLength;
357 : }
358 : }
359 : }
360 :
361 0 : index = IX_SCRIPTS_OFFSET;
362 0 : offset = getIndex(inIndexes, indexesLength, index);
363 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
364 0 : if(length >= 2) {
365 0 : if(data == NULL) {
366 0 : errorCode = U_INVALID_FORMAT_ERROR;
367 0 : return;
368 : }
369 0 : const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
370 0 : int32_t scriptsLength = length / 2;
371 0 : data->numScripts = scripts[0];
372 : // There must be enough entries for both arrays, including more than two range starts.
373 0 : data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
374 0 : if(data->scriptStartsLength <= 2 ||
375 0 : CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
376 0 : errorCode = U_INVALID_FORMAT_ERROR;
377 0 : return;
378 : }
379 0 : data->scriptsIndex = scripts + 1;
380 0 : data->scriptStarts = scripts + 1 + data->numScripts + 16;
381 0 : if(!(data->scriptStarts[0] == 0 &&
382 0 : data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
383 0 : data->scriptStarts[data->scriptStartsLength - 1] ==
384 : (Collation::TRAIL_WEIGHT_BYTE << 8))) {
385 0 : errorCode = U_INVALID_FORMAT_ERROR;
386 0 : return;
387 : }
388 0 : } else if(data == NULL) {
389 : // Nothing to do.
390 0 : } else if(baseData != NULL) {
391 0 : data->numScripts = baseData->numScripts;
392 0 : data->scriptsIndex = baseData->scriptsIndex;
393 0 : data->scriptStarts = baseData->scriptStarts;
394 0 : data->scriptStartsLength = baseData->scriptStartsLength;
395 : }
396 :
397 0 : index = IX_COMPRESSIBLE_BYTES_OFFSET;
398 0 : offset = getIndex(inIndexes, indexesLength, index);
399 0 : length = getIndex(inIndexes, indexesLength, index + 1) - offset;
400 0 : if(length >= 256) {
401 0 : if(data == NULL) {
402 0 : errorCode = U_INVALID_FORMAT_ERROR;
403 0 : return;
404 : }
405 0 : data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
406 0 : } else if(data == NULL) {
407 : // Nothing to do.
408 0 : } else if(baseData != NULL) {
409 0 : data->compressibleBytes = baseData->compressibleBytes;
410 : } else {
411 0 : errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
412 0 : return;
413 : }
414 :
415 0 : const CollationSettings &ts = *tailoring.settings;
416 0 : int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
417 : uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
418 0 : int32_t fastLatinOptions = CollationFastLatin::getOptions(
419 0 : tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
420 0 : if(options == ts.options && ts.variableTop != 0 &&
421 0 : reorderCodesLength == ts.reorderCodesLength &&
422 0 : uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
423 0 : fastLatinOptions == ts.fastLatinOptions &&
424 0 : (fastLatinOptions < 0 ||
425 0 : uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
426 : sizeof(fastLatinPrimaries)) == 0)) {
427 0 : return;
428 : }
429 :
430 0 : CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
431 0 : if(settings == NULL) {
432 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
433 0 : return;
434 : }
435 0 : settings->options = options;
436 : // Set variableTop from options and scripts data.
437 0 : settings->variableTop = tailoring.data->getLastPrimaryForGroup(
438 0 : UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
439 0 : if(settings->variableTop == 0) {
440 0 : errorCode = U_INVALID_FORMAT_ERROR;
441 0 : return;
442 : }
443 :
444 0 : if(reorderCodesLength != 0) {
445 : settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
446 : reorderRanges, reorderRangesLength,
447 0 : reorderTable, errorCode);
448 : }
449 :
450 0 : settings->fastLatinOptions = CollationFastLatin::getOptions(
451 : tailoring.data, *settings,
452 : settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
453 : }
454 :
455 : UBool U_CALLCONV
456 0 : CollationDataReader::isAcceptable(void *context,
457 : const char * /* type */, const char * /*name*/,
458 : const UDataInfo *pInfo) {
459 0 : if(
460 0 : pInfo->size >= 20 &&
461 0 : pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
462 0 : pInfo->charsetFamily == U_CHARSET_FAMILY &&
463 0 : pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
464 0 : pInfo->dataFormat[1] == 0x43 &&
465 0 : pInfo->dataFormat[2] == 0x6f &&
466 0 : pInfo->dataFormat[3] == 0x6c &&
467 0 : pInfo->formatVersion[0] == 5
468 : ) {
469 0 : UVersionInfo *version = static_cast<UVersionInfo *>(context);
470 0 : if(version != NULL) {
471 0 : uprv_memcpy(version, pInfo->dataVersion, 4);
472 : }
473 0 : return TRUE;
474 : } else {
475 0 : return FALSE;
476 : }
477 : }
478 :
479 : U_NAMESPACE_END
480 :
481 : #endif // !UCONFIG_NO_COLLATION
|