Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : * Copyright (C) 2013-2015, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : *******************************************************************************
8 : * collationdatawriter.cpp
9 : *
10 : * created on: 2013aug06
11 : * created by: Markus W. Scherer
12 : */
13 :
14 : #include "unicode/utypes.h"
15 :
16 : #if !UCONFIG_NO_COLLATION
17 :
18 : #include "unicode/tblcoll.h"
19 : #include "unicode/udata.h"
20 : #include "unicode/uniset.h"
21 : #include "cmemory.h"
22 : #include "collationdata.h"
23 : #include "collationdatabuilder.h"
24 : #include "collationdatareader.h"
25 : #include "collationdatawriter.h"
26 : #include "collationfastlatin.h"
27 : #include "collationsettings.h"
28 : #include "collationtailoring.h"
29 : #include "uassert.h"
30 : #include "ucmndata.h"
31 :
32 : U_NAMESPACE_BEGIN
33 :
34 : uint8_t *
35 0 : RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
36 0 : if(U_FAILURE(errorCode)) { return NULL; }
37 0 : LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
38 0 : if(buffer.isNull()) {
39 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
40 0 : return NULL;
41 : }
42 0 : length = cloneBinary(buffer.getAlias(), 20000, errorCode);
43 0 : if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
44 0 : if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
45 0 : errorCode = U_MEMORY_ALLOCATION_ERROR;
46 0 : return NULL;
47 : }
48 0 : errorCode = U_ZERO_ERROR;
49 0 : length = cloneBinary(buffer.getAlias(), length, errorCode);
50 : }
51 0 : if(U_FAILURE(errorCode)) { return NULL; }
52 0 : return buffer.orphan();
53 : }
54 :
55 : int32_t
56 0 : RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
57 : int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
58 : return CollationDataWriter::writeTailoring(
59 0 : *tailoring, *settings, indexes, dest, capacity,
60 0 : errorCode);
61 : }
62 :
63 : static const UDataInfo dataInfo = {
64 : sizeof(UDataInfo),
65 : 0,
66 :
67 : U_IS_BIG_ENDIAN,
68 : U_CHARSET_FAMILY,
69 : U_SIZEOF_UCHAR,
70 : 0,
71 :
72 : { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
73 : { 5, 0, 0, 0 }, // formatVersion
74 : { 6, 3, 0, 0 } // dataVersion
75 : };
76 :
77 : int32_t
78 0 : CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
79 : const void *rootElements, int32_t rootElementsLength,
80 : int32_t indexes[], uint8_t *dest, int32_t capacity,
81 : UErrorCode &errorCode) {
82 : return write(TRUE, NULL,
83 : data, settings,
84 : rootElements, rootElementsLength,
85 0 : indexes, dest, capacity, errorCode);
86 : }
87 :
88 : int32_t
89 0 : CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
90 : int32_t indexes[], uint8_t *dest, int32_t capacity,
91 : UErrorCode &errorCode) {
92 0 : return write(FALSE, t.version,
93 0 : *t.data, settings,
94 : NULL, 0,
95 0 : indexes, dest, capacity, errorCode);
96 : }
97 :
98 : int32_t
99 0 : CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
100 : const CollationData &data, const CollationSettings &settings,
101 : const void *rootElements, int32_t rootElementsLength,
102 : int32_t indexes[], uint8_t *dest, int32_t capacity,
103 : UErrorCode &errorCode) {
104 0 : if(U_FAILURE(errorCode)) { return 0; }
105 0 : if(capacity < 0 || (capacity > 0 && dest == NULL)) {
106 0 : errorCode = U_ILLEGAL_ARGUMENT_ERROR;
107 0 : return 0;
108 : }
109 :
110 : // Figure out which data items to write before settling on
111 : // the indexes length and writing offsets.
112 : // For any data item, we need to write the start and limit offsets,
113 : // so the indexes length must be at least index-of-start-offset + 2.
114 : int32_t indexesLength;
115 : UBool hasMappings;
116 0 : UnicodeSet unsafeBackwardSet;
117 0 : const CollationData *baseData = data.base;
118 :
119 : int32_t fastLatinVersion;
120 0 : if(data.fastLatinTable != NULL) {
121 0 : fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
122 : } else {
123 0 : fastLatinVersion = 0;
124 : }
125 0 : int32_t fastLatinTableLength = 0;
126 :
127 0 : if(isBase) {
128 : // For the root collator, we write an even number of indexes
129 : // so that we start with an 8-aligned offset.
130 0 : indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
131 0 : U_ASSERT(settings.reorderCodesLength == 0);
132 0 : hasMappings = TRUE;
133 0 : unsafeBackwardSet = *data.unsafeBackwardSet;
134 0 : fastLatinTableLength = data.fastLatinTableLength;
135 0 : } else if(baseData == NULL) {
136 0 : hasMappings = FALSE;
137 0 : if(settings.reorderCodesLength == 0) {
138 : // only options
139 0 : indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here
140 : } else {
141 : // only options, reorder codes, and the reorder table
142 0 : indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
143 : }
144 : } else {
145 0 : hasMappings = TRUE;
146 : // Tailored mappings, and what else?
147 : // Check in ascending order of optional tailoring data items.
148 0 : indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
149 0 : if(data.contextsLength != 0) {
150 0 : indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
151 : }
152 0 : unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
153 0 : if(!unsafeBackwardSet.isEmpty()) {
154 0 : indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
155 : }
156 0 : if(data.fastLatinTable != baseData->fastLatinTable) {
157 0 : fastLatinTableLength = data.fastLatinTableLength;
158 0 : indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
159 : }
160 : }
161 :
162 0 : UVector32 codesAndRanges(errorCode);
163 0 : const int32_t *reorderCodes = settings.reorderCodes;
164 0 : int32_t reorderCodesLength = settings.reorderCodesLength;
165 0 : if(settings.hasReordering() &&
166 0 : CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
167 : // Rebuild the full list of reorder ranges.
168 : // The list in the settings is truncated for efficiency.
169 0 : data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
170 : // Write the codes, then the ranges.
171 0 : for(int32_t i = 0; i < reorderCodesLength; ++i) {
172 0 : codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
173 : }
174 0 : if(U_FAILURE(errorCode)) { return 0; }
175 0 : reorderCodes = codesAndRanges.getBuffer();
176 0 : reorderCodesLength = codesAndRanges.size();
177 : }
178 :
179 : int32_t headerSize;
180 0 : if(isBase) {
181 0 : headerSize = 0; // udata_create() writes the header
182 : } else {
183 : DataHeader header;
184 0 : header.dataHeader.magic1 = 0xda;
185 0 : header.dataHeader.magic2 = 0x27;
186 0 : uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
187 0 : uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
188 0 : headerSize = (int32_t)sizeof(header);
189 0 : U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes
190 0 : if(hasMappings && data.cesLength != 0) {
191 : // Sum of the sizes of the data items which are
192 : // not automatically multiples of 8 bytes and which are placed before the CEs.
193 0 : int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
194 0 : if((sum & 7) != 0) {
195 : // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
196 : // We add to the header size here.
197 : // Alternatively, we could increment the indexesLength
198 : // or add a few bytes to the reorderTable.
199 0 : headerSize += 4;
200 : }
201 : }
202 0 : header.dataHeader.headerSize = (uint16_t)headerSize;
203 0 : if(headerSize <= capacity) {
204 0 : uprv_memcpy(dest, &header, sizeof(header));
205 : // Write 00 bytes so that the padding is not mistaken for a copyright string.
206 0 : uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
207 0 : dest += headerSize;
208 0 : capacity -= headerSize;
209 : } else {
210 0 : dest = NULL;
211 0 : capacity = 0;
212 : }
213 : }
214 :
215 0 : indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
216 0 : U_ASSERT((settings.options & ~0xffff) == 0);
217 0 : indexes[CollationDataReader::IX_OPTIONS] =
218 0 : data.numericPrimary | fastLatinVersion | settings.options;
219 0 : indexes[CollationDataReader::IX_RESERVED2] = 0;
220 0 : indexes[CollationDataReader::IX_RESERVED3] = 0;
221 :
222 : // Byte offsets of data items all start from the start of the indexes.
223 : // We add the headerSize at the very end.
224 0 : int32_t totalSize = indexesLength * 4;
225 :
226 0 : if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
227 0 : indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
228 : } else {
229 0 : indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
230 : }
231 :
232 0 : indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
233 0 : totalSize += reorderCodesLength * 4;
234 :
235 0 : indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
236 0 : if(settings.reorderTable != NULL) {
237 0 : totalSize += 256;
238 : }
239 :
240 0 : indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
241 0 : if(hasMappings) {
242 0 : UErrorCode errorCode2 = U_ZERO_ERROR;
243 : int32_t length;
244 0 : if(totalSize < capacity) {
245 0 : length = utrie2_serialize(data.trie, dest + totalSize,
246 0 : capacity - totalSize, &errorCode2);
247 : } else {
248 0 : length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
249 : }
250 0 : if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
251 0 : errorCode = errorCode2;
252 0 : return 0;
253 : }
254 : // The trie size should be a multiple of 8 bytes due to the way
255 : // compactIndex2(UNewTrie2 *trie) currently works.
256 0 : U_ASSERT((length & 7) == 0);
257 0 : totalSize += length;
258 : }
259 :
260 0 : indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
261 0 : indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
262 0 : if(hasMappings && data.cesLength != 0) {
263 0 : U_ASSERT(((headerSize + totalSize) & 7) == 0);
264 0 : totalSize += data.cesLength * 8;
265 : }
266 :
267 0 : indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
268 0 : indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
269 0 : if(hasMappings) {
270 0 : totalSize += data.ce32sLength * 4;
271 : }
272 :
273 0 : indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
274 0 : totalSize += rootElementsLength * 4;
275 :
276 0 : indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
277 0 : if(hasMappings) {
278 0 : totalSize += data.contextsLength * 2;
279 : }
280 :
281 0 : indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
282 0 : if(hasMappings && !unsafeBackwardSet.isEmpty()) {
283 0 : UErrorCode errorCode2 = U_ZERO_ERROR;
284 : int32_t length;
285 0 : if(totalSize < capacity) {
286 0 : uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
287 0 : length = unsafeBackwardSet.serialize(
288 0 : p, (capacity - totalSize) / 2, errorCode2);
289 : } else {
290 0 : length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
291 : }
292 0 : if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
293 0 : errorCode = errorCode2;
294 0 : return 0;
295 : }
296 0 : totalSize += length * 2;
297 : }
298 :
299 0 : indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
300 0 : totalSize += fastLatinTableLength * 2;
301 :
302 0 : UnicodeString scripts;
303 0 : indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
304 0 : if(isBase) {
305 0 : scripts.append((UChar)data.numScripts);
306 0 : scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
307 0 : scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
308 0 : totalSize += scripts.length() * 2;
309 : }
310 :
311 0 : indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
312 0 : if(isBase) {
313 0 : totalSize += 256;
314 : }
315 :
316 0 : indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
317 0 : indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
318 :
319 0 : if(totalSize > capacity) {
320 0 : errorCode = U_BUFFER_OVERFLOW_ERROR;
321 0 : return headerSize + totalSize;
322 : }
323 :
324 0 : uprv_memcpy(dest, indexes, indexesLength * 4);
325 0 : copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
326 0 : copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
327 : // The trie has already been serialized into the dest buffer.
328 0 : copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
329 0 : copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
330 0 : copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
331 0 : copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
332 : // The unsafeBackwardSet has already been serialized into the dest buffer.
333 0 : copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
334 0 : copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
335 0 : copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
336 :
337 0 : return headerSize + totalSize;
338 : }
339 :
340 : void
341 0 : CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
342 : const void *src, uint8_t *dest) {
343 0 : int32_t start = indexes[startIndex];
344 0 : int32_t limit = indexes[startIndex + 1];
345 0 : if(start < limit) {
346 0 : uprv_memcpy(dest + start, src, limit - start);
347 : }
348 0 : }
349 :
350 : U_NAMESPACE_END
351 :
352 : #endif // !UCONFIG_NO_COLLATION
|