Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2001-2014, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: utrie2_builder.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2008sep26 (split off from utrie2.c)
16 : * created by: Markus W. Scherer
17 : *
18 : * This is a common implementation of a Unicode trie.
19 : * It is a kind of compressed, serializable table of 16- or 32-bit values associated with
20 : * Unicode code points (0..0x10ffff).
21 : * This is the second common version of a Unicode trie (hence the name UTrie2).
22 : * See utrie2.h for a comparison.
23 : *
24 : * This file contains only the builder code.
25 : * See utrie2.c for the runtime and enumeration code.
26 : */
27 : #ifdef UTRIE2_DEBUG
28 : # include <stdio.h>
29 : #endif
30 :
31 : #include "unicode/utypes.h"
32 : #include "cmemory.h"
33 : #include "utrie2.h"
34 : #include "utrie2_impl.h"
35 :
36 : #include "utrie.h" /* for utrie2_fromUTrie() and utrie_swap() */
37 :
38 : /* Implementation notes ----------------------------------------------------- */
39 :
40 : /*
41 : * The UTRIE2_SHIFT_1, UTRIE2_SHIFT_2, UTRIE2_INDEX_SHIFT and other values
42 : * have been chosen to minimize trie sizes overall.
43 : * Most of the code is flexible enough to work with a range of values,
44 : * within certain limits.
45 : *
46 : * Exception: Support for separate values for lead surrogate code _units_
47 : * vs. code _points_ was added after the constants were fixed,
48 : * and has not been tested nor particularly designed for different constant values.
49 : * (Especially the utrie2_enum() code that jumps to the special LSCP index-2
50 : * part and back.)
51 : *
52 : * Requires UTRIE2_SHIFT_2<=6. Otherwise 0xc0 which is the top of the ASCII-linear data
53 : * including the bad-UTF-8-data block is not a multiple of UTRIE2_DATA_BLOCK_LENGTH
54 : * and map[block>>UTRIE2_SHIFT_2] (used in reference counting and compaction
55 : * remapping) stops working.
56 : *
57 : * Requires UTRIE2_SHIFT_1>=10 because utrie2_enumForLeadSurrogate()
58 : * assumes that a single index-2 block is used for 0x400 code points
59 : * corresponding to one lead surrogate.
60 : *
61 : * Requires UTRIE2_SHIFT_1<=16. Otherwise one single index-2 block contains
62 : * more than one Unicode plane, and the split of the index-2 table into a BMP
63 : * part and a supplementary part, with a gap in between, would not work.
64 : *
65 : * Requires UTRIE2_INDEX_SHIFT>=1 not because of the code but because
66 : * there is data with more than 64k distinct values,
67 : * for example for Unihan collation with a separate collation weight per
68 : * Han character.
69 : */
70 :
71 : /* Building a trie ----------------------------------------------------------*/
72 :
73 : enum {
74 : /** The null index-2 block, following the gap in the index-2 table. */
75 : UNEWTRIE2_INDEX_2_NULL_OFFSET=UNEWTRIE2_INDEX_GAP_OFFSET+UNEWTRIE2_INDEX_GAP_LENGTH,
76 :
77 : /** The start of allocated index-2 blocks. */
78 : UNEWTRIE2_INDEX_2_START_OFFSET=UNEWTRIE2_INDEX_2_NULL_OFFSET+UTRIE2_INDEX_2_BLOCK_LENGTH,
79 :
80 : /**
81 : * The null data block.
82 : * Length 64=0x40 even if UTRIE2_DATA_BLOCK_LENGTH is smaller,
83 : * to work with 6-bit trail bytes from 2-byte UTF-8.
84 : */
85 : UNEWTRIE2_DATA_NULL_OFFSET=UTRIE2_DATA_START_OFFSET,
86 :
87 : /** The start of allocated data blocks. */
88 : UNEWTRIE2_DATA_START_OFFSET=UNEWTRIE2_DATA_NULL_OFFSET+0x40,
89 :
90 : /**
91 : * The start of data blocks for U+0800 and above.
92 : * Below, compaction uses a block length of 64 for 2-byte UTF-8.
93 : * From here on, compaction uses UTRIE2_DATA_BLOCK_LENGTH.
94 : * Data values for 0x780 code points beyond ASCII.
95 : */
96 : UNEWTRIE2_DATA_0800_OFFSET=UNEWTRIE2_DATA_START_OFFSET+0x780
97 : };
98 :
99 : /* Start with allocation of 16k data entries. */
100 : #define UNEWTRIE2_INITIAL_DATA_LENGTH ((int32_t)1<<14)
101 :
102 : /* Grow about 8x each time. */
103 : #define UNEWTRIE2_MEDIUM_DATA_LENGTH ((int32_t)1<<17)
104 :
105 : static int32_t
106 : allocIndex2Block(UNewTrie2 *trie);
107 :
108 : U_CAPI UTrie2 * U_EXPORT2
109 0 : utrie2_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
110 : UTrie2 *trie;
111 : UNewTrie2 *newTrie;
112 : uint32_t *data;
113 : int32_t i, j;
114 :
115 0 : if(U_FAILURE(*pErrorCode)) {
116 0 : return NULL;
117 : }
118 :
119 0 : trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
120 0 : newTrie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
121 0 : data=(uint32_t *)uprv_malloc(UNEWTRIE2_INITIAL_DATA_LENGTH*4);
122 0 : if(trie==NULL || newTrie==NULL || data==NULL) {
123 0 : uprv_free(trie);
124 0 : uprv_free(newTrie);
125 0 : uprv_free(data);
126 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
127 0 : return 0;
128 : }
129 :
130 0 : uprv_memset(trie, 0, sizeof(UTrie2));
131 0 : trie->initialValue=initialValue;
132 0 : trie->errorValue=errorValue;
133 0 : trie->highStart=0x110000;
134 0 : trie->newTrie=newTrie;
135 :
136 0 : newTrie->data=data;
137 0 : newTrie->dataCapacity=UNEWTRIE2_INITIAL_DATA_LENGTH;
138 0 : newTrie->initialValue=initialValue;
139 0 : newTrie->errorValue=errorValue;
140 0 : newTrie->highStart=0x110000;
141 0 : newTrie->firstFreeBlock=0; /* no free block in the list */
142 0 : newTrie->isCompacted=FALSE;
143 :
144 : /*
145 : * preallocate and reset
146 : * - ASCII
147 : * - the bad-UTF-8-data block
148 : * - the null data block
149 : */
150 0 : for(i=0; i<0x80; ++i) {
151 0 : newTrie->data[i]=initialValue;
152 : }
153 0 : for(; i<0xc0; ++i) {
154 0 : newTrie->data[i]=errorValue;
155 : }
156 0 : for(i=UNEWTRIE2_DATA_NULL_OFFSET; i<UNEWTRIE2_DATA_START_OFFSET; ++i) {
157 0 : newTrie->data[i]=initialValue;
158 : }
159 0 : newTrie->dataNullOffset=UNEWTRIE2_DATA_NULL_OFFSET;
160 0 : newTrie->dataLength=UNEWTRIE2_DATA_START_OFFSET;
161 :
162 : /* set the index-2 indexes for the 2=0x80>>UTRIE2_SHIFT_2 ASCII data blocks */
163 0 : for(i=0, j=0; j<0x80; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
164 0 : newTrie->index2[i]=j;
165 0 : newTrie->map[i]=1;
166 : }
167 : /* reference counts for the bad-UTF-8-data block */
168 0 : for(; j<0xc0; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
169 0 : newTrie->map[i]=0;
170 : }
171 : /*
172 : * Reference counts for the null data block: all blocks except for the ASCII blocks.
173 : * Plus 1 so that we don't drop this block during compaction.
174 : * Plus as many as needed for lead surrogate code points.
175 : */
176 : /* i==newTrie->dataNullOffset */
177 0 : newTrie->map[i++]=
178 : (0x110000>>UTRIE2_SHIFT_2)-
179 : (0x80>>UTRIE2_SHIFT_2)+
180 : 1+
181 : UTRIE2_LSCP_INDEX_2_LENGTH;
182 0 : j+=UTRIE2_DATA_BLOCK_LENGTH;
183 0 : for(; j<UNEWTRIE2_DATA_START_OFFSET; ++i, j+=UTRIE2_DATA_BLOCK_LENGTH) {
184 0 : newTrie->map[i]=0;
185 : }
186 :
187 : /*
188 : * set the remaining indexes in the BMP index-2 block
189 : * to the null data block
190 : */
191 0 : for(i=0x80>>UTRIE2_SHIFT_2; i<UTRIE2_INDEX_2_BMP_LENGTH; ++i) {
192 0 : newTrie->index2[i]=UNEWTRIE2_DATA_NULL_OFFSET;
193 : }
194 :
195 : /*
196 : * Fill the index gap with impossible values so that compaction
197 : * does not overlap other index-2 blocks with the gap.
198 : */
199 0 : for(i=0; i<UNEWTRIE2_INDEX_GAP_LENGTH; ++i) {
200 0 : newTrie->index2[UNEWTRIE2_INDEX_GAP_OFFSET+i]=-1;
201 : }
202 :
203 : /* set the indexes in the null index-2 block */
204 0 : for(i=0; i<UTRIE2_INDEX_2_BLOCK_LENGTH; ++i) {
205 0 : newTrie->index2[UNEWTRIE2_INDEX_2_NULL_OFFSET+i]=UNEWTRIE2_DATA_NULL_OFFSET;
206 : }
207 0 : newTrie->index2NullOffset=UNEWTRIE2_INDEX_2_NULL_OFFSET;
208 0 : newTrie->index2Length=UNEWTRIE2_INDEX_2_START_OFFSET;
209 :
210 : /* set the index-1 indexes for the linear index-2 block */
211 0 : for(i=0, j=0;
212 0 : i<UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
213 0 : ++i, j+=UTRIE2_INDEX_2_BLOCK_LENGTH
214 : ) {
215 0 : newTrie->index1[i]=j;
216 : }
217 :
218 : /* set the remaining index-1 indexes to the null index-2 block */
219 0 : for(; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
220 0 : newTrie->index1[i]=UNEWTRIE2_INDEX_2_NULL_OFFSET;
221 : }
222 :
223 : /*
224 : * Preallocate and reset data for U+0080..U+07ff,
225 : * for 2-byte UTF-8 which will be compacted in 64-blocks
226 : * even if UTRIE2_DATA_BLOCK_LENGTH is smaller.
227 : */
228 0 : for(i=0x80; i<0x800; i+=UTRIE2_DATA_BLOCK_LENGTH) {
229 0 : utrie2_set32(trie, i, initialValue, pErrorCode);
230 : }
231 :
232 0 : return trie;
233 : }
234 :
235 : static UNewTrie2 *
236 0 : cloneBuilder(const UNewTrie2 *other) {
237 : UNewTrie2 *trie;
238 :
239 0 : trie=(UNewTrie2 *)uprv_malloc(sizeof(UNewTrie2));
240 0 : if(trie==NULL) {
241 0 : return NULL;
242 : }
243 :
244 0 : trie->data=(uint32_t *)uprv_malloc(other->dataCapacity*4);
245 0 : if(trie->data==NULL) {
246 0 : uprv_free(trie);
247 0 : return NULL;
248 : }
249 0 : trie->dataCapacity=other->dataCapacity;
250 :
251 : /* clone data */
252 0 : uprv_memcpy(trie->index1, other->index1, sizeof(trie->index1));
253 0 : uprv_memcpy(trie->index2, other->index2, (size_t)other->index2Length*4);
254 0 : trie->index2NullOffset=other->index2NullOffset;
255 0 : trie->index2Length=other->index2Length;
256 :
257 0 : uprv_memcpy(trie->data, other->data, (size_t)other->dataLength*4);
258 0 : trie->dataNullOffset=other->dataNullOffset;
259 0 : trie->dataLength=other->dataLength;
260 :
261 : /* reference counters */
262 0 : if(other->isCompacted) {
263 0 : trie->firstFreeBlock=0;
264 : } else {
265 0 : uprv_memcpy(trie->map, other->map, ((size_t)other->dataLength>>UTRIE2_SHIFT_2)*4);
266 0 : trie->firstFreeBlock=other->firstFreeBlock;
267 : }
268 :
269 0 : trie->initialValue=other->initialValue;
270 0 : trie->errorValue=other->errorValue;
271 0 : trie->highStart=other->highStart;
272 0 : trie->isCompacted=other->isCompacted;
273 :
274 0 : return trie;
275 : }
276 :
277 : U_CAPI UTrie2 * U_EXPORT2
278 0 : utrie2_clone(const UTrie2 *other, UErrorCode *pErrorCode) {
279 : UTrie2 *trie;
280 :
281 0 : if(U_FAILURE(*pErrorCode)) {
282 0 : return NULL;
283 : }
284 0 : if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
285 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
286 0 : return NULL;
287 : }
288 :
289 0 : trie=(UTrie2 *)uprv_malloc(sizeof(UTrie2));
290 0 : if(trie==NULL) {
291 0 : return NULL;
292 : }
293 0 : uprv_memcpy(trie, other, sizeof(UTrie2));
294 :
295 0 : if(other->memory!=NULL) {
296 0 : trie->memory=uprv_malloc(other->length);
297 0 : if(trie->memory!=NULL) {
298 0 : trie->isMemoryOwned=TRUE;
299 0 : uprv_memcpy(trie->memory, other->memory, other->length);
300 :
301 : /* make the clone's pointers point to its own memory */
302 0 : trie->index=(uint16_t *)trie->memory+(other->index-(uint16_t *)other->memory);
303 0 : if(other->data16!=NULL) {
304 0 : trie->data16=(uint16_t *)trie->memory+(other->data16-(uint16_t *)other->memory);
305 : }
306 0 : if(other->data32!=NULL) {
307 0 : trie->data32=(uint32_t *)trie->memory+(other->data32-(uint32_t *)other->memory);
308 : }
309 : }
310 : } else /* other->newTrie!=NULL */ {
311 0 : trie->newTrie=cloneBuilder(other->newTrie);
312 : }
313 :
314 0 : if(trie->memory==NULL && trie->newTrie==NULL) {
315 0 : uprv_free(trie);
316 0 : trie=NULL;
317 : }
318 0 : return trie;
319 : }
320 :
321 : typedef struct NewTrieAndStatus {
322 : UTrie2 *trie;
323 : UErrorCode errorCode;
324 : UBool exclusiveLimit; /* rather than inclusive range end */
325 : } NewTrieAndStatus;
326 :
327 : static UBool U_CALLCONV
328 0 : copyEnumRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
329 0 : NewTrieAndStatus *nt=(NewTrieAndStatus *)context;
330 0 : if(value!=nt->trie->initialValue) {
331 0 : if(nt->exclusiveLimit) {
332 0 : --end;
333 : }
334 0 : if(start==end) {
335 0 : utrie2_set32(nt->trie, start, value, &nt->errorCode);
336 : } else {
337 0 : utrie2_setRange32(nt->trie, start, end, value, TRUE, &nt->errorCode);
338 : }
339 0 : return U_SUCCESS(nt->errorCode);
340 : } else {
341 0 : return TRUE;
342 : }
343 : }
344 :
345 : #ifdef UTRIE2_DEBUG
346 : static void
347 : utrie_printLengths(const UTrie *trie) {
348 : long indexLength=trie->indexLength;
349 : long dataLength=(long)trie->dataLength;
350 : long totalLength=(long)sizeof(UTrieHeader)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
351 : printf("**UTrieLengths** index:%6ld data:%6ld serialized:%6ld\n",
352 : indexLength, dataLength, totalLength);
353 : }
354 :
355 : static void
356 : utrie2_printLengths(const UTrie2 *trie, const char *which) {
357 : long indexLength=trie->indexLength;
358 : long dataLength=(long)trie->dataLength;
359 : long totalLength=(long)sizeof(UTrie2Header)+indexLength*2+dataLength*(trie->data32!=NULL ? 4 : 2);
360 : printf("**UTrie2Lengths(%s)** index:%6ld data:%6ld serialized:%6ld\n",
361 : which, indexLength, dataLength, totalLength);
362 : }
363 : #endif
364 :
365 : U_CAPI UTrie2 * U_EXPORT2
366 0 : utrie2_cloneAsThawed(const UTrie2 *other, UErrorCode *pErrorCode) {
367 : NewTrieAndStatus context;
368 : UChar lead;
369 :
370 0 : if(U_FAILURE(*pErrorCode)) {
371 0 : return NULL;
372 : }
373 0 : if(other==NULL || (other->memory==NULL && other->newTrie==NULL)) {
374 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
375 0 : return NULL;
376 : }
377 0 : if(other->newTrie!=NULL && !other->newTrie->isCompacted) {
378 0 : return utrie2_clone(other, pErrorCode); /* clone an unfrozen trie */
379 : }
380 :
381 : /* Clone the frozen trie by enumerating it and building a new one. */
382 0 : context.trie=utrie2_open(other->initialValue, other->errorValue, pErrorCode);
383 0 : if(U_FAILURE(*pErrorCode)) {
384 0 : return NULL;
385 : }
386 0 : context.exclusiveLimit=FALSE;
387 0 : context.errorCode=*pErrorCode;
388 0 : utrie2_enum(other, NULL, copyEnumRange, &context);
389 0 : *pErrorCode=context.errorCode;
390 0 : for(lead=0xd800; lead<0xdc00; ++lead) {
391 : uint32_t value;
392 0 : if(other->data32==NULL) {
393 0 : value=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(other, lead);
394 : } else {
395 0 : value=UTRIE2_GET32_FROM_U16_SINGLE_LEAD(other, lead);
396 : }
397 0 : if(value!=other->initialValue) {
398 0 : utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
399 : }
400 : }
401 0 : if(U_FAILURE(*pErrorCode)) {
402 0 : utrie2_close(context.trie);
403 0 : context.trie=NULL;
404 : }
405 0 : return context.trie;
406 : }
407 :
408 : /* Almost the same as utrie2_cloneAsThawed() but copies a UTrie and freezes the clone. */
409 : U_CAPI UTrie2 * U_EXPORT2
410 0 : utrie2_fromUTrie(const UTrie *trie1, uint32_t errorValue, UErrorCode *pErrorCode) {
411 : NewTrieAndStatus context;
412 : UChar lead;
413 :
414 0 : if(U_FAILURE(*pErrorCode)) {
415 0 : return NULL;
416 : }
417 0 : if(trie1==NULL) {
418 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
419 0 : return NULL;
420 : }
421 0 : context.trie=utrie2_open(trie1->initialValue, errorValue, pErrorCode);
422 0 : if(U_FAILURE(*pErrorCode)) {
423 0 : return NULL;
424 : }
425 0 : context.exclusiveLimit=TRUE;
426 0 : context.errorCode=*pErrorCode;
427 0 : utrie_enum(trie1, NULL, copyEnumRange, &context);
428 0 : *pErrorCode=context.errorCode;
429 0 : for(lead=0xd800; lead<0xdc00; ++lead) {
430 : uint32_t value;
431 0 : if(trie1->data32==NULL) {
432 0 : value=UTRIE_GET16_FROM_LEAD(trie1, lead);
433 : } else {
434 0 : value=UTRIE_GET32_FROM_LEAD(trie1, lead);
435 : }
436 0 : if(value!=trie1->initialValue) {
437 0 : utrie2_set32ForLeadSurrogateCodeUnit(context.trie, lead, value, pErrorCode);
438 : }
439 : }
440 0 : if(U_SUCCESS(*pErrorCode)) {
441 0 : utrie2_freeze(context.trie,
442 0 : trie1->data32!=NULL ? UTRIE2_32_VALUE_BITS : UTRIE2_16_VALUE_BITS,
443 0 : pErrorCode);
444 : }
445 : #ifdef UTRIE2_DEBUG
446 : if(U_SUCCESS(*pErrorCode)) {
447 : utrie_printLengths(trie1);
448 : utrie2_printLengths(context.trie, "fromUTrie");
449 : }
450 : #endif
451 0 : if(U_FAILURE(*pErrorCode)) {
452 0 : utrie2_close(context.trie);
453 0 : context.trie=NULL;
454 : }
455 0 : return context.trie;
456 : }
457 :
458 : static inline UBool
459 0 : isInNullBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
460 : int32_t i2, block;
461 :
462 0 : if(U_IS_LEAD(c) && forLSCP) {
463 0 : i2=(UTRIE2_LSCP_INDEX_2_OFFSET-(0xd800>>UTRIE2_SHIFT_2))+
464 0 : (c>>UTRIE2_SHIFT_2);
465 : } else {
466 0 : i2=trie->index1[c>>UTRIE2_SHIFT_1]+
467 0 : ((c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK);
468 : }
469 0 : block=trie->index2[i2];
470 0 : return (UBool)(block==trie->dataNullOffset);
471 : }
472 :
473 : static int32_t
474 0 : allocIndex2Block(UNewTrie2 *trie) {
475 : int32_t newBlock, newTop;
476 :
477 0 : newBlock=trie->index2Length;
478 0 : newTop=newBlock+UTRIE2_INDEX_2_BLOCK_LENGTH;
479 0 : if(newTop>UPRV_LENGTHOF(trie->index2)) {
480 : /*
481 : * Should never occur.
482 : * Either UTRIE2_MAX_BUILD_TIME_INDEX_LENGTH is incorrect,
483 : * or the code writes more values than should be possible.
484 : */
485 0 : return -1;
486 : }
487 0 : trie->index2Length=newTop;
488 0 : uprv_memcpy(trie->index2+newBlock, trie->index2+trie->index2NullOffset, UTRIE2_INDEX_2_BLOCK_LENGTH*4);
489 0 : return newBlock;
490 : }
491 :
492 : static int32_t
493 0 : getIndex2Block(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
494 : int32_t i1, i2;
495 :
496 0 : if(U_IS_LEAD(c) && forLSCP) {
497 0 : return UTRIE2_LSCP_INDEX_2_OFFSET;
498 : }
499 :
500 0 : i1=c>>UTRIE2_SHIFT_1;
501 0 : i2=trie->index1[i1];
502 0 : if(i2==trie->index2NullOffset) {
503 0 : i2=allocIndex2Block(trie);
504 0 : if(i2<0) {
505 0 : return -1; /* program error */
506 : }
507 0 : trie->index1[i1]=i2;
508 : }
509 0 : return i2;
510 : }
511 :
512 : static int32_t
513 0 : allocDataBlock(UNewTrie2 *trie, int32_t copyBlock) {
514 : int32_t newBlock, newTop;
515 :
516 0 : if(trie->firstFreeBlock!=0) {
517 : /* get the first free block */
518 0 : newBlock=trie->firstFreeBlock;
519 0 : trie->firstFreeBlock=-trie->map[newBlock>>UTRIE2_SHIFT_2];
520 : } else {
521 : /* get a new block from the high end */
522 0 : newBlock=trie->dataLength;
523 0 : newTop=newBlock+UTRIE2_DATA_BLOCK_LENGTH;
524 0 : if(newTop>trie->dataCapacity) {
525 : /* out of memory in the data array */
526 : int32_t capacity;
527 : uint32_t *data;
528 :
529 0 : if(trie->dataCapacity<UNEWTRIE2_MEDIUM_DATA_LENGTH) {
530 0 : capacity=UNEWTRIE2_MEDIUM_DATA_LENGTH;
531 0 : } else if(trie->dataCapacity<UNEWTRIE2_MAX_DATA_LENGTH) {
532 0 : capacity=UNEWTRIE2_MAX_DATA_LENGTH;
533 : } else {
534 : /*
535 : * Should never occur.
536 : * Either UNEWTRIE2_MAX_DATA_LENGTH is incorrect,
537 : * or the code writes more values than should be possible.
538 : */
539 0 : return -1;
540 : }
541 0 : data=(uint32_t *)uprv_malloc(capacity*4);
542 0 : if(data==NULL) {
543 0 : return -1;
544 : }
545 0 : uprv_memcpy(data, trie->data, (size_t)trie->dataLength*4);
546 0 : uprv_free(trie->data);
547 0 : trie->data=data;
548 0 : trie->dataCapacity=capacity;
549 : }
550 0 : trie->dataLength=newTop;
551 : }
552 0 : uprv_memcpy(trie->data+newBlock, trie->data+copyBlock, UTRIE2_DATA_BLOCK_LENGTH*4);
553 0 : trie->map[newBlock>>UTRIE2_SHIFT_2]=0;
554 0 : return newBlock;
555 : }
556 :
557 : /* call when the block's reference counter reaches 0 */
558 : static void
559 0 : releaseDataBlock(UNewTrie2 *trie, int32_t block) {
560 : /* put this block at the front of the free-block chain */
561 0 : trie->map[block>>UTRIE2_SHIFT_2]=-trie->firstFreeBlock;
562 0 : trie->firstFreeBlock=block;
563 0 : }
564 :
565 : static inline UBool
566 0 : isWritableBlock(UNewTrie2 *trie, int32_t block) {
567 0 : return (UBool)(block!=trie->dataNullOffset && 1==trie->map[block>>UTRIE2_SHIFT_2]);
568 : }
569 :
570 : static inline void
571 0 : setIndex2Entry(UNewTrie2 *trie, int32_t i2, int32_t block) {
572 : int32_t oldBlock;
573 0 : ++trie->map[block>>UTRIE2_SHIFT_2]; /* increment first, in case block==oldBlock! */
574 0 : oldBlock=trie->index2[i2];
575 0 : if(0 == --trie->map[oldBlock>>UTRIE2_SHIFT_2]) {
576 0 : releaseDataBlock(trie, oldBlock);
577 : }
578 0 : trie->index2[i2]=block;
579 0 : }
580 :
581 : /**
582 : * No error checking for illegal arguments.
583 : *
584 : * @return -1 if no new data block available (out of memory in data array)
585 : * @internal
586 : */
587 : static int32_t
588 0 : getDataBlock(UNewTrie2 *trie, UChar32 c, UBool forLSCP) {
589 : int32_t i2, oldBlock, newBlock;
590 :
591 0 : i2=getIndex2Block(trie, c, forLSCP);
592 0 : if(i2<0) {
593 0 : return -1; /* program error */
594 : }
595 :
596 0 : i2+=(c>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
597 0 : oldBlock=trie->index2[i2];
598 0 : if(isWritableBlock(trie, oldBlock)) {
599 0 : return oldBlock;
600 : }
601 :
602 : /* allocate a new data block */
603 0 : newBlock=allocDataBlock(trie, oldBlock);
604 0 : if(newBlock<0) {
605 : /* out of memory in the data array */
606 0 : return -1;
607 : }
608 0 : setIndex2Entry(trie, i2, newBlock);
609 0 : return newBlock;
610 : }
611 :
612 : /**
613 : * @return TRUE if the value was successfully set
614 : */
615 : static void
616 0 : set32(UNewTrie2 *trie,
617 : UChar32 c, UBool forLSCP, uint32_t value,
618 : UErrorCode *pErrorCode) {
619 : int32_t block;
620 :
621 0 : if(trie==NULL || trie->isCompacted) {
622 0 : *pErrorCode=U_NO_WRITE_PERMISSION;
623 0 : return;
624 : }
625 :
626 0 : block=getDataBlock(trie, c, forLSCP);
627 0 : if(block<0) {
628 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
629 0 : return;
630 : }
631 :
632 0 : trie->data[block+(c&UTRIE2_DATA_MASK)]=value;
633 : }
634 :
635 : U_CAPI void U_EXPORT2
636 0 : utrie2_set32(UTrie2 *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
637 0 : if(U_FAILURE(*pErrorCode)) {
638 0 : return;
639 : }
640 0 : if((uint32_t)c>0x10ffff) {
641 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
642 0 : return;
643 : }
644 0 : set32(trie->newTrie, c, TRUE, value, pErrorCode);
645 : }
646 :
647 : U_CAPI void U_EXPORT2
648 0 : utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
649 : UChar32 c, uint32_t value,
650 : UErrorCode *pErrorCode) {
651 0 : if(U_FAILURE(*pErrorCode)) {
652 0 : return;
653 : }
654 0 : if(!U_IS_LEAD(c)) {
655 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
656 0 : return;
657 : }
658 0 : set32(trie->newTrie, c, FALSE, value, pErrorCode);
659 : }
660 :
661 : static void
662 0 : writeBlock(uint32_t *block, uint32_t value) {
663 0 : uint32_t *limit=block+UTRIE2_DATA_BLOCK_LENGTH;
664 0 : while(block<limit) {
665 0 : *block++=value;
666 : }
667 0 : }
668 :
669 : /**
670 : * initialValue is ignored if overwrite=TRUE
671 : * @internal
672 : */
673 : static void
674 0 : fillBlock(uint32_t *block, UChar32 start, UChar32 limit,
675 : uint32_t value, uint32_t initialValue, UBool overwrite) {
676 : uint32_t *pLimit;
677 :
678 0 : pLimit=block+limit;
679 0 : block+=start;
680 0 : if(overwrite) {
681 0 : while(block<pLimit) {
682 0 : *block++=value;
683 : }
684 : } else {
685 0 : while(block<pLimit) {
686 0 : if(*block==initialValue) {
687 0 : *block=value;
688 : }
689 0 : ++block;
690 : }
691 : }
692 0 : }
693 :
694 : U_CAPI void U_EXPORT2
695 0 : utrie2_setRange32(UTrie2 *trie,
696 : UChar32 start, UChar32 end,
697 : uint32_t value, UBool overwrite,
698 : UErrorCode *pErrorCode) {
699 : /*
700 : * repeat value in [start..end]
701 : * mark index values for repeat-data blocks by setting bit 31 of the index values
702 : * fill around existing values if any, if(overwrite)
703 : */
704 : UNewTrie2 *newTrie;
705 : int32_t block, rest, repeatBlock;
706 : UChar32 limit;
707 :
708 0 : if(U_FAILURE(*pErrorCode)) {
709 0 : return;
710 : }
711 0 : if((uint32_t)start>0x10ffff || (uint32_t)end>0x10ffff || start>end) {
712 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
713 0 : return;
714 : }
715 0 : newTrie=trie->newTrie;
716 0 : if(newTrie==NULL || newTrie->isCompacted) {
717 0 : *pErrorCode=U_NO_WRITE_PERMISSION;
718 0 : return;
719 : }
720 0 : if(!overwrite && value==newTrie->initialValue) {
721 0 : return; /* nothing to do */
722 : }
723 :
724 0 : limit=end+1;
725 0 : if(start&UTRIE2_DATA_MASK) {
726 : UChar32 nextStart;
727 :
728 : /* set partial block at [start..following block boundary[ */
729 0 : block=getDataBlock(newTrie, start, TRUE);
730 0 : if(block<0) {
731 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
732 0 : return;
733 : }
734 :
735 0 : nextStart=(start+UTRIE2_DATA_BLOCK_LENGTH)&~UTRIE2_DATA_MASK;
736 0 : if(nextStart<=limit) {
737 0 : fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, UTRIE2_DATA_BLOCK_LENGTH,
738 0 : value, newTrie->initialValue, overwrite);
739 0 : start=nextStart;
740 : } else {
741 0 : fillBlock(newTrie->data+block, start&UTRIE2_DATA_MASK, limit&UTRIE2_DATA_MASK,
742 0 : value, newTrie->initialValue, overwrite);
743 0 : return;
744 : }
745 : }
746 :
747 : /* number of positions in the last, partial block */
748 0 : rest=limit&UTRIE2_DATA_MASK;
749 :
750 : /* round down limit to a block boundary */
751 0 : limit&=~UTRIE2_DATA_MASK;
752 :
753 : /* iterate over all-value blocks */
754 0 : if(value==newTrie->initialValue) {
755 0 : repeatBlock=newTrie->dataNullOffset;
756 : } else {
757 0 : repeatBlock=-1;
758 : }
759 :
760 0 : while(start<limit) {
761 : int32_t i2;
762 0 : UBool setRepeatBlock=FALSE;
763 :
764 0 : if(value==newTrie->initialValue && isInNullBlock(newTrie, start, TRUE)) {
765 0 : start+=UTRIE2_DATA_BLOCK_LENGTH; /* nothing to do */
766 0 : continue;
767 : }
768 :
769 : /* get index value */
770 0 : i2=getIndex2Block(newTrie, start, TRUE);
771 0 : if(i2<0) {
772 0 : *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
773 0 : return;
774 : }
775 0 : i2+=(start>>UTRIE2_SHIFT_2)&UTRIE2_INDEX_2_MASK;
776 0 : block=newTrie->index2[i2];
777 0 : if(isWritableBlock(newTrie, block)) {
778 : /* already allocated */
779 0 : if(overwrite && block>=UNEWTRIE2_DATA_0800_OFFSET) {
780 : /*
781 : * We overwrite all values, and it's not a
782 : * protected (ASCII-linear or 2-byte UTF-8) block:
783 : * replace with the repeatBlock.
784 : */
785 0 : setRepeatBlock=TRUE;
786 : } else {
787 : /* !overwrite, or protected block: just write the values into this block */
788 0 : fillBlock(newTrie->data+block,
789 : 0, UTRIE2_DATA_BLOCK_LENGTH,
790 0 : value, newTrie->initialValue, overwrite);
791 : }
792 0 : } else if(newTrie->data[block]!=value && (overwrite || block==newTrie->dataNullOffset)) {
793 : /*
794 : * Set the repeatBlock instead of the null block or previous repeat block:
795 : *
796 : * If !isWritableBlock() then all entries in the block have the same value
797 : * because it's the null block or a range block (the repeatBlock from a previous
798 : * call to utrie2_setRange32()).
799 : * No other blocks are used multiple times before compacting.
800 : *
801 : * The null block is the only non-writable block with the initialValue because
802 : * of the repeatBlock initialization above. (If value==initialValue, then
803 : * the repeatBlock will be the null data block.)
804 : *
805 : * We set our repeatBlock if the desired value differs from the block's value,
806 : * and if we overwrite any data or if the data is all initial values
807 : * (which is the same as the block being the null block, see above).
808 : */
809 0 : setRepeatBlock=TRUE;
810 : }
811 0 : if(setRepeatBlock) {
812 0 : if(repeatBlock>=0) {
813 0 : setIndex2Entry(newTrie, i2, repeatBlock);
814 : } else {
815 : /* create and set and fill the repeatBlock */
816 0 : repeatBlock=getDataBlock(newTrie, start, TRUE);
817 0 : if(repeatBlock<0) {
818 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
819 0 : return;
820 : }
821 0 : writeBlock(newTrie->data+repeatBlock, value);
822 : }
823 : }
824 :
825 0 : start+=UTRIE2_DATA_BLOCK_LENGTH;
826 : }
827 :
828 0 : if(rest>0) {
829 : /* set partial block at [last block boundary..limit[ */
830 0 : block=getDataBlock(newTrie, start, TRUE);
831 0 : if(block<0) {
832 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
833 0 : return;
834 : }
835 :
836 0 : fillBlock(newTrie->data+block, 0, rest, value, newTrie->initialValue, overwrite);
837 : }
838 :
839 0 : return;
840 : }
841 :
842 : /* compaction --------------------------------------------------------------- */
843 :
844 : static inline UBool
845 0 : equal_int32(const int32_t *s, const int32_t *t, int32_t length) {
846 0 : while(length>0 && *s==*t) {
847 0 : ++s;
848 0 : ++t;
849 0 : --length;
850 : }
851 0 : return (UBool)(length==0);
852 : }
853 :
854 : static inline UBool
855 0 : equal_uint32(const uint32_t *s, const uint32_t *t, int32_t length) {
856 0 : while(length>0 && *s==*t) {
857 0 : ++s;
858 0 : ++t;
859 0 : --length;
860 : }
861 0 : return (UBool)(length==0);
862 : }
863 :
864 : static int32_t
865 0 : findSameIndex2Block(const int32_t *idx, int32_t index2Length, int32_t otherBlock) {
866 : int32_t block;
867 :
868 : /* ensure that we do not even partially get past index2Length */
869 0 : index2Length-=UTRIE2_INDEX_2_BLOCK_LENGTH;
870 :
871 0 : for(block=0; block<=index2Length; ++block) {
872 0 : if(equal_int32(idx+block, idx+otherBlock, UTRIE2_INDEX_2_BLOCK_LENGTH)) {
873 0 : return block;
874 : }
875 : }
876 0 : return -1;
877 : }
878 :
879 : static int32_t
880 0 : findSameDataBlock(const uint32_t *data, int32_t dataLength, int32_t otherBlock, int32_t blockLength) {
881 : int32_t block;
882 :
883 : /* ensure that we do not even partially get past dataLength */
884 0 : dataLength-=blockLength;
885 :
886 0 : for(block=0; block<=dataLength; block+=UTRIE2_DATA_GRANULARITY) {
887 0 : if(equal_uint32(data+block, data+otherBlock, blockLength)) {
888 0 : return block;
889 : }
890 : }
891 0 : return -1;
892 : }
893 :
894 : /*
895 : * Find the start of the last range in the trie by enumerating backward.
896 : * Indexes for supplementary code points higher than this will be omitted.
897 : */
898 : static UChar32
899 0 : findHighStart(UNewTrie2 *trie, uint32_t highValue) {
900 : const uint32_t *data32;
901 :
902 : uint32_t value, initialValue;
903 : UChar32 c, prev;
904 : int32_t i1, i2, j, i2Block, prevI2Block, index2NullOffset, block, prevBlock, nullBlock;
905 :
906 0 : data32=trie->data;
907 0 : initialValue=trie->initialValue;
908 :
909 0 : index2NullOffset=trie->index2NullOffset;
910 0 : nullBlock=trie->dataNullOffset;
911 :
912 : /* set variables for previous range */
913 0 : if(highValue==initialValue) {
914 0 : prevI2Block=index2NullOffset;
915 0 : prevBlock=nullBlock;
916 : } else {
917 0 : prevI2Block=-1;
918 0 : prevBlock=-1;
919 : }
920 0 : prev=0x110000;
921 :
922 : /* enumerate index-2 blocks */
923 0 : i1=UNEWTRIE2_INDEX_1_LENGTH;
924 0 : c=prev;
925 0 : while(c>0) {
926 0 : i2Block=trie->index1[--i1];
927 0 : if(i2Block==prevI2Block) {
928 : /* the index-2 block is the same as the previous one, and filled with highValue */
929 0 : c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
930 0 : continue;
931 : }
932 0 : prevI2Block=i2Block;
933 0 : if(i2Block==index2NullOffset) {
934 : /* this is the null index-2 block */
935 0 : if(highValue!=initialValue) {
936 0 : return c;
937 : }
938 0 : c-=UTRIE2_CP_PER_INDEX_1_ENTRY;
939 : } else {
940 : /* enumerate data blocks for one index-2 block */
941 0 : for(i2=UTRIE2_INDEX_2_BLOCK_LENGTH; i2>0;) {
942 0 : block=trie->index2[i2Block+ --i2];
943 0 : if(block==prevBlock) {
944 : /* the block is the same as the previous one, and filled with highValue */
945 0 : c-=UTRIE2_DATA_BLOCK_LENGTH;
946 0 : continue;
947 : }
948 0 : prevBlock=block;
949 0 : if(block==nullBlock) {
950 : /* this is the null data block */
951 0 : if(highValue!=initialValue) {
952 0 : return c;
953 : }
954 0 : c-=UTRIE2_DATA_BLOCK_LENGTH;
955 : } else {
956 0 : for(j=UTRIE2_DATA_BLOCK_LENGTH; j>0;) {
957 0 : value=data32[block+ --j];
958 0 : if(value!=highValue) {
959 0 : return c;
960 : }
961 0 : --c;
962 : }
963 : }
964 : }
965 : }
966 : }
967 :
968 : /* deliver last range */
969 0 : return 0;
970 : }
971 :
972 : /*
973 : * Compact a build-time trie.
974 : *
975 : * The compaction
976 : * - removes blocks that are identical with earlier ones
977 : * - overlaps adjacent blocks as much as possible (if overlap==TRUE)
978 : * - moves blocks in steps of the data granularity
979 : * - moves and overlaps blocks that overlap with multiple values in the overlap region
980 : *
981 : * It does not
982 : * - try to move and overlap blocks that are not already adjacent
983 : */
984 : static void
985 0 : compactData(UNewTrie2 *trie) {
986 : int32_t start, newStart, movedStart;
987 : int32_t blockLength, overlap;
988 : int32_t i, mapIndex, blockCount;
989 :
990 : /* do not compact linear-ASCII data */
991 0 : newStart=UTRIE2_DATA_START_OFFSET;
992 0 : for(start=0, i=0; start<newStart; start+=UTRIE2_DATA_BLOCK_LENGTH, ++i) {
993 0 : trie->map[i]=start;
994 : }
995 :
996 : /*
997 : * Start with a block length of 64 for 2-byte UTF-8,
998 : * then switch to UTRIE2_DATA_BLOCK_LENGTH.
999 : */
1000 0 : blockLength=64;
1001 0 : blockCount=blockLength>>UTRIE2_SHIFT_2;
1002 0 : for(start=newStart; start<trie->dataLength;) {
1003 : /*
1004 : * start: index of first entry of current block
1005 : * newStart: index where the current block is to be moved
1006 : * (right after current end of already-compacted data)
1007 : */
1008 0 : if(start==UNEWTRIE2_DATA_0800_OFFSET) {
1009 0 : blockLength=UTRIE2_DATA_BLOCK_LENGTH;
1010 0 : blockCount=1;
1011 : }
1012 :
1013 : /* skip blocks that are not used */
1014 0 : if(trie->map[start>>UTRIE2_SHIFT_2]<=0) {
1015 : /* advance start to the next block */
1016 0 : start+=blockLength;
1017 :
1018 : /* leave newStart with the previous block! */
1019 0 : continue;
1020 : }
1021 :
1022 : /* search for an identical block */
1023 0 : if( (movedStart=findSameDataBlock(trie->data, newStart, start, blockLength))
1024 : >=0
1025 : ) {
1026 : /* found an identical block, set the other block's index value for the current block */
1027 0 : for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1028 0 : trie->map[mapIndex++]=movedStart;
1029 0 : movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1030 : }
1031 :
1032 : /* advance start to the next block */
1033 0 : start+=blockLength;
1034 :
1035 : /* leave newStart with the previous block! */
1036 0 : continue;
1037 : }
1038 :
1039 : /* see if the beginning of this block can be overlapped with the end of the previous block */
1040 : /* look for maximum overlap (modulo granularity) with the previous, adjacent block */
1041 0 : for(overlap=blockLength-UTRIE2_DATA_GRANULARITY;
1042 0 : overlap>0 && !equal_uint32(trie->data+(newStart-overlap), trie->data+start, overlap);
1043 0 : overlap-=UTRIE2_DATA_GRANULARITY) {}
1044 :
1045 0 : if(overlap>0 || newStart<start) {
1046 : /* some overlap, or just move the whole block */
1047 0 : movedStart=newStart-overlap;
1048 0 : for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1049 0 : trie->map[mapIndex++]=movedStart;
1050 0 : movedStart+=UTRIE2_DATA_BLOCK_LENGTH;
1051 : }
1052 :
1053 : /* move the non-overlapping indexes to their new positions */
1054 0 : start+=overlap;
1055 0 : for(i=blockLength-overlap; i>0; --i) {
1056 0 : trie->data[newStart++]=trie->data[start++];
1057 : }
1058 : } else /* no overlap && newStart==start */ {
1059 0 : for(i=blockCount, mapIndex=start>>UTRIE2_SHIFT_2; i>0; --i) {
1060 0 : trie->map[mapIndex++]=start;
1061 0 : start+=UTRIE2_DATA_BLOCK_LENGTH;
1062 : }
1063 0 : newStart=start;
1064 : }
1065 : }
1066 :
1067 : /* now adjust the index-2 table */
1068 0 : for(i=0; i<trie->index2Length; ++i) {
1069 0 : if(i==UNEWTRIE2_INDEX_GAP_OFFSET) {
1070 : /* Gap indexes are invalid (-1). Skip over the gap. */
1071 0 : i+=UNEWTRIE2_INDEX_GAP_LENGTH;
1072 : }
1073 0 : trie->index2[i]=trie->map[trie->index2[i]>>UTRIE2_SHIFT_2];
1074 : }
1075 0 : trie->dataNullOffset=trie->map[trie->dataNullOffset>>UTRIE2_SHIFT_2];
1076 :
1077 : /* ensure dataLength alignment */
1078 0 : while((newStart&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1079 0 : trie->data[newStart++]=trie->initialValue;
1080 : }
1081 :
1082 : #ifdef UTRIE2_DEBUG
1083 : /* we saved some space */
1084 : printf("compacting UTrie2: count of 32-bit data words %lu->%lu\n",
1085 : (long)trie->dataLength, (long)newStart);
1086 : #endif
1087 :
1088 0 : trie->dataLength=newStart;
1089 0 : }
1090 :
1091 : static void
1092 0 : compactIndex2(UNewTrie2 *trie) {
1093 : int32_t i, start, newStart, movedStart, overlap;
1094 :
1095 : /* do not compact linear-BMP index-2 blocks */
1096 0 : newStart=UTRIE2_INDEX_2_BMP_LENGTH;
1097 0 : for(start=0, i=0; start<newStart; start+=UTRIE2_INDEX_2_BLOCK_LENGTH, ++i) {
1098 0 : trie->map[i]=start;
1099 : }
1100 :
1101 : /* Reduce the index table gap to what will be needed at runtime. */
1102 0 : newStart+=UTRIE2_UTF8_2B_INDEX_2_LENGTH+((trie->highStart-0x10000)>>UTRIE2_SHIFT_1);
1103 :
1104 0 : for(start=UNEWTRIE2_INDEX_2_NULL_OFFSET; start<trie->index2Length;) {
1105 : /*
1106 : * start: index of first entry of current block
1107 : * newStart: index where the current block is to be moved
1108 : * (right after current end of already-compacted data)
1109 : */
1110 :
1111 : /* search for an identical block */
1112 0 : if( (movedStart=findSameIndex2Block(trie->index2, newStart, start))
1113 : >=0
1114 : ) {
1115 : /* found an identical block, set the other block's index value for the current block */
1116 0 : trie->map[start>>UTRIE2_SHIFT_1_2]=movedStart;
1117 :
1118 : /* advance start to the next block */
1119 0 : start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1120 :
1121 : /* leave newStart with the previous block! */
1122 0 : continue;
1123 : }
1124 :
1125 : /* see if the beginning of this block can be overlapped with the end of the previous block */
1126 : /* look for maximum overlap with the previous, adjacent block */
1127 0 : for(overlap=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
1128 0 : overlap>0 && !equal_int32(trie->index2+(newStart-overlap), trie->index2+start, overlap);
1129 : --overlap) {}
1130 :
1131 0 : if(overlap>0 || newStart<start) {
1132 : /* some overlap, or just move the whole block */
1133 0 : trie->map[start>>UTRIE2_SHIFT_1_2]=newStart-overlap;
1134 :
1135 : /* move the non-overlapping indexes to their new positions */
1136 0 : start+=overlap;
1137 0 : for(i=UTRIE2_INDEX_2_BLOCK_LENGTH-overlap; i>0; --i) {
1138 0 : trie->index2[newStart++]=trie->index2[start++];
1139 : }
1140 : } else /* no overlap && newStart==start */ {
1141 0 : trie->map[start>>UTRIE2_SHIFT_1_2]=start;
1142 0 : start+=UTRIE2_INDEX_2_BLOCK_LENGTH;
1143 0 : newStart=start;
1144 : }
1145 : }
1146 :
1147 : /* now adjust the index-1 table */
1148 0 : for(i=0; i<UNEWTRIE2_INDEX_1_LENGTH; ++i) {
1149 0 : trie->index1[i]=trie->map[trie->index1[i]>>UTRIE2_SHIFT_1_2];
1150 : }
1151 0 : trie->index2NullOffset=trie->map[trie->index2NullOffset>>UTRIE2_SHIFT_1_2];
1152 :
1153 : /*
1154 : * Ensure data table alignment:
1155 : * Needs to be granularity-aligned for 16-bit trie
1156 : * (so that dataMove will be down-shiftable),
1157 : * and 2-aligned for uint32_t data.
1158 : */
1159 0 : while((newStart&((UTRIE2_DATA_GRANULARITY-1)|1))!=0) {
1160 : /* Arbitrary value: 0x3fffc not possible for real data. */
1161 0 : trie->index2[newStart++]=(int32_t)0xffff<<UTRIE2_INDEX_SHIFT;
1162 : }
1163 :
1164 : #ifdef UTRIE2_DEBUG
1165 : /* we saved some space */
1166 : printf("compacting UTrie2: count of 16-bit index-2 words %lu->%lu\n",
1167 : (long)trie->index2Length, (long)newStart);
1168 : #endif
1169 :
1170 0 : trie->index2Length=newStart;
1171 0 : }
1172 :
1173 : static void
1174 0 : compactTrie(UTrie2 *trie, UErrorCode *pErrorCode) {
1175 : UNewTrie2 *newTrie;
1176 : UChar32 highStart, suppHighStart;
1177 : uint32_t highValue;
1178 :
1179 0 : newTrie=trie->newTrie;
1180 :
1181 : /* find highStart and round it up */
1182 0 : highValue=utrie2_get32(trie, 0x10ffff);
1183 0 : highStart=findHighStart(newTrie, highValue);
1184 0 : highStart=(highStart+(UTRIE2_CP_PER_INDEX_1_ENTRY-1))&~(UTRIE2_CP_PER_INDEX_1_ENTRY-1);
1185 0 : if(highStart==0x110000) {
1186 0 : highValue=trie->errorValue;
1187 : }
1188 :
1189 : /*
1190 : * Set trie->highStart only after utrie2_get32(trie, highStart).
1191 : * Otherwise utrie2_get32(trie, highStart) would try to read the highValue.
1192 : */
1193 0 : trie->highStart=newTrie->highStart=highStart;
1194 :
1195 : #ifdef UTRIE2_DEBUG
1196 : printf("UTrie2: highStart U+%04lx highValue 0x%lx initialValue 0x%lx\n",
1197 : (long)highStart, (long)highValue, (long)trie->initialValue);
1198 : #endif
1199 :
1200 0 : if(highStart<0x110000) {
1201 : /* Blank out [highStart..10ffff] to release associated data blocks. */
1202 0 : suppHighStart= highStart<=0x10000 ? 0x10000 : highStart;
1203 0 : utrie2_setRange32(trie, suppHighStart, 0x10ffff, trie->initialValue, TRUE, pErrorCode);
1204 0 : if(U_FAILURE(*pErrorCode)) {
1205 0 : return;
1206 : }
1207 : }
1208 :
1209 0 : compactData(newTrie);
1210 0 : if(highStart>0x10000) {
1211 0 : compactIndex2(newTrie);
1212 : #ifdef UTRIE2_DEBUG
1213 : } else {
1214 : printf("UTrie2: highStart U+%04lx count of 16-bit index-2 words %lu->%lu\n",
1215 : (long)highStart, (long)trie->newTrie->index2Length, (long)UTRIE2_INDEX_1_OFFSET);
1216 : #endif
1217 : }
1218 :
1219 : /*
1220 : * Store the highValue in the data array and round up the dataLength.
1221 : * Must be done after compactData() because that assumes that dataLength
1222 : * is a multiple of UTRIE2_DATA_BLOCK_LENGTH.
1223 : */
1224 0 : newTrie->data[newTrie->dataLength++]=highValue;
1225 0 : while((newTrie->dataLength&(UTRIE2_DATA_GRANULARITY-1))!=0) {
1226 0 : newTrie->data[newTrie->dataLength++]=trie->initialValue;
1227 : }
1228 :
1229 0 : newTrie->isCompacted=TRUE;
1230 : }
1231 :
1232 : /* serialization ------------------------------------------------------------ */
1233 :
1234 : /**
1235 : * Maximum length of the runtime index array.
1236 : * Limited by its own 16-bit index values, and by uint16_t UTrie2Header.indexLength.
1237 : * (The actual maximum length is lower,
1238 : * (0x110000>>UTRIE2_SHIFT_2)+UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH.)
1239 : */
1240 : #define UTRIE2_MAX_INDEX_LENGTH 0xffff
1241 :
1242 : /**
1243 : * Maximum length of the runtime data array.
1244 : * Limited by 16-bit index values that are left-shifted by UTRIE2_INDEX_SHIFT,
1245 : * and by uint16_t UTrie2Header.shiftedDataLength.
1246 : */
1247 : #define UTRIE2_MAX_DATA_LENGTH (0xffff<<UTRIE2_INDEX_SHIFT)
1248 :
1249 : /* Compact and internally serialize the trie. */
1250 : U_CAPI void U_EXPORT2
1251 0 : utrie2_freeze(UTrie2 *trie, UTrie2ValueBits valueBits, UErrorCode *pErrorCode) {
1252 : UNewTrie2 *newTrie;
1253 : UTrie2Header *header;
1254 : uint32_t *p;
1255 : uint16_t *dest16;
1256 : int32_t i, length;
1257 : int32_t allIndexesLength;
1258 : int32_t dataMove; /* >0 if the data is moved to the end of the index array */
1259 : UChar32 highStart;
1260 :
1261 : /* argument check */
1262 0 : if(U_FAILURE(*pErrorCode)) {
1263 0 : return;
1264 : }
1265 0 : if( trie==NULL ||
1266 0 : valueBits<0 || UTRIE2_COUNT_VALUE_BITS<=valueBits
1267 : ) {
1268 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1269 0 : return;
1270 : }
1271 0 : newTrie=trie->newTrie;
1272 0 : if(newTrie==NULL) {
1273 : /* already frozen */
1274 : UTrie2ValueBits frozenValueBits=
1275 0 : trie->data16!=NULL ? UTRIE2_16_VALUE_BITS : UTRIE2_32_VALUE_BITS;
1276 0 : if(valueBits!=frozenValueBits) {
1277 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1278 : }
1279 0 : return;
1280 : }
1281 :
1282 : /* compact if necessary */
1283 0 : if(!newTrie->isCompacted) {
1284 0 : compactTrie(trie, pErrorCode);
1285 0 : if(U_FAILURE(*pErrorCode)) {
1286 0 : return;
1287 : }
1288 : }
1289 0 : highStart=trie->highStart;
1290 :
1291 0 : if(highStart<=0x10000) {
1292 0 : allIndexesLength=UTRIE2_INDEX_1_OFFSET;
1293 : } else {
1294 0 : allIndexesLength=newTrie->index2Length;
1295 : }
1296 0 : if(valueBits==UTRIE2_16_VALUE_BITS) {
1297 0 : dataMove=allIndexesLength;
1298 : } else {
1299 0 : dataMove=0;
1300 : }
1301 :
1302 : /* are indexLength and dataLength within limits? */
1303 0 : if( /* for unshifted indexLength */
1304 0 : allIndexesLength>UTRIE2_MAX_INDEX_LENGTH ||
1305 : /* for unshifted dataNullOffset */
1306 0 : (dataMove+newTrie->dataNullOffset)>0xffff ||
1307 : /* for unshifted 2-byte UTF-8 index-2 values */
1308 0 : (dataMove+UNEWTRIE2_DATA_0800_OFFSET)>0xffff ||
1309 : /* for shiftedDataLength */
1310 0 : (dataMove+newTrie->dataLength)>UTRIE2_MAX_DATA_LENGTH
1311 : ) {
1312 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1313 0 : return;
1314 : }
1315 :
1316 : /* calculate the total serialized length */
1317 0 : length=sizeof(UTrie2Header)+allIndexesLength*2;
1318 0 : if(valueBits==UTRIE2_16_VALUE_BITS) {
1319 0 : length+=newTrie->dataLength*2;
1320 : } else {
1321 0 : length+=newTrie->dataLength*4;
1322 : }
1323 :
1324 0 : trie->memory=uprv_malloc(length);
1325 0 : if(trie->memory==NULL) {
1326 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1327 0 : return;
1328 : }
1329 0 : trie->length=length;
1330 0 : trie->isMemoryOwned=TRUE;
1331 :
1332 0 : trie->indexLength=allIndexesLength;
1333 0 : trie->dataLength=newTrie->dataLength;
1334 0 : if(highStart<=0x10000) {
1335 0 : trie->index2NullOffset=0xffff;
1336 : } else {
1337 0 : trie->index2NullOffset=UTRIE2_INDEX_2_OFFSET+newTrie->index2NullOffset;
1338 : }
1339 0 : trie->dataNullOffset=(uint16_t)(dataMove+newTrie->dataNullOffset);
1340 0 : trie->highValueIndex=dataMove+trie->dataLength-UTRIE2_DATA_GRANULARITY;
1341 :
1342 : /* set the header fields */
1343 0 : header=(UTrie2Header *)trie->memory;
1344 :
1345 0 : header->signature=UTRIE2_SIG; /* "Tri2" */
1346 0 : header->options=(uint16_t)valueBits;
1347 :
1348 0 : header->indexLength=(uint16_t)trie->indexLength;
1349 0 : header->shiftedDataLength=(uint16_t)(trie->dataLength>>UTRIE2_INDEX_SHIFT);
1350 0 : header->index2NullOffset=trie->index2NullOffset;
1351 0 : header->dataNullOffset=trie->dataNullOffset;
1352 0 : header->shiftedHighStart=(uint16_t)(highStart>>UTRIE2_SHIFT_1);
1353 :
1354 : /* fill the index and data arrays */
1355 0 : dest16=(uint16_t *)(header+1);
1356 0 : trie->index=dest16;
1357 :
1358 : /* write the index-2 array values shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove */
1359 0 : p=(uint32_t *)newTrie->index2;
1360 0 : for(i=UTRIE2_INDEX_2_BMP_LENGTH; i>0; --i) {
1361 0 : *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1362 : }
1363 :
1364 : /* write UTF-8 2-byte index-2 values, not right-shifted */
1365 0 : for(i=0; i<(0xc2-0xc0); ++i) { /* C0..C1 */
1366 0 : *dest16++=(uint16_t)(dataMove+UTRIE2_BAD_UTF8_DATA_OFFSET);
1367 : }
1368 0 : for(; i<(0xe0-0xc0); ++i) { /* C2..DF */
1369 0 : *dest16++=(uint16_t)(dataMove+newTrie->index2[i<<(6-UTRIE2_SHIFT_2)]);
1370 : }
1371 :
1372 0 : if(highStart>0x10000) {
1373 0 : int32_t index1Length=(highStart-0x10000)>>UTRIE2_SHIFT_1;
1374 0 : int32_t index2Offset=UTRIE2_INDEX_2_BMP_LENGTH+UTRIE2_UTF8_2B_INDEX_2_LENGTH+index1Length;
1375 :
1376 : /* write 16-bit index-1 values for supplementary code points */
1377 0 : p=(uint32_t *)newTrie->index1+UTRIE2_OMITTED_BMP_INDEX_1_LENGTH;
1378 0 : for(i=index1Length; i>0; --i) {
1379 0 : *dest16++=(uint16_t)(UTRIE2_INDEX_2_OFFSET + *p++);
1380 : }
1381 :
1382 : /*
1383 : * write the index-2 array values for supplementary code points,
1384 : * shifted right by UTRIE2_INDEX_SHIFT, after adding dataMove
1385 : */
1386 0 : p=(uint32_t *)newTrie->index2+index2Offset;
1387 0 : for(i=newTrie->index2Length-index2Offset; i>0; --i) {
1388 0 : *dest16++=(uint16_t)((dataMove + *p++)>>UTRIE2_INDEX_SHIFT);
1389 : }
1390 : }
1391 :
1392 : /* write the 16/32-bit data array */
1393 0 : switch(valueBits) {
1394 : case UTRIE2_16_VALUE_BITS:
1395 : /* write 16-bit data values */
1396 0 : trie->data16=dest16;
1397 0 : trie->data32=NULL;
1398 0 : p=newTrie->data;
1399 0 : for(i=newTrie->dataLength; i>0; --i) {
1400 0 : *dest16++=(uint16_t)*p++;
1401 : }
1402 0 : break;
1403 : case UTRIE2_32_VALUE_BITS:
1404 : /* write 32-bit data values */
1405 0 : trie->data16=NULL;
1406 0 : trie->data32=(uint32_t *)dest16;
1407 0 : uprv_memcpy(dest16, newTrie->data, (size_t)newTrie->dataLength*4);
1408 0 : break;
1409 : default:
1410 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1411 0 : return;
1412 : }
1413 :
1414 : /* Delete the UNewTrie2. */
1415 0 : uprv_free(newTrie->data);
1416 0 : uprv_free(newTrie);
1417 0 : trie->newTrie=NULL;
1418 : }
1419 :
1420 : /*
1421 : * This is here to avoid a dependency from utrie2.cpp on utrie.c.
1422 : * This file already depends on utrie.c.
1423 : * Otherwise, this should be in utrie2.cpp right after utrie2_swap().
1424 : */
1425 : U_CAPI int32_t U_EXPORT2
1426 0 : utrie2_swapAnyVersion(const UDataSwapper *ds,
1427 : const void *inData, int32_t length, void *outData,
1428 : UErrorCode *pErrorCode) {
1429 0 : if(U_SUCCESS(*pErrorCode)) {
1430 0 : switch(utrie2_getVersion(inData, length, TRUE)) {
1431 : case 1:
1432 0 : return utrie_swap(ds, inData, length, outData, pErrorCode);
1433 : case 2:
1434 0 : return utrie2_swap(ds, inData, length, outData, pErrorCode);
1435 : default:
1436 0 : *pErrorCode=U_INVALID_FORMAT_ERROR;
1437 0 : return 0;
1438 : }
1439 : }
1440 0 : return 0;
1441 : }
|