Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 1999-2014, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: unames.c
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 1999oct04
16 : * created by: Markus W. Scherer
17 : */
18 :
19 : #include "unicode/utypes.h"
20 : #include "unicode/putil.h"
21 : #include "unicode/uchar.h"
22 : #include "unicode/udata.h"
23 : #include "unicode/utf.h"
24 : #include "unicode/utf16.h"
25 : #include "uassert.h"
26 : #include "ustr_imp.h"
27 : #include "umutex.h"
28 : #include "cmemory.h"
29 : #include "cstring.h"
30 : #include "ucln_cmn.h"
31 : #include "udataswp.h"
32 : #include "uprops.h"
33 :
34 : U_NAMESPACE_BEGIN
35 :
36 : /* prototypes ------------------------------------------------------------- */
37 :
38 : static const char DATA_NAME[] = "unames";
39 : static const char DATA_TYPE[] = "icu";
40 :
41 : #define GROUP_SHIFT 5
42 : #define LINES_PER_GROUP (1L<<GROUP_SHIFT)
43 : #define GROUP_MASK (LINES_PER_GROUP-1)
44 :
45 : /*
46 : * This struct was replaced by explicitly accessing equivalent
47 : * fields from triples of uint16_t.
48 : * The Group struct was padded to 8 bytes on compilers for early ARM CPUs,
49 : * which broke the assumption that sizeof(Group)==6 and that the ++ operator
50 : * would advance by 6 bytes (3 uint16_t).
51 : *
52 : * We can't just change the data structure because it's loaded from a data file,
53 : * and we don't want to make it less compact, so we changed the access code.
54 : *
55 : * For details see ICU tickets 6331 and 6008.
56 : typedef struct {
57 : uint16_t groupMSB,
58 : offsetHigh, offsetLow; / * avoid padding * /
59 : } Group;
60 : */
61 : enum {
62 : GROUP_MSB,
63 : GROUP_OFFSET_HIGH,
64 : GROUP_OFFSET_LOW,
65 : GROUP_LENGTH
66 : };
67 :
68 : /*
69 : * Get the 32-bit group offset.
70 : * @param group (const uint16_t *) pointer to a Group triple of uint16_t
71 : * @return group offset (int32_t)
72 : */
73 : #define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW])
74 :
75 : #define NEXT_GROUP(group) ((group)+GROUP_LENGTH)
76 : #define PREV_GROUP(group) ((group)-GROUP_LENGTH)
77 :
78 : typedef struct {
79 : uint32_t start, end;
80 : uint8_t type, variant;
81 : uint16_t size;
82 : } AlgorithmicRange;
83 :
84 : typedef struct {
85 : uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
86 : } UCharNames;
87 :
88 : /*
89 : * Get the groups table from a UCharNames struct.
90 : * The groups table consists of one uint16_t groupCount followed by
91 : * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH
92 : * and the comment for the old struct Group above.
93 : *
94 : * @param names (const UCharNames *) pointer to the UCharNames indexes
95 : * @return (const uint16_t *) pointer to the groups table
96 : */
97 : #define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset)
98 :
99 : typedef struct {
100 : const char *otherName;
101 : UChar32 code;
102 : } FindName;
103 :
104 : #define DO_FIND_NAME NULL
105 :
106 : static UDataMemory *uCharNamesData=NULL;
107 : static UCharNames *uCharNames=NULL;
108 : static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER;
109 :
110 : /*
111 : * Maximum length of character names (regular & 1.0).
112 : */
113 : static int32_t gMaxNameLength=0;
114 :
115 : /*
116 : * Set of chars used in character names (regular & 1.0).
117 : * Chars are platform-dependent (can be EBCDIC).
118 : */
119 : static uint32_t gNameSet[8]={ 0 };
120 :
121 : #define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT
122 : #define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1
123 : #define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2
124 :
125 : #define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3)
126 :
127 : static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = {
128 : "unassigned",
129 : "uppercase letter",
130 : "lowercase letter",
131 : "titlecase letter",
132 : "modifier letter",
133 : "other letter",
134 : "non spacing mark",
135 : "enclosing mark",
136 : "combining spacing mark",
137 : "decimal digit number",
138 : "letter number",
139 : "other number",
140 : "space separator",
141 : "line separator",
142 : "paragraph separator",
143 : "control",
144 : "format",
145 : "private use area",
146 : "surrogate",
147 : "dash punctuation",
148 : "start punctuation",
149 : "end punctuation",
150 : "connector punctuation",
151 : "other punctuation",
152 : "math symbol",
153 : "currency symbol",
154 : "modifier symbol",
155 : "other symbol",
156 : "initial punctuation",
157 : "final punctuation",
158 : "noncharacter",
159 : "lead surrogate",
160 : "trail surrogate"
161 : };
162 :
163 : /* implementation ----------------------------------------------------------- */
164 :
165 0 : static UBool U_CALLCONV unames_cleanup(void)
166 : {
167 0 : if(uCharNamesData) {
168 0 : udata_close(uCharNamesData);
169 0 : uCharNamesData = NULL;
170 : }
171 0 : if(uCharNames) {
172 0 : uCharNames = NULL;
173 : }
174 0 : gCharNamesInitOnce.reset();
175 0 : gMaxNameLength=0;
176 0 : return TRUE;
177 : }
178 :
179 : static UBool U_CALLCONV
180 0 : isAcceptable(void * /*context*/,
181 : const char * /*type*/, const char * /*name*/,
182 : const UDataInfo *pInfo) {
183 : return (UBool)(
184 0 : pInfo->size>=20 &&
185 0 : pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
186 0 : pInfo->charsetFamily==U_CHARSET_FAMILY &&
187 0 : pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
188 0 : pInfo->dataFormat[1]==0x6e &&
189 0 : pInfo->dataFormat[2]==0x61 &&
190 0 : pInfo->dataFormat[3]==0x6d &&
191 0 : pInfo->formatVersion[0]==1);
192 : }
193 :
194 : static void U_CALLCONV
195 0 : loadCharNames(UErrorCode &status) {
196 0 : U_ASSERT(uCharNamesData == NULL);
197 0 : U_ASSERT(uCharNames == NULL);
198 :
199 0 : uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status);
200 0 : if(U_FAILURE(status)) {
201 0 : uCharNamesData = NULL;
202 : } else {
203 0 : uCharNames = (UCharNames *)udata_getMemory(uCharNamesData);
204 : }
205 0 : ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup);
206 0 : }
207 :
208 :
209 : static UBool
210 0 : isDataLoaded(UErrorCode *pErrorCode) {
211 0 : umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode);
212 0 : return U_SUCCESS(*pErrorCode);
213 : }
214 :
215 : #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \
216 : if((bufferLength)>0) { \
217 : *(buffer)++=c; \
218 : --(bufferLength); \
219 : } \
220 : ++(bufferPos); \
221 : }
222 :
223 : #define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT
224 :
225 : /*
226 : * Important: expandName() and compareName() are almost the same -
227 : * apply fixes to both.
228 : *
229 : * UnicodeData.txt uses ';' as a field separator, so no
230 : * field can contain ';' as part of its contents.
231 : * In unames.dat, it is marked as token[';']==-1 only if the
232 : * semicolon is used in the data file - which is iff we
233 : * have Unicode 1.0 names or ISO comments or aliases.
234 : * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases
235 : * although we know that it will never be part of a name.
236 : */
237 : static uint16_t
238 0 : expandName(UCharNames *names,
239 : const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
240 : char *buffer, uint16_t bufferLength) {
241 0 : uint16_t *tokens=(uint16_t *)names+8;
242 0 : uint16_t token, tokenCount=*tokens++, bufferPos=0;
243 0 : uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
244 : uint8_t c;
245 :
246 0 : if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
247 : /*
248 : * skip the modern name if it is not requested _and_
249 : * if the semicolon byte value is a character, not a token number
250 : */
251 0 : if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
252 0 : int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
253 0 : do {
254 0 : while(nameLength>0) {
255 0 : --nameLength;
256 0 : if(*name++==';') {
257 0 : break;
258 : }
259 : }
260 0 : } while(--fieldIndex>0);
261 : } else {
262 : /*
263 : * the semicolon byte value is a token number, therefore
264 : * only modern names are stored in unames.dat and there is no
265 : * such requested alternate name here
266 : */
267 0 : nameLength=0;
268 : }
269 : }
270 :
271 : /* write each letter directly, and write a token word per token */
272 0 : while(nameLength>0) {
273 0 : --nameLength;
274 0 : c=*name++;
275 :
276 0 : if(c>=tokenCount) {
277 0 : if(c!=';') {
278 : /* implicit letter */
279 0 : WRITE_CHAR(buffer, bufferLength, bufferPos, c);
280 : } else {
281 : /* finished */
282 0 : break;
283 : }
284 : } else {
285 0 : token=tokens[c];
286 0 : if(token==(uint16_t)(-2)) {
287 : /* this is a lead byte for a double-byte token */
288 0 : token=tokens[c<<8|*name++];
289 0 : --nameLength;
290 : }
291 0 : if(token==(uint16_t)(-1)) {
292 0 : if(c!=';') {
293 : /* explicit letter */
294 0 : WRITE_CHAR(buffer, bufferLength, bufferPos, c);
295 : } else {
296 : /* stop, but skip the semicolon if we are seeking
297 : extended names and there was no 2.0 name but there
298 : is a 1.0 name. */
299 0 : if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) {
300 0 : if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
301 0 : continue;
302 : }
303 : }
304 : /* finished */
305 0 : break;
306 : }
307 : } else {
308 : /* write token word */
309 0 : uint8_t *tokenString=tokenStrings+token;
310 0 : while((c=*tokenString++)!=0) {
311 0 : WRITE_CHAR(buffer, bufferLength, bufferPos, c);
312 : }
313 : }
314 : }
315 : }
316 :
317 : /* zero-terminate */
318 0 : if(bufferLength>0) {
319 0 : *buffer=0;
320 : }
321 :
322 0 : return bufferPos;
323 : }
324 :
325 : /*
326 : * compareName() is almost the same as expandName() except that it compares
327 : * the currently expanded name to an input name.
328 : * It returns the match/no match result as soon as possible.
329 : */
330 : static UBool
331 0 : compareName(UCharNames *names,
332 : const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice,
333 : const char *otherName) {
334 0 : uint16_t *tokens=(uint16_t *)names+8;
335 0 : uint16_t token, tokenCount=*tokens++;
336 0 : uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset;
337 : uint8_t c;
338 0 : const char *origOtherName = otherName;
339 :
340 0 : if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
341 : /*
342 : * skip the modern name if it is not requested _and_
343 : * if the semicolon byte value is a character, not a token number
344 : */
345 0 : if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
346 0 : int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice;
347 0 : do {
348 0 : while(nameLength>0) {
349 0 : --nameLength;
350 0 : if(*name++==';') {
351 0 : break;
352 : }
353 : }
354 0 : } while(--fieldIndex>0);
355 : } else {
356 : /*
357 : * the semicolon byte value is a token number, therefore
358 : * only modern names are stored in unames.dat and there is no
359 : * such requested alternate name here
360 : */
361 0 : nameLength=0;
362 : }
363 : }
364 :
365 : /* compare each letter directly, and compare a token word per token */
366 0 : while(nameLength>0) {
367 0 : --nameLength;
368 0 : c=*name++;
369 :
370 0 : if(c>=tokenCount) {
371 0 : if(c!=';') {
372 : /* implicit letter */
373 0 : if((char)c!=*otherName++) {
374 0 : return FALSE;
375 : }
376 : } else {
377 : /* finished */
378 0 : break;
379 : }
380 : } else {
381 0 : token=tokens[c];
382 0 : if(token==(uint16_t)(-2)) {
383 : /* this is a lead byte for a double-byte token */
384 0 : token=tokens[c<<8|*name++];
385 0 : --nameLength;
386 : }
387 0 : if(token==(uint16_t)(-1)) {
388 0 : if(c!=';') {
389 : /* explicit letter */
390 0 : if((char)c!=*otherName++) {
391 0 : return FALSE;
392 : }
393 : } else {
394 : /* stop, but skip the semicolon if we are seeking
395 : extended names and there was no 2.0 name but there
396 : is a 1.0 name. */
397 0 : if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) {
398 0 : if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) {
399 0 : continue;
400 : }
401 : }
402 : /* finished */
403 0 : break;
404 : }
405 : } else {
406 : /* write token word */
407 0 : uint8_t *tokenString=tokenStrings+token;
408 0 : while((c=*tokenString++)!=0) {
409 0 : if((char)c!=*otherName++) {
410 0 : return FALSE;
411 : }
412 : }
413 : }
414 : }
415 : }
416 :
417 : /* complete match? */
418 0 : return (UBool)(*otherName==0);
419 : }
420 :
421 0 : static uint8_t getCharCat(UChar32 cp) {
422 : uint8_t cat;
423 :
424 0 : if (U_IS_UNICODE_NONCHAR(cp)) {
425 0 : return U_NONCHARACTER_CODE_POINT;
426 : }
427 :
428 0 : if ((cat = u_charType(cp)) == U_SURROGATE) {
429 0 : cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE;
430 : }
431 :
432 0 : return cat;
433 : }
434 :
435 0 : static const char *getCharCatName(UChar32 cp) {
436 0 : uint8_t cat = getCharCat(cp);
437 :
438 : /* Return unknown if the table of names above is not up to
439 : date. */
440 :
441 0 : if (cat >= UPRV_LENGTHOF(charCatNames)) {
442 0 : return "unknown";
443 : } else {
444 0 : return charCatNames[cat];
445 : }
446 : }
447 :
448 0 : static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) {
449 0 : const char *catname = getCharCatName(code);
450 0 : uint16_t length = 0;
451 :
452 : UChar32 cp;
453 : int ndigits, i;
454 :
455 0 : WRITE_CHAR(buffer, bufferLength, length, '<');
456 0 : while (catname[length - 1]) {
457 0 : WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]);
458 : }
459 0 : WRITE_CHAR(buffer, bufferLength, length, '-');
460 0 : for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4)
461 : ;
462 0 : if (ndigits < 4)
463 0 : ndigits = 4;
464 0 : for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) {
465 0 : uint8_t v = (uint8_t)(cp & 0xf);
466 0 : buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10);
467 : }
468 0 : buffer += ndigits;
469 0 : length += ndigits;
470 0 : WRITE_CHAR(buffer, bufferLength, length, '>');
471 :
472 0 : return length;
473 : }
474 :
475 : /*
476 : * getGroup() does a binary search for the group that contains the
477 : * Unicode code point "code".
478 : * The return value is always a valid Group* that may contain "code"
479 : * or else is the highest group before "code".
480 : * If the lowest group is after "code", then that one is returned.
481 : */
482 : static const uint16_t *
483 0 : getGroup(UCharNames *names, uint32_t code) {
484 0 : const uint16_t *groups=GET_GROUPS(names);
485 0 : uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT),
486 0 : start=0,
487 0 : limit=*groups++,
488 : number;
489 :
490 : /* binary search for the group of names that contains the one for code */
491 0 : while(start<limit-1) {
492 0 : number=(uint16_t)((start+limit)/2);
493 0 : if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) {
494 0 : limit=number;
495 : } else {
496 0 : start=number;
497 : }
498 : }
499 :
500 : /* return this regardless of whether it is an exact match */
501 0 : return groups+start*GROUP_LENGTH;
502 : }
503 :
504 : /*
505 : * expandGroupLengths() reads a block of compressed lengths of 32 strings and
506 : * expands them into offsets and lengths for each string.
507 : * Lengths are stored with a variable-width encoding in consecutive nibbles:
508 : * If a nibble<0xc, then it is the length itself (0=empty string).
509 : * If a nibble>=0xc, then it forms a length value with the following nibble.
510 : * Calculation see below.
511 : * The offsets and lengths arrays must be at least 33 (one more) long because
512 : * there is no check here at the end if the last nibble is still used.
513 : */
514 : static const uint8_t *
515 0 : expandGroupLengths(const uint8_t *s,
516 : uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) {
517 : /* read the lengths of the 32 strings in this group and get each string's offset */
518 0 : uint16_t i=0, offset=0, length=0;
519 : uint8_t lengthByte;
520 :
521 : /* all 32 lengths must be read to get the offset of the first group string */
522 0 : while(i<LINES_PER_GROUP) {
523 0 : lengthByte=*s++;
524 :
525 : /* read even nibble - MSBs of lengthByte */
526 0 : if(length>=12) {
527 : /* double-nibble length spread across two bytes */
528 0 : length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12);
529 0 : lengthByte&=0xf;
530 0 : } else if((lengthByte /* &0xf0 */)>=0xc0) {
531 : /* double-nibble length spread across this one byte */
532 0 : length=(uint16_t)((lengthByte&0x3f)+12);
533 : } else {
534 : /* single-nibble length in MSBs */
535 0 : length=(uint16_t)(lengthByte>>4);
536 0 : lengthByte&=0xf;
537 : }
538 :
539 0 : *offsets++=offset;
540 0 : *lengths++=length;
541 :
542 0 : offset+=length;
543 0 : ++i;
544 :
545 : /* read odd nibble - LSBs of lengthByte */
546 0 : if((lengthByte&0xf0)==0) {
547 : /* this nibble was not consumed for a double-nibble length above */
548 0 : length=lengthByte;
549 0 : if(length<12) {
550 : /* single-nibble length in LSBs */
551 0 : *offsets++=offset;
552 0 : *lengths++=length;
553 :
554 0 : offset+=length;
555 0 : ++i;
556 : }
557 : } else {
558 0 : length=0; /* prevent double-nibble detection in the next iteration */
559 : }
560 : }
561 :
562 : /* now, s is at the first group string */
563 0 : return s;
564 : }
565 :
566 : static uint16_t
567 0 : expandGroupName(UCharNames *names, const uint16_t *group,
568 : uint16_t lineNumber, UCharNameChoice nameChoice,
569 : char *buffer, uint16_t bufferLength) {
570 : uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
571 0 : const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
572 0 : s=expandGroupLengths(s, offsets, lengths);
573 0 : return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice,
574 0 : buffer, bufferLength);
575 : }
576 :
577 : static uint16_t
578 0 : getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice,
579 : char *buffer, uint16_t bufferLength) {
580 0 : const uint16_t *group=getGroup(names, code);
581 0 : if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) {
582 0 : return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice,
583 0 : buffer, bufferLength);
584 : } else {
585 : /* group not found */
586 : /* zero-terminate */
587 0 : if(bufferLength>0) {
588 0 : *buffer=0;
589 : }
590 0 : return 0;
591 : }
592 : }
593 :
594 : /*
595 : * enumGroupNames() enumerates all the names in a 32-group
596 : * and either calls the enumerator function or finds a given input name.
597 : */
598 : static UBool
599 0 : enumGroupNames(UCharNames *names, const uint16_t *group,
600 : UChar32 start, UChar32 end,
601 : UEnumCharNamesFn *fn, void *context,
602 : UCharNameChoice nameChoice) {
603 : uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
604 0 : const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group);
605 :
606 0 : s=expandGroupLengths(s, offsets, lengths);
607 0 : if(fn!=DO_FIND_NAME) {
608 : char buffer[200];
609 : uint16_t length;
610 :
611 0 : while(start<=end) {
612 0 : length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer));
613 0 : if (!length && nameChoice == U_EXTENDED_CHAR_NAME) {
614 0 : buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
615 : }
616 : /* here, we assume that the buffer is large enough */
617 0 : if(length>0) {
618 0 : if(!fn(context, start, nameChoice, buffer, length)) {
619 0 : return FALSE;
620 : }
621 : }
622 0 : ++start;
623 : }
624 : } else {
625 0 : const char *otherName=((FindName *)context)->otherName;
626 0 : while(start<=end) {
627 0 : if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) {
628 0 : ((FindName *)context)->code=start;
629 0 : return FALSE;
630 : }
631 0 : ++start;
632 : }
633 : }
634 0 : return TRUE;
635 : }
636 :
637 : /*
638 : * enumExtNames enumerate extended names.
639 : * It only needs to do it if it is called with a real function and not
640 : * with the dummy DO_FIND_NAME, because u_charFromName() does a check
641 : * for extended names by itself.
642 : */
643 : static UBool
644 0 : enumExtNames(UChar32 start, UChar32 end,
645 : UEnumCharNamesFn *fn, void *context)
646 : {
647 0 : if(fn!=DO_FIND_NAME) {
648 : char buffer[200];
649 : uint16_t length;
650 :
651 0 : while(start<=end) {
652 0 : buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0;
653 : /* here, we assume that the buffer is large enough */
654 0 : if(length>0) {
655 0 : if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) {
656 0 : return FALSE;
657 : }
658 : }
659 0 : ++start;
660 : }
661 : }
662 :
663 0 : return TRUE;
664 : }
665 :
666 : static UBool
667 0 : enumNames(UCharNames *names,
668 : UChar32 start, UChar32 limit,
669 : UEnumCharNamesFn *fn, void *context,
670 : UCharNameChoice nameChoice) {
671 : uint16_t startGroupMSB, endGroupMSB, groupCount;
672 : const uint16_t *group, *groupLimit;
673 :
674 0 : startGroupMSB=(uint16_t)(start>>GROUP_SHIFT);
675 0 : endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT);
676 :
677 : /* find the group that contains start, or the highest before it */
678 0 : group=getGroup(names, start);
679 :
680 0 : if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) {
681 : /* enumerate synthetic names between start and the group start */
682 0 : UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT);
683 0 : if(extLimit>limit) {
684 0 : extLimit=limit;
685 : }
686 0 : if(!enumExtNames(start, extLimit-1, fn, context)) {
687 0 : return FALSE;
688 : }
689 0 : start=extLimit;
690 : }
691 :
692 0 : if(startGroupMSB==endGroupMSB) {
693 0 : if(startGroupMSB==group[GROUP_MSB]) {
694 : /* if start and limit-1 are in the same group, then enumerate only in that one */
695 0 : return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice);
696 : }
697 : } else {
698 0 : const uint16_t *groups=GET_GROUPS(names);
699 0 : groupCount=*groups++;
700 0 : groupLimit=groups+groupCount*GROUP_LENGTH;
701 :
702 0 : if(startGroupMSB==group[GROUP_MSB]) {
703 : /* enumerate characters in the partial start group */
704 0 : if((start&GROUP_MASK)!=0) {
705 0 : if(!enumGroupNames(names, group,
706 0 : start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1,
707 : fn, context, nameChoice)) {
708 0 : return FALSE;
709 : }
710 0 : group=NEXT_GROUP(group); /* continue with the next group */
711 : }
712 0 : } else if(startGroupMSB>group[GROUP_MSB]) {
713 : /* make sure that we start enumerating with the first group after start */
714 0 : const uint16_t *nextGroup=NEXT_GROUP(group);
715 0 : if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) {
716 0 : UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
717 0 : if (end > limit) {
718 0 : end = limit;
719 : }
720 0 : if (!enumExtNames(start, end - 1, fn, context)) {
721 0 : return FALSE;
722 : }
723 : }
724 0 : group=nextGroup;
725 : }
726 :
727 : /* enumerate entire groups between the start- and end-groups */
728 0 : while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) {
729 : const uint16_t *nextGroup;
730 0 : start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT;
731 0 : if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) {
732 0 : return FALSE;
733 : }
734 0 : nextGroup=NEXT_GROUP(group);
735 0 : if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) {
736 0 : UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT;
737 0 : if (end > limit) {
738 0 : end = limit;
739 : }
740 0 : if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) {
741 0 : return FALSE;
742 : }
743 : }
744 0 : group=nextGroup;
745 : }
746 :
747 : /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */
748 0 : if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) {
749 0 : return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice);
750 0 : } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) {
751 0 : UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT;
752 0 : if (next > start) {
753 0 : start = next;
754 0 : }
755 : } else {
756 0 : return TRUE;
757 : }
758 : }
759 :
760 : /* we have not found a group, which means everything is made of
761 : extended names. */
762 0 : if (nameChoice == U_EXTENDED_CHAR_NAME) {
763 0 : if (limit > UCHAR_MAX_VALUE + 1) {
764 0 : limit = UCHAR_MAX_VALUE + 1;
765 : }
766 0 : return enumExtNames(start, limit - 1, fn, context);
767 : }
768 :
769 0 : return TRUE;
770 : }
771 :
772 : static uint16_t
773 0 : writeFactorSuffix(const uint16_t *factors, uint16_t count,
774 : const char *s, /* suffix elements */
775 : uint32_t code,
776 : uint16_t indexes[8], /* output fields from here */
777 : const char *elementBases[8], const char *elements[8],
778 : char *buffer, uint16_t bufferLength) {
779 0 : uint16_t i, factor, bufferPos=0;
780 : char c;
781 :
782 : /* write elements according to the factors */
783 :
784 : /*
785 : * the factorized elements are determined by modulo arithmetic
786 : * with the factors of this algorithm
787 : *
788 : * note that for fewer operations, count is decremented here
789 : */
790 0 : --count;
791 0 : for(i=count; i>0; --i) {
792 0 : factor=factors[i];
793 0 : indexes[i]=(uint16_t)(code%factor);
794 0 : code/=factor;
795 : }
796 : /*
797 : * we don't need to calculate the last modulus because start<=code<=end
798 : * guarantees here that code<=factors[0]
799 : */
800 0 : indexes[0]=(uint16_t)code;
801 :
802 : /* write each element */
803 : for(;;) {
804 0 : if(elementBases!=NULL) {
805 0 : *elementBases++=s;
806 : }
807 :
808 : /* skip indexes[i] strings */
809 0 : factor=indexes[i];
810 0 : while(factor>0) {
811 0 : while(*s++!=0) {}
812 0 : --factor;
813 : }
814 0 : if(elements!=NULL) {
815 0 : *elements++=s;
816 : }
817 :
818 : /* write element */
819 0 : while((c=*s++)!=0) {
820 0 : WRITE_CHAR(buffer, bufferLength, bufferPos, c);
821 : }
822 :
823 : /* we do not need to perform the rest of this loop for i==count - break here */
824 0 : if(i>=count) {
825 0 : break;
826 : }
827 :
828 : /* skip the rest of the strings for this factors[i] */
829 0 : factor=(uint16_t)(factors[i]-indexes[i]-1);
830 0 : while(factor>0) {
831 0 : while(*s++!=0) {}
832 0 : --factor;
833 : }
834 :
835 0 : ++i;
836 : }
837 :
838 : /* zero-terminate */
839 0 : if(bufferLength>0) {
840 0 : *buffer=0;
841 : }
842 :
843 0 : return bufferPos;
844 : }
845 :
846 : /*
847 : * Important:
848 : * Parts of findAlgName() are almost the same as some of getAlgName().
849 : * Fixes must be applied to both.
850 : */
851 : static uint16_t
852 0 : getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice,
853 : char *buffer, uint16_t bufferLength) {
854 0 : uint16_t bufferPos=0;
855 :
856 : /* Only the normative character name can be algorithmic. */
857 0 : if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
858 : /* zero-terminate */
859 0 : if(bufferLength>0) {
860 0 : *buffer=0;
861 : }
862 0 : return 0;
863 : }
864 :
865 0 : switch(range->type) {
866 : case 0: {
867 : /* name = prefix hex-digits */
868 0 : const char *s=(const char *)(range+1);
869 : char c;
870 :
871 : uint16_t i, count;
872 :
873 : /* copy prefix */
874 0 : while((c=*s++)!=0) {
875 0 : WRITE_CHAR(buffer, bufferLength, bufferPos, c);
876 : }
877 :
878 : /* write hexadecimal code point value */
879 0 : count=range->variant;
880 :
881 : /* zero-terminate */
882 0 : if(count<bufferLength) {
883 0 : buffer[count]=0;
884 : }
885 :
886 0 : for(i=count; i>0;) {
887 0 : if(--i<bufferLength) {
888 0 : c=(char)(code&0xf);
889 0 : if(c<10) {
890 0 : c+='0';
891 : } else {
892 0 : c+='A'-10;
893 : }
894 0 : buffer[i]=c;
895 : }
896 0 : code>>=4;
897 : }
898 :
899 0 : bufferPos+=count;
900 0 : break;
901 : }
902 : case 1: {
903 : /* name = prefix factorized-elements */
904 : uint16_t indexes[8];
905 0 : const uint16_t *factors=(const uint16_t *)(range+1);
906 0 : uint16_t count=range->variant;
907 0 : const char *s=(const char *)(factors+count);
908 : char c;
909 :
910 : /* copy prefix */
911 0 : while((c=*s++)!=0) {
912 0 : WRITE_CHAR(buffer, bufferLength, bufferPos, c);
913 : }
914 :
915 0 : bufferPos+=writeFactorSuffix(factors, count,
916 0 : s, code-range->start, indexes, NULL, NULL, buffer, bufferLength);
917 0 : break;
918 : }
919 : default:
920 : /* undefined type */
921 : /* zero-terminate */
922 0 : if(bufferLength>0) {
923 0 : *buffer=0;
924 : }
925 0 : break;
926 : }
927 :
928 0 : return bufferPos;
929 : }
930 :
931 : /*
932 : * Important: enumAlgNames() and findAlgName() are almost the same.
933 : * Any fix must be applied to both.
934 : */
935 : static UBool
936 0 : enumAlgNames(AlgorithmicRange *range,
937 : UChar32 start, UChar32 limit,
938 : UEnumCharNamesFn *fn, void *context,
939 : UCharNameChoice nameChoice) {
940 : char buffer[200];
941 : uint16_t length;
942 :
943 0 : if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
944 0 : return TRUE;
945 : }
946 :
947 0 : switch(range->type) {
948 : case 0: {
949 : char *s, *end;
950 : char c;
951 :
952 : /* get the full name of the start character */
953 0 : length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer));
954 0 : if(length<=0) {
955 0 : return TRUE;
956 : }
957 :
958 : /* call the enumerator function with this first character */
959 0 : if(!fn(context, start, nameChoice, buffer, length)) {
960 0 : return FALSE;
961 : }
962 :
963 : /* go to the end of the name; all these names have the same length */
964 0 : end=buffer;
965 0 : while(*end!=0) {
966 0 : ++end;
967 : }
968 :
969 : /* enumerate the rest of the names */
970 0 : while(++start<limit) {
971 : /* increment the hexadecimal number on a character-basis */
972 0 : s=end;
973 : for (;;) {
974 0 : c=*--s;
975 0 : if(('0'<=c && c<'9') || ('A'<=c && c<'F')) {
976 0 : *s=(char)(c+1);
977 0 : break;
978 0 : } else if(c=='9') {
979 0 : *s='A';
980 0 : break;
981 0 : } else if(c=='F') {
982 0 : *s='0';
983 : }
984 : }
985 :
986 0 : if(!fn(context, start, nameChoice, buffer, length)) {
987 0 : return FALSE;
988 : }
989 : }
990 0 : break;
991 : }
992 : case 1: {
993 : uint16_t indexes[8];
994 : const char *elementBases[8], *elements[8];
995 0 : const uint16_t *factors=(const uint16_t *)(range+1);
996 0 : uint16_t count=range->variant;
997 0 : const char *s=(const char *)(factors+count);
998 : char *suffix, *t;
999 : uint16_t prefixLength, i, idx;
1000 :
1001 : char c;
1002 :
1003 : /* name = prefix factorized-elements */
1004 :
1005 : /* copy prefix */
1006 0 : suffix=buffer;
1007 0 : prefixLength=0;
1008 0 : while((c=*s++)!=0) {
1009 0 : *suffix++=c;
1010 0 : ++prefixLength;
1011 : }
1012 :
1013 : /* append the suffix of the start character */
1014 0 : length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count,
1015 0 : s, (uint32_t)start-range->start,
1016 : indexes, elementBases, elements,
1017 0 : suffix, (uint16_t)(sizeof(buffer)-prefixLength)));
1018 :
1019 : /* call the enumerator function with this first character */
1020 0 : if(!fn(context, start, nameChoice, buffer, length)) {
1021 0 : return FALSE;
1022 : }
1023 :
1024 : /* enumerate the rest of the names */
1025 0 : while(++start<limit) {
1026 : /* increment the indexes in lexical order bound by the factors */
1027 0 : i=count;
1028 : for (;;) {
1029 0 : idx=(uint16_t)(indexes[--i]+1);
1030 0 : if(idx<factors[i]) {
1031 : /* skip one index and its element string */
1032 0 : indexes[i]=idx;
1033 0 : s=elements[i];
1034 0 : while(*s++!=0) {
1035 : }
1036 0 : elements[i]=s;
1037 0 : break;
1038 : } else {
1039 : /* reset this index to 0 and its element string to the first one */
1040 0 : indexes[i]=0;
1041 0 : elements[i]=elementBases[i];
1042 : }
1043 : }
1044 :
1045 : /* to make matters a little easier, just append all elements to the suffix */
1046 0 : t=suffix;
1047 0 : length=prefixLength;
1048 0 : for(i=0; i<count; ++i) {
1049 0 : s=elements[i];
1050 0 : while((c=*s++)!=0) {
1051 0 : *t++=c;
1052 0 : ++length;
1053 : }
1054 : }
1055 : /* zero-terminate */
1056 0 : *t=0;
1057 :
1058 0 : if(!fn(context, start, nameChoice, buffer, length)) {
1059 0 : return FALSE;
1060 : }
1061 : }
1062 0 : break;
1063 : }
1064 : default:
1065 : /* undefined type */
1066 0 : break;
1067 : }
1068 :
1069 0 : return TRUE;
1070 : }
1071 :
1072 : /*
1073 : * findAlgName() is almost the same as enumAlgNames() except that it
1074 : * returns the code point for a name if it fits into the range.
1075 : * It returns 0xffff otherwise.
1076 : */
1077 : static UChar32
1078 0 : findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) {
1079 : UChar32 code;
1080 :
1081 0 : if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) {
1082 0 : return 0xffff;
1083 : }
1084 :
1085 0 : switch(range->type) {
1086 : case 0: {
1087 : /* name = prefix hex-digits */
1088 0 : const char *s=(const char *)(range+1);
1089 : char c;
1090 :
1091 : uint16_t i, count;
1092 :
1093 : /* compare prefix */
1094 0 : while((c=*s++)!=0) {
1095 0 : if((char)c!=*otherName++) {
1096 0 : return 0xffff;
1097 : }
1098 : }
1099 :
1100 : /* read hexadecimal code point value */
1101 0 : count=range->variant;
1102 0 : code=0;
1103 0 : for(i=0; i<count; ++i) {
1104 0 : c=*otherName++;
1105 0 : if('0'<=c && c<='9') {
1106 0 : code=(code<<4)|(c-'0');
1107 0 : } else if('A'<=c && c<='F') {
1108 0 : code=(code<<4)|(c-'A'+10);
1109 : } else {
1110 0 : return 0xffff;
1111 : }
1112 : }
1113 :
1114 : /* does it fit into the range? */
1115 0 : if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) {
1116 0 : return code;
1117 : }
1118 0 : break;
1119 : }
1120 : case 1: {
1121 : char buffer[64];
1122 : uint16_t indexes[8];
1123 : const char *elementBases[8], *elements[8];
1124 0 : const uint16_t *factors=(const uint16_t *)(range+1);
1125 0 : uint16_t count=range->variant;
1126 0 : const char *s=(const char *)(factors+count), *t;
1127 : UChar32 start, limit;
1128 : uint16_t i, idx;
1129 :
1130 : char c;
1131 :
1132 : /* name = prefix factorized-elements */
1133 :
1134 : /* compare prefix */
1135 0 : while((c=*s++)!=0) {
1136 0 : if((char)c!=*otherName++) {
1137 0 : return 0xffff;
1138 : }
1139 : }
1140 :
1141 0 : start=(UChar32)range->start;
1142 0 : limit=(UChar32)(range->end+1);
1143 :
1144 : /* initialize the suffix elements for enumeration; indexes should all be set to 0 */
1145 0 : writeFactorSuffix(factors, count, s, 0,
1146 0 : indexes, elementBases, elements, buffer, sizeof(buffer));
1147 :
1148 : /* compare the first suffix */
1149 0 : if(0==uprv_strcmp(otherName, buffer)) {
1150 0 : return start;
1151 : }
1152 :
1153 : /* enumerate and compare the rest of the suffixes */
1154 0 : while(++start<limit) {
1155 : /* increment the indexes in lexical order bound by the factors */
1156 0 : i=count;
1157 : for (;;) {
1158 0 : idx=(uint16_t)(indexes[--i]+1);
1159 0 : if(idx<factors[i]) {
1160 : /* skip one index and its element string */
1161 0 : indexes[i]=idx;
1162 0 : s=elements[i];
1163 0 : while(*s++!=0) {}
1164 0 : elements[i]=s;
1165 0 : break;
1166 : } else {
1167 : /* reset this index to 0 and its element string to the first one */
1168 0 : indexes[i]=0;
1169 0 : elements[i]=elementBases[i];
1170 : }
1171 : }
1172 :
1173 : /* to make matters a little easier, just compare all elements of the suffix */
1174 0 : t=otherName;
1175 0 : for(i=0; i<count; ++i) {
1176 0 : s=elements[i];
1177 0 : while((c=*s++)!=0) {
1178 0 : if(c!=*t++) {
1179 0 : s=""; /* does not match */
1180 0 : i=99;
1181 : }
1182 : }
1183 : }
1184 0 : if(i<99 && *t==0) {
1185 0 : return start;
1186 : }
1187 : }
1188 0 : break;
1189 : }
1190 : default:
1191 : /* undefined type */
1192 0 : break;
1193 : }
1194 :
1195 0 : return 0xffff;
1196 : }
1197 :
1198 : /* sets of name characters, maximum name lengths ---------------------------- */
1199 :
1200 : #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f)))
1201 : #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0)
1202 :
1203 : static int32_t
1204 0 : calcStringSetLength(uint32_t set[8], const char *s) {
1205 0 : int32_t length=0;
1206 : char c;
1207 :
1208 0 : while((c=*s++)!=0) {
1209 0 : SET_ADD(set, c);
1210 0 : ++length;
1211 : }
1212 0 : return length;
1213 : }
1214 :
1215 : static int32_t
1216 0 : calcAlgNameSetsLengths(int32_t maxNameLength) {
1217 : AlgorithmicRange *range;
1218 : uint32_t *p;
1219 : uint32_t rangeCount;
1220 : int32_t length;
1221 :
1222 : /* enumerate algorithmic ranges */
1223 0 : p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1224 0 : rangeCount=*p;
1225 0 : range=(AlgorithmicRange *)(p+1);
1226 0 : while(rangeCount>0) {
1227 0 : switch(range->type) {
1228 : case 0:
1229 : /* name = prefix + (range->variant times) hex-digits */
1230 : /* prefix */
1231 0 : length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant;
1232 0 : if(length>maxNameLength) {
1233 0 : maxNameLength=length;
1234 : }
1235 0 : break;
1236 : case 1: {
1237 : /* name = prefix factorized-elements */
1238 0 : const uint16_t *factors=(const uint16_t *)(range+1);
1239 : const char *s;
1240 0 : int32_t i, count=range->variant, factor, factorLength, maxFactorLength;
1241 :
1242 : /* prefix length */
1243 0 : s=(const char *)(factors+count);
1244 0 : length=calcStringSetLength(gNameSet, s);
1245 0 : s+=length+1; /* start of factor suffixes */
1246 :
1247 : /* get the set and maximum factor suffix length for each factor */
1248 0 : for(i=0; i<count; ++i) {
1249 0 : maxFactorLength=0;
1250 0 : for(factor=factors[i]; factor>0; --factor) {
1251 0 : factorLength=calcStringSetLength(gNameSet, s);
1252 0 : s+=factorLength+1;
1253 0 : if(factorLength>maxFactorLength) {
1254 0 : maxFactorLength=factorLength;
1255 : }
1256 : }
1257 0 : length+=maxFactorLength;
1258 : }
1259 :
1260 0 : if(length>maxNameLength) {
1261 0 : maxNameLength=length;
1262 : }
1263 0 : break;
1264 : }
1265 : default:
1266 : /* unknown type */
1267 0 : break;
1268 : }
1269 :
1270 0 : range=(AlgorithmicRange *)((uint8_t *)range+range->size);
1271 0 : --rangeCount;
1272 : }
1273 0 : return maxNameLength;
1274 : }
1275 :
1276 : static int32_t
1277 0 : calcExtNameSetsLengths(int32_t maxNameLength) {
1278 : int32_t i, length;
1279 :
1280 0 : for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) {
1281 : /*
1282 : * for each category, count the length of the category name
1283 : * plus 9=
1284 : * 2 for <>
1285 : * 1 for -
1286 : * 6 for most hex digits per code point
1287 : */
1288 0 : length=9+calcStringSetLength(gNameSet, charCatNames[i]);
1289 0 : if(length>maxNameLength) {
1290 0 : maxNameLength=length;
1291 : }
1292 : }
1293 0 : return maxNameLength;
1294 : }
1295 :
1296 : static int32_t
1297 0 : calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths,
1298 : uint32_t set[8],
1299 : const uint8_t **pLine, const uint8_t *lineLimit) {
1300 0 : const uint8_t *line=*pLine;
1301 0 : int32_t length=0, tokenLength;
1302 : uint16_t c, token;
1303 :
1304 0 : while(line!=lineLimit && (c=*line++)!=(uint8_t)';') {
1305 0 : if(c>=tokenCount) {
1306 : /* implicit letter */
1307 0 : SET_ADD(set, c);
1308 0 : ++length;
1309 : } else {
1310 0 : token=tokens[c];
1311 0 : if(token==(uint16_t)(-2)) {
1312 : /* this is a lead byte for a double-byte token */
1313 0 : c=c<<8|*line++;
1314 0 : token=tokens[c];
1315 : }
1316 0 : if(token==(uint16_t)(-1)) {
1317 : /* explicit letter */
1318 0 : SET_ADD(set, c);
1319 0 : ++length;
1320 : } else {
1321 : /* count token word */
1322 0 : if(tokenLengths!=NULL) {
1323 : /* use cached token length */
1324 0 : tokenLength=tokenLengths[c];
1325 0 : if(tokenLength==0) {
1326 0 : tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1327 0 : tokenLengths[c]=(int8_t)tokenLength;
1328 : }
1329 : } else {
1330 0 : tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token);
1331 : }
1332 0 : length+=tokenLength;
1333 : }
1334 : }
1335 : }
1336 :
1337 0 : *pLine=line;
1338 0 : return length;
1339 : }
1340 :
1341 : static void
1342 0 : calcGroupNameSetsLengths(int32_t maxNameLength) {
1343 : uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2];
1344 :
1345 0 : uint16_t *tokens=(uint16_t *)uCharNames+8;
1346 0 : uint16_t tokenCount=*tokens++;
1347 0 : uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset;
1348 :
1349 : int8_t *tokenLengths;
1350 :
1351 : const uint16_t *group;
1352 : const uint8_t *s, *line, *lineLimit;
1353 :
1354 : int32_t groupCount, lineNumber, length;
1355 :
1356 0 : tokenLengths=(int8_t *)uprv_malloc(tokenCount);
1357 0 : if(tokenLengths!=NULL) {
1358 0 : uprv_memset(tokenLengths, 0, tokenCount);
1359 : }
1360 :
1361 0 : group=GET_GROUPS(uCharNames);
1362 0 : groupCount=*group++;
1363 :
1364 : /* enumerate all groups */
1365 0 : while(groupCount>0) {
1366 0 : s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group);
1367 0 : s=expandGroupLengths(s, offsets, lengths);
1368 :
1369 : /* enumerate all lines in each group */
1370 0 : for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) {
1371 0 : line=s+offsets[lineNumber];
1372 0 : length=lengths[lineNumber];
1373 0 : if(length==0) {
1374 0 : continue;
1375 : }
1376 :
1377 0 : lineLimit=line+length;
1378 :
1379 : /* read regular name */
1380 0 : length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1381 0 : if(length>maxNameLength) {
1382 0 : maxNameLength=length;
1383 : }
1384 0 : if(line==lineLimit) {
1385 0 : continue;
1386 : }
1387 :
1388 : /* read Unicode 1.0 name */
1389 0 : length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit);
1390 0 : if(length>maxNameLength) {
1391 0 : maxNameLength=length;
1392 : }
1393 0 : if(line==lineLimit) {
1394 0 : continue;
1395 : }
1396 :
1397 : /* read ISO comment */
1398 : /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/
1399 : }
1400 :
1401 0 : group=NEXT_GROUP(group);
1402 0 : --groupCount;
1403 : }
1404 :
1405 0 : if(tokenLengths!=NULL) {
1406 0 : uprv_free(tokenLengths);
1407 : }
1408 :
1409 : /* set gMax... - name length last for threading */
1410 0 : gMaxNameLength=maxNameLength;
1411 0 : }
1412 :
1413 : static UBool
1414 0 : calcNameSetsLengths(UErrorCode *pErrorCode) {
1415 : static const char extChars[]="0123456789ABCDEF<>-";
1416 : int32_t i, maxNameLength;
1417 :
1418 0 : if(gMaxNameLength!=0) {
1419 0 : return TRUE;
1420 : }
1421 :
1422 0 : if(!isDataLoaded(pErrorCode)) {
1423 0 : return FALSE;
1424 : }
1425 :
1426 : /* set hex digits, used in various names, and <>-, used in extended names */
1427 0 : for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) {
1428 0 : SET_ADD(gNameSet, extChars[i]);
1429 : }
1430 :
1431 : /* set sets and lengths from algorithmic names */
1432 0 : maxNameLength=calcAlgNameSetsLengths(0);
1433 :
1434 : /* set sets and lengths from extended names */
1435 0 : maxNameLength=calcExtNameSetsLengths(maxNameLength);
1436 :
1437 : /* set sets and lengths from group names, set global maximum values */
1438 0 : calcGroupNameSetsLengths(maxNameLength);
1439 :
1440 0 : return TRUE;
1441 : }
1442 :
1443 : U_NAMESPACE_END
1444 :
1445 : /* public API --------------------------------------------------------------- */
1446 :
1447 : U_NAMESPACE_USE
1448 :
1449 : U_CAPI int32_t U_EXPORT2
1450 0 : u_charName(UChar32 code, UCharNameChoice nameChoice,
1451 : char *buffer, int32_t bufferLength,
1452 : UErrorCode *pErrorCode) {
1453 : AlgorithmicRange *algRange;
1454 : uint32_t *p;
1455 : uint32_t i;
1456 : int32_t length;
1457 :
1458 : /* check the argument values */
1459 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1460 0 : return 0;
1461 0 : } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT ||
1462 0 : bufferLength<0 || (bufferLength>0 && buffer==NULL)
1463 : ) {
1464 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1465 0 : return 0;
1466 : }
1467 :
1468 0 : if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) {
1469 0 : return u_terminateChars(buffer, bufferLength, 0, pErrorCode);
1470 : }
1471 :
1472 0 : length=0;
1473 :
1474 : /* try algorithmic names first */
1475 0 : p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1476 0 : i=*p;
1477 0 : algRange=(AlgorithmicRange *)(p+1);
1478 0 : while(i>0) {
1479 0 : if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) {
1480 0 : length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1481 0 : break;
1482 : }
1483 0 : algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1484 0 : --i;
1485 : }
1486 :
1487 0 : if(i==0) {
1488 0 : if (nameChoice == U_EXTENDED_CHAR_NAME) {
1489 0 : length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength);
1490 0 : if (!length) {
1491 : /* extended character name */
1492 0 : length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength);
1493 : }
1494 : } else {
1495 : /* normal character name */
1496 0 : length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength);
1497 : }
1498 : }
1499 :
1500 0 : return u_terminateChars(buffer, bufferLength, length, pErrorCode);
1501 : }
1502 :
1503 : U_CAPI int32_t U_EXPORT2
1504 0 : u_getISOComment(UChar32 /*c*/,
1505 : char *dest, int32_t destCapacity,
1506 : UErrorCode *pErrorCode) {
1507 : /* check the argument values */
1508 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1509 0 : return 0;
1510 0 : } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1511 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1512 0 : return 0;
1513 : }
1514 :
1515 0 : return u_terminateChars(dest, destCapacity, 0, pErrorCode);
1516 : }
1517 :
1518 : U_CAPI UChar32 U_EXPORT2
1519 0 : u_charFromName(UCharNameChoice nameChoice,
1520 : const char *name,
1521 : UErrorCode *pErrorCode) {
1522 : char upper[120], lower[120];
1523 : FindName findName;
1524 : AlgorithmicRange *algRange;
1525 : uint32_t *p;
1526 : uint32_t i;
1527 0 : UChar32 cp = 0;
1528 : char c0;
1529 0 : UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */
1530 :
1531 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1532 0 : return error;
1533 : }
1534 :
1535 0 : if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) {
1536 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1537 0 : return error;
1538 : }
1539 :
1540 0 : if(!isDataLoaded(pErrorCode)) {
1541 0 : return error;
1542 : }
1543 :
1544 : /* construct the uppercase and lowercase of the name first */
1545 0 : for(i=0; i<sizeof(upper); ++i) {
1546 0 : if((c0=*name++)!=0) {
1547 0 : upper[i]=uprv_toupper(c0);
1548 0 : lower[i]=uprv_tolower(c0);
1549 : } else {
1550 0 : upper[i]=lower[i]=0;
1551 0 : break;
1552 : }
1553 : }
1554 0 : if(i==sizeof(upper)) {
1555 : /* name too long, there is no such character */
1556 0 : *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1557 0 : return error;
1558 : }
1559 : // i==strlen(name)==strlen(lower)==strlen(upper)
1560 :
1561 : /* try extended names first */
1562 0 : if (lower[0] == '<') {
1563 0 : if (nameChoice == U_EXTENDED_CHAR_NAME) {
1564 : // Parse a string like "<category-HHHH>" where HHHH is a hex code point.
1565 0 : if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') {
1566 0 : while (i >= 3 && lower[--i] != '-') {}
1567 :
1568 0 : if (i >= 2 && lower[i] == '-') {
1569 : uint32_t cIdx;
1570 :
1571 0 : lower[i] = 0;
1572 :
1573 0 : for (++i; lower[i] != '>'; ++i) {
1574 0 : if (lower[i] >= '0' && lower[i] <= '9') {
1575 0 : cp = (cp << 4) + lower[i] - '0';
1576 0 : } else if (lower[i] >= 'a' && lower[i] <= 'f') {
1577 0 : cp = (cp << 4) + lower[i] - 'a' + 10;
1578 : } else {
1579 0 : *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1580 0 : return error;
1581 : }
1582 : }
1583 :
1584 : /* Now validate the category name.
1585 : We could use a binary search, or a trie, if
1586 : we really wanted to. */
1587 :
1588 0 : for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) {
1589 :
1590 0 : if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) {
1591 0 : if (getCharCat(cp) == cIdx) {
1592 0 : return cp;
1593 : }
1594 0 : break;
1595 : }
1596 : }
1597 : }
1598 : }
1599 : }
1600 :
1601 0 : *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1602 0 : return error;
1603 : }
1604 :
1605 : /* try algorithmic names now */
1606 0 : p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1607 0 : i=*p;
1608 0 : algRange=(AlgorithmicRange *)(p+1);
1609 0 : while(i>0) {
1610 0 : if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) {
1611 0 : return cp;
1612 : }
1613 0 : algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1614 0 : --i;
1615 : }
1616 :
1617 : /* normal character name */
1618 0 : findName.otherName=upper;
1619 0 : findName.code=error;
1620 0 : enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice);
1621 0 : if (findName.code == error) {
1622 0 : *pErrorCode = U_ILLEGAL_CHAR_FOUND;
1623 : }
1624 0 : return findName.code;
1625 : }
1626 :
1627 : U_CAPI void U_EXPORT2
1628 0 : u_enumCharNames(UChar32 start, UChar32 limit,
1629 : UEnumCharNamesFn *fn,
1630 : void *context,
1631 : UCharNameChoice nameChoice,
1632 : UErrorCode *pErrorCode) {
1633 : AlgorithmicRange *algRange;
1634 : uint32_t *p;
1635 : uint32_t i;
1636 :
1637 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1638 0 : return;
1639 : }
1640 :
1641 0 : if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) {
1642 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1643 0 : return;
1644 : }
1645 :
1646 0 : if((uint32_t) limit > UCHAR_MAX_VALUE + 1) {
1647 0 : limit = UCHAR_MAX_VALUE + 1;
1648 : }
1649 0 : if((uint32_t)start>=(uint32_t)limit) {
1650 0 : return;
1651 : }
1652 :
1653 0 : if(!isDataLoaded(pErrorCode)) {
1654 0 : return;
1655 : }
1656 :
1657 : /* interleave the data-driven ones with the algorithmic ones */
1658 : /* iterate over all algorithmic ranges; assume that they are in ascending order */
1659 0 : p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset);
1660 0 : i=*p;
1661 0 : algRange=(AlgorithmicRange *)(p+1);
1662 0 : while(i>0) {
1663 : /* enumerate the character names before the current algorithmic range */
1664 : /* here: start<limit */
1665 0 : if((uint32_t)start<algRange->start) {
1666 0 : if((uint32_t)limit<=algRange->start) {
1667 0 : enumNames(uCharNames, start, limit, fn, context, nameChoice);
1668 0 : return;
1669 : }
1670 0 : if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) {
1671 0 : return;
1672 : }
1673 0 : start=(UChar32)algRange->start;
1674 : }
1675 : /* enumerate the character names in the current algorithmic range */
1676 : /* here: algRange->start<=start<limit */
1677 0 : if((uint32_t)start<=algRange->end) {
1678 0 : if((uint32_t)limit<=(algRange->end+1)) {
1679 0 : enumAlgNames(algRange, start, limit, fn, context, nameChoice);
1680 0 : return;
1681 : }
1682 0 : if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) {
1683 0 : return;
1684 : }
1685 0 : start=(UChar32)algRange->end+1;
1686 : }
1687 : /* continue to the next algorithmic range (here: start<limit) */
1688 0 : algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size);
1689 0 : --i;
1690 : }
1691 : /* enumerate the character names after the last algorithmic range */
1692 0 : enumNames(uCharNames, start, limit, fn, context, nameChoice);
1693 : }
1694 :
1695 : U_CAPI int32_t U_EXPORT2
1696 0 : uprv_getMaxCharNameLength() {
1697 0 : UErrorCode errorCode=U_ZERO_ERROR;
1698 0 : if(calcNameSetsLengths(&errorCode)) {
1699 0 : return gMaxNameLength;
1700 : } else {
1701 0 : return 0;
1702 : }
1703 : }
1704 :
1705 : /**
1706 : * Converts the char set cset into a Unicode set uset.
1707 : * @param cset Set of 256 bit flags corresponding to a set of chars.
1708 : * @param uset USet to receive characters. Existing contents are deleted.
1709 : */
1710 : static void
1711 0 : charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
1712 : UChar us[256];
1713 : char cs[256];
1714 :
1715 : int32_t i, length;
1716 : UErrorCode errorCode;
1717 :
1718 0 : errorCode=U_ZERO_ERROR;
1719 :
1720 0 : if(!calcNameSetsLengths(&errorCode)) {
1721 0 : return;
1722 : }
1723 :
1724 : /* build a char string with all chars that are used in character names */
1725 0 : length=0;
1726 0 : for(i=0; i<256; ++i) {
1727 0 : if(SET_CONTAINS(cset, i)) {
1728 0 : cs[length++]=(char)i;
1729 : }
1730 : }
1731 :
1732 : /* convert the char string to a UChar string */
1733 0 : u_charsToUChars(cs, us, length);
1734 :
1735 : /* add each UChar to the USet */
1736 0 : for(i=0; i<length; ++i) {
1737 0 : if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */
1738 0 : sa->add(sa->set, us[i]);
1739 : }
1740 : }
1741 : }
1742 :
1743 : /**
1744 : * Fills set with characters that are used in Unicode character names.
1745 : * @param set USet to receive characters.
1746 : */
1747 : U_CAPI void U_EXPORT2
1748 0 : uprv_getCharNameCharacters(const USetAdder *sa) {
1749 0 : charSetToUSet(gNameSet, sa);
1750 0 : }
1751 :
1752 : /* data swapping ------------------------------------------------------------ */
1753 :
1754 : /*
1755 : * The token table contains non-negative entries for token bytes,
1756 : * and -1 for bytes that represent themselves in the data file's charset.
1757 : * -2 entries are used for lead bytes.
1758 : *
1759 : * Direct bytes (-1 entries) must be translated from the input charset family
1760 : * to the output charset family.
1761 : * makeTokenMap() writes a permutation mapping for this.
1762 : * Use it once for single-/lead-byte tokens and once more for all trail byte
1763 : * tokens. (';' is an unused trail byte marked with -1.)
1764 : */
1765 : static void
1766 0 : makeTokenMap(const UDataSwapper *ds,
1767 : int16_t tokens[], uint16_t tokenCount,
1768 : uint8_t map[256],
1769 : UErrorCode *pErrorCode) {
1770 : UBool usedOutChar[256];
1771 : uint16_t i, j;
1772 : uint8_t c1, c2;
1773 :
1774 0 : if(U_FAILURE(*pErrorCode)) {
1775 0 : return;
1776 : }
1777 :
1778 0 : if(ds->inCharset==ds->outCharset) {
1779 : /* Same charset family: identity permutation */
1780 0 : for(i=0; i<256; ++i) {
1781 0 : map[i]=(uint8_t)i;
1782 : }
1783 : } else {
1784 0 : uprv_memset(map, 0, 256);
1785 0 : uprv_memset(usedOutChar, 0, 256);
1786 :
1787 0 : if(tokenCount>256) {
1788 0 : tokenCount=256;
1789 : }
1790 :
1791 : /* set the direct bytes (byte 0 always maps to itself) */
1792 0 : for(i=1; i<tokenCount; ++i) {
1793 0 : if(tokens[i]==-1) {
1794 : /* convert the direct byte character */
1795 0 : c1=(uint8_t)i;
1796 0 : ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode);
1797 0 : if(U_FAILURE(*pErrorCode)) {
1798 0 : udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n",
1799 0 : i, ds->inCharset);
1800 0 : return;
1801 : }
1802 :
1803 : /* enter the converted character into the map and mark it used */
1804 0 : map[c1]=c2;
1805 0 : usedOutChar[c2]=TRUE;
1806 : }
1807 : }
1808 :
1809 : /* set the mappings for the rest of the permutation */
1810 0 : for(i=j=1; i<tokenCount; ++i) {
1811 : /* set mappings that were not set for direct bytes */
1812 0 : if(map[i]==0) {
1813 : /* set an output byte value that was not used as an output byte above */
1814 0 : while(usedOutChar[j]) {
1815 0 : ++j;
1816 : }
1817 0 : map[i]=(uint8_t)j++;
1818 : }
1819 : }
1820 :
1821 : /*
1822 : * leave mappings at tokenCount and above unset if tokenCount<256
1823 : * because they won't be used
1824 : */
1825 : }
1826 : }
1827 :
1828 : U_CAPI int32_t U_EXPORT2
1829 0 : uchar_swapNames(const UDataSwapper *ds,
1830 : const void *inData, int32_t length, void *outData,
1831 : UErrorCode *pErrorCode) {
1832 : const UDataInfo *pInfo;
1833 : int32_t headerSize;
1834 :
1835 : const uint8_t *inBytes;
1836 : uint8_t *outBytes;
1837 :
1838 : uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset,
1839 : offset, i, count, stringsCount;
1840 :
1841 : const AlgorithmicRange *inRange;
1842 : AlgorithmicRange *outRange;
1843 :
1844 : /* udata_swapDataHeader checks the arguments */
1845 0 : headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
1846 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
1847 0 : return 0;
1848 : }
1849 :
1850 : /* check data format and format version */
1851 0 : pInfo=(const UDataInfo *)((const char *)inData+4);
1852 0 : if(!(
1853 0 : pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */
1854 0 : pInfo->dataFormat[1]==0x6e &&
1855 0 : pInfo->dataFormat[2]==0x61 &&
1856 0 : pInfo->dataFormat[3]==0x6d &&
1857 0 : pInfo->formatVersion[0]==1
1858 : )) {
1859 0 : udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n",
1860 0 : pInfo->dataFormat[0], pInfo->dataFormat[1],
1861 0 : pInfo->dataFormat[2], pInfo->dataFormat[3],
1862 0 : pInfo->formatVersion[0]);
1863 0 : *pErrorCode=U_UNSUPPORTED_ERROR;
1864 0 : return 0;
1865 : }
1866 :
1867 0 : inBytes=(const uint8_t *)inData+headerSize;
1868 0 : outBytes=(uint8_t *)outData+headerSize;
1869 0 : if(length<0) {
1870 0 : algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]);
1871 : } else {
1872 0 : length-=headerSize;
1873 0 : if( length<20 ||
1874 0 : (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]))
1875 : ) {
1876 : udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n",
1877 0 : length);
1878 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
1879 0 : return 0;
1880 : }
1881 : }
1882 :
1883 0 : if(length<0) {
1884 : /* preflighting: iterate through algorithmic ranges */
1885 0 : offset=algNamesOffset;
1886 0 : count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
1887 0 : offset+=4;
1888 :
1889 0 : for(i=0; i<count; ++i) {
1890 0 : inRange=(const AlgorithmicRange *)(inBytes+offset);
1891 0 : offset+=ds->readUInt16(inRange->size);
1892 : }
1893 : } else {
1894 : /* swap data */
1895 : const uint16_t *p;
1896 : uint16_t *q, *temp;
1897 :
1898 : int16_t tokens[512];
1899 : uint16_t tokenCount;
1900 :
1901 : uint8_t map[256], trailMap[256];
1902 :
1903 : /* copy the data for inaccessible bytes */
1904 0 : if(inBytes!=outBytes) {
1905 0 : uprv_memcpy(outBytes, inBytes, length);
1906 : }
1907 :
1908 : /* the initial 4 offsets first */
1909 0 : tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]);
1910 0 : groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]);
1911 0 : groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]);
1912 0 : ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode);
1913 :
1914 : /*
1915 : * now the tokens table
1916 : * it needs to be permutated along with the compressed name strings
1917 : */
1918 0 : p=(const uint16_t *)(inBytes+16);
1919 0 : q=(uint16_t *)(outBytes+16);
1920 :
1921 : /* read and swap the tokenCount */
1922 0 : tokenCount=ds->readUInt16(*p);
1923 0 : ds->swapArray16(ds, p, 2, q, pErrorCode);
1924 0 : ++p;
1925 0 : ++q;
1926 :
1927 : /* read the first 512 tokens and make the token maps */
1928 0 : if(tokenCount<=512) {
1929 0 : count=tokenCount;
1930 : } else {
1931 0 : count=512;
1932 : }
1933 0 : for(i=0; i<count; ++i) {
1934 0 : tokens[i]=udata_readInt16(ds, p[i]);
1935 : }
1936 0 : for(; i<512; ++i) {
1937 0 : tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */
1938 : }
1939 0 : makeTokenMap(ds, tokens, tokenCount, map, pErrorCode);
1940 0 : makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode);
1941 0 : if(U_FAILURE(*pErrorCode)) {
1942 0 : return 0;
1943 : }
1944 :
1945 : /*
1946 : * swap and permutate the tokens
1947 : * go through a temporary array to support in-place swapping
1948 : */
1949 0 : temp=(uint16_t *)uprv_malloc(tokenCount*2);
1950 0 : if(temp==NULL) {
1951 0 : udata_printError(ds, "out of memory swapping %u unames.icu tokens\n",
1952 0 : tokenCount);
1953 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
1954 0 : return 0;
1955 : }
1956 :
1957 : /* swap and permutate single-/lead-byte tokens */
1958 0 : for(i=0; i<tokenCount && i<256; ++i) {
1959 0 : ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode);
1960 : }
1961 :
1962 : /* swap and permutate trail-byte tokens */
1963 0 : for(; i<tokenCount; ++i) {
1964 0 : ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode);
1965 : }
1966 :
1967 : /* copy the result into the output and free the temporary array */
1968 0 : uprv_memcpy(q, temp, tokenCount*2);
1969 0 : uprv_free(temp);
1970 :
1971 : /*
1972 : * swap the token strings but not a possible padding byte after
1973 : * the terminating NUL of the last string
1974 : */
1975 0 : udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset),
1976 0 : outBytes+tokenStringOffset, pErrorCode);
1977 0 : if(U_FAILURE(*pErrorCode)) {
1978 0 : udata_printError(ds, "uchar_swapNames(token strings) failed\n");
1979 0 : return 0;
1980 : }
1981 :
1982 : /* swap the group table */
1983 0 : count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset)));
1984 0 : ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2),
1985 0 : outBytes+groupsOffset, pErrorCode);
1986 :
1987 : /*
1988 : * swap the group strings
1989 : * swap the string bytes but not the nibble-encoded string lengths
1990 : */
1991 0 : if(ds->inCharset!=ds->outCharset) {
1992 : uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1];
1993 :
1994 : const uint8_t *inStrings, *nextInStrings;
1995 : uint8_t *outStrings;
1996 :
1997 : uint8_t c;
1998 :
1999 0 : inStrings=inBytes+groupStringOffset;
2000 0 : outStrings=outBytes+groupStringOffset;
2001 :
2002 0 : stringsCount=algNamesOffset-groupStringOffset;
2003 :
2004 : /* iterate through string groups until only a few padding bytes are left */
2005 0 : while(stringsCount>32) {
2006 0 : nextInStrings=expandGroupLengths(inStrings, offsets, lengths);
2007 :
2008 : /* move past the length bytes */
2009 0 : stringsCount-=(uint32_t)(nextInStrings-inStrings);
2010 0 : outStrings+=nextInStrings-inStrings;
2011 0 : inStrings=nextInStrings;
2012 :
2013 0 : count=offsets[31]+lengths[31]; /* total number of string bytes in this group */
2014 0 : stringsCount-=count;
2015 :
2016 : /* swap the string bytes using map[] and trailMap[] */
2017 0 : while(count>0) {
2018 0 : c=*inStrings++;
2019 0 : *outStrings++=map[c];
2020 0 : if(tokens[c]!=-2) {
2021 0 : --count;
2022 : } else {
2023 : /* token lead byte: swap the trail byte, too */
2024 0 : *outStrings++=trailMap[*inStrings++];
2025 0 : count-=2;
2026 : }
2027 : }
2028 : }
2029 : }
2030 :
2031 : /* swap the algorithmic ranges */
2032 0 : offset=algNamesOffset;
2033 0 : count=ds->readUInt32(*((const uint32_t *)(inBytes+offset)));
2034 0 : ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode);
2035 0 : offset+=4;
2036 :
2037 0 : for(i=0; i<count; ++i) {
2038 0 : if(offset>(uint32_t)length) {
2039 : udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n",
2040 0 : length, i);
2041 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
2042 0 : return 0;
2043 : }
2044 :
2045 0 : inRange=(const AlgorithmicRange *)(inBytes+offset);
2046 0 : outRange=(AlgorithmicRange *)(outBytes+offset);
2047 0 : offset+=ds->readUInt16(inRange->size);
2048 :
2049 0 : ds->swapArray32(ds, inRange, 8, outRange, pErrorCode);
2050 0 : ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode);
2051 0 : switch(inRange->type) {
2052 : case 0:
2053 : /* swap prefix string */
2054 0 : ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)),
2055 0 : outRange+1, pErrorCode);
2056 0 : if(U_FAILURE(*pErrorCode)) {
2057 : udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n",
2058 0 : i);
2059 0 : return 0;
2060 : }
2061 0 : break;
2062 : case 1:
2063 : {
2064 : /* swap factors and the prefix and factor strings */
2065 : uint32_t factorsCount;
2066 :
2067 0 : factorsCount=inRange->variant;
2068 0 : p=(const uint16_t *)(inRange+1);
2069 0 : q=(uint16_t *)(outRange+1);
2070 0 : ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode);
2071 :
2072 : /* swap the strings, up to the last terminating NUL */
2073 0 : p+=factorsCount;
2074 0 : q+=factorsCount;
2075 0 : stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p);
2076 0 : while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) {
2077 0 : --stringsCount;
2078 : }
2079 0 : ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode);
2080 : }
2081 0 : break;
2082 : default:
2083 0 : udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n",
2084 0 : inRange->type, i);
2085 0 : *pErrorCode=U_UNSUPPORTED_ERROR;
2086 0 : return 0;
2087 : }
2088 : }
2089 : }
2090 :
2091 0 : return headerSize+(int32_t)offset;
2092 : }
2093 :
2094 : /*
2095 : * Hey, Emacs, please set the following:
2096 : *
2097 : * Local Variables:
2098 : * indent-tabs-mode: nil
2099 : * End:
2100 : *
2101 : */
|