Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 1998-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : *
11 : * File ustring.cpp
12 : *
13 : * Modification History:
14 : *
15 : * Date Name Description
16 : * 12/07/98 bertrand Creation.
17 : ******************************************************************************
18 : */
19 :
20 : #include "unicode/utypes.h"
21 : #include "unicode/putil.h"
22 : #include "unicode/uchar.h"
23 : #include "unicode/ustring.h"
24 : #include "unicode/utf16.h"
25 : #include "cstring.h"
26 : #include "cwchar.h"
27 : #include "cmemory.h"
28 : #include "ustr_imp.h"
29 :
30 : /* ANSI string.h - style functions ------------------------------------------ */
31 :
32 : /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */
33 : #define U_BMP_MAX 0xffff
34 :
35 : /* Forward binary string search functions ----------------------------------- */
36 :
37 : /*
38 : * Test if a substring match inside a string is at code point boundaries.
39 : * All pointers refer to the same buffer.
40 : * The limit pointer may be NULL, all others must be real pointers.
41 : */
42 : static inline UBool
43 0 : isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) {
44 0 : if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
45 : /* the leading edge of the match is in the middle of a surrogate pair */
46 0 : return FALSE;
47 : }
48 0 : if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) {
49 : /* the trailing edge of the match is in the middle of a surrogate pair */
50 0 : return FALSE;
51 : }
52 0 : return TRUE;
53 : }
54 :
55 : U_CAPI UChar * U_EXPORT2
56 0 : u_strFindFirst(const UChar *s, int32_t length,
57 : const UChar *sub, int32_t subLength) {
58 : const UChar *start, *p, *q, *subLimit;
59 : UChar c, cs, cq;
60 :
61 0 : if(sub==NULL || subLength<-1) {
62 0 : return (UChar *)s;
63 : }
64 0 : if(s==NULL || length<-1) {
65 0 : return NULL;
66 : }
67 :
68 0 : start=s;
69 :
70 0 : if(length<0 && subLength<0) {
71 : /* both strings are NUL-terminated */
72 0 : if((cs=*sub++)==0) {
73 0 : return (UChar *)s;
74 : }
75 0 : if(*sub==0 && !U16_IS_SURROGATE(cs)) {
76 : /* the substring consists of a single, non-surrogate BMP code point */
77 0 : return u_strchr(s, cs);
78 : }
79 :
80 0 : while((c=*s++)!=0) {
81 0 : if(c==cs) {
82 : /* found first substring UChar, compare rest */
83 0 : p=s;
84 0 : q=sub;
85 : for(;;) {
86 0 : if((cq=*q)==0) {
87 0 : if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
88 0 : return (UChar *)(s-1); /* well-formed match */
89 : } else {
90 0 : break; /* no match because surrogate pair is split */
91 : }
92 : }
93 0 : if((c=*p)==0) {
94 0 : return NULL; /* no match, and none possible after s */
95 : }
96 0 : if(c!=cq) {
97 0 : break; /* no match */
98 : }
99 0 : ++p;
100 0 : ++q;
101 : }
102 : }
103 : }
104 :
105 : /* not found */
106 0 : return NULL;
107 : }
108 :
109 0 : if(subLength<0) {
110 0 : subLength=u_strlen(sub);
111 : }
112 0 : if(subLength==0) {
113 0 : return (UChar *)s;
114 : }
115 :
116 : /* get sub[0] to search for it fast */
117 0 : cs=*sub++;
118 0 : --subLength;
119 0 : subLimit=sub+subLength;
120 :
121 0 : if(subLength==0 && !U16_IS_SURROGATE(cs)) {
122 : /* the substring consists of a single, non-surrogate BMP code point */
123 0 : return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
124 : }
125 :
126 0 : if(length<0) {
127 : /* s is NUL-terminated */
128 0 : while((c=*s++)!=0) {
129 0 : if(c==cs) {
130 : /* found first substring UChar, compare rest */
131 0 : p=s;
132 0 : q=sub;
133 : for(;;) {
134 0 : if(q==subLimit) {
135 0 : if(isMatchAtCPBoundary(start, s-1, p, NULL)) {
136 0 : return (UChar *)(s-1); /* well-formed match */
137 : } else {
138 0 : break; /* no match because surrogate pair is split */
139 : }
140 : }
141 0 : if((c=*p)==0) {
142 0 : return NULL; /* no match, and none possible after s */
143 : }
144 0 : if(c!=*q) {
145 0 : break; /* no match */
146 : }
147 0 : ++p;
148 0 : ++q;
149 : }
150 : }
151 : }
152 : } else {
153 : const UChar *limit, *preLimit;
154 :
155 : /* subLength was decremented above */
156 0 : if(length<=subLength) {
157 0 : return NULL; /* s is shorter than sub */
158 : }
159 :
160 0 : limit=s+length;
161 :
162 : /* the substring must start before preLimit */
163 0 : preLimit=limit-subLength;
164 :
165 0 : while(s!=preLimit) {
166 0 : c=*s++;
167 0 : if(c==cs) {
168 : /* found first substring UChar, compare rest */
169 0 : p=s;
170 0 : q=sub;
171 : for(;;) {
172 0 : if(q==subLimit) {
173 0 : if(isMatchAtCPBoundary(start, s-1, p, limit)) {
174 0 : return (UChar *)(s-1); /* well-formed match */
175 : } else {
176 0 : break; /* no match because surrogate pair is split */
177 : }
178 : }
179 0 : if(*p!=*q) {
180 0 : break; /* no match */
181 : }
182 0 : ++p;
183 0 : ++q;
184 : }
185 : }
186 : }
187 : }
188 :
189 : /* not found */
190 0 : return NULL;
191 : }
192 :
193 : U_CAPI UChar * U_EXPORT2
194 0 : u_strstr(const UChar *s, const UChar *substring) {
195 0 : return u_strFindFirst(s, -1, substring, -1);
196 : }
197 :
198 : U_CAPI UChar * U_EXPORT2
199 0 : u_strchr(const UChar *s, UChar c) {
200 0 : if(U16_IS_SURROGATE(c)) {
201 : /* make sure to not find half of a surrogate pair */
202 0 : return u_strFindFirst(s, -1, &c, 1);
203 : } else {
204 : UChar cs;
205 :
206 : /* trivial search for a BMP code point */
207 : for(;;) {
208 0 : if((cs=*s)==c) {
209 0 : return (UChar *)s;
210 : }
211 0 : if(cs==0) {
212 0 : return NULL;
213 : }
214 0 : ++s;
215 : }
216 : }
217 : }
218 :
219 : U_CAPI UChar * U_EXPORT2
220 0 : u_strchr32(const UChar *s, UChar32 c) {
221 0 : if((uint32_t)c<=U_BMP_MAX) {
222 : /* find BMP code point */
223 0 : return u_strchr(s, (UChar)c);
224 0 : } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
225 : /* find supplementary code point as surrogate pair */
226 0 : UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
227 :
228 0 : while((cs=*s++)!=0) {
229 0 : if(cs==lead && *s==trail) {
230 0 : return (UChar *)(s-1);
231 : }
232 : }
233 0 : return NULL;
234 : } else {
235 : /* not a Unicode code point, not findable */
236 0 : return NULL;
237 : }
238 : }
239 :
240 : U_CAPI UChar * U_EXPORT2
241 0 : u_memchr(const UChar *s, UChar c, int32_t count) {
242 0 : if(count<=0) {
243 0 : return NULL; /* no string */
244 0 : } else if(U16_IS_SURROGATE(c)) {
245 : /* make sure to not find half of a surrogate pair */
246 0 : return u_strFindFirst(s, count, &c, 1);
247 : } else {
248 : /* trivial search for a BMP code point */
249 0 : const UChar *limit=s+count;
250 0 : do {
251 0 : if(*s==c) {
252 0 : return (UChar *)s;
253 : }
254 : } while(++s!=limit);
255 0 : return NULL;
256 : }
257 : }
258 :
259 : U_CAPI UChar * U_EXPORT2
260 0 : u_memchr32(const UChar *s, UChar32 c, int32_t count) {
261 0 : if((uint32_t)c<=U_BMP_MAX) {
262 : /* find BMP code point */
263 0 : return u_memchr(s, (UChar)c, count);
264 0 : } else if(count<2) {
265 : /* too short for a surrogate pair */
266 0 : return NULL;
267 0 : } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
268 : /* find supplementary code point as surrogate pair */
269 0 : const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
270 0 : UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
271 :
272 0 : do {
273 0 : if(*s==lead && *(s+1)==trail) {
274 0 : return (UChar *)s;
275 : }
276 : } while(++s!=limit);
277 0 : return NULL;
278 : } else {
279 : /* not a Unicode code point, not findable */
280 0 : return NULL;
281 : }
282 : }
283 :
284 : /* Backward binary string search functions ---------------------------------- */
285 :
286 : U_CAPI UChar * U_EXPORT2
287 0 : u_strFindLast(const UChar *s, int32_t length,
288 : const UChar *sub, int32_t subLength) {
289 : const UChar *start, *limit, *p, *q, *subLimit;
290 : UChar c, cs;
291 :
292 0 : if(sub==NULL || subLength<-1) {
293 0 : return (UChar *)s;
294 : }
295 0 : if(s==NULL || length<-1) {
296 0 : return NULL;
297 : }
298 :
299 : /*
300 : * This implementation is more lazy than the one for u_strFindFirst():
301 : * There is no special search code for NUL-terminated strings.
302 : * It does not seem to be worth it for searching substrings to
303 : * search forward and find all matches like in u_strrchr() and similar.
304 : * Therefore, we simply get both string lengths and search backward.
305 : *
306 : * markus 2002oct23
307 : */
308 :
309 0 : if(subLength<0) {
310 0 : subLength=u_strlen(sub);
311 : }
312 0 : if(subLength==0) {
313 0 : return (UChar *)s;
314 : }
315 :
316 : /* get sub[subLength-1] to search for it fast */
317 0 : subLimit=sub+subLength;
318 0 : cs=*(--subLimit);
319 0 : --subLength;
320 :
321 0 : if(subLength==0 && !U16_IS_SURROGATE(cs)) {
322 : /* the substring consists of a single, non-surrogate BMP code point */
323 0 : return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
324 : }
325 :
326 0 : if(length<0) {
327 0 : length=u_strlen(s);
328 : }
329 :
330 : /* subLength was decremented above */
331 0 : if(length<=subLength) {
332 0 : return NULL; /* s is shorter than sub */
333 : }
334 :
335 0 : start=s;
336 0 : limit=s+length;
337 :
338 : /* the substring must start no later than s+subLength */
339 0 : s+=subLength;
340 :
341 0 : while(s!=limit) {
342 0 : c=*(--limit);
343 0 : if(c==cs) {
344 : /* found last substring UChar, compare rest */
345 0 : p=limit;
346 0 : q=subLimit;
347 : for(;;) {
348 0 : if(q==sub) {
349 0 : if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
350 0 : return (UChar *)p; /* well-formed match */
351 : } else {
352 0 : break; /* no match because surrogate pair is split */
353 : }
354 : }
355 0 : if(*(--p)!=*(--q)) {
356 0 : break; /* no match */
357 : }
358 : }
359 : }
360 : }
361 :
362 : /* not found */
363 0 : return NULL;
364 : }
365 :
366 : U_CAPI UChar * U_EXPORT2
367 0 : u_strrstr(const UChar *s, const UChar *substring) {
368 0 : return u_strFindLast(s, -1, substring, -1);
369 : }
370 :
371 : U_CAPI UChar * U_EXPORT2
372 0 : u_strrchr(const UChar *s, UChar c) {
373 0 : if(U16_IS_SURROGATE(c)) {
374 : /* make sure to not find half of a surrogate pair */
375 0 : return u_strFindLast(s, -1, &c, 1);
376 : } else {
377 0 : const UChar *result=NULL;
378 : UChar cs;
379 :
380 : /* trivial search for a BMP code point */
381 : for(;;) {
382 0 : if((cs=*s)==c) {
383 0 : result=s;
384 : }
385 0 : if(cs==0) {
386 0 : return (UChar *)result;
387 : }
388 0 : ++s;
389 : }
390 : }
391 : }
392 :
393 : U_CAPI UChar * U_EXPORT2
394 0 : u_strrchr32(const UChar *s, UChar32 c) {
395 0 : if((uint32_t)c<=U_BMP_MAX) {
396 : /* find BMP code point */
397 0 : return u_strrchr(s, (UChar)c);
398 0 : } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
399 : /* find supplementary code point as surrogate pair */
400 0 : const UChar *result=NULL;
401 0 : UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
402 :
403 0 : while((cs=*s++)!=0) {
404 0 : if(cs==lead && *s==trail) {
405 0 : result=s-1;
406 : }
407 : }
408 0 : return (UChar *)result;
409 : } else {
410 : /* not a Unicode code point, not findable */
411 0 : return NULL;
412 : }
413 : }
414 :
415 : U_CAPI UChar * U_EXPORT2
416 0 : u_memrchr(const UChar *s, UChar c, int32_t count) {
417 0 : if(count<=0) {
418 0 : return NULL; /* no string */
419 0 : } else if(U16_IS_SURROGATE(c)) {
420 : /* make sure to not find half of a surrogate pair */
421 0 : return u_strFindLast(s, count, &c, 1);
422 : } else {
423 : /* trivial search for a BMP code point */
424 0 : const UChar *limit=s+count;
425 0 : do {
426 0 : if(*(--limit)==c) {
427 0 : return (UChar *)limit;
428 : }
429 0 : } while(s!=limit);
430 0 : return NULL;
431 : }
432 : }
433 :
434 : U_CAPI UChar * U_EXPORT2
435 0 : u_memrchr32(const UChar *s, UChar32 c, int32_t count) {
436 0 : if((uint32_t)c<=U_BMP_MAX) {
437 : /* find BMP code point */
438 0 : return u_memrchr(s, (UChar)c, count);
439 0 : } else if(count<2) {
440 : /* too short for a surrogate pair */
441 0 : return NULL;
442 0 : } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
443 : /* find supplementary code point as surrogate pair */
444 0 : const UChar *limit=s+count-1;
445 0 : UChar lead=U16_LEAD(c), trail=U16_TRAIL(c);
446 :
447 0 : do {
448 0 : if(*limit==trail && *(limit-1)==lead) {
449 0 : return (UChar *)(limit-1);
450 : }
451 : } while(s!=--limit);
452 0 : return NULL;
453 : } else {
454 : /* not a Unicode code point, not findable */
455 0 : return NULL;
456 : }
457 : }
458 :
459 : /* Tokenization functions --------------------------------------------------- */
460 :
461 : /*
462 : * Match each code point in a string against each code point in the matchSet.
463 : * Return the index of the first string code point that
464 : * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
465 : * Return -(string length)-1 if there is no such code point.
466 : */
467 : static int32_t
468 0 : _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
469 : int32_t matchLen, matchBMPLen, strItr, matchItr;
470 : UChar32 stringCh, matchCh;
471 : UChar c, c2;
472 :
473 : /* first part of matchSet contains only BMP code points */
474 0 : matchBMPLen = 0;
475 0 : while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
476 0 : ++matchBMPLen;
477 : }
478 :
479 : /* second part of matchSet contains BMP and supplementary code points */
480 0 : matchLen = matchBMPLen;
481 0 : while(matchSet[matchLen] != 0) {
482 0 : ++matchLen;
483 : }
484 :
485 0 : for(strItr = 0; (c = string[strItr]) != 0;) {
486 0 : ++strItr;
487 0 : if(U16_IS_SINGLE(c)) {
488 0 : if(polarity) {
489 0 : for(matchItr = 0; matchItr < matchLen; ++matchItr) {
490 0 : if(c == matchSet[matchItr]) {
491 0 : return strItr - 1; /* one matches */
492 : }
493 : }
494 : } else {
495 0 : for(matchItr = 0; matchItr < matchLen; ++matchItr) {
496 0 : if(c == matchSet[matchItr]) {
497 0 : goto endloop;
498 : }
499 : }
500 0 : return strItr - 1; /* none matches */
501 : }
502 : } else {
503 : /*
504 : * No need to check for string length before U16_IS_TRAIL
505 : * because c2 could at worst be the terminating NUL.
506 : */
507 0 : if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
508 0 : ++strItr;
509 0 : stringCh = U16_GET_SUPPLEMENTARY(c, c2);
510 : } else {
511 0 : stringCh = c; /* unpaired trail surrogate */
512 : }
513 :
514 0 : if(polarity) {
515 0 : for(matchItr = matchBMPLen; matchItr < matchLen;) {
516 0 : U16_NEXT(matchSet, matchItr, matchLen, matchCh);
517 0 : if(stringCh == matchCh) {
518 0 : return strItr - U16_LENGTH(stringCh); /* one matches */
519 : }
520 : }
521 : } else {
522 0 : for(matchItr = matchBMPLen; matchItr < matchLen;) {
523 0 : U16_NEXT(matchSet, matchItr, matchLen, matchCh);
524 0 : if(stringCh == matchCh) {
525 0 : goto endloop;
526 : }
527 : }
528 0 : return strItr - U16_LENGTH(stringCh); /* none matches */
529 : }
530 : }
531 : endloop:
532 : /* wish C had continue with labels like Java... */;
533 : }
534 :
535 : /* Didn't find it. */
536 0 : return -strItr-1;
537 : }
538 :
539 : /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
540 : U_CAPI UChar * U_EXPORT2
541 0 : u_strpbrk(const UChar *string, const UChar *matchSet)
542 : {
543 0 : int32_t idx = _matchFromSet(string, matchSet, TRUE);
544 0 : if(idx >= 0) {
545 0 : return (UChar *)string + idx;
546 : } else {
547 0 : return NULL;
548 : }
549 : }
550 :
551 : /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
552 : U_CAPI int32_t U_EXPORT2
553 0 : u_strcspn(const UChar *string, const UChar *matchSet)
554 : {
555 0 : int32_t idx = _matchFromSet(string, matchSet, TRUE);
556 0 : if(idx >= 0) {
557 0 : return idx;
558 : } else {
559 0 : return -idx - 1; /* == u_strlen(string) */
560 : }
561 : }
562 :
563 : /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
564 : U_CAPI int32_t U_EXPORT2
565 0 : u_strspn(const UChar *string, const UChar *matchSet)
566 : {
567 0 : int32_t idx = _matchFromSet(string, matchSet, FALSE);
568 0 : if(idx >= 0) {
569 0 : return idx;
570 : } else {
571 0 : return -idx - 1; /* == u_strlen(string) */
572 : }
573 : }
574 :
575 : /* ----- Text manipulation functions --- */
576 :
577 : U_CAPI UChar* U_EXPORT2
578 0 : u_strtok_r(UChar *src,
579 : const UChar *delim,
580 : UChar **saveState)
581 : {
582 : UChar *tokSource;
583 : UChar *nextToken;
584 : uint32_t nonDelimIdx;
585 :
586 : /* If saveState is NULL, the user messed up. */
587 0 : if (src != NULL) {
588 0 : tokSource = src;
589 0 : *saveState = src; /* Set to "src" in case there are no delimiters */
590 : }
591 0 : else if (*saveState) {
592 0 : tokSource = *saveState;
593 : }
594 : else {
595 : /* src == NULL && *saveState == NULL */
596 : /* This shouldn't happen. We already finished tokenizing. */
597 0 : return NULL;
598 : }
599 :
600 : /* Skip initial delimiters */
601 0 : nonDelimIdx = u_strspn(tokSource, delim);
602 0 : tokSource = &tokSource[nonDelimIdx];
603 :
604 0 : if (*tokSource) {
605 0 : nextToken = u_strpbrk(tokSource, delim);
606 0 : if (nextToken != NULL) {
607 : /* Create a token */
608 0 : *(nextToken++) = 0;
609 0 : *saveState = nextToken;
610 0 : return tokSource;
611 : }
612 0 : else if (*saveState) {
613 : /* Return the last token */
614 0 : *saveState = NULL;
615 0 : return tokSource;
616 : }
617 : }
618 : else {
619 : /* No tokens were found. Only delimiters were left. */
620 0 : *saveState = NULL;
621 : }
622 0 : return NULL;
623 : }
624 :
625 : /* Miscellaneous functions -------------------------------------------------- */
626 :
627 : U_CAPI UChar* U_EXPORT2
628 0 : u_strcat(UChar *dst,
629 : const UChar *src)
630 : {
631 0 : UChar *anchor = dst; /* save a pointer to start of dst */
632 :
633 0 : while(*dst != 0) { /* To end of first string */
634 0 : ++dst;
635 : }
636 0 : while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
637 : }
638 :
639 0 : return anchor;
640 : }
641 :
642 : U_CAPI UChar* U_EXPORT2
643 0 : u_strncat(UChar *dst,
644 : const UChar *src,
645 : int32_t n )
646 : {
647 0 : if(n > 0) {
648 0 : UChar *anchor = dst; /* save a pointer to start of dst */
649 :
650 0 : while(*dst != 0) { /* To end of first string */
651 0 : ++dst;
652 : }
653 0 : while((*dst = *src) != 0) { /* copy string 2 over */
654 0 : ++dst;
655 0 : if(--n == 0) {
656 0 : *dst = 0;
657 0 : break;
658 : }
659 0 : ++src;
660 : }
661 :
662 0 : return anchor;
663 : } else {
664 0 : return dst;
665 : }
666 : }
667 :
668 : /* ----- Text property functions --- */
669 :
670 : U_CAPI int32_t U_EXPORT2
671 0 : u_strcmp(const UChar *s1,
672 : const UChar *s2)
673 : {
674 : UChar c1, c2;
675 :
676 : for(;;) {
677 0 : c1=*s1++;
678 0 : c2=*s2++;
679 0 : if (c1 != c2 || c1 == 0) {
680 : break;
681 : }
682 : }
683 0 : return (int32_t)c1 - (int32_t)c2;
684 : }
685 :
686 : U_CFUNC int32_t U_EXPORT2
687 0 : uprv_strCompare(const UChar *s1, int32_t length1,
688 : const UChar *s2, int32_t length2,
689 : UBool strncmpStyle, UBool codePointOrder) {
690 : const UChar *start1, *start2, *limit1, *limit2;
691 : UChar c1, c2;
692 :
693 : /* setup for fix-up */
694 0 : start1=s1;
695 0 : start2=s2;
696 :
697 : /* compare identical prefixes - they do not need to be fixed up */
698 0 : if(length1<0 && length2<0) {
699 : /* strcmp style, both NUL-terminated */
700 0 : if(s1==s2) {
701 0 : return 0;
702 : }
703 :
704 : for(;;) {
705 0 : c1=*s1;
706 0 : c2=*s2;
707 0 : if(c1!=c2) {
708 0 : break;
709 : }
710 0 : if(c1==0) {
711 0 : return 0;
712 : }
713 0 : ++s1;
714 0 : ++s2;
715 : }
716 :
717 : /* setup for fix-up */
718 0 : limit1=limit2=NULL;
719 0 : } else if(strncmpStyle) {
720 : /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
721 0 : if(s1==s2) {
722 0 : return 0;
723 : }
724 :
725 0 : limit1=start1+length1;
726 :
727 : for(;;) {
728 : /* both lengths are same, check only one limit */
729 0 : if(s1==limit1) {
730 0 : return 0;
731 : }
732 :
733 0 : c1=*s1;
734 0 : c2=*s2;
735 0 : if(c1!=c2) {
736 0 : break;
737 : }
738 0 : if(c1==0) {
739 0 : return 0;
740 : }
741 0 : ++s1;
742 0 : ++s2;
743 : }
744 :
745 : /* setup for fix-up */
746 0 : limit2=start2+length1; /* use length1 here, too, to enforce assumption */
747 : } else {
748 : /* memcmp/UnicodeString style, both length-specified */
749 : int32_t lengthResult;
750 :
751 0 : if(length1<0) {
752 0 : length1=u_strlen(s1);
753 : }
754 0 : if(length2<0) {
755 0 : length2=u_strlen(s2);
756 : }
757 :
758 : /* limit1=start1+min(lenght1, length2) */
759 0 : if(length1<length2) {
760 0 : lengthResult=-1;
761 0 : limit1=start1+length1;
762 0 : } else if(length1==length2) {
763 0 : lengthResult=0;
764 0 : limit1=start1+length1;
765 : } else /* length1>length2 */ {
766 0 : lengthResult=1;
767 0 : limit1=start1+length2;
768 : }
769 :
770 0 : if(s1==s2) {
771 0 : return lengthResult;
772 : }
773 :
774 : for(;;) {
775 : /* check pseudo-limit */
776 0 : if(s1==limit1) {
777 0 : return lengthResult;
778 : }
779 :
780 0 : c1=*s1;
781 0 : c2=*s2;
782 0 : if(c1!=c2) {
783 0 : break;
784 : }
785 0 : ++s1;
786 0 : ++s2;
787 : }
788 :
789 : /* setup for fix-up */
790 0 : limit1=start1+length1;
791 0 : limit2=start2+length2;
792 : }
793 :
794 : /* if both values are in or above the surrogate range, fix them up */
795 0 : if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
796 : /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
797 0 : if(
798 0 : (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
799 0 : (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
800 : ) {
801 : /* part of a surrogate pair, leave >=d800 */
802 : } else {
803 : /* BMP code point - may be surrogate code point - make <d800 */
804 0 : c1-=0x2800;
805 : }
806 :
807 0 : if(
808 0 : (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
809 0 : (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
810 : ) {
811 : /* part of a surrogate pair, leave >=d800 */
812 : } else {
813 : /* BMP code point - may be surrogate code point - make <d800 */
814 0 : c2-=0x2800;
815 : }
816 : }
817 :
818 : /* now c1 and c2 are in the requested (code unit or code point) order */
819 0 : return (int32_t)c1-(int32_t)c2;
820 : }
821 :
822 : /*
823 : * Compare two strings as presented by UCharIterators.
824 : * Use code unit or code point order.
825 : * When the function returns, it is undefined where the iterators
826 : * have stopped.
827 : */
828 : U_CAPI int32_t U_EXPORT2
829 0 : u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
830 : UChar32 c1, c2;
831 :
832 : /* argument checking */
833 0 : if(iter1==NULL || iter2==NULL) {
834 0 : return 0; /* bad arguments */
835 : }
836 0 : if(iter1==iter2) {
837 0 : return 0; /* identical iterators */
838 : }
839 :
840 : /* reset iterators to start? */
841 0 : iter1->move(iter1, 0, UITER_START);
842 0 : iter2->move(iter2, 0, UITER_START);
843 :
844 : /* compare identical prefixes - they do not need to be fixed up */
845 : for(;;) {
846 0 : c1=iter1->next(iter1);
847 0 : c2=iter2->next(iter2);
848 0 : if(c1!=c2) {
849 0 : break;
850 : }
851 0 : if(c1==-1) {
852 0 : return 0;
853 : }
854 : }
855 :
856 : /* if both values are in or above the surrogate range, fix them up */
857 0 : if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
858 : /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
859 0 : if(
860 0 : (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
861 0 : (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
862 : ) {
863 : /* part of a surrogate pair, leave >=d800 */
864 : } else {
865 : /* BMP code point - may be surrogate code point - make <d800 */
866 0 : c1-=0x2800;
867 : }
868 :
869 0 : if(
870 0 : (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
871 0 : (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
872 : ) {
873 : /* part of a surrogate pair, leave >=d800 */
874 : } else {
875 : /* BMP code point - may be surrogate code point - make <d800 */
876 0 : c2-=0x2800;
877 : }
878 : }
879 :
880 : /* now c1 and c2 are in the requested (code unit or code point) order */
881 0 : return (int32_t)c1-(int32_t)c2;
882 : }
883 :
884 : #if 0
885 : /*
886 : * u_strCompareIter() does not leave the iterators _on_ the different units.
887 : * This is possible but would cost a few extra indirect function calls to back
888 : * up if the last unit (c1 or c2 respectively) was >=0.
889 : *
890 : * Consistently leaving them _behind_ the different units is not an option
891 : * because the current "unit" is the end of the string if that is reached,
892 : * and in such a case the iterator does not move.
893 : * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
894 : * of their strings. Calling previous() on each does not move them to where
895 : * the comparison fails.
896 : *
897 : * So the simplest semantics is to not define where the iterators end up.
898 : *
899 : * The following fragment is part of what would need to be done for backing up.
900 : */
901 : void fragment {
902 : /* iff a surrogate is part of a surrogate pair, leave >=d800 */
903 : if(c1<=0xdbff) {
904 : if(!U16_IS_TRAIL(iter1->current(iter1))) {
905 : /* lead surrogate code point - make <d800 */
906 : c1-=0x2800;
907 : }
908 : } else if(c1<=0xdfff) {
909 : int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
910 : iter1->previous(iter1); /* ==c1 */
911 : if(!U16_IS_LEAD(iter1->previous(iter1))) {
912 : /* trail surrogate code point - make <d800 */
913 : c1-=0x2800;
914 : }
915 : /* go back to behind where the difference is */
916 : iter1->move(iter1, idx, UITER_ZERO);
917 : } else /* 0xe000<=c1<=0xffff */ {
918 : /* BMP code point - make <d800 */
919 : c1-=0x2800;
920 : }
921 : }
922 : #endif
923 :
924 : U_CAPI int32_t U_EXPORT2
925 0 : u_strCompare(const UChar *s1, int32_t length1,
926 : const UChar *s2, int32_t length2,
927 : UBool codePointOrder) {
928 : /* argument checking */
929 0 : if(s1==NULL || length1<-1 || s2==NULL || length2<-1) {
930 0 : return 0;
931 : }
932 0 : return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder);
933 : }
934 :
935 : /* String compare in code point order - u_strcmp() compares in code unit order. */
936 : U_CAPI int32_t U_EXPORT2
937 0 : u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
938 0 : return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE);
939 : }
940 :
941 : U_CAPI int32_t U_EXPORT2
942 0 : u_strncmp(const UChar *s1,
943 : const UChar *s2,
944 : int32_t n)
945 : {
946 0 : if(n > 0) {
947 : int32_t rc;
948 : for(;;) {
949 0 : rc = (int32_t)*s1 - (int32_t)*s2;
950 0 : if(rc != 0 || *s1 == 0 || --n == 0) {
951 0 : return rc;
952 : }
953 0 : ++s1;
954 0 : ++s2;
955 : }
956 : } else {
957 0 : return 0;
958 : }
959 : }
960 :
961 : U_CAPI int32_t U_EXPORT2
962 0 : u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
963 0 : return uprv_strCompare(s1, n, s2, n, TRUE, TRUE);
964 : }
965 :
966 : U_CAPI UChar* U_EXPORT2
967 0 : u_strcpy(UChar *dst,
968 : const UChar *src)
969 : {
970 0 : UChar *anchor = dst; /* save a pointer to start of dst */
971 :
972 0 : while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */
973 : }
974 :
975 0 : return anchor;
976 : }
977 :
978 : U_CAPI UChar* U_EXPORT2
979 0 : u_strncpy(UChar *dst,
980 : const UChar *src,
981 : int32_t n)
982 : {
983 0 : UChar *anchor = dst; /* save a pointer to start of dst */
984 :
985 : /* copy string 2 over */
986 0 : while(n > 0 && (*(dst++) = *(src++)) != 0) {
987 0 : --n;
988 : }
989 :
990 0 : return anchor;
991 : }
992 :
993 : U_CAPI int32_t U_EXPORT2
994 2 : u_strlen(const UChar *s)
995 : {
996 : #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
997 : return (int32_t)uprv_wcslen((const wchar_t *)s);
998 : #else
999 2 : const UChar *t = s;
1000 42 : while(*t != 0) {
1001 20 : ++t;
1002 : }
1003 2 : return t - s;
1004 : #endif
1005 : }
1006 :
1007 : U_CAPI int32_t U_EXPORT2
1008 0 : u_countChar32(const UChar *s, int32_t length) {
1009 : int32_t count;
1010 :
1011 0 : if(s==NULL || length<-1) {
1012 0 : return 0;
1013 : }
1014 :
1015 0 : count=0;
1016 0 : if(length>=0) {
1017 0 : while(length>0) {
1018 0 : ++count;
1019 0 : if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
1020 0 : s+=2;
1021 0 : length-=2;
1022 : } else {
1023 0 : ++s;
1024 0 : --length;
1025 : }
1026 : }
1027 : } else /* length==-1 */ {
1028 : UChar c;
1029 :
1030 : for(;;) {
1031 0 : if((c=*s++)==0) {
1032 0 : break;
1033 : }
1034 0 : ++count;
1035 :
1036 : /*
1037 : * sufficient to look ahead one because of UTF-16;
1038 : * safe to look ahead one because at worst that would be the terminating NUL
1039 : */
1040 0 : if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1041 0 : ++s;
1042 : }
1043 : }
1044 : }
1045 0 : return count;
1046 : }
1047 :
1048 : U_CAPI UBool U_EXPORT2
1049 0 : u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) {
1050 :
1051 0 : if(number<0) {
1052 0 : return TRUE;
1053 : }
1054 0 : if(s==NULL || length<-1) {
1055 0 : return FALSE;
1056 : }
1057 :
1058 0 : if(length==-1) {
1059 : /* s is NUL-terminated */
1060 : UChar c;
1061 :
1062 : /* count code points until they exceed */
1063 : for(;;) {
1064 0 : if((c=*s++)==0) {
1065 0 : return FALSE;
1066 : }
1067 0 : if(number==0) {
1068 0 : return TRUE;
1069 : }
1070 0 : if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
1071 0 : ++s;
1072 : }
1073 0 : --number;
1074 : }
1075 : } else {
1076 : /* length>=0 known */
1077 : const UChar *limit;
1078 : int32_t maxSupplementary;
1079 :
1080 : /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
1081 0 : if(((length+1)/2)>number) {
1082 0 : return TRUE;
1083 : }
1084 :
1085 : /* check if s does not even contain enough UChars */
1086 0 : maxSupplementary=length-number;
1087 0 : if(maxSupplementary<=0) {
1088 0 : return FALSE;
1089 : }
1090 : /* there are maxSupplementary=length-number more UChars than asked-for code points */
1091 :
1092 : /*
1093 : * count code points until they exceed and also check that there are
1094 : * no more than maxSupplementary supplementary code points (UChar pairs)
1095 : */
1096 0 : limit=s+length;
1097 : for(;;) {
1098 0 : if(s==limit) {
1099 0 : return FALSE;
1100 : }
1101 0 : if(number==0) {
1102 0 : return TRUE;
1103 : }
1104 0 : if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
1105 0 : ++s;
1106 0 : if(--maxSupplementary<=0) {
1107 : /* too many pairs - too few code points */
1108 0 : return FALSE;
1109 : }
1110 : }
1111 0 : --number;
1112 : }
1113 : }
1114 : }
1115 :
1116 : U_CAPI UChar * U_EXPORT2
1117 0 : u_memcpy(UChar *dest, const UChar *src, int32_t count) {
1118 0 : if(count > 0) {
1119 0 : uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1120 : }
1121 0 : return dest;
1122 : }
1123 :
1124 : U_CAPI UChar * U_EXPORT2
1125 0 : u_memmove(UChar *dest, const UChar *src, int32_t count) {
1126 0 : if(count > 0) {
1127 0 : uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
1128 : }
1129 0 : return dest;
1130 : }
1131 :
1132 : U_CAPI UChar * U_EXPORT2
1133 0 : u_memset(UChar *dest, UChar c, int32_t count) {
1134 0 : if(count > 0) {
1135 0 : UChar *ptr = dest;
1136 0 : UChar *limit = dest + count;
1137 :
1138 0 : while (ptr < limit) {
1139 0 : *(ptr++) = c;
1140 : }
1141 : }
1142 0 : return dest;
1143 : }
1144 :
1145 : U_CAPI int32_t U_EXPORT2
1146 0 : u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
1147 0 : if(count > 0) {
1148 0 : const UChar *limit = buf1 + count;
1149 : int32_t result;
1150 :
1151 0 : while (buf1 < limit) {
1152 0 : result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
1153 0 : if (result != 0) {
1154 0 : return result;
1155 : }
1156 0 : buf1++;
1157 0 : buf2++;
1158 : }
1159 : }
1160 0 : return 0;
1161 : }
1162 :
1163 : U_CAPI int32_t U_EXPORT2
1164 0 : u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
1165 0 : return uprv_strCompare(s1, count, s2, count, FALSE, TRUE);
1166 : }
1167 :
1168 : /* u_unescape & support fns ------------------------------------------------- */
1169 :
1170 : /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
1171 : static const UChar UNESCAPE_MAP[] = {
1172 : /*" 0x22, 0x22 */
1173 : /*' 0x27, 0x27 */
1174 : /*? 0x3F, 0x3F */
1175 : /*\ 0x5C, 0x5C */
1176 : /*a*/ 0x61, 0x07,
1177 : /*b*/ 0x62, 0x08,
1178 : /*e*/ 0x65, 0x1b,
1179 : /*f*/ 0x66, 0x0c,
1180 : /*n*/ 0x6E, 0x0a,
1181 : /*r*/ 0x72, 0x0d,
1182 : /*t*/ 0x74, 0x09,
1183 : /*v*/ 0x76, 0x0b
1184 : };
1185 : enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
1186 :
1187 : /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
1188 0 : static int8_t _digit8(UChar c) {
1189 0 : if (c >= 0x0030 && c <= 0x0037) {
1190 0 : return (int8_t)(c - 0x0030);
1191 : }
1192 0 : return -1;
1193 : }
1194 :
1195 : /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
1196 0 : static int8_t _digit16(UChar c) {
1197 0 : if (c >= 0x0030 && c <= 0x0039) {
1198 0 : return (int8_t)(c - 0x0030);
1199 : }
1200 0 : if (c >= 0x0041 && c <= 0x0046) {
1201 0 : return (int8_t)(c - (0x0041 - 10));
1202 : }
1203 0 : if (c >= 0x0061 && c <= 0x0066) {
1204 0 : return (int8_t)(c - (0x0061 - 10));
1205 : }
1206 0 : return -1;
1207 : }
1208 :
1209 : /* Parse a single escape sequence. Although this method deals in
1210 : * UChars, it does not use C++ or UnicodeString. This allows it to
1211 : * be used from C contexts. */
1212 : U_CAPI UChar32 U_EXPORT2
1213 0 : u_unescapeAt(UNESCAPE_CHAR_AT charAt,
1214 : int32_t *offset,
1215 : int32_t length,
1216 : void *context) {
1217 :
1218 0 : int32_t start = *offset;
1219 : UChar c;
1220 0 : UChar32 result = 0;
1221 0 : int8_t n = 0;
1222 0 : int8_t minDig = 0;
1223 0 : int8_t maxDig = 0;
1224 0 : int8_t bitsPerDigit = 4;
1225 : int8_t dig;
1226 : int32_t i;
1227 0 : UBool braces = FALSE;
1228 :
1229 : /* Check that offset is in range */
1230 0 : if (*offset < 0 || *offset >= length) {
1231 : goto err;
1232 : }
1233 :
1234 : /* Fetch first UChar after '\\' */
1235 0 : c = charAt((*offset)++, context);
1236 :
1237 : /* Convert hexadecimal and octal escapes */
1238 0 : switch (c) {
1239 : case 0x0075 /*'u'*/:
1240 0 : minDig = maxDig = 4;
1241 0 : break;
1242 : case 0x0055 /*'U'*/:
1243 0 : minDig = maxDig = 8;
1244 0 : break;
1245 : case 0x0078 /*'x'*/:
1246 0 : minDig = 1;
1247 0 : if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) {
1248 0 : ++(*offset);
1249 0 : braces = TRUE;
1250 0 : maxDig = 8;
1251 : } else {
1252 0 : maxDig = 2;
1253 : }
1254 0 : break;
1255 : default:
1256 0 : dig = _digit8(c);
1257 0 : if (dig >= 0) {
1258 0 : minDig = 1;
1259 0 : maxDig = 3;
1260 0 : n = 1; /* Already have first octal digit */
1261 0 : bitsPerDigit = 3;
1262 0 : result = dig;
1263 : }
1264 0 : break;
1265 : }
1266 0 : if (minDig != 0) {
1267 0 : while (*offset < length && n < maxDig) {
1268 0 : c = charAt(*offset, context);
1269 0 : dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c));
1270 0 : if (dig < 0) {
1271 0 : break;
1272 : }
1273 0 : result = (result << bitsPerDigit) | dig;
1274 0 : ++(*offset);
1275 0 : ++n;
1276 : }
1277 0 : if (n < minDig) {
1278 0 : goto err;
1279 : }
1280 0 : if (braces) {
1281 0 : if (c != 0x7D /*}*/) {
1282 0 : goto err;
1283 : }
1284 0 : ++(*offset);
1285 : }
1286 0 : if (result < 0 || result >= 0x110000) {
1287 : goto err;
1288 : }
1289 : /* If an escape sequence specifies a lead surrogate, see if
1290 : * there is a trail surrogate after it, either as an escape or
1291 : * as a literal. If so, join them up into a supplementary.
1292 : */
1293 0 : if (*offset < length && U16_IS_LEAD(result)) {
1294 0 : int32_t ahead = *offset + 1;
1295 0 : c = charAt(*offset, context);
1296 0 : if (c == 0x5C /*'\\'*/ && ahead < length) {
1297 0 : c = (UChar) u_unescapeAt(charAt, &ahead, length, context);
1298 : }
1299 0 : if (U16_IS_TRAIL(c)) {
1300 0 : *offset = ahead;
1301 0 : result = U16_GET_SUPPLEMENTARY(result, c);
1302 : }
1303 : }
1304 0 : return result;
1305 : }
1306 :
1307 : /* Convert C-style escapes in table */
1308 0 : for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
1309 0 : if (c == UNESCAPE_MAP[i]) {
1310 0 : return UNESCAPE_MAP[i+1];
1311 0 : } else if (c < UNESCAPE_MAP[i]) {
1312 0 : break;
1313 : }
1314 : }
1315 :
1316 : /* Map \cX to control-X: X & 0x1F */
1317 0 : if (c == 0x0063 /*'c'*/ && *offset < length) {
1318 0 : c = charAt((*offset)++, context);
1319 0 : if (U16_IS_LEAD(c) && *offset < length) {
1320 0 : UChar c2 = charAt(*offset, context);
1321 0 : if (U16_IS_TRAIL(c2)) {
1322 0 : ++(*offset);
1323 0 : c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */
1324 : }
1325 : }
1326 0 : return 0x1F & c;
1327 : }
1328 :
1329 : /* If no special forms are recognized, then consider
1330 : * the backslash to generically escape the next character.
1331 : * Deal with surrogate pairs. */
1332 0 : if (U16_IS_LEAD(c) && *offset < length) {
1333 0 : UChar c2 = charAt(*offset, context);
1334 0 : if (U16_IS_TRAIL(c2)) {
1335 0 : ++(*offset);
1336 0 : return U16_GET_SUPPLEMENTARY(c, c2);
1337 : }
1338 : }
1339 0 : return c;
1340 :
1341 : err:
1342 : /* Invalid escape sequence */
1343 0 : *offset = start; /* Reset to initial value */
1344 0 : return (UChar32)0xFFFFFFFF;
1345 : }
1346 :
1347 : /* u_unescapeAt() callback to return a UChar from a char* */
1348 : static UChar U_CALLCONV
1349 0 : _charPtr_charAt(int32_t offset, void *context) {
1350 : UChar c16;
1351 : /* It would be more efficient to access the invariant tables
1352 : * directly but there is no API for that. */
1353 0 : u_charsToUChars(((char*) context) + offset, &c16, 1);
1354 0 : return c16;
1355 : }
1356 :
1357 : /* Append an escape-free segment of the text; used by u_unescape() */
1358 0 : static void _appendUChars(UChar *dest, int32_t destCapacity,
1359 : const char *src, int32_t srcLen) {
1360 0 : if (destCapacity < 0) {
1361 0 : destCapacity = 0;
1362 : }
1363 0 : if (srcLen > destCapacity) {
1364 0 : srcLen = destCapacity;
1365 : }
1366 0 : u_charsToUChars(src, dest, srcLen);
1367 0 : }
1368 :
1369 : /* Do an invariant conversion of char* -> UChar*, with escape parsing */
1370 : U_CAPI int32_t U_EXPORT2
1371 0 : u_unescape(const char *src, UChar *dest, int32_t destCapacity) {
1372 0 : const char *segment = src;
1373 0 : int32_t i = 0;
1374 : char c;
1375 :
1376 0 : while ((c=*src) != 0) {
1377 : /* '\\' intentionally written as compiler-specific
1378 : * character constant to correspond to compiler-specific
1379 : * char* constants. */
1380 0 : if (c == '\\') {
1381 0 : int32_t lenParsed = 0;
1382 : UChar32 c32;
1383 0 : if (src != segment) {
1384 0 : if (dest != NULL) {
1385 0 : _appendUChars(dest + i, destCapacity - i,
1386 0 : segment, (int32_t)(src - segment));
1387 : }
1388 0 : i += (int32_t)(src - segment);
1389 : }
1390 0 : ++src; /* advance past '\\' */
1391 0 : c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src);
1392 0 : if (lenParsed == 0) {
1393 0 : goto err;
1394 : }
1395 0 : src += lenParsed; /* advance past escape seq. */
1396 0 : if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) {
1397 0 : U16_APPEND_UNSAFE(dest, i, c32);
1398 : } else {
1399 0 : i += U16_LENGTH(c32);
1400 : }
1401 0 : segment = src;
1402 : } else {
1403 0 : ++src;
1404 : }
1405 : }
1406 0 : if (src != segment) {
1407 0 : if (dest != NULL) {
1408 0 : _appendUChars(dest + i, destCapacity - i,
1409 0 : segment, (int32_t)(src - segment));
1410 : }
1411 0 : i += (int32_t)(src - segment);
1412 : }
1413 0 : if (dest != NULL && i < destCapacity) {
1414 0 : dest[i] = 0;
1415 : }
1416 0 : return i;
1417 :
1418 : err:
1419 0 : if (dest != NULL && destCapacity > 0) {
1420 0 : *dest = 0;
1421 : }
1422 0 : return 0;
1423 : }
1424 :
1425 : /* NUL-termination of strings ----------------------------------------------- */
1426 :
1427 : /**
1428 : * NUL-terminate a string no matter what its type.
1429 : * Set warning and error codes accordingly.
1430 : */
1431 : #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \
1432 : if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \
1433 : /* not a public function, so no complete argument checking */ \
1434 : \
1435 : if(length<0) { \
1436 : /* assume that the caller handles this */ \
1437 : } else if(length<destCapacity) { \
1438 : /* NUL-terminate the string, the NUL fits */ \
1439 : dest[length]=0; \
1440 : /* unset the not-terminated warning but leave all others */ \
1441 : if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \
1442 : *pErrorCode=U_ZERO_ERROR; \
1443 : } \
1444 : } else if(length==destCapacity) { \
1445 : /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
1446 : *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \
1447 : } else /* length>destCapacity */ { \
1448 : /* even the string itself did not fit - set an error code */ \
1449 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \
1450 : } \
1451 : }
1452 :
1453 : U_CAPI int32_t U_EXPORT2
1454 0 : u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1455 0 : __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1456 0 : return length;
1457 : }
1458 :
1459 : U_CAPI int32_t U_EXPORT2
1460 164 : u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1461 164 : __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1462 164 : return length;
1463 : }
1464 :
1465 : U_CAPI int32_t U_EXPORT2
1466 0 : u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1467 0 : __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1468 0 : return length;
1469 : }
1470 :
1471 : U_CAPI int32_t U_EXPORT2
1472 0 : u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
1473 0 : __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
1474 0 : return length;
1475 : }
1476 :
1477 : // Compute the hash code for a string -------------------------------------- ***
1478 :
1479 : // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
1480 : // on UHashtable code.
1481 :
1482 : /*
1483 : Compute the hash by iterating sparsely over about 32 (up to 63)
1484 : characters spaced evenly through the string. For each character,
1485 : multiply the previous hash value by a prime number and add the new
1486 : character in, like a linear congruential random number generator,
1487 : producing a pseudorandom deterministic value well distributed over
1488 : the output range. [LIU]
1489 : */
1490 :
1491 : #define STRING_HASH(TYPE, STR, STRLEN, DEREF) \
1492 : uint32_t hash = 0; \
1493 : const TYPE *p = (const TYPE*) STR; \
1494 : if (p != NULL) { \
1495 : int32_t len = (int32_t)(STRLEN); \
1496 : int32_t inc = ((len - 32) / 32) + 1; \
1497 : const TYPE *limit = p + len; \
1498 : while (p<limit) { \
1499 : hash = (hash * 37) + DEREF; \
1500 : p += inc; \
1501 : } \
1502 : } \
1503 : return static_cast<int32_t>(hash)
1504 :
1505 : /* Used by UnicodeString to compute its hashcode - Not public API. */
1506 : U_CAPI int32_t U_EXPORT2
1507 0 : ustr_hashUCharsN(const UChar *str, int32_t length) {
1508 0 : STRING_HASH(UChar, str, length, *p);
1509 : }
1510 :
1511 : U_CAPI int32_t U_EXPORT2
1512 28 : ustr_hashCharsN(const char *str, int32_t length) {
1513 28 : STRING_HASH(uint8_t, str, length, *p);
1514 : }
1515 :
1516 : U_CAPI int32_t U_EXPORT2
1517 0 : ustr_hashICharsN(const char *str, int32_t length) {
1518 0 : STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
1519 : }
|