Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2001-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : *
11 : * File ustrtrns.cpp
12 : *
13 : * Modification History:
14 : *
15 : * Date Name Description
16 : * 9/10/2001 Ram Creation.
17 : ******************************************************************************
18 : */
19 :
20 : /*******************************************************************************
21 : *
22 : * u_strTo* and u_strFrom* APIs
23 : * WCS functions moved to ustr_wcs.c for better modularization
24 : *
25 : *******************************************************************************
26 : */
27 :
28 :
29 : #include "unicode/putil.h"
30 : #include "unicode/ustring.h"
31 : #include "unicode/utf.h"
32 : #include "unicode/utf8.h"
33 : #include "unicode/utf16.h"
34 : #include "cstring.h"
35 : #include "cmemory.h"
36 : #include "ustr_imp.h"
37 : #include "uassert.h"
38 :
39 : U_CAPI UChar* U_EXPORT2
40 0 : u_strFromUTF32WithSub(UChar *dest,
41 : int32_t destCapacity,
42 : int32_t *pDestLength,
43 : const UChar32 *src,
44 : int32_t srcLength,
45 : UChar32 subchar, int32_t *pNumSubstitutions,
46 : UErrorCode *pErrorCode) {
47 : const UChar32 *srcLimit;
48 : UChar32 ch;
49 : UChar *destLimit;
50 : UChar *pDest;
51 : int32_t reqLength;
52 : int32_t numSubstitutions;
53 :
54 : /* args check */
55 0 : if(U_FAILURE(*pErrorCode)){
56 0 : return NULL;
57 : }
58 0 : if( (src==NULL && srcLength!=0) || srcLength < -1 ||
59 0 : (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
60 0 : subchar > 0x10ffff || U_IS_SURROGATE(subchar)
61 : ) {
62 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
63 0 : return NULL;
64 : }
65 :
66 0 : if(pNumSubstitutions != NULL) {
67 0 : *pNumSubstitutions = 0;
68 : }
69 :
70 0 : pDest = dest;
71 0 : destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
72 0 : reqLength = 0;
73 0 : numSubstitutions = 0;
74 :
75 0 : if(srcLength < 0) {
76 : /* simple loop for conversion of a NUL-terminated BMP string */
77 0 : while((ch=*src) != 0 &&
78 0 : ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
79 0 : ++src;
80 0 : if(pDest < destLimit) {
81 0 : *pDest++ = (UChar)ch;
82 : } else {
83 0 : ++reqLength;
84 : }
85 : }
86 0 : srcLimit = src;
87 0 : if(ch != 0) {
88 : /* "complicated" case, find the end of the remaining string */
89 0 : while(*++srcLimit != 0) {}
90 : }
91 : } else {
92 0 : srcLimit = (src!=NULL)?(src + srcLength):NULL;
93 : }
94 :
95 : /* convert with length */
96 0 : while(src < srcLimit) {
97 0 : ch = *src++;
98 : do {
99 : /* usually "loops" once; twice only for writing subchar */
100 0 : if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
101 0 : if(pDest < destLimit) {
102 0 : *pDest++ = (UChar)ch;
103 : } else {
104 0 : ++reqLength;
105 : }
106 0 : break;
107 0 : } else if(0x10000 <= ch && ch <= 0x10ffff) {
108 0 : if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
109 0 : *pDest++ = U16_LEAD(ch);
110 0 : *pDest++ = U16_TRAIL(ch);
111 : } else {
112 0 : reqLength += 2;
113 : }
114 0 : break;
115 0 : } else if((ch = subchar) < 0) {
116 : /* surrogate code point, or not a Unicode code point at all */
117 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
118 0 : return NULL;
119 : } else {
120 0 : ++numSubstitutions;
121 : }
122 : } while(TRUE);
123 : }
124 :
125 0 : reqLength += (int32_t)(pDest - dest);
126 0 : if(pDestLength) {
127 0 : *pDestLength = reqLength;
128 : }
129 0 : if(pNumSubstitutions != NULL) {
130 0 : *pNumSubstitutions = numSubstitutions;
131 : }
132 :
133 : /* Terminate the buffer */
134 0 : u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
135 :
136 0 : return dest;
137 : }
138 :
139 : U_CAPI UChar* U_EXPORT2
140 0 : u_strFromUTF32(UChar *dest,
141 : int32_t destCapacity,
142 : int32_t *pDestLength,
143 : const UChar32 *src,
144 : int32_t srcLength,
145 : UErrorCode *pErrorCode) {
146 : return u_strFromUTF32WithSub(
147 : dest, destCapacity, pDestLength,
148 : src, srcLength,
149 : U_SENTINEL, NULL,
150 0 : pErrorCode);
151 : }
152 :
153 : U_CAPI UChar32* U_EXPORT2
154 0 : u_strToUTF32WithSub(UChar32 *dest,
155 : int32_t destCapacity,
156 : int32_t *pDestLength,
157 : const UChar *src,
158 : int32_t srcLength,
159 : UChar32 subchar, int32_t *pNumSubstitutions,
160 : UErrorCode *pErrorCode) {
161 : const UChar *srcLimit;
162 : UChar32 ch;
163 : UChar ch2;
164 : UChar32 *destLimit;
165 : UChar32 *pDest;
166 : int32_t reqLength;
167 : int32_t numSubstitutions;
168 :
169 : /* args check */
170 0 : if(U_FAILURE(*pErrorCode)){
171 0 : return NULL;
172 : }
173 0 : if( (src==NULL && srcLength!=0) || srcLength < -1 ||
174 0 : (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
175 0 : subchar > 0x10ffff || U_IS_SURROGATE(subchar)
176 : ) {
177 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
178 0 : return NULL;
179 : }
180 :
181 0 : if(pNumSubstitutions != NULL) {
182 0 : *pNumSubstitutions = 0;
183 : }
184 :
185 0 : pDest = dest;
186 0 : destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
187 0 : reqLength = 0;
188 0 : numSubstitutions = 0;
189 :
190 0 : if(srcLength < 0) {
191 : /* simple loop for conversion of a NUL-terminated BMP string */
192 0 : while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
193 0 : ++src;
194 0 : if(pDest < destLimit) {
195 0 : *pDest++ = ch;
196 : } else {
197 0 : ++reqLength;
198 : }
199 : }
200 0 : srcLimit = src;
201 0 : if(ch != 0) {
202 : /* "complicated" case, find the end of the remaining string */
203 0 : while(*++srcLimit != 0) {}
204 : }
205 : } else {
206 0 : srcLimit = (src!=NULL)?(src + srcLength):NULL;
207 : }
208 :
209 : /* convert with length */
210 0 : while(src < srcLimit) {
211 0 : ch = *src++;
212 0 : if(!U16_IS_SURROGATE(ch)) {
213 : /* write or count ch below */
214 0 : } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
215 0 : ++src;
216 0 : ch = U16_GET_SUPPLEMENTARY(ch, ch2);
217 0 : } else if((ch = subchar) < 0) {
218 : /* unpaired surrogate */
219 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
220 0 : return NULL;
221 : } else {
222 0 : ++numSubstitutions;
223 : }
224 0 : if(pDest < destLimit) {
225 0 : *pDest++ = ch;
226 : } else {
227 0 : ++reqLength;
228 : }
229 : }
230 :
231 0 : reqLength += (int32_t)(pDest - dest);
232 0 : if(pDestLength) {
233 0 : *pDestLength = reqLength;
234 : }
235 0 : if(pNumSubstitutions != NULL) {
236 0 : *pNumSubstitutions = numSubstitutions;
237 : }
238 :
239 : /* Terminate the buffer */
240 0 : u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
241 :
242 0 : return dest;
243 : }
244 :
245 : U_CAPI UChar32* U_EXPORT2
246 0 : u_strToUTF32(UChar32 *dest,
247 : int32_t destCapacity,
248 : int32_t *pDestLength,
249 : const UChar *src,
250 : int32_t srcLength,
251 : UErrorCode *pErrorCode) {
252 : return u_strToUTF32WithSub(
253 : dest, destCapacity, pDestLength,
254 : src, srcLength,
255 : U_SENTINEL, NULL,
256 0 : pErrorCode);
257 : }
258 :
259 : /* for utf8_nextCharSafeBodyTerminated() */
260 : static const UChar32
261 : utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
262 :
263 : /*
264 : * Version of utf8_nextCharSafeBody() with the following differences:
265 : * - checks for NUL termination instead of length
266 : * - works with pointers instead of indexes
267 : * - always strict (strict==-1)
268 : *
269 : * *ps points to after the lead byte and will be moved to after the last trail byte.
270 : * c is the lead byte.
271 : * @return the code point, or U_SENTINEL
272 : */
273 : static UChar32
274 0 : utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
275 0 : const uint8_t *s=*ps;
276 0 : uint8_t trail, illegal=0;
277 0 : uint8_t count=U8_COUNT_TRAIL_BYTES(c);
278 0 : U_ASSERT(count<6);
279 0 : U8_MASK_LEAD_BYTE((c), count);
280 : /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
281 0 : switch(count) {
282 : /* each branch falls through to the next one */
283 : case 5:
284 : case 4:
285 : /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
286 0 : illegal=1;
287 0 : break;
288 : case 3:
289 0 : trail=(uint8_t)(*s++ - 0x80);
290 0 : c=(c<<6)|trail;
291 0 : if(trail>0x3f || c>=0x110) {
292 : /* not a trail byte, or code point>0x10ffff (outside Unicode) */
293 0 : illegal=1;
294 0 : break;
295 : }
296 : U_FALLTHROUGH;
297 : case 2:
298 0 : trail=(uint8_t)(*s++ - 0x80);
299 0 : if(trail>0x3f) {
300 : /* not a trail byte */
301 0 : illegal=1;
302 0 : break;
303 : }
304 0 : c=(c<<6)|trail;
305 : U_FALLTHROUGH;
306 : case 1:
307 0 : trail=(uint8_t)(*s++ - 0x80);
308 0 : if(trail>0x3f) {
309 : /* not a trail byte */
310 0 : illegal=1;
311 : }
312 0 : c=(c<<6)|trail;
313 0 : break;
314 : case 0:
315 0 : return U_SENTINEL;
316 : /* no default branch to optimize switch() - all values are covered */
317 : }
318 :
319 : /* correct sequence - all trail bytes have (b7..b6)==(10)? */
320 : /* illegal is also set if count>=4 */
321 0 : if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
322 : /* error handling */
323 : /* don't go beyond this sequence */
324 0 : s=*ps;
325 0 : while(count>0 && U8_IS_TRAIL(*s)) {
326 0 : ++s;
327 0 : --count;
328 : }
329 0 : c=U_SENTINEL;
330 : }
331 0 : *ps=s;
332 0 : return c;
333 : }
334 :
335 : /*
336 : * Version of utf8_nextCharSafeBody() with the following differences:
337 : * - works with pointers instead of indexes
338 : * - always strict (strict==-1)
339 : *
340 : * *ps points to after the lead byte and will be moved to after the last trail byte.
341 : * c is the lead byte.
342 : * @return the code point, or U_SENTINEL
343 : */
344 : static UChar32
345 0 : utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
346 0 : const uint8_t *s=*ps;
347 0 : uint8_t trail, illegal=0;
348 0 : uint8_t count=U8_COUNT_TRAIL_BYTES(c);
349 0 : if((limit-s)>=count) {
350 0 : U8_MASK_LEAD_BYTE((c), count);
351 : /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
352 0 : switch(count) {
353 : /* each branch falls through to the next one */
354 : case 5:
355 : case 4:
356 : /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
357 0 : illegal=1;
358 0 : break;
359 : case 3:
360 0 : trail=*s++;
361 0 : c=(c<<6)|(trail&0x3f);
362 0 : if(c<0x110) {
363 0 : illegal|=(trail&0xc0)^0x80;
364 : } else {
365 : /* code point>0x10ffff, outside Unicode */
366 0 : illegal=1;
367 0 : break;
368 : }
369 : U_FALLTHROUGH;
370 : case 2:
371 0 : trail=*s++;
372 0 : c=(c<<6)|(trail&0x3f);
373 0 : illegal|=(trail&0xc0)^0x80;
374 : U_FALLTHROUGH;
375 : case 1:
376 0 : trail=*s++;
377 0 : c=(c<<6)|(trail&0x3f);
378 0 : illegal|=(trail&0xc0)^0x80;
379 0 : break;
380 : case 0:
381 0 : return U_SENTINEL;
382 : /* no default branch to optimize switch() - all values are covered */
383 : }
384 : } else {
385 0 : illegal=1; /* too few bytes left */
386 : }
387 :
388 : /* correct sequence - all trail bytes have (b7..b6)==(10)? */
389 : /* illegal is also set if count>=4 */
390 0 : U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
391 0 : if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
392 : /* error handling */
393 : /* don't go beyond this sequence */
394 0 : s=*ps;
395 0 : while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
396 0 : ++s;
397 0 : --count;
398 : }
399 0 : c=U_SENTINEL;
400 : }
401 0 : *ps=s;
402 0 : return c;
403 : }
404 :
405 : U_CAPI UChar* U_EXPORT2
406 0 : u_strFromUTF8WithSub(UChar *dest,
407 : int32_t destCapacity,
408 : int32_t *pDestLength,
409 : const char* src,
410 : int32_t srcLength,
411 : UChar32 subchar, int32_t *pNumSubstitutions,
412 : UErrorCode *pErrorCode){
413 0 : UChar *pDest = dest;
414 0 : UChar *pDestLimit = dest+destCapacity;
415 : UChar32 ch;
416 0 : int32_t reqLength = 0;
417 0 : const uint8_t* pSrc = (const uint8_t*) src;
418 : uint8_t t1, t2; /* trail bytes */
419 : int32_t numSubstitutions;
420 :
421 : /* args check */
422 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
423 0 : return NULL;
424 : }
425 :
426 0 : if( (src==NULL && srcLength!=0) || srcLength < -1 ||
427 0 : (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
428 0 : subchar > 0x10ffff || U_IS_SURROGATE(subchar)
429 : ) {
430 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
431 0 : return NULL;
432 : }
433 :
434 0 : if(pNumSubstitutions!=NULL) {
435 0 : *pNumSubstitutions=0;
436 : }
437 0 : numSubstitutions=0;
438 :
439 : /*
440 : * Inline processing of UTF-8 byte sequences:
441 : *
442 : * Byte sequences for the most common characters are handled inline in
443 : * the conversion loops. In order to reduce the path lengths for those
444 : * characters, the tests are arranged in a kind of binary search.
445 : * ASCII (<=0x7f) is checked first, followed by the dividing point
446 : * between 2- and 3-byte sequences (0xe0).
447 : * The 3-byte branch is tested first to speed up CJK text.
448 : * The compiler should combine the subtractions for the two tests for 0xe0.
449 : * Each branch then tests for the other end of its range.
450 : */
451 :
452 0 : if(srcLength < 0){
453 : /*
454 : * Transform a NUL-terminated string.
455 : * The code explicitly checks for NULs only in the lead byte position.
456 : * A NUL byte in the trail byte position fails the trail byte range check anyway.
457 : */
458 0 : while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
459 0 : if(ch <= 0x7f){
460 0 : *pDest++=(UChar)ch;
461 0 : ++pSrc;
462 : } else {
463 0 : if(ch > 0xe0) {
464 0 : if( /* handle U+1000..U+CFFF inline */
465 0 : ch <= 0xec &&
466 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
467 0 : (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
468 : ) {
469 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
470 0 : *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
471 0 : pSrc += 3;
472 0 : continue;
473 : }
474 0 : } else if(ch < 0xe0) {
475 0 : if( /* handle U+0080..U+07FF inline */
476 0 : ch >= 0xc2 &&
477 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
478 : ) {
479 0 : *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
480 0 : pSrc += 2;
481 0 : continue;
482 : }
483 : }
484 :
485 : /* function call for "complicated" and error cases */
486 0 : ++pSrc; /* continue after the lead byte */
487 0 : ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
488 0 : if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
489 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
490 0 : return NULL;
491 0 : } else if(ch<=0xFFFF) {
492 0 : *(pDest++)=(UChar)ch;
493 : } else {
494 0 : *(pDest++)=U16_LEAD(ch);
495 0 : if(pDest<pDestLimit) {
496 0 : *(pDest++)=U16_TRAIL(ch);
497 : } else {
498 0 : reqLength++;
499 0 : break;
500 : }
501 : }
502 : }
503 : }
504 :
505 : /* Pre-flight the rest of the string. */
506 0 : while((ch = *pSrc) != 0) {
507 0 : if(ch <= 0x7f){
508 0 : ++reqLength;
509 0 : ++pSrc;
510 : } else {
511 0 : if(ch > 0xe0) {
512 0 : if( /* handle U+1000..U+CFFF inline */
513 0 : ch <= 0xec &&
514 0 : (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
515 0 : (uint8_t)(pSrc[2] - 0x80) <= 0x3f
516 : ) {
517 0 : ++reqLength;
518 0 : pSrc += 3;
519 0 : continue;
520 : }
521 0 : } else if(ch < 0xe0) {
522 0 : if( /* handle U+0080..U+07FF inline */
523 0 : ch >= 0xc2 &&
524 0 : (uint8_t)(pSrc[1] - 0x80) <= 0x3f
525 : ) {
526 0 : ++reqLength;
527 0 : pSrc += 2;
528 0 : continue;
529 : }
530 : }
531 :
532 : /* function call for "complicated" and error cases */
533 0 : ++pSrc; /* continue after the lead byte */
534 0 : ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
535 0 : if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
536 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
537 0 : return NULL;
538 : }
539 0 : reqLength += U16_LENGTH(ch);
540 : }
541 : }
542 : } else /* srcLength >= 0 */ {
543 0 : const uint8_t *pSrcLimit = pSrc + srcLength;
544 : int32_t count;
545 :
546 : /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
547 : for(;;) {
548 : /*
549 : * Each iteration of the inner loop progresses by at most 3 UTF-8
550 : * bytes and one UChar, for most characters.
551 : * For supplementary code points (4 & 2), which are rare,
552 : * there is an additional adjustment.
553 : */
554 0 : count = (int32_t)(pDestLimit - pDest);
555 0 : srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
556 0 : if(count > srcLength) {
557 0 : count = srcLength; /* min(remaining dest, remaining src/3) */
558 : }
559 0 : if(count < 3) {
560 : /*
561 : * Too much overhead if we get near the end of the string,
562 : * continue with the next loop.
563 : */
564 0 : break;
565 : }
566 :
567 0 : do {
568 0 : ch = *pSrc;
569 0 : if(ch <= 0x7f){
570 0 : *pDest++=(UChar)ch;
571 0 : ++pSrc;
572 : } else {
573 0 : if(ch > 0xe0) {
574 0 : if( /* handle U+1000..U+CFFF inline */
575 0 : ch <= 0xec &&
576 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
577 0 : (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
578 : ) {
579 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
580 0 : *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
581 0 : pSrc += 3;
582 0 : continue;
583 : }
584 0 : } else if(ch < 0xe0) {
585 0 : if( /* handle U+0080..U+07FF inline */
586 0 : ch >= 0xc2 &&
587 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
588 : ) {
589 0 : *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
590 0 : pSrc += 2;
591 0 : continue;
592 : }
593 : }
594 :
595 0 : if(ch >= 0xf0 || subchar > 0xffff) {
596 : /*
597 : * We may read up to six bytes and write up to two UChars,
598 : * which we didn't account for with computing count,
599 : * so we adjust it here.
600 : */
601 0 : if(--count == 0) {
602 0 : break;
603 : }
604 : }
605 :
606 : /* function call for "complicated" and error cases */
607 0 : ++pSrc; /* continue after the lead byte */
608 0 : ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
609 0 : if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
610 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
611 0 : return NULL;
612 0 : }else if(ch<=0xFFFF){
613 0 : *(pDest++)=(UChar)ch;
614 : }else{
615 0 : *(pDest++)=U16_LEAD(ch);
616 0 : *(pDest++)=U16_TRAIL(ch);
617 : }
618 : }
619 : } while(--count > 0);
620 : }
621 :
622 0 : while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
623 0 : ch = *pSrc;
624 0 : if(ch <= 0x7f){
625 0 : *pDest++=(UChar)ch;
626 0 : ++pSrc;
627 : } else {
628 0 : if(ch > 0xe0) {
629 0 : if( /* handle U+1000..U+CFFF inline */
630 0 : ch <= 0xec &&
631 0 : ((pSrcLimit - pSrc) >= 3) &&
632 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
633 0 : (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
634 : ) {
635 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
636 0 : *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
637 0 : pSrc += 3;
638 0 : continue;
639 : }
640 0 : } else if(ch < 0xe0) {
641 0 : if( /* handle U+0080..U+07FF inline */
642 0 : ch >= 0xc2 &&
643 0 : ((pSrcLimit - pSrc) >= 2) &&
644 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
645 : ) {
646 0 : *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
647 0 : pSrc += 2;
648 0 : continue;
649 : }
650 : }
651 :
652 : /* function call for "complicated" and error cases */
653 0 : ++pSrc; /* continue after the lead byte */
654 0 : ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
655 0 : if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
656 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
657 0 : return NULL;
658 0 : }else if(ch<=0xFFFF){
659 0 : *(pDest++)=(UChar)ch;
660 : }else{
661 0 : *(pDest++)=U16_LEAD(ch);
662 0 : if(pDest<pDestLimit){
663 0 : *(pDest++)=U16_TRAIL(ch);
664 : }else{
665 0 : reqLength++;
666 0 : break;
667 : }
668 : }
669 : }
670 : }
671 : /* do not fill the dest buffer just count the UChars needed */
672 0 : while(pSrc < pSrcLimit){
673 0 : ch = *pSrc;
674 0 : if(ch <= 0x7f){
675 0 : reqLength++;
676 0 : ++pSrc;
677 : } else {
678 0 : if(ch > 0xe0) {
679 0 : if( /* handle U+1000..U+CFFF inline */
680 0 : ch <= 0xec &&
681 0 : ((pSrcLimit - pSrc) >= 3) &&
682 0 : (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
683 0 : (uint8_t)(pSrc[2] - 0x80) <= 0x3f
684 : ) {
685 0 : reqLength++;
686 0 : pSrc += 3;
687 0 : continue;
688 : }
689 0 : } else if(ch < 0xe0) {
690 0 : if( /* handle U+0080..U+07FF inline */
691 0 : ch >= 0xc2 &&
692 0 : ((pSrcLimit - pSrc) >= 2) &&
693 0 : (uint8_t)(pSrc[1] - 0x80) <= 0x3f
694 : ) {
695 0 : reqLength++;
696 0 : pSrc += 2;
697 0 : continue;
698 : }
699 : }
700 :
701 : /* function call for "complicated" and error cases */
702 0 : ++pSrc; /* continue after the lead byte */
703 0 : ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
704 0 : if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
705 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
706 0 : return NULL;
707 : }
708 0 : reqLength+=U16_LENGTH(ch);
709 : }
710 : }
711 : }
712 :
713 0 : reqLength+=(int32_t)(pDest - dest);
714 :
715 0 : if(pNumSubstitutions!=NULL) {
716 0 : *pNumSubstitutions=numSubstitutions;
717 : }
718 :
719 0 : if(pDestLength){
720 0 : *pDestLength = reqLength;
721 : }
722 :
723 : /* Terminate the buffer */
724 0 : u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
725 :
726 0 : return dest;
727 : }
728 :
729 : U_CAPI UChar* U_EXPORT2
730 0 : u_strFromUTF8(UChar *dest,
731 : int32_t destCapacity,
732 : int32_t *pDestLength,
733 : const char* src,
734 : int32_t srcLength,
735 : UErrorCode *pErrorCode){
736 : return u_strFromUTF8WithSub(
737 : dest, destCapacity, pDestLength,
738 : src, srcLength,
739 : U_SENTINEL, NULL,
740 0 : pErrorCode);
741 : }
742 :
743 : U_CAPI UChar * U_EXPORT2
744 0 : u_strFromUTF8Lenient(UChar *dest,
745 : int32_t destCapacity,
746 : int32_t *pDestLength,
747 : const char *src,
748 : int32_t srcLength,
749 : UErrorCode *pErrorCode) {
750 0 : UChar *pDest = dest;
751 : UChar32 ch;
752 0 : int32_t reqLength = 0;
753 0 : uint8_t* pSrc = (uint8_t*) src;
754 :
755 : /* args check */
756 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
757 0 : return NULL;
758 : }
759 :
760 0 : if( (src==NULL && srcLength!=0) || srcLength < -1 ||
761 0 : (destCapacity<0) || (dest == NULL && destCapacity > 0)
762 : ) {
763 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
764 0 : return NULL;
765 : }
766 :
767 0 : if(srcLength < 0) {
768 : /* Transform a NUL-terminated string. */
769 0 : UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
770 : uint8_t t1, t2, t3; /* trail bytes */
771 :
772 0 : while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
773 0 : if(ch < 0xc0) {
774 : /*
775 : * ASCII, or a trail byte in lead position which is treated like
776 : * a single-byte sequence for better character boundary
777 : * resynchronization after illegal sequences.
778 : */
779 0 : *pDest++=(UChar)ch;
780 0 : ++pSrc;
781 0 : continue;
782 0 : } else if(ch < 0xe0) { /* U+0080..U+07FF */
783 0 : if((t1 = pSrc[1]) != 0) {
784 : /* 0x3080 = (0xc0 << 6) + 0x80 */
785 0 : *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
786 0 : pSrc += 2;
787 0 : continue;
788 : }
789 0 : } else if(ch < 0xf0) { /* U+0800..U+FFFF */
790 0 : if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
791 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
792 : /* 0x2080 = (0x80 << 6) + 0x80 */
793 0 : *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
794 0 : pSrc += 3;
795 0 : continue;
796 : }
797 : } else /* f0..f4 */ { /* U+10000..U+10FFFF */
798 0 : if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
799 0 : pSrc += 4;
800 : /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
801 0 : ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
802 0 : *(pDest++) = U16_LEAD(ch);
803 0 : if(pDest < pDestLimit) {
804 0 : *(pDest++) = U16_TRAIL(ch);
805 : } else {
806 0 : reqLength = 1;
807 0 : break;
808 : }
809 0 : continue;
810 : }
811 : }
812 :
813 : /* truncated character at the end */
814 0 : *pDest++ = 0xfffd;
815 0 : while(*++pSrc != 0) {}
816 0 : break;
817 : }
818 :
819 : /* Pre-flight the rest of the string. */
820 0 : while((ch = *pSrc) != 0) {
821 0 : if(ch < 0xc0) {
822 : /*
823 : * ASCII, or a trail byte in lead position which is treated like
824 : * a single-byte sequence for better character boundary
825 : * resynchronization after illegal sequences.
826 : */
827 0 : ++reqLength;
828 0 : ++pSrc;
829 0 : continue;
830 0 : } else if(ch < 0xe0) { /* U+0080..U+07FF */
831 0 : if(pSrc[1] != 0) {
832 0 : ++reqLength;
833 0 : pSrc += 2;
834 0 : continue;
835 : }
836 0 : } else if(ch < 0xf0) { /* U+0800..U+FFFF */
837 0 : if(pSrc[1] != 0 && pSrc[2] != 0) {
838 0 : ++reqLength;
839 0 : pSrc += 3;
840 0 : continue;
841 : }
842 : } else /* f0..f4 */ { /* U+10000..U+10FFFF */
843 0 : if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
844 0 : reqLength += 2;
845 0 : pSrc += 4;
846 0 : continue;
847 : }
848 : }
849 :
850 : /* truncated character at the end */
851 0 : ++reqLength;
852 0 : break;
853 : }
854 : } else /* srcLength >= 0 */ {
855 0 : const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
856 :
857 : /*
858 : * This function requires that if srcLength is given, then it must be
859 : * destCapatity >= srcLength so that we need not check for
860 : * destination buffer overflow in the loop.
861 : */
862 0 : if(destCapacity < srcLength) {
863 0 : if(pDestLength != NULL) {
864 0 : *pDestLength = srcLength; /* this likely overestimates the true destLength! */
865 : }
866 0 : *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
867 0 : return NULL;
868 : }
869 :
870 0 : if((pSrcLimit - pSrc) >= 4) {
871 0 : pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
872 :
873 : /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
874 0 : do {
875 0 : ch = *pSrc++;
876 0 : if(ch < 0xc0) {
877 : /*
878 : * ASCII, or a trail byte in lead position which is treated like
879 : * a single-byte sequence for better character boundary
880 : * resynchronization after illegal sequences.
881 : */
882 0 : *pDest++=(UChar)ch;
883 0 : } else if(ch < 0xe0) { /* U+0080..U+07FF */
884 : /* 0x3080 = (0xc0 << 6) + 0x80 */
885 0 : *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
886 0 : } else if(ch < 0xf0) { /* U+0800..U+FFFF */
887 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
888 : /* 0x2080 = (0x80 << 6) + 0x80 */
889 0 : ch = (ch << 12) + (*pSrc++ << 6);
890 0 : *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
891 : } else /* f0..f4 */ { /* U+10000..U+10FFFF */
892 : /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
893 0 : ch = (ch << 18) + (*pSrc++ << 12);
894 0 : ch += *pSrc++ << 6;
895 0 : ch += *pSrc++ - 0x3c82080;
896 0 : *(pDest++) = U16_LEAD(ch);
897 0 : *(pDest++) = U16_TRAIL(ch);
898 : }
899 0 : } while(pSrc < pSrcLimit);
900 :
901 0 : pSrcLimit += 3; /* restore original pSrcLimit */
902 : }
903 :
904 0 : while(pSrc < pSrcLimit) {
905 0 : ch = *pSrc++;
906 0 : if(ch < 0xc0) {
907 : /*
908 : * ASCII, or a trail byte in lead position which is treated like
909 : * a single-byte sequence for better character boundary
910 : * resynchronization after illegal sequences.
911 : */
912 0 : *pDest++=(UChar)ch;
913 0 : continue;
914 0 : } else if(ch < 0xe0) { /* U+0080..U+07FF */
915 0 : if(pSrc < pSrcLimit) {
916 : /* 0x3080 = (0xc0 << 6) + 0x80 */
917 0 : *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
918 0 : continue;
919 : }
920 0 : } else if(ch < 0xf0) { /* U+0800..U+FFFF */
921 0 : if((pSrcLimit - pSrc) >= 2) {
922 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
923 : /* 0x2080 = (0x80 << 6) + 0x80 */
924 0 : ch = (ch << 12) + (*pSrc++ << 6);
925 0 : *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
926 0 : pSrc += 3;
927 0 : continue;
928 : }
929 : } else /* f0..f4 */ { /* U+10000..U+10FFFF */
930 0 : if((pSrcLimit - pSrc) >= 3) {
931 : /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
932 0 : ch = (ch << 18) + (*pSrc++ << 12);
933 0 : ch += *pSrc++ << 6;
934 0 : ch += *pSrc++ - 0x3c82080;
935 0 : *(pDest++) = U16_LEAD(ch);
936 0 : *(pDest++) = U16_TRAIL(ch);
937 0 : pSrc += 4;
938 0 : continue;
939 : }
940 : }
941 :
942 : /* truncated character at the end */
943 0 : *pDest++ = 0xfffd;
944 0 : break;
945 : }
946 : }
947 :
948 0 : reqLength+=(int32_t)(pDest - dest);
949 :
950 0 : if(pDestLength){
951 0 : *pDestLength = reqLength;
952 : }
953 :
954 : /* Terminate the buffer */
955 0 : u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
956 :
957 0 : return dest;
958 : }
959 :
960 : static inline uint8_t *
961 0 : _appendUTF8(uint8_t *pDest, UChar32 c) {
962 : /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
963 0 : if((c)<=0x7f) {
964 0 : *pDest++=(uint8_t)c;
965 0 : } else if(c<=0x7ff) {
966 0 : *pDest++=(uint8_t)((c>>6)|0xc0);
967 0 : *pDest++=(uint8_t)((c&0x3f)|0x80);
968 0 : } else if(c<=0xffff) {
969 0 : *pDest++=(uint8_t)((c>>12)|0xe0);
970 0 : *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
971 0 : *pDest++=(uint8_t)(((c)&0x3f)|0x80);
972 : } else /* if((uint32_t)(c)<=0x10ffff) */ {
973 0 : *pDest++=(uint8_t)(((c)>>18)|0xf0);
974 0 : *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
975 0 : *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
976 0 : *pDest++=(uint8_t)(((c)&0x3f)|0x80);
977 : }
978 0 : return pDest;
979 : }
980 :
981 :
982 : U_CAPI char* U_EXPORT2
983 0 : u_strToUTF8WithSub(char *dest,
984 : int32_t destCapacity,
985 : int32_t *pDestLength,
986 : const UChar *pSrc,
987 : int32_t srcLength,
988 : UChar32 subchar, int32_t *pNumSubstitutions,
989 : UErrorCode *pErrorCode){
990 0 : int32_t reqLength=0;
991 0 : uint32_t ch=0,ch2=0;
992 0 : uint8_t *pDest = (uint8_t *)dest;
993 0 : uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
994 : int32_t numSubstitutions;
995 :
996 : /* args check */
997 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
998 0 : return NULL;
999 : }
1000 :
1001 0 : if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
1002 0 : (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
1003 0 : subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1004 : ) {
1005 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1006 0 : return NULL;
1007 : }
1008 :
1009 0 : if(pNumSubstitutions!=NULL) {
1010 0 : *pNumSubstitutions=0;
1011 : }
1012 0 : numSubstitutions=0;
1013 :
1014 0 : if(srcLength==-1) {
1015 0 : while((ch=*pSrc)!=0) {
1016 0 : ++pSrc;
1017 0 : if(ch <= 0x7f) {
1018 0 : if(pDest<pDestLimit) {
1019 0 : *pDest++ = (uint8_t)ch;
1020 : } else {
1021 0 : reqLength = 1;
1022 0 : break;
1023 : }
1024 0 : } else if(ch <= 0x7ff) {
1025 0 : if((pDestLimit - pDest) >= 2) {
1026 0 : *pDest++=(uint8_t)((ch>>6)|0xc0);
1027 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1028 : } else {
1029 0 : reqLength = 2;
1030 0 : break;
1031 : }
1032 0 : } else if(ch <= 0xd7ff || ch >= 0xe000) {
1033 0 : if((pDestLimit - pDest) >= 3) {
1034 0 : *pDest++=(uint8_t)((ch>>12)|0xe0);
1035 0 : *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1036 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1037 : } else {
1038 0 : reqLength = 3;
1039 0 : break;
1040 : }
1041 : } else /* ch is a surrogate */ {
1042 : int32_t length;
1043 :
1044 : /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
1045 0 : if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1046 0 : ++pSrc;
1047 0 : ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1048 0 : } else if(subchar>=0) {
1049 0 : ch=subchar;
1050 0 : ++numSubstitutions;
1051 : } else {
1052 : /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1053 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1054 0 : return NULL;
1055 : }
1056 :
1057 0 : length = U8_LENGTH(ch);
1058 0 : if((pDestLimit - pDest) >= length) {
1059 : /* convert and append*/
1060 0 : pDest=_appendUTF8(pDest, ch);
1061 : } else {
1062 0 : reqLength = length;
1063 0 : break;
1064 : }
1065 : }
1066 : }
1067 0 : while((ch=*pSrc++)!=0) {
1068 0 : if(ch<=0x7f) {
1069 0 : ++reqLength;
1070 0 : } else if(ch<=0x7ff) {
1071 0 : reqLength+=2;
1072 0 : } else if(!U16_IS_SURROGATE(ch)) {
1073 0 : reqLength+=3;
1074 0 : } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1075 0 : ++pSrc;
1076 0 : reqLength+=4;
1077 0 : } else if(subchar>=0) {
1078 0 : reqLength+=U8_LENGTH(subchar);
1079 0 : ++numSubstitutions;
1080 : } else {
1081 : /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1082 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1083 0 : return NULL;
1084 : }
1085 : }
1086 : } else {
1087 0 : const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
1088 : int32_t count;
1089 :
1090 : /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1091 : for(;;) {
1092 : /*
1093 : * Each iteration of the inner loop progresses by at most 3 UTF-8
1094 : * bytes and one UChar, for most characters.
1095 : * For supplementary code points (4 & 2), which are rare,
1096 : * there is an additional adjustment.
1097 : */
1098 0 : count = (int32_t)((pDestLimit - pDest) / 3);
1099 0 : srcLength = (int32_t)(pSrcLimit - pSrc);
1100 0 : if(count > srcLength) {
1101 0 : count = srcLength; /* min(remaining dest/3, remaining src) */
1102 : }
1103 0 : if(count < 3) {
1104 : /*
1105 : * Too much overhead if we get near the end of the string,
1106 : * continue with the next loop.
1107 : */
1108 0 : break;
1109 : }
1110 0 : do {
1111 0 : ch=*pSrc++;
1112 0 : if(ch <= 0x7f) {
1113 0 : *pDest++ = (uint8_t)ch;
1114 0 : } else if(ch <= 0x7ff) {
1115 0 : *pDest++=(uint8_t)((ch>>6)|0xc0);
1116 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1117 0 : } else if(ch <= 0xd7ff || ch >= 0xe000) {
1118 0 : *pDest++=(uint8_t)((ch>>12)|0xe0);
1119 0 : *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1120 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1121 : } else /* ch is a surrogate */ {
1122 : /*
1123 : * We will read two UChars and probably output four bytes,
1124 : * which we didn't account for with computing count,
1125 : * so we adjust it here.
1126 : */
1127 0 : if(--count == 0) {
1128 0 : --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
1129 0 : break; /* recompute count */
1130 : }
1131 :
1132 0 : if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
1133 0 : ++pSrc;
1134 0 : ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1135 :
1136 : /* writing 4 bytes per 2 UChars is ok */
1137 0 : *pDest++=(uint8_t)((ch>>18)|0xf0);
1138 0 : *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
1139 0 : *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1140 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1141 : } else {
1142 : /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1143 0 : if(subchar>=0) {
1144 0 : ch=subchar;
1145 0 : ++numSubstitutions;
1146 : } else {
1147 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1148 0 : return NULL;
1149 : }
1150 :
1151 : /* convert and append*/
1152 0 : pDest=_appendUTF8(pDest, ch);
1153 : }
1154 : }
1155 : } while(--count > 0);
1156 : }
1157 :
1158 0 : while(pSrc<pSrcLimit) {
1159 0 : ch=*pSrc++;
1160 0 : if(ch <= 0x7f) {
1161 0 : if(pDest<pDestLimit) {
1162 0 : *pDest++ = (uint8_t)ch;
1163 : } else {
1164 0 : reqLength = 1;
1165 0 : break;
1166 : }
1167 0 : } else if(ch <= 0x7ff) {
1168 0 : if((pDestLimit - pDest) >= 2) {
1169 0 : *pDest++=(uint8_t)((ch>>6)|0xc0);
1170 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1171 : } else {
1172 0 : reqLength = 2;
1173 0 : break;
1174 : }
1175 0 : } else if(ch <= 0xd7ff || ch >= 0xe000) {
1176 0 : if((pDestLimit - pDest) >= 3) {
1177 0 : *pDest++=(uint8_t)((ch>>12)|0xe0);
1178 0 : *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1179 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1180 : } else {
1181 0 : reqLength = 3;
1182 0 : break;
1183 : }
1184 : } else /* ch is a surrogate */ {
1185 : int32_t length;
1186 :
1187 0 : if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1188 0 : ++pSrc;
1189 0 : ch=U16_GET_SUPPLEMENTARY(ch, ch2);
1190 0 : } else if(subchar>=0) {
1191 0 : ch=subchar;
1192 0 : ++numSubstitutions;
1193 : } else {
1194 : /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1195 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1196 0 : return NULL;
1197 : }
1198 :
1199 0 : length = U8_LENGTH(ch);
1200 0 : if((pDestLimit - pDest) >= length) {
1201 : /* convert and append*/
1202 0 : pDest=_appendUTF8(pDest, ch);
1203 : } else {
1204 0 : reqLength = length;
1205 0 : break;
1206 : }
1207 : }
1208 : }
1209 0 : while(pSrc<pSrcLimit) {
1210 0 : ch=*pSrc++;
1211 0 : if(ch<=0x7f) {
1212 0 : ++reqLength;
1213 0 : } else if(ch<=0x7ff) {
1214 0 : reqLength+=2;
1215 0 : } else if(!U16_IS_SURROGATE(ch)) {
1216 0 : reqLength+=3;
1217 0 : } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
1218 0 : ++pSrc;
1219 0 : reqLength+=4;
1220 0 : } else if(subchar>=0) {
1221 0 : reqLength+=U8_LENGTH(subchar);
1222 0 : ++numSubstitutions;
1223 : } else {
1224 : /* Unicode 3.2 forbids surrogate code points in UTF-8 */
1225 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1226 0 : return NULL;
1227 : }
1228 : }
1229 : }
1230 :
1231 0 : reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1232 :
1233 0 : if(pNumSubstitutions!=NULL) {
1234 0 : *pNumSubstitutions=numSubstitutions;
1235 : }
1236 :
1237 0 : if(pDestLength){
1238 0 : *pDestLength = reqLength;
1239 : }
1240 :
1241 : /* Terminate the buffer */
1242 0 : u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1243 0 : return dest;
1244 : }
1245 :
1246 : U_CAPI char* U_EXPORT2
1247 0 : u_strToUTF8(char *dest,
1248 : int32_t destCapacity,
1249 : int32_t *pDestLength,
1250 : const UChar *pSrc,
1251 : int32_t srcLength,
1252 : UErrorCode *pErrorCode){
1253 : return u_strToUTF8WithSub(
1254 : dest, destCapacity, pDestLength,
1255 : pSrc, srcLength,
1256 : U_SENTINEL, NULL,
1257 0 : pErrorCode);
1258 : }
1259 :
1260 : U_CAPI UChar* U_EXPORT2
1261 0 : u_strFromJavaModifiedUTF8WithSub(
1262 : UChar *dest,
1263 : int32_t destCapacity,
1264 : int32_t *pDestLength,
1265 : const char *src,
1266 : int32_t srcLength,
1267 : UChar32 subchar, int32_t *pNumSubstitutions,
1268 : UErrorCode *pErrorCode) {
1269 0 : UChar *pDest = dest;
1270 0 : UChar *pDestLimit = dest+destCapacity;
1271 : UChar32 ch;
1272 0 : int32_t reqLength = 0;
1273 0 : const uint8_t* pSrc = (const uint8_t*) src;
1274 : const uint8_t *pSrcLimit;
1275 : int32_t count;
1276 : uint8_t t1, t2; /* trail bytes */
1277 : int32_t numSubstitutions;
1278 :
1279 : /* args check */
1280 0 : if(U_FAILURE(*pErrorCode)){
1281 0 : return NULL;
1282 : }
1283 0 : if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1284 0 : (dest==NULL && destCapacity!=0) || destCapacity<0 ||
1285 0 : subchar > 0x10ffff || U_IS_SURROGATE(subchar)
1286 : ) {
1287 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1288 0 : return NULL;
1289 : }
1290 :
1291 0 : if(pNumSubstitutions!=NULL) {
1292 0 : *pNumSubstitutions=0;
1293 : }
1294 0 : numSubstitutions=0;
1295 :
1296 0 : if(srcLength < 0) {
1297 : /*
1298 : * Transform a NUL-terminated ASCII string.
1299 : * Handle non-ASCII strings with slower code.
1300 : */
1301 0 : while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
1302 0 : *pDest++=(UChar)ch;
1303 0 : ++pSrc;
1304 : }
1305 0 : if(ch == 0) {
1306 0 : reqLength=(int32_t)(pDest - dest);
1307 0 : if(pDestLength) {
1308 0 : *pDestLength = reqLength;
1309 : }
1310 :
1311 : /* Terminate the buffer */
1312 0 : u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1313 0 : return dest;
1314 : }
1315 0 : srcLength = uprv_strlen((const char *)pSrc);
1316 : }
1317 :
1318 : /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1319 0 : pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
1320 : for(;;) {
1321 0 : count = (int32_t)(pDestLimit - pDest);
1322 0 : srcLength = (int32_t)(pSrcLimit - pSrc);
1323 0 : if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
1324 : /* fast ASCII loop */
1325 0 : const uint8_t *prevSrc = pSrc;
1326 : int32_t delta;
1327 0 : while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
1328 0 : *pDest++=(UChar)ch;
1329 0 : ++pSrc;
1330 : }
1331 0 : delta = (int32_t)(pSrc - prevSrc);
1332 0 : count -= delta;
1333 0 : srcLength -= delta;
1334 : }
1335 : /*
1336 : * Each iteration of the inner loop progresses by at most 3 UTF-8
1337 : * bytes and one UChar.
1338 : */
1339 0 : srcLength /= 3;
1340 0 : if(count > srcLength) {
1341 0 : count = srcLength; /* min(remaining dest, remaining src/3) */
1342 : }
1343 0 : if(count < 3) {
1344 : /*
1345 : * Too much overhead if we get near the end of the string,
1346 : * continue with the next loop.
1347 : */
1348 0 : break;
1349 : }
1350 0 : do {
1351 0 : ch = *pSrc;
1352 0 : if(ch <= 0x7f){
1353 0 : *pDest++=(UChar)ch;
1354 0 : ++pSrc;
1355 : } else {
1356 0 : if(ch >= 0xe0) {
1357 0 : if( /* handle U+0000..U+FFFF inline */
1358 0 : ch <= 0xef &&
1359 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1360 0 : (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1361 : ) {
1362 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1363 0 : *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1364 0 : pSrc += 3;
1365 0 : continue;
1366 : }
1367 : } else {
1368 0 : if( /* handle U+0000..U+07FF inline */
1369 0 : ch >= 0xc0 &&
1370 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1371 : ) {
1372 0 : *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1373 0 : pSrc += 2;
1374 0 : continue;
1375 : }
1376 : }
1377 :
1378 0 : if(subchar < 0) {
1379 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1380 0 : return NULL;
1381 0 : } else if(subchar > 0xffff && --count == 0) {
1382 : /*
1383 : * We need to write two UChars, adjusted count for that,
1384 : * and ran out of space.
1385 : */
1386 0 : break;
1387 : } else {
1388 : /* function call for error cases */
1389 0 : ++pSrc; /* continue after the lead byte */
1390 0 : utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1391 0 : ++numSubstitutions;
1392 0 : if(subchar<=0xFFFF) {
1393 0 : *(pDest++)=(UChar)subchar;
1394 : } else {
1395 0 : *(pDest++)=U16_LEAD(subchar);
1396 0 : *(pDest++)=U16_TRAIL(subchar);
1397 : }
1398 : }
1399 : }
1400 : } while(--count > 0);
1401 0 : }
1402 :
1403 0 : while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
1404 0 : ch = *pSrc;
1405 0 : if(ch <= 0x7f){
1406 0 : *pDest++=(UChar)ch;
1407 0 : ++pSrc;
1408 : } else {
1409 0 : if(ch >= 0xe0) {
1410 0 : if( /* handle U+0000..U+FFFF inline */
1411 0 : ch <= 0xef &&
1412 0 : ((pSrcLimit - pSrc) >= 3) &&
1413 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
1414 0 : (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
1415 : ) {
1416 : /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
1417 0 : *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
1418 0 : pSrc += 3;
1419 0 : continue;
1420 : }
1421 : } else {
1422 0 : if( /* handle U+0000..U+07FF inline */
1423 0 : ch >= 0xc0 &&
1424 0 : ((pSrcLimit - pSrc) >= 2) &&
1425 0 : (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
1426 : ) {
1427 0 : *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
1428 0 : pSrc += 2;
1429 0 : continue;
1430 : }
1431 : }
1432 :
1433 0 : if(subchar < 0) {
1434 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1435 0 : return NULL;
1436 : } else {
1437 : /* function call for error cases */
1438 0 : ++pSrc; /* continue after the lead byte */
1439 0 : utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1440 0 : ++numSubstitutions;
1441 0 : if(subchar<=0xFFFF) {
1442 0 : *(pDest++)=(UChar)subchar;
1443 : } else {
1444 0 : *(pDest++)=U16_LEAD(subchar);
1445 0 : if(pDest<pDestLimit) {
1446 0 : *(pDest++)=U16_TRAIL(subchar);
1447 : } else {
1448 0 : reqLength++;
1449 0 : break;
1450 : }
1451 : }
1452 : }
1453 : }
1454 : }
1455 :
1456 : /* do not fill the dest buffer just count the UChars needed */
1457 0 : while(pSrc < pSrcLimit){
1458 0 : ch = *pSrc;
1459 0 : if(ch <= 0x7f) {
1460 0 : reqLength++;
1461 0 : ++pSrc;
1462 : } else {
1463 0 : if(ch >= 0xe0) {
1464 0 : if( /* handle U+0000..U+FFFF inline */
1465 0 : ch <= 0xef &&
1466 0 : ((pSrcLimit - pSrc) >= 3) &&
1467 0 : (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
1468 0 : (uint8_t)(pSrc[2] - 0x80) <= 0x3f
1469 : ) {
1470 0 : reqLength++;
1471 0 : pSrc += 3;
1472 0 : continue;
1473 : }
1474 : } else {
1475 0 : if( /* handle U+0000..U+07FF inline */
1476 0 : ch >= 0xc0 &&
1477 0 : ((pSrcLimit - pSrc) >= 2) &&
1478 0 : (uint8_t)(pSrc[1] - 0x80) <= 0x3f
1479 : ) {
1480 0 : reqLength++;
1481 0 : pSrc += 2;
1482 0 : continue;
1483 : }
1484 : }
1485 :
1486 0 : if(subchar < 0) {
1487 0 : *pErrorCode = U_INVALID_CHAR_FOUND;
1488 0 : return NULL;
1489 : } else {
1490 : /* function call for error cases */
1491 0 : ++pSrc; /* continue after the lead byte */
1492 0 : utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
1493 0 : ++numSubstitutions;
1494 0 : reqLength+=U16_LENGTH(ch);
1495 : }
1496 : }
1497 : }
1498 :
1499 0 : if(pNumSubstitutions!=NULL) {
1500 0 : *pNumSubstitutions=numSubstitutions;
1501 : }
1502 :
1503 0 : reqLength+=(int32_t)(pDest - dest);
1504 0 : if(pDestLength) {
1505 0 : *pDestLength = reqLength;
1506 : }
1507 :
1508 : /* Terminate the buffer */
1509 0 : u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
1510 0 : return dest;
1511 : }
1512 :
1513 : U_CAPI char* U_EXPORT2
1514 0 : u_strToJavaModifiedUTF8(
1515 : char *dest,
1516 : int32_t destCapacity,
1517 : int32_t *pDestLength,
1518 : const UChar *src,
1519 : int32_t srcLength,
1520 : UErrorCode *pErrorCode) {
1521 0 : int32_t reqLength=0;
1522 0 : uint32_t ch=0;
1523 0 : uint8_t *pDest = (uint8_t *)dest;
1524 0 : uint8_t *pDestLimit = pDest + destCapacity;
1525 : const UChar *pSrcLimit;
1526 : int32_t count;
1527 :
1528 : /* args check */
1529 0 : if(U_FAILURE(*pErrorCode)){
1530 0 : return NULL;
1531 : }
1532 0 : if( (src==NULL && srcLength!=0) || srcLength < -1 ||
1533 0 : (dest==NULL && destCapacity!=0) || destCapacity<0
1534 : ) {
1535 0 : *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1536 0 : return NULL;
1537 : }
1538 :
1539 0 : if(srcLength==-1) {
1540 : /* Convert NUL-terminated ASCII, then find the string length. */
1541 0 : while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
1542 0 : *pDest++ = (uint8_t)ch;
1543 0 : ++src;
1544 : }
1545 0 : if(ch == 0) {
1546 0 : reqLength=(int32_t)(pDest - (uint8_t *)dest);
1547 0 : if(pDestLength) {
1548 0 : *pDestLength = reqLength;
1549 : }
1550 :
1551 : /* Terminate the buffer */
1552 0 : u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1553 0 : return dest;
1554 : }
1555 0 : srcLength = u_strlen(src);
1556 : }
1557 :
1558 : /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
1559 0 : pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
1560 : for(;;) {
1561 0 : count = (int32_t)(pDestLimit - pDest);
1562 0 : srcLength = (int32_t)(pSrcLimit - src);
1563 0 : if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
1564 : /* fast ASCII loop */
1565 0 : const UChar *prevSrc = src;
1566 : int32_t delta;
1567 0 : while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
1568 0 : *pDest++=(uint8_t)ch;
1569 0 : ++src;
1570 : }
1571 0 : delta = (int32_t)(src - prevSrc);
1572 0 : count -= delta;
1573 0 : srcLength -= delta;
1574 : }
1575 : /*
1576 : * Each iteration of the inner loop progresses by at most 3 UTF-8
1577 : * bytes and one UChar.
1578 : */
1579 0 : count /= 3;
1580 0 : if(count > srcLength) {
1581 0 : count = srcLength; /* min(remaining dest/3, remaining src) */
1582 : }
1583 0 : if(count < 3) {
1584 : /*
1585 : * Too much overhead if we get near the end of the string,
1586 : * continue with the next loop.
1587 : */
1588 0 : break;
1589 : }
1590 0 : do {
1591 0 : ch=*src++;
1592 0 : if(ch <= 0x7f && ch != 0) {
1593 0 : *pDest++ = (uint8_t)ch;
1594 0 : } else if(ch <= 0x7ff) {
1595 0 : *pDest++=(uint8_t)((ch>>6)|0xc0);
1596 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1597 : } else {
1598 0 : *pDest++=(uint8_t)((ch>>12)|0xe0);
1599 0 : *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1600 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1601 : }
1602 : } while(--count > 0);
1603 0 : }
1604 :
1605 0 : while(src<pSrcLimit) {
1606 0 : ch=*src++;
1607 0 : if(ch <= 0x7f && ch != 0) {
1608 0 : if(pDest<pDestLimit) {
1609 0 : *pDest++ = (uint8_t)ch;
1610 : } else {
1611 0 : reqLength = 1;
1612 0 : break;
1613 : }
1614 0 : } else if(ch <= 0x7ff) {
1615 0 : if((pDestLimit - pDest) >= 2) {
1616 0 : *pDest++=(uint8_t)((ch>>6)|0xc0);
1617 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1618 : } else {
1619 0 : reqLength = 2;
1620 0 : break;
1621 : }
1622 : } else {
1623 0 : if((pDestLimit - pDest) >= 3) {
1624 0 : *pDest++=(uint8_t)((ch>>12)|0xe0);
1625 0 : *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
1626 0 : *pDest++=(uint8_t)((ch&0x3f)|0x80);
1627 : } else {
1628 0 : reqLength = 3;
1629 0 : break;
1630 : }
1631 : }
1632 : }
1633 0 : while(src<pSrcLimit) {
1634 0 : ch=*src++;
1635 0 : if(ch <= 0x7f && ch != 0) {
1636 0 : ++reqLength;
1637 0 : } else if(ch<=0x7ff) {
1638 0 : reqLength+=2;
1639 : } else {
1640 0 : reqLength+=3;
1641 : }
1642 : }
1643 :
1644 0 : reqLength+=(int32_t)(pDest - (uint8_t *)dest);
1645 0 : if(pDestLength){
1646 0 : *pDestLength = reqLength;
1647 : }
1648 :
1649 : /* Terminate the buffer */
1650 0 : u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
1651 0 : return dest;
1652 : }
|