Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (C) 2002-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : * file name: ucnv_u8.c
9 : * encoding: UTF-8
10 : * tab size: 8 (not used)
11 : * indentation:4
12 : *
13 : * created on: 2002jul01
14 : * created by: Markus W. Scherer
15 : *
16 : * UTF-8 converter implementation. Used to be in ucnv_utf.c.
17 : *
18 : * Also, CESU-8 implementation, see UTR 26.
19 : * The CESU-8 converter uses all the same functions as the
20 : * UTF-8 converter, with a branch for converting supplementary code points.
21 : */
22 :
23 : #include "unicode/utypes.h"
24 :
25 : #if !UCONFIG_NO_CONVERSION
26 :
27 : #include "unicode/ucnv.h"
28 : #include "unicode/utf.h"
29 : #include "unicode/utf8.h"
30 : #include "unicode/utf16.h"
31 : #include "ucnv_bld.h"
32 : #include "ucnv_cnv.h"
33 : #include "cmemory.h"
34 :
35 : /* Prototypes --------------------------------------------------------------- */
36 :
37 : /* Keep these here to make finicky compilers happy */
38 :
39 : U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
40 : UErrorCode *err);
41 : U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
42 : UErrorCode *err);
43 :
44 :
45 : /* UTF-8 -------------------------------------------------------------------- */
46 :
47 : /* UTF-8 Conversion DATA
48 : * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
49 : */
50 : /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
51 : #define MAXIMUM_UCS2 0x0000FFFF
52 : #define MAXIMUM_UTF 0x0010FFFF
53 : #define MAXIMUM_UCS4 0x7FFFFFFF
54 : #define HALF_SHIFT 10
55 : #define HALF_BASE 0x0010000
56 : #define HALF_MASK 0x3FF
57 : #define SURROGATE_HIGH_START 0xD800
58 : #define SURROGATE_HIGH_END 0xDBFF
59 : #define SURROGATE_LOW_START 0xDC00
60 : #define SURROGATE_LOW_END 0xDFFF
61 :
62 : /* -SURROGATE_LOW_START + HALF_BASE */
63 : #define SURROGATE_LOW_BASE 9216
64 :
65 : static const uint32_t offsetsFromUTF8[7] = {0,
66 : (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
67 : (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
68 : };
69 :
70 : /* END OF UTF-8 Conversion DATA */
71 :
72 : static const int8_t bytesFromUTF8[256] = {
73 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
74 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
75 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
76 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
77 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
78 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
79 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
81 : };
82 :
83 : /*
84 : * Starting with Unicode 3.0.1:
85 : * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
86 : * byte sequences with more than 4 bytes are illegal in UTF-8,
87 : * which is tested with impossible values for them
88 : */
89 : static const uint32_t
90 : utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
91 :
92 0 : static UBool hasCESU8Data(const UConverter *cnv)
93 : {
94 : #if UCONFIG_ONLY_HTML_CONVERSION
95 : return FALSE;
96 : #else
97 0 : return (UBool)(cnv->sharedData == &_CESU8Data);
98 : #endif
99 : }
100 : U_CDECL_BEGIN
101 0 : static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
102 : UErrorCode * err)
103 : {
104 0 : UConverter *cnv = args->converter;
105 0 : const unsigned char *mySource = (unsigned char *) args->source;
106 0 : UChar *myTarget = args->target;
107 0 : const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
108 0 : const UChar *targetLimit = args->targetLimit;
109 0 : unsigned char *toUBytes = cnv->toUBytes;
110 0 : UBool isCESU8 = hasCESU8Data(cnv);
111 0 : uint32_t ch, ch2 = 0;
112 : int32_t i, inBytes;
113 :
114 : /* Restore size of current sequence */
115 0 : if (cnv->toUnicodeStatus && myTarget < targetLimit)
116 : {
117 0 : inBytes = cnv->mode; /* restore # of bytes to consume */
118 0 : i = cnv->toULength; /* restore # of bytes consumed */
119 0 : cnv->toULength = 0;
120 :
121 0 : ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
122 0 : cnv->toUnicodeStatus = 0;
123 0 : goto morebytes;
124 : }
125 :
126 :
127 0 : while (mySource < sourceLimit && myTarget < targetLimit)
128 : {
129 0 : ch = *(mySource++);
130 0 : if (ch < 0x80) /* Simple case */
131 : {
132 0 : *(myTarget++) = (UChar) ch;
133 : }
134 : else
135 : {
136 : /* store the first char */
137 0 : toUBytes[0] = (char)ch;
138 0 : inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
139 0 : i = 1;
140 :
141 : morebytes:
142 0 : while (i < inBytes)
143 : {
144 0 : if (mySource < sourceLimit)
145 : {
146 0 : toUBytes[i] = (char) (ch2 = *mySource);
147 0 : if (!U8_IS_TRAIL(ch2))
148 : {
149 0 : break; /* i < inBytes */
150 : }
151 0 : ch = (ch << 6) + ch2;
152 0 : ++mySource;
153 0 : i++;
154 : }
155 : else
156 : {
157 : /* stores a partially calculated target*/
158 0 : cnv->toUnicodeStatus = ch;
159 0 : cnv->mode = inBytes;
160 0 : cnv->toULength = (int8_t) i;
161 0 : goto donefornow;
162 : }
163 : }
164 :
165 : /* Remove the accumulated high bits */
166 0 : ch -= offsetsFromUTF8[inBytes];
167 :
168 : /*
169 : * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
170 : * - use only trail bytes after a lead byte (checked above)
171 : * - use the right number of trail bytes for a given lead byte
172 : * - encode a code point <= U+10ffff
173 : * - use the fewest possible number of bytes for their code points
174 : * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
175 : *
176 : * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
177 : * There are no irregular sequences any more.
178 : * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
179 : */
180 0 : if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
181 0 : (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
182 : {
183 : /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
184 0 : if (ch <= MAXIMUM_UCS2)
185 : {
186 : /* fits in 16 bits */
187 0 : *(myTarget++) = (UChar) ch;
188 : }
189 : else
190 : {
191 : /* write out the surrogates */
192 0 : ch -= HALF_BASE;
193 0 : *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
194 0 : ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
195 0 : if (myTarget < targetLimit)
196 : {
197 0 : *(myTarget++) = (UChar)ch;
198 : }
199 : else
200 : {
201 : /* Put in overflow buffer (not handled here) */
202 0 : cnv->UCharErrorBuffer[0] = (UChar) ch;
203 0 : cnv->UCharErrorBufferLength = 1;
204 0 : *err = U_BUFFER_OVERFLOW_ERROR;
205 0 : break;
206 : }
207 : }
208 : }
209 : else
210 : {
211 0 : cnv->toULength = (int8_t)i;
212 0 : *err = U_ILLEGAL_CHAR_FOUND;
213 0 : break;
214 : }
215 : }
216 : }
217 :
218 : donefornow:
219 0 : if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
220 : {
221 : /* End of target buffer */
222 0 : *err = U_BUFFER_OVERFLOW_ERROR;
223 : }
224 :
225 0 : args->target = myTarget;
226 0 : args->source = (const char *) mySource;
227 0 : }
228 :
229 0 : static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
230 : UErrorCode * err)
231 : {
232 0 : UConverter *cnv = args->converter;
233 0 : const unsigned char *mySource = (unsigned char *) args->source;
234 0 : UChar *myTarget = args->target;
235 0 : int32_t *myOffsets = args->offsets;
236 0 : int32_t offsetNum = 0;
237 0 : const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
238 0 : const UChar *targetLimit = args->targetLimit;
239 0 : unsigned char *toUBytes = cnv->toUBytes;
240 0 : UBool isCESU8 = hasCESU8Data(cnv);
241 0 : uint32_t ch, ch2 = 0;
242 : int32_t i, inBytes;
243 :
244 : /* Restore size of current sequence */
245 0 : if (cnv->toUnicodeStatus && myTarget < targetLimit)
246 : {
247 0 : inBytes = cnv->mode; /* restore # of bytes to consume */
248 0 : i = cnv->toULength; /* restore # of bytes consumed */
249 0 : cnv->toULength = 0;
250 :
251 0 : ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
252 0 : cnv->toUnicodeStatus = 0;
253 0 : goto morebytes;
254 : }
255 :
256 0 : while (mySource < sourceLimit && myTarget < targetLimit)
257 : {
258 0 : ch = *(mySource++);
259 0 : if (ch < 0x80) /* Simple case */
260 : {
261 0 : *(myTarget++) = (UChar) ch;
262 0 : *(myOffsets++) = offsetNum++;
263 : }
264 : else
265 : {
266 0 : toUBytes[0] = (char)ch;
267 0 : inBytes = bytesFromUTF8[ch];
268 0 : i = 1;
269 :
270 : morebytes:
271 0 : while (i < inBytes)
272 : {
273 0 : if (mySource < sourceLimit)
274 : {
275 0 : toUBytes[i] = (char) (ch2 = *mySource);
276 0 : if (!U8_IS_TRAIL(ch2))
277 : {
278 0 : break; /* i < inBytes */
279 : }
280 0 : ch = (ch << 6) + ch2;
281 0 : ++mySource;
282 0 : i++;
283 : }
284 : else
285 : {
286 0 : cnv->toUnicodeStatus = ch;
287 0 : cnv->mode = inBytes;
288 0 : cnv->toULength = (int8_t)i;
289 0 : goto donefornow;
290 : }
291 : }
292 :
293 : /* Remove the accumulated high bits */
294 0 : ch -= offsetsFromUTF8[inBytes];
295 :
296 : /*
297 : * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
298 : * - use only trail bytes after a lead byte (checked above)
299 : * - use the right number of trail bytes for a given lead byte
300 : * - encode a code point <= U+10ffff
301 : * - use the fewest possible number of bytes for their code points
302 : * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
303 : *
304 : * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
305 : * There are no irregular sequences any more.
306 : * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
307 : */
308 0 : if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
309 0 : (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
310 : {
311 : /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
312 0 : if (ch <= MAXIMUM_UCS2)
313 : {
314 : /* fits in 16 bits */
315 0 : *(myTarget++) = (UChar) ch;
316 0 : *(myOffsets++) = offsetNum;
317 : }
318 : else
319 : {
320 : /* write out the surrogates */
321 0 : ch -= HALF_BASE;
322 0 : *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
323 0 : *(myOffsets++) = offsetNum;
324 0 : ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
325 0 : if (myTarget < targetLimit)
326 : {
327 0 : *(myTarget++) = (UChar)ch;
328 0 : *(myOffsets++) = offsetNum;
329 : }
330 : else
331 : {
332 0 : cnv->UCharErrorBuffer[0] = (UChar) ch;
333 0 : cnv->UCharErrorBufferLength = 1;
334 0 : *err = U_BUFFER_OVERFLOW_ERROR;
335 : }
336 : }
337 0 : offsetNum += i;
338 : }
339 : else
340 : {
341 0 : cnv->toULength = (int8_t)i;
342 0 : *err = U_ILLEGAL_CHAR_FOUND;
343 0 : break;
344 : }
345 : }
346 : }
347 :
348 : donefornow:
349 0 : if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
350 : { /* End of target buffer */
351 0 : *err = U_BUFFER_OVERFLOW_ERROR;
352 : }
353 :
354 0 : args->target = myTarget;
355 0 : args->source = (const char *) mySource;
356 0 : args->offsets = myOffsets;
357 0 : }
358 : U_CDECL_END
359 :
360 0 : U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
361 : UErrorCode * err)
362 : {
363 0 : UConverter *cnv = args->converter;
364 0 : const UChar *mySource = args->source;
365 0 : const UChar *sourceLimit = args->sourceLimit;
366 0 : uint8_t *myTarget = (uint8_t *) args->target;
367 0 : const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
368 : uint8_t *tempPtr;
369 : UChar32 ch;
370 : uint8_t tempBuf[4];
371 : int32_t indexToWrite;
372 0 : UBool isNotCESU8 = !hasCESU8Data(cnv);
373 :
374 0 : if (cnv->fromUChar32 && myTarget < targetLimit)
375 : {
376 0 : ch = cnv->fromUChar32;
377 0 : cnv->fromUChar32 = 0;
378 0 : goto lowsurrogate;
379 : }
380 :
381 0 : while (mySource < sourceLimit && myTarget < targetLimit)
382 : {
383 0 : ch = *(mySource++);
384 :
385 0 : if (ch < 0x80) /* Single byte */
386 : {
387 0 : *(myTarget++) = (uint8_t) ch;
388 : }
389 0 : else if (ch < 0x800) /* Double byte */
390 : {
391 0 : *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
392 0 : if (myTarget < targetLimit)
393 : {
394 0 : *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
395 : }
396 : else
397 : {
398 0 : cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
399 0 : cnv->charErrorBufferLength = 1;
400 0 : *err = U_BUFFER_OVERFLOW_ERROR;
401 : }
402 : }
403 : else {
404 : /* Check for surrogates */
405 0 : if(U16_IS_SURROGATE(ch) && isNotCESU8) {
406 : lowsurrogate:
407 0 : if (mySource < sourceLimit) {
408 : /* test both code units */
409 0 : if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
410 : /* convert and consume this supplementary code point */
411 0 : ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
412 0 : ++mySource;
413 : /* exit this condition tree */
414 : }
415 : else {
416 : /* this is an unpaired trail or lead code unit */
417 : /* callback(illegal) */
418 0 : cnv->fromUChar32 = ch;
419 0 : *err = U_ILLEGAL_CHAR_FOUND;
420 0 : break;
421 : }
422 : }
423 : else {
424 : /* no more input */
425 0 : cnv->fromUChar32 = ch;
426 0 : break;
427 : }
428 : }
429 :
430 : /* Do we write the buffer directly for speed,
431 : or do we have to be careful about target buffer space? */
432 0 : tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
433 :
434 0 : if (ch <= MAXIMUM_UCS2) {
435 0 : indexToWrite = 2;
436 0 : tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
437 : }
438 : else {
439 0 : indexToWrite = 3;
440 0 : tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
441 0 : tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
442 : }
443 0 : tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
444 0 : tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
445 :
446 0 : if (tempPtr == myTarget) {
447 : /* There was enough space to write the codepoint directly. */
448 0 : myTarget += (indexToWrite + 1);
449 : }
450 : else {
451 : /* We might run out of room soon. Write it slowly. */
452 0 : for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
453 0 : if (myTarget < targetLimit) {
454 0 : *(myTarget++) = *tempPtr;
455 : }
456 : else {
457 0 : cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
458 0 : *err = U_BUFFER_OVERFLOW_ERROR;
459 : }
460 : }
461 : }
462 : }
463 : }
464 :
465 0 : if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
466 : {
467 0 : *err = U_BUFFER_OVERFLOW_ERROR;
468 : }
469 :
470 0 : args->target = (char *) myTarget;
471 0 : args->source = mySource;
472 0 : }
473 :
474 0 : U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
475 : UErrorCode * err)
476 : {
477 0 : UConverter *cnv = args->converter;
478 0 : const UChar *mySource = args->source;
479 0 : int32_t *myOffsets = args->offsets;
480 0 : const UChar *sourceLimit = args->sourceLimit;
481 0 : uint8_t *myTarget = (uint8_t *) args->target;
482 0 : const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
483 : uint8_t *tempPtr;
484 : UChar32 ch;
485 : int32_t offsetNum, nextSourceIndex;
486 : int32_t indexToWrite;
487 : uint8_t tempBuf[4];
488 0 : UBool isNotCESU8 = !hasCESU8Data(cnv);
489 :
490 0 : if (cnv->fromUChar32 && myTarget < targetLimit)
491 : {
492 0 : ch = cnv->fromUChar32;
493 0 : cnv->fromUChar32 = 0;
494 0 : offsetNum = -1;
495 0 : nextSourceIndex = 0;
496 0 : goto lowsurrogate;
497 : } else {
498 0 : offsetNum = 0;
499 : }
500 :
501 0 : while (mySource < sourceLimit && myTarget < targetLimit)
502 : {
503 0 : ch = *(mySource++);
504 :
505 0 : if (ch < 0x80) /* Single byte */
506 : {
507 0 : *(myOffsets++) = offsetNum++;
508 0 : *(myTarget++) = (char) ch;
509 : }
510 0 : else if (ch < 0x800) /* Double byte */
511 : {
512 0 : *(myOffsets++) = offsetNum;
513 0 : *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
514 0 : if (myTarget < targetLimit)
515 : {
516 0 : *(myOffsets++) = offsetNum++;
517 0 : *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
518 : }
519 : else
520 : {
521 0 : cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
522 0 : cnv->charErrorBufferLength = 1;
523 0 : *err = U_BUFFER_OVERFLOW_ERROR;
524 : }
525 : }
526 : else
527 : /* Check for surrogates */
528 : {
529 0 : nextSourceIndex = offsetNum + 1;
530 :
531 0 : if(U16_IS_SURROGATE(ch) && isNotCESU8) {
532 : lowsurrogate:
533 0 : if (mySource < sourceLimit) {
534 : /* test both code units */
535 0 : if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
536 : /* convert and consume this supplementary code point */
537 0 : ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
538 0 : ++mySource;
539 0 : ++nextSourceIndex;
540 : /* exit this condition tree */
541 : }
542 : else {
543 : /* this is an unpaired trail or lead code unit */
544 : /* callback(illegal) */
545 0 : cnv->fromUChar32 = ch;
546 0 : *err = U_ILLEGAL_CHAR_FOUND;
547 0 : break;
548 : }
549 : }
550 : else {
551 : /* no more input */
552 0 : cnv->fromUChar32 = ch;
553 0 : break;
554 : }
555 : }
556 :
557 : /* Do we write the buffer directly for speed,
558 : or do we have to be careful about target buffer space? */
559 0 : tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
560 :
561 0 : if (ch <= MAXIMUM_UCS2) {
562 0 : indexToWrite = 2;
563 0 : tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
564 : }
565 : else {
566 0 : indexToWrite = 3;
567 0 : tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
568 0 : tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
569 : }
570 0 : tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
571 0 : tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
572 :
573 0 : if (tempPtr == myTarget) {
574 : /* There was enough space to write the codepoint directly. */
575 0 : myTarget += (indexToWrite + 1);
576 0 : myOffsets[0] = offsetNum;
577 0 : myOffsets[1] = offsetNum;
578 0 : myOffsets[2] = offsetNum;
579 0 : if (indexToWrite >= 3) {
580 0 : myOffsets[3] = offsetNum;
581 : }
582 0 : myOffsets += (indexToWrite + 1);
583 : }
584 : else {
585 : /* We might run out of room soon. Write it slowly. */
586 0 : for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
587 0 : if (myTarget < targetLimit)
588 : {
589 0 : *(myOffsets++) = offsetNum;
590 0 : *(myTarget++) = *tempPtr;
591 : }
592 : else
593 : {
594 0 : cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
595 0 : *err = U_BUFFER_OVERFLOW_ERROR;
596 : }
597 : }
598 : }
599 0 : offsetNum = nextSourceIndex;
600 : }
601 : }
602 :
603 0 : if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
604 : {
605 0 : *err = U_BUFFER_OVERFLOW_ERROR;
606 : }
607 :
608 0 : args->target = (char *) myTarget;
609 0 : args->source = mySource;
610 0 : args->offsets = myOffsets;
611 0 : }
612 :
613 : U_CDECL_BEGIN
614 0 : static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
615 : UErrorCode *err) {
616 : UConverter *cnv;
617 : const uint8_t *sourceInitial;
618 : const uint8_t *source;
619 : uint16_t extraBytesToWrite;
620 : uint8_t myByte;
621 : UChar32 ch;
622 : int8_t i, isLegalSequence;
623 :
624 : /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
625 :
626 0 : cnv = args->converter;
627 0 : sourceInitial = source = (const uint8_t *)args->source;
628 0 : if (source >= (const uint8_t *)args->sourceLimit)
629 : {
630 : /* no input */
631 0 : *err = U_INDEX_OUTOFBOUNDS_ERROR;
632 0 : return 0xffff;
633 : }
634 :
635 0 : myByte = (uint8_t)*(source++);
636 0 : if (myByte < 0x80)
637 : {
638 0 : args->source = (const char *)source;
639 0 : return (UChar32)myByte;
640 : }
641 :
642 0 : extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
643 0 : if (extraBytesToWrite == 0) {
644 0 : cnv->toUBytes[0] = myByte;
645 0 : cnv->toULength = 1;
646 0 : *err = U_ILLEGAL_CHAR_FOUND;
647 0 : args->source = (const char *)source;
648 0 : return 0xffff;
649 : }
650 :
651 : /*The byte sequence is longer than the buffer area passed*/
652 0 : if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
653 : {
654 : /* check if all of the remaining bytes are trail bytes */
655 0 : cnv->toUBytes[0] = myByte;
656 0 : i = 1;
657 0 : *err = U_TRUNCATED_CHAR_FOUND;
658 0 : while(source < (const uint8_t *)args->sourceLimit) {
659 0 : if(U8_IS_TRAIL(myByte = *source)) {
660 0 : cnv->toUBytes[i++] = myByte;
661 0 : ++source;
662 : } else {
663 : /* error even before we run out of input */
664 0 : *err = U_ILLEGAL_CHAR_FOUND;
665 0 : break;
666 : }
667 : }
668 0 : cnv->toULength = i;
669 0 : args->source = (const char *)source;
670 0 : return 0xffff;
671 : }
672 :
673 0 : isLegalSequence = 1;
674 0 : ch = myByte << 6;
675 0 : switch(extraBytesToWrite)
676 : {
677 : /* note: code falls through cases! (sic)*/
678 : case 6:
679 0 : ch += (myByte = *source);
680 0 : ch <<= 6;
681 0 : if (!U8_IS_TRAIL(myByte))
682 : {
683 0 : isLegalSequence = 0;
684 0 : break;
685 : }
686 0 : ++source;
687 : U_FALLTHROUGH;
688 : case 5:
689 0 : ch += (myByte = *source);
690 0 : ch <<= 6;
691 0 : if (!U8_IS_TRAIL(myByte))
692 : {
693 0 : isLegalSequence = 0;
694 0 : break;
695 : }
696 0 : ++source;
697 : U_FALLTHROUGH;
698 : case 4:
699 0 : ch += (myByte = *source);
700 0 : ch <<= 6;
701 0 : if (!U8_IS_TRAIL(myByte))
702 : {
703 0 : isLegalSequence = 0;
704 0 : break;
705 : }
706 0 : ++source;
707 : U_FALLTHROUGH;
708 : case 3:
709 0 : ch += (myByte = *source);
710 0 : ch <<= 6;
711 0 : if (!U8_IS_TRAIL(myByte))
712 : {
713 0 : isLegalSequence = 0;
714 0 : break;
715 : }
716 0 : ++source;
717 : U_FALLTHROUGH;
718 : case 2:
719 0 : ch += (myByte = *source);
720 0 : if (!U8_IS_TRAIL(myByte))
721 : {
722 0 : isLegalSequence = 0;
723 0 : break;
724 : }
725 0 : ++source;
726 : };
727 0 : ch -= offsetsFromUTF8[extraBytesToWrite];
728 0 : args->source = (const char *)source;
729 :
730 : /*
731 : * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
732 : * - use only trail bytes after a lead byte (checked above)
733 : * - use the right number of trail bytes for a given lead byte
734 : * - encode a code point <= U+10ffff
735 : * - use the fewest possible number of bytes for their code points
736 : * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
737 : *
738 : * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
739 : * There are no irregular sequences any more.
740 : */
741 0 : if (isLegalSequence &&
742 0 : (uint32_t)ch <= MAXIMUM_UTF &&
743 0 : (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
744 0 : !U_IS_SURROGATE(ch)
745 : ) {
746 0 : return ch; /* return the code point */
747 : }
748 :
749 0 : for(i = 0; sourceInitial < source; ++i) {
750 0 : cnv->toUBytes[i] = *sourceInitial++;
751 : }
752 0 : cnv->toULength = i;
753 0 : *err = U_ILLEGAL_CHAR_FOUND;
754 0 : return 0xffff;
755 : }
756 : U_CDECL_END
757 :
758 : /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
759 :
760 : /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
761 : static const UChar32
762 : utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
763 :
764 : /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
765 : static const UChar32
766 : utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
767 :
768 : U_CDECL_BEGIN
769 : /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
770 : static void U_CALLCONV
771 0 : ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
772 : UConverterToUnicodeArgs *pToUArgs,
773 : UErrorCode *pErrorCode) {
774 : UConverter *utf8;
775 : const uint8_t *source, *sourceLimit;
776 : uint8_t *target;
777 : int32_t targetCapacity;
778 : int32_t count;
779 :
780 : int8_t oldToULength, toULength, toULimit;
781 :
782 : UChar32 c;
783 : uint8_t b, t1, t2;
784 :
785 : /* set up the local pointers */
786 0 : utf8=pToUArgs->converter;
787 0 : source=(uint8_t *)pToUArgs->source;
788 0 : sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
789 0 : target=(uint8_t *)pFromUArgs->target;
790 0 : targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
791 :
792 : /* get the converter state from the UTF-8 UConverter */
793 0 : c=(UChar32)utf8->toUnicodeStatus;
794 0 : if(c!=0) {
795 0 : toULength=oldToULength=utf8->toULength;
796 0 : toULimit=(int8_t)utf8->mode;
797 : } else {
798 0 : toULength=oldToULength=toULimit=0;
799 : }
800 :
801 0 : count=(int32_t)(sourceLimit-source)+oldToULength;
802 0 : if(count<toULimit) {
803 : /*
804 : * Not enough input to complete the partial character.
805 : * Jump to moreBytes below - it will not output to target.
806 : */
807 0 : } else if(targetCapacity<toULimit) {
808 : /*
809 : * Not enough target capacity to output the partial character.
810 : * Let the standard converter handle this.
811 : */
812 0 : *pErrorCode=U_USING_DEFAULT_WARNING;
813 0 : return;
814 : } else {
815 : /*
816 : * Use a single counter for source and target, counting the minimum of
817 : * the source length and the target capacity.
818 : * As a result, the source length is checked only once per multi-byte
819 : * character instead of twice.
820 : *
821 : * Make sure that the last byte sequence is complete, or else
822 : * stop just before it.
823 : * (The longest legal byte sequence has 3 trail bytes.)
824 : * Count oldToULength (number of source bytes from a previous buffer)
825 : * into the source length but reduce the source index by toULimit
826 : * while going back over trail bytes in order to not go back into
827 : * the bytes that will be read for finishing a partial
828 : * sequence from the previous buffer.
829 : * Let the standard converter handle edge cases.
830 : */
831 : int32_t i;
832 :
833 0 : if(count>targetCapacity) {
834 0 : count=targetCapacity;
835 : }
836 :
837 0 : i=0;
838 0 : while(i<3 && i<(count-toULimit)) {
839 0 : b=source[count-oldToULength-i-1];
840 0 : if(U8_IS_TRAIL(b)) {
841 0 : ++i;
842 : } else {
843 0 : if(i<U8_COUNT_TRAIL_BYTES(b)) {
844 : /* stop converting before the lead byte if there are not enough trail bytes for it */
845 0 : count-=i+1;
846 : }
847 0 : break;
848 : }
849 : }
850 : }
851 :
852 0 : if(c!=0) {
853 0 : utf8->toUnicodeStatus=0;
854 0 : utf8->toULength=0;
855 0 : goto moreBytes;
856 : /* See note in ucnv_SBCSFromUTF8() about this goto. */
857 : }
858 :
859 : /* conversion loop */
860 0 : while(count>0) {
861 0 : b=*source++;
862 0 : if((int8_t)b>=0) {
863 : /* convert ASCII */
864 0 : *target++=b;
865 0 : --count;
866 0 : continue;
867 : } else {
868 0 : if(b>0xe0) {
869 0 : if( /* handle U+1000..U+D7FF inline */
870 0 : (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
871 0 : (b==0xed && (t1 <= 0x9f))) &&
872 0 : (t2=source[1]) >= 0x80 && t2 <= 0xbf
873 : ) {
874 0 : source+=2;
875 0 : *target++=b;
876 0 : *target++=t1;
877 0 : *target++=t2;
878 0 : count-=3;
879 0 : continue;
880 : }
881 0 : } else if(b<0xe0) {
882 0 : if( /* handle U+0080..U+07FF inline */
883 0 : b>=0xc2 &&
884 0 : (t1=*source) >= 0x80 && t1 <= 0xbf
885 : ) {
886 0 : ++source;
887 0 : *target++=b;
888 0 : *target++=t1;
889 0 : count-=2;
890 0 : continue;
891 : }
892 0 : } else if(b==0xe0) {
893 0 : if( /* handle U+0800..U+0FFF inline */
894 0 : (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
895 0 : (t2=source[1]) >= 0x80 && t2 <= 0xbf
896 : ) {
897 0 : source+=2;
898 0 : *target++=b;
899 0 : *target++=t1;
900 0 : *target++=t2;
901 0 : count-=3;
902 0 : continue;
903 : }
904 : }
905 :
906 : /* handle "complicated" and error cases, and continuing partial characters */
907 0 : oldToULength=0;
908 0 : toULength=1;
909 0 : toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
910 0 : c=b;
911 : moreBytes:
912 0 : while(toULength<toULimit) {
913 0 : if(source<sourceLimit) {
914 0 : b=*source;
915 0 : if(U8_IS_TRAIL(b)) {
916 0 : ++source;
917 0 : ++toULength;
918 0 : c=(c<<6)+b;
919 : } else {
920 0 : break; /* sequence too short, stop with toULength<toULimit */
921 : }
922 : } else {
923 : /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
924 0 : source-=(toULength-oldToULength);
925 0 : while(oldToULength<toULength) {
926 0 : utf8->toUBytes[oldToULength++]=*source++;
927 : }
928 0 : utf8->toUnicodeStatus=c;
929 0 : utf8->toULength=toULength;
930 0 : utf8->mode=toULimit;
931 0 : pToUArgs->source=(char *)source;
932 0 : pFromUArgs->target=(char *)target;
933 0 : return;
934 : }
935 : }
936 :
937 0 : if( toULength==toULimit && /* consumed all trail bytes */
938 0 : (toULength==3 || toULength==2) && /* BMP */
939 0 : (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
940 0 : (c<=0xd7ff || 0xe000<=c) /* not a surrogate */
941 : ) {
942 : /* legal byte sequence for BMP code point */
943 0 : } else if(
944 0 : toULength==toULimit && toULength==4 &&
945 0 : (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
946 : ) {
947 : /* legal byte sequence for supplementary code point */
948 : } else {
949 : /* error handling: illegal UTF-8 byte sequence */
950 0 : source-=(toULength-oldToULength);
951 0 : while(oldToULength<toULength) {
952 0 : utf8->toUBytes[oldToULength++]=*source++;
953 : }
954 0 : utf8->toULength=toULength;
955 0 : pToUArgs->source=(char *)source;
956 0 : pFromUArgs->target=(char *)target;
957 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
958 0 : return;
959 : }
960 :
961 : /* copy the legal byte sequence to the target */
962 : {
963 : int8_t i;
964 :
965 0 : for(i=0; i<oldToULength; ++i) {
966 0 : *target++=utf8->toUBytes[i];
967 : }
968 0 : source-=(toULength-oldToULength);
969 0 : for(; i<toULength; ++i) {
970 0 : *target++=*source++;
971 : }
972 0 : count-=toULength;
973 : }
974 : }
975 : }
976 :
977 0 : if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
978 0 : if(target==(const uint8_t *)pFromUArgs->targetLimit) {
979 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
980 : } else {
981 0 : b=*source;
982 0 : toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
983 0 : if(toULimit>(sourceLimit-source)) {
984 : /* collect a truncated byte sequence */
985 0 : toULength=0;
986 0 : c=b;
987 : for(;;) {
988 0 : utf8->toUBytes[toULength++]=b;
989 0 : if(++source==sourceLimit) {
990 : /* partial byte sequence at end of source */
991 0 : utf8->toUnicodeStatus=c;
992 0 : utf8->toULength=toULength;
993 0 : utf8->mode=toULimit;
994 0 : break;
995 0 : } else if(!U8_IS_TRAIL(b=*source)) {
996 : /* lead byte in trail byte position */
997 0 : utf8->toULength=toULength;
998 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
999 0 : break;
1000 : }
1001 0 : c=(c<<6)+b;
1002 : }
1003 : } else {
1004 : /* partial-sequence target overflow: fall back to the pivoting implementation */
1005 0 : *pErrorCode=U_USING_DEFAULT_WARNING;
1006 : }
1007 : }
1008 : }
1009 :
1010 : /* write back the updated pointers */
1011 0 : pToUArgs->source=(char *)source;
1012 0 : pFromUArgs->target=(char *)target;
1013 : }
1014 :
1015 : U_CDECL_END
1016 :
1017 : /* UTF-8 converter data ----------------------------------------------------- */
1018 :
1019 : static const UConverterImpl _UTF8Impl={
1020 : UCNV_UTF8,
1021 :
1022 : NULL,
1023 : NULL,
1024 :
1025 : NULL,
1026 : NULL,
1027 : NULL,
1028 :
1029 : ucnv_toUnicode_UTF8,
1030 : ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1031 : ucnv_fromUnicode_UTF8,
1032 : ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1033 : ucnv_getNextUChar_UTF8,
1034 :
1035 : NULL,
1036 : NULL,
1037 : NULL,
1038 : NULL,
1039 : ucnv_getNonSurrogateUnicodeSet,
1040 :
1041 : ucnv_UTF8FromUTF8,
1042 : ucnv_UTF8FromUTF8
1043 : };
1044 :
1045 : /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
1046 : static const UConverterStaticData _UTF8StaticData={
1047 : sizeof(UConverterStaticData),
1048 : "UTF-8",
1049 : 1208, UCNV_IBM, UCNV_UTF8,
1050 : 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
1051 : { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1052 : 0,
1053 : 0,
1054 : { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1055 : };
1056 :
1057 :
1058 : const UConverterSharedData _UTF8Data=
1059 : UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
1060 :
1061 : /* CESU-8 converter data ---------------------------------------------------- */
1062 :
1063 : static const UConverterImpl _CESU8Impl={
1064 : UCNV_CESU8,
1065 :
1066 : NULL,
1067 : NULL,
1068 :
1069 : NULL,
1070 : NULL,
1071 : NULL,
1072 :
1073 : ucnv_toUnicode_UTF8,
1074 : ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
1075 : ucnv_fromUnicode_UTF8,
1076 : ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
1077 : NULL,
1078 :
1079 : NULL,
1080 : NULL,
1081 : NULL,
1082 : NULL,
1083 : ucnv_getCompleteUnicodeSet,
1084 :
1085 : NULL,
1086 : NULL
1087 : };
1088 :
1089 : static const UConverterStaticData _CESU8StaticData={
1090 : sizeof(UConverterStaticData),
1091 : "CESU-8",
1092 : 9400, /* CCSID for CESU-8 */
1093 : UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
1094 : { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
1095 : 0,
1096 : 0,
1097 : { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1098 : };
1099 :
1100 :
1101 : const UConverterSharedData _CESU8Data=
1102 : UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
1103 :
1104 : #endif
|