Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2002-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: ucnvbocu.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2002mar27
16 : * created by: Markus W. Scherer
17 : *
18 : * This is an implementation of the Binary Ordered Compression for Unicode,
19 : * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
20 : */
21 :
22 : #include "unicode/utypes.h"
23 :
24 : #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
25 :
26 : #include "unicode/ucnv.h"
27 : #include "unicode/ucnv_cb.h"
28 : #include "unicode/utf16.h"
29 : #include "putilimp.h"
30 : #include "ucnv_bld.h"
31 : #include "ucnv_cnv.h"
32 : #include "uassert.h"
33 :
34 : /* BOCU-1 constants and macros ---------------------------------------------- */
35 :
36 : /*
37 : * BOCU-1 encodes the code points of a Unicode string as
38 : * a sequence of byte-encoded differences (slope detection),
39 : * preserving lexical order.
40 : *
41 : * Optimize the difference-taking for runs of Unicode text within
42 : * small scripts:
43 : *
44 : * Most small scripts are allocated within aligned 128-blocks of Unicode
45 : * code points. Lexical order is preserved if the "previous code point" state
46 : * is always moved into the middle of such a block.
47 : *
48 : * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
49 : * areas into the middle of those areas.
50 : *
51 : * C0 control codes and space are encoded with their US-ASCII bytes.
52 : * "prev" is reset for C0 controls but not for space.
53 : */
54 :
55 : /* initial value for "prev": middle of the ASCII range */
56 : #define BOCU1_ASCII_PREV 0x40
57 :
58 : /* bounding byte values for differences */
59 : #define BOCU1_MIN 0x21
60 : #define BOCU1_MIDDLE 0x90
61 : #define BOCU1_MAX_LEAD 0xfe
62 : #define BOCU1_MAX_TRAIL 0xff
63 : #define BOCU1_RESET 0xff
64 :
65 : /* number of lead bytes */
66 : #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
67 :
68 : /* adjust trail byte counts for the use of some C0 control byte values */
69 : #define BOCU1_TRAIL_CONTROLS_COUNT 20
70 : #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
71 :
72 : /* number of trail bytes */
73 : #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
74 :
75 : /*
76 : * number of positive and negative single-byte codes
77 : * (counting 0==BOCU1_MIDDLE among the positive ones)
78 : */
79 : #define BOCU1_SINGLE 64
80 :
81 : /* number of lead bytes for positive and negative 2/3/4-byte sequences */
82 : #define BOCU1_LEAD_2 43
83 : #define BOCU1_LEAD_3 3
84 : #define BOCU1_LEAD_4 1
85 :
86 : /* The difference value range for single-byters. */
87 : #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
88 : #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
89 :
90 : /* The difference value range for double-byters. */
91 : #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
92 : #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
93 :
94 : /* The difference value range for 3-byters. */
95 : #define BOCU1_REACH_POS_3 \
96 : (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
97 :
98 : #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
99 :
100 : /* The lead byte start values. */
101 : #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
102 : #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
103 : #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
104 : /* ==BOCU1_MAX_LEAD */
105 :
106 : #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
107 : #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
108 : #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
109 : /* ==BOCU1_MIN+1 */
110 :
111 : /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
112 : #define BOCU1_LENGTH_FROM_LEAD(lead) \
113 : ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
114 : (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
115 : (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
116 :
117 : /* The length of a byte sequence, according to its packed form. */
118 : #define BOCU1_LENGTH_FROM_PACKED(packed) \
119 : ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
120 :
121 : /*
122 : * 12 commonly used C0 control codes (and space) are only used to encode
123 : * themselves directly,
124 : * which makes BOCU-1 MIME-usable and reasonably safe for
125 : * ASCII-oriented software.
126 : *
127 : * These controls are
128 : * 0 NUL
129 : *
130 : * 7 BEL
131 : * 8 BS
132 : *
133 : * 9 TAB
134 : * a LF
135 : * b VT
136 : * c FF
137 : * d CR
138 : *
139 : * e SO
140 : * f SI
141 : *
142 : * 1a SUB
143 : * 1b ESC
144 : *
145 : * The other 20 C0 controls are also encoded directly (to preserve order)
146 : * but are also used as trail bytes in difference encoding
147 : * (for better compression).
148 : */
149 : #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
150 :
151 : /*
152 : * Byte value map for control codes,
153 : * from external byte values 0x00..0x20
154 : * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
155 : * External byte values that are illegal as trail bytes are mapped to -1.
156 : */
157 : static const int8_t
158 : bocu1ByteToTrail[BOCU1_MIN]={
159 : /* 0 1 2 3 4 5 6 7 */
160 : -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
161 :
162 : /* 8 9 a b c d e f */
163 : -1, -1, -1, -1, -1, -1, -1, -1,
164 :
165 : /* 10 11 12 13 14 15 16 17 */
166 : 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
167 :
168 : /* 18 19 1a 1b 1c 1d 1e 1f */
169 : 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
170 :
171 : /* 20 */
172 : -1
173 : };
174 :
175 : /*
176 : * Byte value map for control codes,
177 : * from trail byte values 0..19 (0..0x13) as used in the difference calculation
178 : * to external byte values 0x00..0x20.
179 : */
180 : static const int8_t
181 : bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
182 : /* 0 1 2 3 4 5 6 7 */
183 : 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
184 :
185 : /* 8 9 a b c d e f */
186 : 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
187 :
188 : /* 10 11 12 13 */
189 : 0x1c, 0x1d, 0x1e, 0x1f
190 : };
191 :
192 : /**
193 : * Integer division and modulo with negative numerators
194 : * yields negative modulo results and quotients that are one more than
195 : * what we need here.
196 : * This macro adjust the results so that the modulo-value m is always >=0.
197 : *
198 : * For positive n, the if() condition is always FALSE.
199 : *
200 : * @param n Number to be split into quotient and rest.
201 : * Will be modified to contain the quotient.
202 : * @param d Divisor.
203 : * @param m Output variable for the rest (modulo result).
204 : */
205 : #define NEGDIVMOD(n, d, m) { \
206 : (m)=(n)%(d); \
207 : (n)/=(d); \
208 : if((m)<0) { \
209 : --(n); \
210 : (m)+=(d); \
211 : } \
212 : }
213 :
214 : /* Faster versions of packDiff() for single-byte-encoded diff values. */
215 :
216 : /** Is a diff value encodable in a single byte? */
217 : #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
218 :
219 : /** Encode a diff value in a single byte. */
220 : #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
221 :
222 : /** Is a diff value encodable in two bytes? */
223 : #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
224 :
225 : /* BOCU-1 implementation functions ------------------------------------------ */
226 :
227 : #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
228 :
229 : /**
230 : * Compute the next "previous" value for differencing
231 : * from the current code point.
232 : *
233 : * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
234 : * @return "previous code point" state value
235 : */
236 : static inline int32_t
237 0 : bocu1Prev(int32_t c) {
238 : /* compute new prev */
239 0 : if(/* 0x3040<=c && */ c<=0x309f) {
240 : /* Hiragana is not 128-aligned */
241 0 : return 0x3070;
242 0 : } else if(0x4e00<=c && c<=0x9fa5) {
243 : /* CJK Unihan */
244 0 : return 0x4e00-BOCU1_REACH_NEG_2;
245 0 : } else if(0xac00<=c /* && c<=0xd7a3 */) {
246 : /* Korean Hangul */
247 0 : return (0xd7a3+0xac00)/2;
248 : } else {
249 : /* mostly small scripts */
250 0 : return BOCU1_SIMPLE_PREV(c);
251 : }
252 : }
253 :
254 : /** Fast version of bocu1Prev() for most scripts. */
255 : #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
256 :
257 : /*
258 : * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
259 : * The UConverter fields are used as follows:
260 : *
261 : * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
262 : *
263 : * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
264 : * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
265 : */
266 :
267 : /* BOCU-1-from-Unicode conversion functions --------------------------------- */
268 :
269 : /**
270 : * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
271 : * and return a packed integer with them.
272 : *
273 : * The encoding favors small absolute differences with short encodings
274 : * to compress runs of same-script characters.
275 : *
276 : * Optimized version with unrolled loops and fewer floating-point operations
277 : * than the standard packDiff().
278 : *
279 : * @param diff difference value -0x10ffff..0x10ffff
280 : * @return
281 : * 0x010000zz for 1-byte sequence zz
282 : * 0x0200yyzz for 2-byte sequence yy zz
283 : * 0x03xxyyzz for 3-byte sequence xx yy zz
284 : * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
285 : */
286 : static int32_t
287 0 : packDiff(int32_t diff) {
288 : int32_t result, m;
289 :
290 0 : U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
291 0 : if(diff>=BOCU1_REACH_NEG_1) {
292 : /* mostly positive differences, and single-byte negative ones */
293 : #if 0 /* single-byte case handled in macros, see below */
294 : if(diff<=BOCU1_REACH_POS_1) {
295 : /* single byte */
296 : return 0x01000000|(BOCU1_MIDDLE+diff);
297 : } else
298 : #endif
299 0 : if(diff<=BOCU1_REACH_POS_2) {
300 : /* two bytes */
301 0 : diff-=BOCU1_REACH_POS_1+1;
302 0 : result=0x02000000;
303 :
304 0 : m=diff%BOCU1_TRAIL_COUNT;
305 0 : diff/=BOCU1_TRAIL_COUNT;
306 0 : result|=BOCU1_TRAIL_TO_BYTE(m);
307 :
308 0 : result|=(BOCU1_START_POS_2+diff)<<8;
309 0 : } else if(diff<=BOCU1_REACH_POS_3) {
310 : /* three bytes */
311 0 : diff-=BOCU1_REACH_POS_2+1;
312 0 : result=0x03000000;
313 :
314 0 : m=diff%BOCU1_TRAIL_COUNT;
315 0 : diff/=BOCU1_TRAIL_COUNT;
316 0 : result|=BOCU1_TRAIL_TO_BYTE(m);
317 :
318 0 : m=diff%BOCU1_TRAIL_COUNT;
319 0 : diff/=BOCU1_TRAIL_COUNT;
320 0 : result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
321 :
322 0 : result|=(BOCU1_START_POS_3+diff)<<16;
323 : } else {
324 : /* four bytes */
325 0 : diff-=BOCU1_REACH_POS_3+1;
326 :
327 0 : m=diff%BOCU1_TRAIL_COUNT;
328 0 : diff/=BOCU1_TRAIL_COUNT;
329 0 : result=BOCU1_TRAIL_TO_BYTE(m);
330 :
331 0 : m=diff%BOCU1_TRAIL_COUNT;
332 0 : diff/=BOCU1_TRAIL_COUNT;
333 0 : result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
334 :
335 : /*
336 : * We know that / and % would deliver quotient 0 and rest=diff.
337 : * Avoid division and modulo for performance.
338 : */
339 0 : result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
340 :
341 0 : result|=((uint32_t)BOCU1_START_POS_4)<<24;
342 : }
343 : } else {
344 : /* two- to four-byte negative differences */
345 0 : if(diff>=BOCU1_REACH_NEG_2) {
346 : /* two bytes */
347 0 : diff-=BOCU1_REACH_NEG_1;
348 0 : result=0x02000000;
349 :
350 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
351 0 : result|=BOCU1_TRAIL_TO_BYTE(m);
352 :
353 0 : result|=(BOCU1_START_NEG_2+diff)<<8;
354 0 : } else if(diff>=BOCU1_REACH_NEG_3) {
355 : /* three bytes */
356 0 : diff-=BOCU1_REACH_NEG_2;
357 0 : result=0x03000000;
358 :
359 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
360 0 : result|=BOCU1_TRAIL_TO_BYTE(m);
361 :
362 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
363 0 : result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
364 :
365 0 : result|=(BOCU1_START_NEG_3+diff)<<16;
366 : } else {
367 : /* four bytes */
368 0 : diff-=BOCU1_REACH_NEG_3;
369 :
370 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
371 0 : result=BOCU1_TRAIL_TO_BYTE(m);
372 :
373 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
374 0 : result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
375 :
376 : /*
377 : * We know that NEGDIVMOD would deliver
378 : * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
379 : * Avoid division and modulo for performance.
380 : */
381 0 : m=diff+BOCU1_TRAIL_COUNT;
382 0 : result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
383 :
384 0 : result|=BOCU1_MIN<<24;
385 : }
386 : }
387 0 : return result;
388 : }
389 :
390 :
391 : static void U_CALLCONV
392 0 : _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
393 : UErrorCode *pErrorCode) {
394 : UConverter *cnv;
395 : const UChar *source, *sourceLimit;
396 : uint8_t *target;
397 : int32_t targetCapacity;
398 : int32_t *offsets;
399 :
400 : int32_t prev, c, diff;
401 :
402 : int32_t sourceIndex, nextSourceIndex;
403 :
404 : /* set up the local pointers */
405 0 : cnv=pArgs->converter;
406 0 : source=pArgs->source;
407 0 : sourceLimit=pArgs->sourceLimit;
408 0 : target=(uint8_t *)pArgs->target;
409 0 : targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
410 0 : offsets=pArgs->offsets;
411 :
412 : /* get the converter state from UConverter */
413 0 : c=cnv->fromUChar32;
414 0 : prev=(int32_t)cnv->fromUnicodeStatus;
415 0 : if(prev==0) {
416 0 : prev=BOCU1_ASCII_PREV;
417 : }
418 :
419 : /* sourceIndex=-1 if the current character began in the previous buffer */
420 0 : sourceIndex= c==0 ? 0 : -1;
421 0 : nextSourceIndex=0;
422 :
423 : /* conversion loop */
424 0 : if(c!=0 && targetCapacity>0) {
425 0 : goto getTrail;
426 : }
427 :
428 : fastSingle:
429 : /* fast loop for single-byte differences */
430 : /* use only one loop counter variable, targetCapacity, not also source */
431 0 : diff=(int32_t)(sourceLimit-source);
432 0 : if(targetCapacity>diff) {
433 0 : targetCapacity=diff;
434 : }
435 0 : while(targetCapacity>0 && (c=*source)<0x3000) {
436 0 : if(c<=0x20) {
437 0 : if(c!=0x20) {
438 0 : prev=BOCU1_ASCII_PREV;
439 : }
440 0 : *target++=(uint8_t)c;
441 0 : *offsets++=nextSourceIndex++;
442 0 : ++source;
443 0 : --targetCapacity;
444 : } else {
445 0 : diff=c-prev;
446 0 : if(DIFF_IS_SINGLE(diff)) {
447 0 : prev=BOCU1_SIMPLE_PREV(c);
448 0 : *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
449 0 : *offsets++=nextSourceIndex++;
450 0 : ++source;
451 0 : --targetCapacity;
452 : } else {
453 : break;
454 : }
455 : }
456 : }
457 : /* restore real values */
458 0 : targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
459 0 : sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
460 :
461 : /* regular loop for all cases */
462 0 : while(source<sourceLimit) {
463 0 : if(targetCapacity>0) {
464 0 : c=*source++;
465 0 : ++nextSourceIndex;
466 :
467 0 : if(c<=0x20) {
468 : /*
469 : * ISO C0 control & space:
470 : * Encode directly for MIME compatibility,
471 : * and reset state except for space, to not disrupt compression.
472 : */
473 0 : if(c!=0x20) {
474 0 : prev=BOCU1_ASCII_PREV;
475 : }
476 0 : *target++=(uint8_t)c;
477 0 : *offsets++=sourceIndex;
478 0 : --targetCapacity;
479 :
480 0 : sourceIndex=nextSourceIndex;
481 0 : continue;
482 : }
483 :
484 0 : if(U16_IS_LEAD(c)) {
485 : getTrail:
486 0 : if(source<sourceLimit) {
487 : /* test the following code unit */
488 0 : UChar trail=*source;
489 0 : if(U16_IS_TRAIL(trail)) {
490 0 : ++source;
491 0 : ++nextSourceIndex;
492 0 : c=U16_GET_SUPPLEMENTARY(c, trail);
493 : }
494 : } else {
495 : /* no more input */
496 0 : c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
497 0 : break;
498 : }
499 : }
500 :
501 : /*
502 : * all other Unicode code points c==U+0021..U+10ffff
503 : * are encoded with the difference c-prev
504 : *
505 : * a new prev is computed from c,
506 : * placed in the middle of a 0x80-block (for most small scripts) or
507 : * in the middle of the Unihan and Hangul blocks
508 : * to statistically minimize the following difference
509 : */
510 0 : diff=c-prev;
511 0 : prev=BOCU1_PREV(c);
512 0 : if(DIFF_IS_SINGLE(diff)) {
513 0 : *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
514 0 : *offsets++=sourceIndex;
515 0 : --targetCapacity;
516 0 : sourceIndex=nextSourceIndex;
517 0 : if(c<0x3000) {
518 0 : goto fastSingle;
519 : }
520 0 : } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
521 : /* optimize 2-byte case */
522 : int32_t m;
523 :
524 0 : if(diff>=0) {
525 0 : diff-=BOCU1_REACH_POS_1+1;
526 0 : m=diff%BOCU1_TRAIL_COUNT;
527 0 : diff/=BOCU1_TRAIL_COUNT;
528 0 : diff+=BOCU1_START_POS_2;
529 : } else {
530 0 : diff-=BOCU1_REACH_NEG_1;
531 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
532 0 : diff+=BOCU1_START_NEG_2;
533 : }
534 0 : *target++=(uint8_t)diff;
535 0 : *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
536 0 : *offsets++=sourceIndex;
537 0 : *offsets++=sourceIndex;
538 0 : targetCapacity-=2;
539 0 : sourceIndex=nextSourceIndex;
540 : } else {
541 : int32_t length; /* will be 2..4 */
542 :
543 0 : diff=packDiff(diff);
544 0 : length=BOCU1_LENGTH_FROM_PACKED(diff);
545 :
546 : /* write the output character bytes from diff and length */
547 : /* from the first if in the loop we know that targetCapacity>0 */
548 0 : if(length<=targetCapacity) {
549 0 : switch(length) {
550 : /* each branch falls through to the next one */
551 : case 4:
552 0 : *target++=(uint8_t)(diff>>24);
553 0 : *offsets++=sourceIndex;
554 : U_FALLTHROUGH;
555 : case 3:
556 0 : *target++=(uint8_t)(diff>>16);
557 0 : *offsets++=sourceIndex;
558 : U_FALLTHROUGH;
559 : case 2:
560 0 : *target++=(uint8_t)(diff>>8);
561 0 : *offsets++=sourceIndex;
562 : /* case 1: handled above */
563 0 : *target++=(uint8_t)diff;
564 0 : *offsets++=sourceIndex;
565 : U_FALLTHROUGH;
566 : default:
567 : /* will never occur */
568 0 : break;
569 : }
570 0 : targetCapacity-=length;
571 0 : sourceIndex=nextSourceIndex;
572 : } else {
573 : uint8_t *charErrorBuffer;
574 :
575 : /*
576 : * We actually do this backwards here:
577 : * In order to save an intermediate variable, we output
578 : * first to the overflow buffer what does not fit into the
579 : * regular target.
580 : */
581 : /* we know that 1<=targetCapacity<length<=4 */
582 0 : length-=targetCapacity;
583 0 : charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
584 0 : switch(length) {
585 : /* each branch falls through to the next one */
586 : case 3:
587 0 : *charErrorBuffer++=(uint8_t)(diff>>16);
588 : U_FALLTHROUGH;
589 : case 2:
590 0 : *charErrorBuffer++=(uint8_t)(diff>>8);
591 : U_FALLTHROUGH;
592 : case 1:
593 0 : *charErrorBuffer=(uint8_t)diff;
594 : U_FALLTHROUGH;
595 : default:
596 : /* will never occur */
597 0 : break;
598 : }
599 0 : cnv->charErrorBufferLength=(int8_t)length;
600 :
601 : /* now output what fits into the regular target */
602 0 : diff>>=8*length; /* length was reduced by targetCapacity */
603 0 : switch(targetCapacity) {
604 : /* each branch falls through to the next one */
605 : case 3:
606 0 : *target++=(uint8_t)(diff>>16);
607 0 : *offsets++=sourceIndex;
608 : U_FALLTHROUGH;
609 : case 2:
610 0 : *target++=(uint8_t)(diff>>8);
611 0 : *offsets++=sourceIndex;
612 : U_FALLTHROUGH;
613 : case 1:
614 0 : *target++=(uint8_t)diff;
615 0 : *offsets++=sourceIndex;
616 : U_FALLTHROUGH;
617 : default:
618 : /* will never occur */
619 0 : break;
620 : }
621 :
622 : /* target overflow */
623 0 : targetCapacity=0;
624 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
625 0 : break;
626 : }
627 : }
628 : } else {
629 : /* target is full */
630 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
631 0 : break;
632 : }
633 : }
634 :
635 : /* set the converter state back into UConverter */
636 0 : cnv->fromUChar32= c<0 ? -c : 0;
637 0 : cnv->fromUnicodeStatus=(uint32_t)prev;
638 :
639 : /* write back the updated pointers */
640 0 : pArgs->source=source;
641 0 : pArgs->target=(char *)target;
642 0 : pArgs->offsets=offsets;
643 0 : }
644 :
645 : /*
646 : * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
647 : * If a change is made in the original function, then either
648 : * change this function the same way or
649 : * re-copy the original function and remove the variables
650 : * offsets, sourceIndex, and nextSourceIndex.
651 : */
652 : static void U_CALLCONV
653 0 : _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
654 : UErrorCode *pErrorCode) {
655 : UConverter *cnv;
656 : const UChar *source, *sourceLimit;
657 : uint8_t *target;
658 : int32_t targetCapacity;
659 :
660 : int32_t prev, c, diff;
661 :
662 : /* set up the local pointers */
663 0 : cnv=pArgs->converter;
664 0 : source=pArgs->source;
665 0 : sourceLimit=pArgs->sourceLimit;
666 0 : target=(uint8_t *)pArgs->target;
667 0 : targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
668 :
669 : /* get the converter state from UConverter */
670 0 : c=cnv->fromUChar32;
671 0 : prev=(int32_t)cnv->fromUnicodeStatus;
672 0 : if(prev==0) {
673 0 : prev=BOCU1_ASCII_PREV;
674 : }
675 :
676 : /* conversion loop */
677 0 : if(c!=0 && targetCapacity>0) {
678 0 : goto getTrail;
679 : }
680 :
681 : fastSingle:
682 : /* fast loop for single-byte differences */
683 : /* use only one loop counter variable, targetCapacity, not also source */
684 0 : diff=(int32_t)(sourceLimit-source);
685 0 : if(targetCapacity>diff) {
686 0 : targetCapacity=diff;
687 : }
688 0 : while(targetCapacity>0 && (c=*source)<0x3000) {
689 0 : if(c<=0x20) {
690 0 : if(c!=0x20) {
691 0 : prev=BOCU1_ASCII_PREV;
692 : }
693 0 : *target++=(uint8_t)c;
694 : } else {
695 0 : diff=c-prev;
696 0 : if(DIFF_IS_SINGLE(diff)) {
697 0 : prev=BOCU1_SIMPLE_PREV(c);
698 0 : *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
699 : } else {
700 : break;
701 : }
702 : }
703 0 : ++source;
704 0 : --targetCapacity;
705 : }
706 : /* restore real values */
707 0 : targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
708 :
709 : /* regular loop for all cases */
710 0 : while(source<sourceLimit) {
711 0 : if(targetCapacity>0) {
712 0 : c=*source++;
713 :
714 0 : if(c<=0x20) {
715 : /*
716 : * ISO C0 control & space:
717 : * Encode directly for MIME compatibility,
718 : * and reset state except for space, to not disrupt compression.
719 : */
720 0 : if(c!=0x20) {
721 0 : prev=BOCU1_ASCII_PREV;
722 : }
723 0 : *target++=(uint8_t)c;
724 0 : --targetCapacity;
725 0 : continue;
726 : }
727 :
728 0 : if(U16_IS_LEAD(c)) {
729 : getTrail:
730 0 : if(source<sourceLimit) {
731 : /* test the following code unit */
732 0 : UChar trail=*source;
733 0 : if(U16_IS_TRAIL(trail)) {
734 0 : ++source;
735 0 : c=U16_GET_SUPPLEMENTARY(c, trail);
736 : }
737 : } else {
738 : /* no more input */
739 0 : c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
740 0 : break;
741 : }
742 : }
743 :
744 : /*
745 : * all other Unicode code points c==U+0021..U+10ffff
746 : * are encoded with the difference c-prev
747 : *
748 : * a new prev is computed from c,
749 : * placed in the middle of a 0x80-block (for most small scripts) or
750 : * in the middle of the Unihan and Hangul blocks
751 : * to statistically minimize the following difference
752 : */
753 0 : diff=c-prev;
754 0 : prev=BOCU1_PREV(c);
755 0 : if(DIFF_IS_SINGLE(diff)) {
756 0 : *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
757 0 : --targetCapacity;
758 0 : if(c<0x3000) {
759 0 : goto fastSingle;
760 : }
761 0 : } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
762 : /* optimize 2-byte case */
763 : int32_t m;
764 :
765 0 : if(diff>=0) {
766 0 : diff-=BOCU1_REACH_POS_1+1;
767 0 : m=diff%BOCU1_TRAIL_COUNT;
768 0 : diff/=BOCU1_TRAIL_COUNT;
769 0 : diff+=BOCU1_START_POS_2;
770 : } else {
771 0 : diff-=BOCU1_REACH_NEG_1;
772 0 : NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
773 0 : diff+=BOCU1_START_NEG_2;
774 : }
775 0 : *target++=(uint8_t)diff;
776 0 : *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
777 0 : targetCapacity-=2;
778 : } else {
779 : int32_t length; /* will be 2..4 */
780 :
781 0 : diff=packDiff(diff);
782 0 : length=BOCU1_LENGTH_FROM_PACKED(diff);
783 :
784 : /* write the output character bytes from diff and length */
785 : /* from the first if in the loop we know that targetCapacity>0 */
786 0 : if(length<=targetCapacity) {
787 0 : switch(length) {
788 : /* each branch falls through to the next one */
789 : case 4:
790 0 : *target++=(uint8_t)(diff>>24);
791 : U_FALLTHROUGH;
792 : case 3:
793 0 : *target++=(uint8_t)(diff>>16);
794 : /* case 2: handled above */
795 0 : *target++=(uint8_t)(diff>>8);
796 : /* case 1: handled above */
797 0 : *target++=(uint8_t)diff;
798 : U_FALLTHROUGH;
799 : default:
800 : /* will never occur */
801 0 : break;
802 : }
803 0 : targetCapacity-=length;
804 : } else {
805 : uint8_t *charErrorBuffer;
806 :
807 : /*
808 : * We actually do this backwards here:
809 : * In order to save an intermediate variable, we output
810 : * first to the overflow buffer what does not fit into the
811 : * regular target.
812 : */
813 : /* we know that 1<=targetCapacity<length<=4 */
814 0 : length-=targetCapacity;
815 0 : charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
816 0 : switch(length) {
817 : /* each branch falls through to the next one */
818 : case 3:
819 0 : *charErrorBuffer++=(uint8_t)(diff>>16);
820 : U_FALLTHROUGH;
821 : case 2:
822 0 : *charErrorBuffer++=(uint8_t)(diff>>8);
823 : U_FALLTHROUGH;
824 : case 1:
825 0 : *charErrorBuffer=(uint8_t)diff;
826 : U_FALLTHROUGH;
827 : default:
828 : /* will never occur */
829 0 : break;
830 : }
831 0 : cnv->charErrorBufferLength=(int8_t)length;
832 :
833 : /* now output what fits into the regular target */
834 0 : diff>>=8*length; /* length was reduced by targetCapacity */
835 0 : switch(targetCapacity) {
836 : /* each branch falls through to the next one */
837 : case 3:
838 0 : *target++=(uint8_t)(diff>>16);
839 : U_FALLTHROUGH;
840 : case 2:
841 0 : *target++=(uint8_t)(diff>>8);
842 : U_FALLTHROUGH;
843 : case 1:
844 0 : *target++=(uint8_t)diff;
845 : U_FALLTHROUGH;
846 : default:
847 : /* will never occur */
848 0 : break;
849 : }
850 :
851 : /* target overflow */
852 0 : targetCapacity=0;
853 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
854 0 : break;
855 : }
856 : }
857 : } else {
858 : /* target is full */
859 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
860 0 : break;
861 : }
862 : }
863 :
864 : /* set the converter state back into UConverter */
865 0 : cnv->fromUChar32= c<0 ? -c : 0;
866 0 : cnv->fromUnicodeStatus=(uint32_t)prev;
867 :
868 : /* write back the updated pointers */
869 0 : pArgs->source=source;
870 0 : pArgs->target=(char *)target;
871 0 : }
872 :
873 : /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
874 :
875 : /**
876 : * Function for BOCU-1 decoder; handles multi-byte lead bytes.
877 : *
878 : * @param b lead byte;
879 : * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
880 : * @return (diff<<2)|count
881 : */
882 : static inline int32_t
883 0 : decodeBocu1LeadByte(int32_t b) {
884 : int32_t diff, count;
885 :
886 0 : if(b>=BOCU1_START_NEG_2) {
887 : /* positive difference */
888 0 : if(b<BOCU1_START_POS_3) {
889 : /* two bytes */
890 0 : diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
891 0 : count=1;
892 0 : } else if(b<BOCU1_START_POS_4) {
893 : /* three bytes */
894 0 : diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
895 0 : count=2;
896 : } else {
897 : /* four bytes */
898 0 : diff=BOCU1_REACH_POS_3+1;
899 0 : count=3;
900 : }
901 : } else {
902 : /* negative difference */
903 0 : if(b>=BOCU1_START_NEG_3) {
904 : /* two bytes */
905 0 : diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
906 0 : count=1;
907 0 : } else if(b>BOCU1_MIN) {
908 : /* three bytes */
909 0 : diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
910 0 : count=2;
911 : } else {
912 : /* four bytes */
913 0 : diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
914 0 : count=3;
915 : }
916 : }
917 :
918 : /* return the state for decoding the trail byte(s) */
919 0 : return (diff<<2)|count;
920 : }
921 :
922 : /**
923 : * Function for BOCU-1 decoder; handles multi-byte trail bytes.
924 : *
925 : * @param count number of remaining trail bytes including this one
926 : * @param b trail byte
927 : * @return new delta for diff including b - <0 indicates an error
928 : *
929 : * @see decodeBocu1
930 : */
931 : static inline int32_t
932 0 : decodeBocu1TrailByte(int32_t count, int32_t b) {
933 0 : if(b<=0x20) {
934 : /* skip some C0 controls and make the trail byte range contiguous */
935 0 : b=bocu1ByteToTrail[b];
936 : /* b<0 for an illegal trail byte value will result in return<0 below */
937 : #if BOCU1_MAX_TRAIL<0xff
938 : } else if(b>BOCU1_MAX_TRAIL) {
939 : return -99;
940 : #endif
941 : } else {
942 0 : b-=BOCU1_TRAIL_BYTE_OFFSET;
943 : }
944 :
945 : /* add trail byte into difference and decrement count */
946 0 : if(count==1) {
947 0 : return b;
948 0 : } else if(count==2) {
949 0 : return b*BOCU1_TRAIL_COUNT;
950 : } else /* count==3 */ {
951 0 : return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
952 : }
953 : }
954 :
955 : static void U_CALLCONV
956 0 : _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
957 : UErrorCode *pErrorCode) {
958 : UConverter *cnv;
959 : const uint8_t *source, *sourceLimit;
960 : UChar *target;
961 : const UChar *targetLimit;
962 : int32_t *offsets;
963 :
964 : int32_t prev, count, diff, c;
965 :
966 : int8_t byteIndex;
967 : uint8_t *bytes;
968 :
969 : int32_t sourceIndex, nextSourceIndex;
970 :
971 : /* set up the local pointers */
972 0 : cnv=pArgs->converter;
973 0 : source=(const uint8_t *)pArgs->source;
974 0 : sourceLimit=(const uint8_t *)pArgs->sourceLimit;
975 0 : target=pArgs->target;
976 0 : targetLimit=pArgs->targetLimit;
977 0 : offsets=pArgs->offsets;
978 :
979 : /* get the converter state from UConverter */
980 0 : prev=(int32_t)cnv->toUnicodeStatus;
981 0 : if(prev==0) {
982 0 : prev=BOCU1_ASCII_PREV;
983 : }
984 0 : diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
985 0 : count=diff&3;
986 0 : diff>>=2;
987 :
988 0 : byteIndex=cnv->toULength;
989 0 : bytes=cnv->toUBytes;
990 :
991 : /* sourceIndex=-1 if the current character began in the previous buffer */
992 0 : sourceIndex=byteIndex==0 ? 0 : -1;
993 0 : nextSourceIndex=0;
994 :
995 : /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
996 0 : if(count>0 && byteIndex>0 && target<targetLimit) {
997 0 : goto getTrail;
998 : }
999 :
1000 : fastSingle:
1001 : /* fast loop for single-byte differences */
1002 : /* use count as the only loop counter variable */
1003 0 : diff=(int32_t)(sourceLimit-source);
1004 0 : count=(int32_t)(pArgs->targetLimit-target);
1005 0 : if(count>diff) {
1006 0 : count=diff;
1007 : }
1008 0 : while(count>0) {
1009 0 : if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1010 0 : c=prev+(c-BOCU1_MIDDLE);
1011 0 : if(c<0x3000) {
1012 0 : *target++=(UChar)c;
1013 0 : *offsets++=nextSourceIndex++;
1014 0 : prev=BOCU1_SIMPLE_PREV(c);
1015 : } else {
1016 0 : break;
1017 : }
1018 0 : } else if(c<=0x20) {
1019 0 : if(c!=0x20) {
1020 0 : prev=BOCU1_ASCII_PREV;
1021 : }
1022 0 : *target++=(UChar)c;
1023 0 : *offsets++=nextSourceIndex++;
1024 : } else {
1025 0 : break;
1026 : }
1027 0 : ++source;
1028 0 : --count;
1029 : }
1030 0 : sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
1031 :
1032 : /* decode a sequence of single and lead bytes */
1033 0 : while(source<sourceLimit) {
1034 0 : if(target>=targetLimit) {
1035 : /* target is full */
1036 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1037 0 : break;
1038 : }
1039 :
1040 0 : ++nextSourceIndex;
1041 0 : c=*source++;
1042 0 : if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1043 : /* Write a code point directly from a single-byte difference. */
1044 0 : c=prev+(c-BOCU1_MIDDLE);
1045 0 : if(c<0x3000) {
1046 0 : *target++=(UChar)c;
1047 0 : *offsets++=sourceIndex;
1048 0 : prev=BOCU1_SIMPLE_PREV(c);
1049 0 : sourceIndex=nextSourceIndex;
1050 0 : goto fastSingle;
1051 : }
1052 0 : } else if(c<=0x20) {
1053 : /*
1054 : * Direct-encoded C0 control code or space.
1055 : * Reset prev for C0 control codes but not for space.
1056 : */
1057 0 : if(c!=0x20) {
1058 0 : prev=BOCU1_ASCII_PREV;
1059 : }
1060 0 : *target++=(UChar)c;
1061 0 : *offsets++=sourceIndex;
1062 0 : sourceIndex=nextSourceIndex;
1063 0 : continue;
1064 0 : } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1065 : /* Optimize two-byte case. */
1066 0 : if(c>=BOCU1_MIDDLE) {
1067 0 : diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1068 : } else {
1069 0 : diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1070 : }
1071 :
1072 : /* trail byte */
1073 0 : ++nextSourceIndex;
1074 0 : c=decodeBocu1TrailByte(1, *source++);
1075 0 : if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1076 0 : bytes[0]=source[-2];
1077 0 : bytes[1]=source[-1];
1078 0 : byteIndex=2;
1079 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1080 0 : break;
1081 : }
1082 0 : } else if(c==BOCU1_RESET) {
1083 : /* only reset the state, no code point */
1084 0 : prev=BOCU1_ASCII_PREV;
1085 0 : sourceIndex=nextSourceIndex;
1086 0 : continue;
1087 : } else {
1088 : /*
1089 : * For multi-byte difference lead bytes, set the decoder state
1090 : * with the partial difference value from the lead byte and
1091 : * with the number of trail bytes.
1092 : */
1093 0 : bytes[0]=(uint8_t)c;
1094 0 : byteIndex=1;
1095 :
1096 0 : diff=decodeBocu1LeadByte(c);
1097 0 : count=diff&3;
1098 0 : diff>>=2;
1099 : getTrail:
1100 : for(;;) {
1101 0 : if(source>=sourceLimit) {
1102 0 : goto endloop;
1103 : }
1104 0 : ++nextSourceIndex;
1105 0 : c=bytes[byteIndex++]=*source++;
1106 :
1107 : /* trail byte in any position */
1108 0 : c=decodeBocu1TrailByte(count, c);
1109 0 : if(c<0) {
1110 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1111 0 : goto endloop;
1112 : }
1113 :
1114 0 : diff+=c;
1115 0 : if(--count==0) {
1116 : /* final trail byte, deliver a code point */
1117 0 : byteIndex=0;
1118 0 : c=prev+diff;
1119 0 : if((uint32_t)c>0x10ffff) {
1120 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121 0 : goto endloop;
1122 : }
1123 0 : break;
1124 : }
1125 : }
1126 : }
1127 :
1128 : /* calculate the next prev and output c */
1129 0 : prev=BOCU1_PREV(c);
1130 0 : if(c<=0xffff) {
1131 0 : *target++=(UChar)c;
1132 0 : *offsets++=sourceIndex;
1133 : } else {
1134 : /* output surrogate pair */
1135 0 : *target++=U16_LEAD(c);
1136 0 : if(target<targetLimit) {
1137 0 : *target++=U16_TRAIL(c);
1138 0 : *offsets++=sourceIndex;
1139 0 : *offsets++=sourceIndex;
1140 : } else {
1141 : /* target overflow */
1142 0 : *offsets++=sourceIndex;
1143 0 : cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1144 0 : cnv->UCharErrorBufferLength=1;
1145 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1146 0 : break;
1147 : }
1148 : }
1149 0 : sourceIndex=nextSourceIndex;
1150 : }
1151 : endloop:
1152 :
1153 0 : if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1154 : /* set the converter state in UConverter to deal with the next character */
1155 0 : cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1156 0 : cnv->mode=0;
1157 : } else {
1158 : /* set the converter state back into UConverter */
1159 0 : cnv->toUnicodeStatus=(uint32_t)prev;
1160 0 : cnv->mode=(diff<<2)|count;
1161 : }
1162 0 : cnv->toULength=byteIndex;
1163 :
1164 : /* write back the updated pointers */
1165 0 : pArgs->source=(const char *)source;
1166 0 : pArgs->target=target;
1167 0 : pArgs->offsets=offsets;
1168 0 : return;
1169 : }
1170 :
1171 : /*
1172 : * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
1173 : * If a change is made in the original function, then either
1174 : * change this function the same way or
1175 : * re-copy the original function and remove the variables
1176 : * offsets, sourceIndex, and nextSourceIndex.
1177 : */
1178 : static void U_CALLCONV
1179 0 : _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
1180 : UErrorCode *pErrorCode) {
1181 : UConverter *cnv;
1182 : const uint8_t *source, *sourceLimit;
1183 : UChar *target;
1184 : const UChar *targetLimit;
1185 :
1186 : int32_t prev, count, diff, c;
1187 :
1188 : int8_t byteIndex;
1189 : uint8_t *bytes;
1190 :
1191 : /* set up the local pointers */
1192 0 : cnv=pArgs->converter;
1193 0 : source=(const uint8_t *)pArgs->source;
1194 0 : sourceLimit=(const uint8_t *)pArgs->sourceLimit;
1195 0 : target=pArgs->target;
1196 0 : targetLimit=pArgs->targetLimit;
1197 :
1198 : /* get the converter state from UConverter */
1199 0 : prev=(int32_t)cnv->toUnicodeStatus;
1200 0 : if(prev==0) {
1201 0 : prev=BOCU1_ASCII_PREV;
1202 : }
1203 0 : diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
1204 0 : count=diff&3;
1205 0 : diff>>=2;
1206 :
1207 0 : byteIndex=cnv->toULength;
1208 0 : bytes=cnv->toUBytes;
1209 :
1210 : /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
1211 0 : if(count>0 && byteIndex>0 && target<targetLimit) {
1212 0 : goto getTrail;
1213 : }
1214 :
1215 : fastSingle:
1216 : /* fast loop for single-byte differences */
1217 : /* use count as the only loop counter variable */
1218 0 : diff=(int32_t)(sourceLimit-source);
1219 0 : count=(int32_t)(pArgs->targetLimit-target);
1220 0 : if(count>diff) {
1221 0 : count=diff;
1222 : }
1223 0 : while(count>0) {
1224 0 : if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
1225 0 : c=prev+(c-BOCU1_MIDDLE);
1226 0 : if(c<0x3000) {
1227 0 : *target++=(UChar)c;
1228 0 : prev=BOCU1_SIMPLE_PREV(c);
1229 : } else {
1230 0 : break;
1231 : }
1232 0 : } else if(c<=0x20) {
1233 0 : if(c!=0x20) {
1234 0 : prev=BOCU1_ASCII_PREV;
1235 : }
1236 0 : *target++=(UChar)c;
1237 : } else {
1238 0 : break;
1239 : }
1240 0 : ++source;
1241 0 : --count;
1242 : }
1243 :
1244 : /* decode a sequence of single and lead bytes */
1245 0 : while(source<sourceLimit) {
1246 0 : if(target>=targetLimit) {
1247 : /* target is full */
1248 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1249 0 : break;
1250 : }
1251 :
1252 0 : c=*source++;
1253 0 : if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
1254 : /* Write a code point directly from a single-byte difference. */
1255 0 : c=prev+(c-BOCU1_MIDDLE);
1256 0 : if(c<0x3000) {
1257 0 : *target++=(UChar)c;
1258 0 : prev=BOCU1_SIMPLE_PREV(c);
1259 0 : goto fastSingle;
1260 : }
1261 0 : } else if(c<=0x20) {
1262 : /*
1263 : * Direct-encoded C0 control code or space.
1264 : * Reset prev for C0 control codes but not for space.
1265 : */
1266 0 : if(c!=0x20) {
1267 0 : prev=BOCU1_ASCII_PREV;
1268 : }
1269 0 : *target++=(UChar)c;
1270 0 : continue;
1271 0 : } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
1272 : /* Optimize two-byte case. */
1273 0 : if(c>=BOCU1_MIDDLE) {
1274 0 : diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
1275 : } else {
1276 0 : diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
1277 : }
1278 :
1279 : /* trail byte */
1280 0 : c=decodeBocu1TrailByte(1, *source++);
1281 0 : if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
1282 0 : bytes[0]=source[-2];
1283 0 : bytes[1]=source[-1];
1284 0 : byteIndex=2;
1285 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1286 0 : break;
1287 : }
1288 0 : } else if(c==BOCU1_RESET) {
1289 : /* only reset the state, no code point */
1290 0 : prev=BOCU1_ASCII_PREV;
1291 0 : continue;
1292 : } else {
1293 : /*
1294 : * For multi-byte difference lead bytes, set the decoder state
1295 : * with the partial difference value from the lead byte and
1296 : * with the number of trail bytes.
1297 : */
1298 0 : bytes[0]=(uint8_t)c;
1299 0 : byteIndex=1;
1300 :
1301 0 : diff=decodeBocu1LeadByte(c);
1302 0 : count=diff&3;
1303 0 : diff>>=2;
1304 : getTrail:
1305 : for(;;) {
1306 0 : if(source>=sourceLimit) {
1307 0 : goto endloop;
1308 : }
1309 0 : c=bytes[byteIndex++]=*source++;
1310 :
1311 : /* trail byte in any position */
1312 0 : c=decodeBocu1TrailByte(count, c);
1313 0 : if(c<0) {
1314 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1315 0 : goto endloop;
1316 : }
1317 :
1318 0 : diff+=c;
1319 0 : if(--count==0) {
1320 : /* final trail byte, deliver a code point */
1321 0 : byteIndex=0;
1322 0 : c=prev+diff;
1323 0 : if((uint32_t)c>0x10ffff) {
1324 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1325 0 : goto endloop;
1326 : }
1327 0 : break;
1328 : }
1329 : }
1330 : }
1331 :
1332 : /* calculate the next prev and output c */
1333 0 : prev=BOCU1_PREV(c);
1334 0 : if(c<=0xffff) {
1335 0 : *target++=(UChar)c;
1336 : } else {
1337 : /* output surrogate pair */
1338 0 : *target++=U16_LEAD(c);
1339 0 : if(target<targetLimit) {
1340 0 : *target++=U16_TRAIL(c);
1341 : } else {
1342 : /* target overflow */
1343 0 : cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
1344 0 : cnv->UCharErrorBufferLength=1;
1345 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1346 0 : break;
1347 : }
1348 : }
1349 : }
1350 : endloop:
1351 :
1352 0 : if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
1353 : /* set the converter state in UConverter to deal with the next character */
1354 0 : cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
1355 0 : cnv->mode=0;
1356 : } else {
1357 : /* set the converter state back into UConverter */
1358 0 : cnv->toUnicodeStatus=(uint32_t)prev;
1359 0 : cnv->mode=(diff<<2)|count;
1360 : }
1361 0 : cnv->toULength=byteIndex;
1362 :
1363 : /* write back the updated pointers */
1364 0 : pArgs->source=(const char *)source;
1365 0 : pArgs->target=target;
1366 0 : return;
1367 : }
1368 :
1369 : /* miscellaneous ------------------------------------------------------------ */
1370 :
1371 : static const UConverterImpl _Bocu1Impl={
1372 : UCNV_BOCU1,
1373 :
1374 : NULL,
1375 : NULL,
1376 :
1377 : NULL,
1378 : NULL,
1379 : NULL,
1380 :
1381 : _Bocu1ToUnicode,
1382 : _Bocu1ToUnicodeWithOffsets,
1383 : _Bocu1FromUnicode,
1384 : _Bocu1FromUnicodeWithOffsets,
1385 : NULL,
1386 :
1387 : NULL,
1388 : NULL,
1389 : NULL,
1390 : NULL,
1391 : ucnv_getCompleteUnicodeSet,
1392 :
1393 : NULL,
1394 : NULL
1395 : };
1396 :
1397 : static const UConverterStaticData _Bocu1StaticData={
1398 : sizeof(UConverterStaticData),
1399 : "BOCU-1",
1400 : 1214, /* CCSID for BOCU-1 */
1401 : UCNV_IBM, UCNV_BOCU1,
1402 : 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
1403 : { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
1404 : FALSE, FALSE,
1405 : 0,
1406 : 0,
1407 : { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1408 : };
1409 :
1410 : const UConverterSharedData _Bocu1Data=
1411 : UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
1412 :
1413 : #endif
|