Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : ******************************************************************************
5 : *
6 : * Copyright (C) 2000-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : ******************************************************************************
10 : * file name: ucnvscsu.c
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2000nov18
16 : * created by: Markus W. Scherer
17 : *
18 : * This is an implementation of the Standard Compression Scheme for Unicode
19 : * as defined in http://www.unicode.org/unicode/reports/tr6/ .
20 : * Reserved commands and window settings are treated as illegal sequences and
21 : * will result in callback calls.
22 : */
23 :
24 : #include "unicode/utypes.h"
25 :
26 : #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
27 :
28 : #include "unicode/ucnv.h"
29 : #include "unicode/ucnv_cb.h"
30 : #include "unicode/utf16.h"
31 : #include "ucnv_bld.h"
32 : #include "ucnv_cnv.h"
33 : #include "cmemory.h"
34 :
35 : /* SCSU definitions --------------------------------------------------------- */
36 :
37 : /* SCSU command byte values */
38 : enum {
39 : SQ0=0x01, /* Quote from window pair 0 */
40 : SQ7=0x08, /* Quote from window pair 7 */
41 : SDX=0x0B, /* Define a window as extended */
42 : Srs=0x0C, /* reserved */
43 : SQU=0x0E, /* Quote a single Unicode character */
44 : SCU=0x0F, /* Change to Unicode mode */
45 : SC0=0x10, /* Select window 0 */
46 : SC7=0x17, /* Select window 7 */
47 : SD0=0x18, /* Define and select window 0 */
48 : SD7=0x1F, /* Define and select window 7 */
49 :
50 : UC0=0xE0, /* Select window 0 */
51 : UC7=0xE7, /* Select window 7 */
52 : UD0=0xE8, /* Define and select window 0 */
53 : UD7=0xEF, /* Define and select window 7 */
54 : UQU=0xF0, /* Quote a single Unicode character */
55 : UDX=0xF1, /* Define a Window as extended */
56 : Urs=0xF2 /* reserved */
57 : };
58 :
59 : enum {
60 : /*
61 : * Unicode code points from 3400 to E000 are not adressible by
62 : * dynamic window, since in these areas no short run alphabets are
63 : * found. Therefore add gapOffset to all values from gapThreshold.
64 : */
65 : gapThreshold=0x68,
66 : gapOffset=0xAC00,
67 :
68 : /* values between reservedStart and fixedThreshold are reserved */
69 : reservedStart=0xA8,
70 :
71 : /* use table of predefined fixed offsets for values from fixedThreshold */
72 : fixedThreshold=0xF9
73 : };
74 :
75 : /* constant offsets for the 8 static windows */
76 : static const uint32_t staticOffsets[8]={
77 : 0x0000, /* ASCII for quoted tags */
78 : 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
79 : 0x0100, /* Latin Extended-A */
80 : 0x0300, /* Combining Diacritical Marks */
81 : 0x2000, /* General Punctuation */
82 : 0x2080, /* Currency Symbols */
83 : 0x2100, /* Letterlike Symbols and Number Forms */
84 : 0x3000 /* CJK Symbols and punctuation */
85 : };
86 :
87 : /* initial offsets for the 8 dynamic (sliding) windows */
88 : static const uint32_t initialDynamicOffsets[8]={
89 : 0x0080, /* Latin-1 */
90 : 0x00C0, /* Latin Extended A */
91 : 0x0400, /* Cyrillic */
92 : 0x0600, /* Arabic */
93 : 0x0900, /* Devanagari */
94 : 0x3040, /* Hiragana */
95 : 0x30A0, /* Katakana */
96 : 0xFF00 /* Fullwidth ASCII */
97 : };
98 :
99 : /* Table of fixed predefined Offsets */
100 : static const uint32_t fixedOffsets[]={
101 : /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
102 : /* 0xFA */ 0x0250, /* IPA extensions */
103 : /* 0xFB */ 0x0370, /* Greek */
104 : /* 0xFC */ 0x0530, /* Armenian */
105 : /* 0xFD */ 0x3040, /* Hiragana */
106 : /* 0xFE */ 0x30A0, /* Katakana */
107 : /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
108 : };
109 :
110 : /* state values */
111 : enum {
112 : readCommand,
113 : quotePairOne,
114 : quotePairTwo,
115 : quoteOne,
116 : definePairOne,
117 : definePairTwo,
118 : defineOne
119 : };
120 :
121 : typedef struct SCSUData {
122 : /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */
123 : uint32_t toUDynamicOffsets[8];
124 : uint32_t fromUDynamicOffsets[8];
125 :
126 : /* state machine state - toUnicode */
127 : UBool toUIsSingleByteMode;
128 : uint8_t toUState;
129 : int8_t toUQuoteWindow, toUDynamicWindow;
130 : uint8_t toUByteOne;
131 : uint8_t toUPadding[3];
132 :
133 : /* state machine state - fromUnicode */
134 : UBool fromUIsSingleByteMode;
135 : int8_t fromUDynamicWindow;
136 :
137 : /*
138 : * windowUse[] keeps track of the use of the dynamic windows:
139 : * At nextWindowUseIndex there is the least recently used window,
140 : * and the following windows (in a wrapping manner) are more and more
141 : * recently used.
142 : * At nextWindowUseIndex-1 there is the most recently used window.
143 : */
144 : uint8_t locale;
145 : int8_t nextWindowUseIndex;
146 : int8_t windowUse[8];
147 : } SCSUData;
148 :
149 : static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
150 : static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
151 :
152 : enum {
153 : lGeneric, l_ja
154 : };
155 :
156 : /* SCSU setup functions ----------------------------------------------------- */
157 : U_CDECL_BEGIN
158 : static void U_CALLCONV
159 0 : _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
160 0 : SCSUData *scsu=(SCSUData *)cnv->extraInfo;
161 :
162 0 : if(choice<=UCNV_RESET_TO_UNICODE) {
163 : /* reset toUnicode */
164 0 : uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
165 :
166 0 : scsu->toUIsSingleByteMode=TRUE;
167 0 : scsu->toUState=readCommand;
168 0 : scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
169 0 : scsu->toUByteOne=0;
170 :
171 0 : cnv->toULength=0;
172 : }
173 0 : if(choice!=UCNV_RESET_TO_UNICODE) {
174 : /* reset fromUnicode */
175 0 : uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
176 :
177 0 : scsu->fromUIsSingleByteMode=TRUE;
178 0 : scsu->fromUDynamicWindow=0;
179 :
180 0 : scsu->nextWindowUseIndex=0;
181 0 : switch(scsu->locale) {
182 : case l_ja:
183 0 : uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
184 0 : break;
185 : default:
186 0 : uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
187 0 : break;
188 : }
189 :
190 0 : cnv->fromUChar32=0;
191 : }
192 0 : }
193 :
194 : static void U_CALLCONV
195 0 : _SCSUOpen(UConverter *cnv,
196 : UConverterLoadArgs *pArgs,
197 : UErrorCode *pErrorCode) {
198 0 : const char *locale=pArgs->locale;
199 0 : if(pArgs->onlyTestIsLoadable) {
200 0 : return;
201 : }
202 0 : cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
203 0 : if(cnv->extraInfo!=NULL) {
204 0 : if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
205 0 : ((SCSUData *)cnv->extraInfo)->locale=l_ja;
206 : } else {
207 0 : ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
208 : }
209 0 : _SCSUReset(cnv, UCNV_RESET_BOTH);
210 : } else {
211 0 : *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
212 : }
213 :
214 : /* Set the substitution character U+fffd as a Unicode string. */
215 0 : cnv->subUChars[0]=0xfffd;
216 0 : cnv->subCharLen=-1;
217 : }
218 :
219 : static void U_CALLCONV
220 0 : _SCSUClose(UConverter *cnv) {
221 0 : if(cnv->extraInfo!=NULL) {
222 0 : if(!cnv->isExtraLocal) {
223 0 : uprv_free(cnv->extraInfo);
224 : }
225 0 : cnv->extraInfo=NULL;
226 : }
227 0 : }
228 :
229 : /* SCSU-to-Unicode conversion functions ------------------------------------- */
230 :
231 : static void U_CALLCONV
232 0 : _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
233 : UErrorCode *pErrorCode) {
234 : UConverter *cnv;
235 : SCSUData *scsu;
236 : const uint8_t *source, *sourceLimit;
237 : UChar *target;
238 : const UChar *targetLimit;
239 : int32_t *offsets;
240 : UBool isSingleByteMode;
241 : uint8_t state, byteOne;
242 : int8_t quoteWindow, dynamicWindow;
243 :
244 : int32_t sourceIndex, nextSourceIndex;
245 :
246 : uint8_t b;
247 :
248 : /* set up the local pointers */
249 0 : cnv=pArgs->converter;
250 0 : scsu=(SCSUData *)cnv->extraInfo;
251 :
252 0 : source=(const uint8_t *)pArgs->source;
253 0 : sourceLimit=(const uint8_t *)pArgs->sourceLimit;
254 0 : target=pArgs->target;
255 0 : targetLimit=pArgs->targetLimit;
256 0 : offsets=pArgs->offsets;
257 :
258 : /* get the state machine state */
259 0 : isSingleByteMode=scsu->toUIsSingleByteMode;
260 0 : state=scsu->toUState;
261 0 : quoteWindow=scsu->toUQuoteWindow;
262 0 : dynamicWindow=scsu->toUDynamicWindow;
263 0 : byteOne=scsu->toUByteOne;
264 :
265 : /* sourceIndex=-1 if the current character began in the previous buffer */
266 0 : sourceIndex=state==readCommand ? 0 : -1;
267 0 : nextSourceIndex=0;
268 :
269 : /*
270 : * conversion "loop"
271 : *
272 : * For performance, this is not a normal C loop.
273 : * Instead, there are two code blocks for the two SCSU modes.
274 : * The function branches to either one, and a change of the mode is done with a goto to
275 : * the other branch.
276 : *
277 : * Each branch has two conventional loops:
278 : * - a fast-path loop for the most common codes in the mode
279 : * - a loop for all other codes in the mode
280 : * When the fast-path runs into a code that it cannot handle, its loop ends and it
281 : * runs into the following loop to handle the other codes.
282 : * The end of the input or output buffer is also handled by the slower loop.
283 : * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
284 : *
285 : * The callback handling is done by returning with an error code.
286 : * The conversion framework actually calls the callback function.
287 : */
288 0 : if(isSingleByteMode) {
289 : /* fast path for single-byte mode */
290 0 : if(state==readCommand) {
291 : fastSingle:
292 0 : while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
293 0 : ++source;
294 0 : ++nextSourceIndex;
295 0 : if(b<=0x7f) {
296 : /* write US-ASCII graphic character or DEL */
297 0 : *target++=(UChar)b;
298 0 : if(offsets!=NULL) {
299 0 : *offsets++=sourceIndex;
300 : }
301 : } else {
302 : /* write from dynamic window */
303 0 : uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
304 0 : if(c<=0xffff) {
305 0 : *target++=(UChar)c;
306 0 : if(offsets!=NULL) {
307 0 : *offsets++=sourceIndex;
308 : }
309 : } else {
310 : /* output surrogate pair */
311 0 : *target++=(UChar)(0xd7c0+(c>>10));
312 0 : if(target<targetLimit) {
313 0 : *target++=(UChar)(0xdc00|(c&0x3ff));
314 0 : if(offsets!=NULL) {
315 0 : *offsets++=sourceIndex;
316 0 : *offsets++=sourceIndex;
317 : }
318 : } else {
319 : /* target overflow */
320 0 : if(offsets!=NULL) {
321 0 : *offsets++=sourceIndex;
322 : }
323 0 : cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
324 0 : cnv->UCharErrorBufferLength=1;
325 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
326 0 : goto endloop;
327 : }
328 : }
329 : }
330 0 : sourceIndex=nextSourceIndex;
331 : }
332 : }
333 :
334 : /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
335 : singleByteMode:
336 0 : while(source<sourceLimit) {
337 0 : if(target>=targetLimit) {
338 : /* target is full */
339 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
340 0 : break;
341 : }
342 0 : b=*source++;
343 0 : ++nextSourceIndex;
344 0 : switch(state) {
345 : case readCommand:
346 : /* redundant conditions are commented out */
347 : /* here: b<0x20 because otherwise we would be in fastSingle */
348 0 : if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
349 : /* CR/LF/TAB/NUL */
350 0 : *target++=(UChar)b;
351 0 : if(offsets!=NULL) {
352 0 : *offsets++=sourceIndex;
353 : }
354 0 : sourceIndex=nextSourceIndex;
355 0 : goto fastSingle;
356 0 : } else if(SC0<=b) {
357 0 : if(b<=SC7) {
358 0 : dynamicWindow=(int8_t)(b-SC0);
359 0 : sourceIndex=nextSourceIndex;
360 0 : goto fastSingle;
361 : } else /* if(SD0<=b && b<=SD7) */ {
362 0 : dynamicWindow=(int8_t)(b-SD0);
363 0 : state=defineOne;
364 : }
365 0 : } else if(/* SQ0<=b && */ b<=SQ7) {
366 0 : quoteWindow=(int8_t)(b-SQ0);
367 0 : state=quoteOne;
368 0 : } else if(b==SDX) {
369 0 : state=definePairOne;
370 0 : } else if(b==SQU) {
371 0 : state=quotePairOne;
372 0 : } else if(b==SCU) {
373 0 : sourceIndex=nextSourceIndex;
374 0 : isSingleByteMode=FALSE;
375 0 : goto fastUnicode;
376 : } else /* Srs */ {
377 : /* callback(illegal) */
378 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
379 0 : cnv->toUBytes[0]=b;
380 0 : cnv->toULength=1;
381 0 : goto endloop;
382 : }
383 :
384 : /* store the first byte of a multibyte sequence in toUBytes[] */
385 0 : cnv->toUBytes[0]=b;
386 0 : cnv->toULength=1;
387 0 : break;
388 : case quotePairOne:
389 0 : byteOne=b;
390 0 : cnv->toUBytes[1]=b;
391 0 : cnv->toULength=2;
392 0 : state=quotePairTwo;
393 0 : break;
394 : case quotePairTwo:
395 0 : *target++=(UChar)((byteOne<<8)|b);
396 0 : if(offsets!=NULL) {
397 0 : *offsets++=sourceIndex;
398 : }
399 0 : sourceIndex=nextSourceIndex;
400 0 : state=readCommand;
401 0 : goto fastSingle;
402 : case quoteOne:
403 0 : if(b<0x80) {
404 : /* all static offsets are in the BMP */
405 0 : *target++=(UChar)(staticOffsets[quoteWindow]+b);
406 0 : if(offsets!=NULL) {
407 0 : *offsets++=sourceIndex;
408 : }
409 : } else {
410 : /* write from dynamic window */
411 0 : uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
412 0 : if(c<=0xffff) {
413 0 : *target++=(UChar)c;
414 0 : if(offsets!=NULL) {
415 0 : *offsets++=sourceIndex;
416 : }
417 : } else {
418 : /* output surrogate pair */
419 0 : *target++=(UChar)(0xd7c0+(c>>10));
420 0 : if(target<targetLimit) {
421 0 : *target++=(UChar)(0xdc00|(c&0x3ff));
422 0 : if(offsets!=NULL) {
423 0 : *offsets++=sourceIndex;
424 0 : *offsets++=sourceIndex;
425 : }
426 : } else {
427 : /* target overflow */
428 0 : if(offsets!=NULL) {
429 0 : *offsets++=sourceIndex;
430 : }
431 0 : cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
432 0 : cnv->UCharErrorBufferLength=1;
433 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
434 0 : goto endloop;
435 : }
436 : }
437 : }
438 0 : sourceIndex=nextSourceIndex;
439 0 : state=readCommand;
440 0 : goto fastSingle;
441 : case definePairOne:
442 0 : dynamicWindow=(int8_t)((b>>5)&7);
443 0 : byteOne=(uint8_t)(b&0x1f);
444 0 : cnv->toUBytes[1]=b;
445 0 : cnv->toULength=2;
446 0 : state=definePairTwo;
447 0 : break;
448 : case definePairTwo:
449 0 : scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
450 0 : sourceIndex=nextSourceIndex;
451 0 : state=readCommand;
452 0 : goto fastSingle;
453 : case defineOne:
454 0 : if(b==0) {
455 : /* callback(illegal): Reserved window offset value 0 */
456 0 : cnv->toUBytes[1]=b;
457 0 : cnv->toULength=2;
458 0 : goto endloop;
459 0 : } else if(b<gapThreshold) {
460 0 : scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
461 0 : } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
462 0 : scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
463 0 : } else if(b>=fixedThreshold) {
464 0 : scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
465 : } else {
466 : /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
467 0 : cnv->toUBytes[1]=b;
468 0 : cnv->toULength=2;
469 0 : goto endloop;
470 : }
471 0 : sourceIndex=nextSourceIndex;
472 0 : state=readCommand;
473 0 : goto fastSingle;
474 : }
475 : }
476 : } else {
477 : /* fast path for Unicode mode */
478 0 : if(state==readCommand) {
479 : fastUnicode:
480 0 : while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
481 0 : *target++=(UChar)((b<<8)|source[1]);
482 0 : if(offsets!=NULL) {
483 0 : *offsets++=sourceIndex;
484 : }
485 0 : sourceIndex=nextSourceIndex;
486 0 : nextSourceIndex+=2;
487 0 : source+=2;
488 : }
489 : }
490 :
491 : /* normal state machine for Unicode mode */
492 : /* unicodeByteMode: */
493 0 : while(source<sourceLimit) {
494 0 : if(target>=targetLimit) {
495 : /* target is full */
496 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
497 0 : break;
498 : }
499 0 : b=*source++;
500 0 : ++nextSourceIndex;
501 0 : switch(state) {
502 : case readCommand:
503 0 : if((uint8_t)(b-UC0)>(Urs-UC0)) {
504 0 : byteOne=b;
505 0 : cnv->toUBytes[0]=b;
506 0 : cnv->toULength=1;
507 0 : state=quotePairTwo;
508 0 : } else if(/* UC0<=b && */ b<=UC7) {
509 0 : dynamicWindow=(int8_t)(b-UC0);
510 0 : sourceIndex=nextSourceIndex;
511 0 : isSingleByteMode=TRUE;
512 0 : goto fastSingle;
513 0 : } else if(/* UD0<=b && */ b<=UD7) {
514 0 : dynamicWindow=(int8_t)(b-UD0);
515 0 : isSingleByteMode=TRUE;
516 0 : cnv->toUBytes[0]=b;
517 0 : cnv->toULength=1;
518 0 : state=defineOne;
519 0 : goto singleByteMode;
520 0 : } else if(b==UDX) {
521 0 : isSingleByteMode=TRUE;
522 0 : cnv->toUBytes[0]=b;
523 0 : cnv->toULength=1;
524 0 : state=definePairOne;
525 0 : goto singleByteMode;
526 0 : } else if(b==UQU) {
527 0 : cnv->toUBytes[0]=b;
528 0 : cnv->toULength=1;
529 0 : state=quotePairOne;
530 : } else /* Urs */ {
531 : /* callback(illegal) */
532 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
533 0 : cnv->toUBytes[0]=b;
534 0 : cnv->toULength=1;
535 0 : goto endloop;
536 : }
537 0 : break;
538 : case quotePairOne:
539 0 : byteOne=b;
540 0 : cnv->toUBytes[1]=b;
541 0 : cnv->toULength=2;
542 0 : state=quotePairTwo;
543 0 : break;
544 : case quotePairTwo:
545 0 : *target++=(UChar)((byteOne<<8)|b);
546 0 : if(offsets!=NULL) {
547 0 : *offsets++=sourceIndex;
548 : }
549 0 : sourceIndex=nextSourceIndex;
550 0 : state=readCommand;
551 0 : goto fastUnicode;
552 : }
553 : }
554 : }
555 : endloop:
556 :
557 : /* set the converter state back into UConverter */
558 0 : if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
559 : /* reset to deal with the next character */
560 0 : state=readCommand;
561 0 : } else if(state==readCommand) {
562 : /* not in a multi-byte sequence, reset toULength */
563 0 : cnv->toULength=0;
564 : }
565 0 : scsu->toUIsSingleByteMode=isSingleByteMode;
566 0 : scsu->toUState=state;
567 0 : scsu->toUQuoteWindow=quoteWindow;
568 0 : scsu->toUDynamicWindow=dynamicWindow;
569 0 : scsu->toUByteOne=byteOne;
570 :
571 : /* write back the updated pointers */
572 0 : pArgs->source=(const char *)source;
573 0 : pArgs->target=target;
574 0 : pArgs->offsets=offsets;
575 0 : return;
576 : }
577 :
578 : /*
579 : * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
580 : * If a change is made in the original function, then either
581 : * change this function the same way or
582 : * re-copy the original function and remove the variables
583 : * offsets, sourceIndex, and nextSourceIndex.
584 : */
585 : static void U_CALLCONV
586 0 : _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
587 : UErrorCode *pErrorCode) {
588 : UConverter *cnv;
589 : SCSUData *scsu;
590 : const uint8_t *source, *sourceLimit;
591 : UChar *target;
592 : const UChar *targetLimit;
593 : UBool isSingleByteMode;
594 : uint8_t state, byteOne;
595 : int8_t quoteWindow, dynamicWindow;
596 :
597 : uint8_t b;
598 :
599 : /* set up the local pointers */
600 0 : cnv=pArgs->converter;
601 0 : scsu=(SCSUData *)cnv->extraInfo;
602 :
603 0 : source=(const uint8_t *)pArgs->source;
604 0 : sourceLimit=(const uint8_t *)pArgs->sourceLimit;
605 0 : target=pArgs->target;
606 0 : targetLimit=pArgs->targetLimit;
607 :
608 : /* get the state machine state */
609 0 : isSingleByteMode=scsu->toUIsSingleByteMode;
610 0 : state=scsu->toUState;
611 0 : quoteWindow=scsu->toUQuoteWindow;
612 0 : dynamicWindow=scsu->toUDynamicWindow;
613 0 : byteOne=scsu->toUByteOne;
614 :
615 : /*
616 : * conversion "loop"
617 : *
618 : * For performance, this is not a normal C loop.
619 : * Instead, there are two code blocks for the two SCSU modes.
620 : * The function branches to either one, and a change of the mode is done with a goto to
621 : * the other branch.
622 : *
623 : * Each branch has two conventional loops:
624 : * - a fast-path loop for the most common codes in the mode
625 : * - a loop for all other codes in the mode
626 : * When the fast-path runs into a code that it cannot handle, its loop ends and it
627 : * runs into the following loop to handle the other codes.
628 : * The end of the input or output buffer is also handled by the slower loop.
629 : * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
630 : *
631 : * The callback handling is done by returning with an error code.
632 : * The conversion framework actually calls the callback function.
633 : */
634 0 : if(isSingleByteMode) {
635 : /* fast path for single-byte mode */
636 0 : if(state==readCommand) {
637 : fastSingle:
638 0 : while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
639 0 : ++source;
640 0 : if(b<=0x7f) {
641 : /* write US-ASCII graphic character or DEL */
642 0 : *target++=(UChar)b;
643 : } else {
644 : /* write from dynamic window */
645 0 : uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
646 0 : if(c<=0xffff) {
647 0 : *target++=(UChar)c;
648 : } else {
649 : /* output surrogate pair */
650 0 : *target++=(UChar)(0xd7c0+(c>>10));
651 0 : if(target<targetLimit) {
652 0 : *target++=(UChar)(0xdc00|(c&0x3ff));
653 : } else {
654 : /* target overflow */
655 0 : cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
656 0 : cnv->UCharErrorBufferLength=1;
657 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
658 0 : goto endloop;
659 : }
660 : }
661 : }
662 : }
663 : }
664 :
665 : /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
666 : singleByteMode:
667 0 : while(source<sourceLimit) {
668 0 : if(target>=targetLimit) {
669 : /* target is full */
670 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
671 0 : break;
672 : }
673 0 : b=*source++;
674 0 : switch(state) {
675 : case readCommand:
676 : /* redundant conditions are commented out */
677 : /* here: b<0x20 because otherwise we would be in fastSingle */
678 0 : if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
679 : /* CR/LF/TAB/NUL */
680 0 : *target++=(UChar)b;
681 0 : goto fastSingle;
682 0 : } else if(SC0<=b) {
683 0 : if(b<=SC7) {
684 0 : dynamicWindow=(int8_t)(b-SC0);
685 0 : goto fastSingle;
686 : } else /* if(SD0<=b && b<=SD7) */ {
687 0 : dynamicWindow=(int8_t)(b-SD0);
688 0 : state=defineOne;
689 : }
690 0 : } else if(/* SQ0<=b && */ b<=SQ7) {
691 0 : quoteWindow=(int8_t)(b-SQ0);
692 0 : state=quoteOne;
693 0 : } else if(b==SDX) {
694 0 : state=definePairOne;
695 0 : } else if(b==SQU) {
696 0 : state=quotePairOne;
697 0 : } else if(b==SCU) {
698 0 : isSingleByteMode=FALSE;
699 0 : goto fastUnicode;
700 : } else /* Srs */ {
701 : /* callback(illegal) */
702 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
703 0 : cnv->toUBytes[0]=b;
704 0 : cnv->toULength=1;
705 0 : goto endloop;
706 : }
707 :
708 : /* store the first byte of a multibyte sequence in toUBytes[] */
709 0 : cnv->toUBytes[0]=b;
710 0 : cnv->toULength=1;
711 0 : break;
712 : case quotePairOne:
713 0 : byteOne=b;
714 0 : cnv->toUBytes[1]=b;
715 0 : cnv->toULength=2;
716 0 : state=quotePairTwo;
717 0 : break;
718 : case quotePairTwo:
719 0 : *target++=(UChar)((byteOne<<8)|b);
720 0 : state=readCommand;
721 0 : goto fastSingle;
722 : case quoteOne:
723 0 : if(b<0x80) {
724 : /* all static offsets are in the BMP */
725 0 : *target++=(UChar)(staticOffsets[quoteWindow]+b);
726 : } else {
727 : /* write from dynamic window */
728 0 : uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
729 0 : if(c<=0xffff) {
730 0 : *target++=(UChar)c;
731 : } else {
732 : /* output surrogate pair */
733 0 : *target++=(UChar)(0xd7c0+(c>>10));
734 0 : if(target<targetLimit) {
735 0 : *target++=(UChar)(0xdc00|(c&0x3ff));
736 : } else {
737 : /* target overflow */
738 0 : cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff));
739 0 : cnv->UCharErrorBufferLength=1;
740 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
741 0 : goto endloop;
742 : }
743 : }
744 : }
745 0 : state=readCommand;
746 0 : goto fastSingle;
747 : case definePairOne:
748 0 : dynamicWindow=(int8_t)((b>>5)&7);
749 0 : byteOne=(uint8_t)(b&0x1f);
750 0 : cnv->toUBytes[1]=b;
751 0 : cnv->toULength=2;
752 0 : state=definePairTwo;
753 0 : break;
754 : case definePairTwo:
755 0 : scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
756 0 : state=readCommand;
757 0 : goto fastSingle;
758 : case defineOne:
759 0 : if(b==0) {
760 : /* callback(illegal): Reserved window offset value 0 */
761 0 : cnv->toUBytes[1]=b;
762 0 : cnv->toULength=2;
763 0 : goto endloop;
764 0 : } else if(b<gapThreshold) {
765 0 : scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
766 0 : } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
767 0 : scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
768 0 : } else if(b>=fixedThreshold) {
769 0 : scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
770 : } else {
771 : /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
772 0 : cnv->toUBytes[1]=b;
773 0 : cnv->toULength=2;
774 0 : goto endloop;
775 : }
776 0 : state=readCommand;
777 0 : goto fastSingle;
778 : }
779 : }
780 : } else {
781 : /* fast path for Unicode mode */
782 0 : if(state==readCommand) {
783 : fastUnicode:
784 0 : while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
785 0 : *target++=(UChar)((b<<8)|source[1]);
786 0 : source+=2;
787 : }
788 : }
789 :
790 : /* normal state machine for Unicode mode */
791 : /* unicodeByteMode: */
792 0 : while(source<sourceLimit) {
793 0 : if(target>=targetLimit) {
794 : /* target is full */
795 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
796 0 : break;
797 : }
798 0 : b=*source++;
799 0 : switch(state) {
800 : case readCommand:
801 0 : if((uint8_t)(b-UC0)>(Urs-UC0)) {
802 0 : byteOne=b;
803 0 : cnv->toUBytes[0]=b;
804 0 : cnv->toULength=1;
805 0 : state=quotePairTwo;
806 0 : } else if(/* UC0<=b && */ b<=UC7) {
807 0 : dynamicWindow=(int8_t)(b-UC0);
808 0 : isSingleByteMode=TRUE;
809 0 : goto fastSingle;
810 0 : } else if(/* UD0<=b && */ b<=UD7) {
811 0 : dynamicWindow=(int8_t)(b-UD0);
812 0 : isSingleByteMode=TRUE;
813 0 : cnv->toUBytes[0]=b;
814 0 : cnv->toULength=1;
815 0 : state=defineOne;
816 0 : goto singleByteMode;
817 0 : } else if(b==UDX) {
818 0 : isSingleByteMode=TRUE;
819 0 : cnv->toUBytes[0]=b;
820 0 : cnv->toULength=1;
821 0 : state=definePairOne;
822 0 : goto singleByteMode;
823 0 : } else if(b==UQU) {
824 0 : cnv->toUBytes[0]=b;
825 0 : cnv->toULength=1;
826 0 : state=quotePairOne;
827 : } else /* Urs */ {
828 : /* callback(illegal) */
829 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
830 0 : cnv->toUBytes[0]=b;
831 0 : cnv->toULength=1;
832 0 : goto endloop;
833 : }
834 0 : break;
835 : case quotePairOne:
836 0 : byteOne=b;
837 0 : cnv->toUBytes[1]=b;
838 0 : cnv->toULength=2;
839 0 : state=quotePairTwo;
840 0 : break;
841 : case quotePairTwo:
842 0 : *target++=(UChar)((byteOne<<8)|b);
843 0 : state=readCommand;
844 0 : goto fastUnicode;
845 : }
846 : }
847 : }
848 : endloop:
849 :
850 : /* set the converter state back into UConverter */
851 0 : if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
852 : /* reset to deal with the next character */
853 0 : state=readCommand;
854 0 : } else if(state==readCommand) {
855 : /* not in a multi-byte sequence, reset toULength */
856 0 : cnv->toULength=0;
857 : }
858 0 : scsu->toUIsSingleByteMode=isSingleByteMode;
859 0 : scsu->toUState=state;
860 0 : scsu->toUQuoteWindow=quoteWindow;
861 0 : scsu->toUDynamicWindow=dynamicWindow;
862 0 : scsu->toUByteOne=byteOne;
863 :
864 : /* write back the updated pointers */
865 0 : pArgs->source=(const char *)source;
866 0 : pArgs->target=target;
867 0 : return;
868 : }
869 : U_CDECL_END
870 : /* SCSU-from-Unicode conversion functions ----------------------------------- */
871 :
872 : /*
873 : * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
874 : * reasonable results. The lookahead is minimal.
875 : * Many cases are simple:
876 : * A character fits directly into the current mode, a dynamic or static window,
877 : * or is not compressible. These cases are tested first.
878 : * Real compression heuristics are applied to the rest, in code branches for
879 : * single/Unicode mode and BMP/supplementary code points.
880 : * The heuristics used here are extremely simple.
881 : */
882 :
883 : /* get the number of the window that this character is in, or -1 */
884 : static int8_t
885 0 : getWindow(const uint32_t offsets[8], uint32_t c) {
886 : int i;
887 0 : for(i=0; i<8; ++i) {
888 0 : if((uint32_t)(c-offsets[i])<=0x7f) {
889 0 : return (int8_t)(i);
890 : }
891 : }
892 0 : return -1;
893 : }
894 :
895 : /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
896 : static UBool
897 0 : isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
898 0 : return (UBool)(c<=offset+0x7f &&
899 0 : (c>=offset || (c<=0x7f &&
900 0 : (c>=0x20 || (1UL<<c)&0x2601))));
901 : /* binary 0010 0110 0000 0001,
902 : check for b==0xd || b==0xa || b==9 || b==0 */
903 : }
904 :
905 : /*
906 : * getNextDynamicWindow returns the next dynamic window to be redefined
907 : */
908 : static int8_t
909 0 : getNextDynamicWindow(SCSUData *scsu) {
910 0 : int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
911 0 : if(++scsu->nextWindowUseIndex==8) {
912 0 : scsu->nextWindowUseIndex=0;
913 : }
914 0 : return window;
915 : }
916 :
917 : /*
918 : * useDynamicWindow() adjusts
919 : * windowUse[] and nextWindowUseIndex for the algorithm to choose
920 : * the next dynamic window to be defined;
921 : * a subclass may override it and provide its own algorithm.
922 : */
923 : static void
924 0 : useDynamicWindow(SCSUData *scsu, int8_t window) {
925 : /*
926 : * move the existing window, which just became the most recently used one,
927 : * up in windowUse[] to nextWindowUseIndex-1
928 : */
929 :
930 : /* first, find the index of the window - backwards to favor the more recently used windows */
931 : int i, j;
932 :
933 0 : i=scsu->nextWindowUseIndex;
934 0 : do {
935 0 : if(--i<0) {
936 0 : i=7;
937 : }
938 0 : } while(scsu->windowUse[i]!=window);
939 :
940 : /* now copy each windowUse[i+1] to [i] */
941 0 : j=i+1;
942 0 : if(j==8) {
943 0 : j=0;
944 : }
945 0 : while(j!=scsu->nextWindowUseIndex) {
946 0 : scsu->windowUse[i]=scsu->windowUse[j];
947 0 : i=j;
948 0 : if(++j==8) { j=0; }
949 : }
950 :
951 : /* finally, set the window into the most recently used index */
952 0 : scsu->windowUse[i]=window;
953 0 : }
954 :
955 : /*
956 : * calculate the offset and the code for a dynamic window that contains the character
957 : * takes fixed offsets into account
958 : * the offset of the window is stored in the offset variable,
959 : * the code is returned
960 : *
961 : * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
962 : */
963 : static int
964 0 : getDynamicOffset(uint32_t c, uint32_t *pOffset) {
965 : int i;
966 :
967 0 : for(i=0; i<7; ++i) {
968 0 : if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
969 0 : *pOffset=fixedOffsets[i];
970 0 : return 0xf9+i;
971 : }
972 : }
973 :
974 0 : if(c<0x80) {
975 : /* No dynamic window for US-ASCII. */
976 0 : return -1;
977 0 : } else if(c<0x3400 ||
978 0 : (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
979 0 : (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
980 : ) {
981 : /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
982 0 : *pOffset=c&0x7fffff80;
983 0 : return (int)(c>>7);
984 0 : } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
985 : /* For these characters we need to take the gapOffset into account. */
986 0 : *pOffset=c&0x7fffff80;
987 0 : return (int)((c-gapOffset)>>7);
988 : } else {
989 0 : return -1;
990 : }
991 : }
992 : U_CDECL_BEGIN
993 : /*
994 : * Idea for compression:
995 : * - save SCSUData and other state before really starting work
996 : * - at endloop, see if compression could be better with just unicode mode
997 : * - don't do this if a callback has been called
998 : * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
999 : * - different buffer handling!
1000 : *
1001 : * Drawback or need for corrective handling:
1002 : * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
1003 : * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
1004 : * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
1005 : *
1006 : * How to achieve both?
1007 : * - Only replace the result after an SDX or SCU?
1008 : */
1009 :
1010 : static void U_CALLCONV
1011 0 : _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1012 : UErrorCode *pErrorCode) {
1013 : UConverter *cnv;
1014 : SCSUData *scsu;
1015 : const UChar *source, *sourceLimit;
1016 : uint8_t *target;
1017 : int32_t targetCapacity;
1018 : int32_t *offsets;
1019 :
1020 : UBool isSingleByteMode;
1021 : uint8_t dynamicWindow;
1022 : uint32_t currentOffset;
1023 :
1024 : uint32_t c, delta;
1025 :
1026 : int32_t sourceIndex, nextSourceIndex;
1027 :
1028 : int32_t length;
1029 :
1030 : /* variables for compression heuristics */
1031 : uint32_t offset;
1032 : UChar lead, trail;
1033 : int code;
1034 : int8_t window;
1035 :
1036 : /* set up the local pointers */
1037 0 : cnv=pArgs->converter;
1038 0 : scsu=(SCSUData *)cnv->extraInfo;
1039 :
1040 : /* set up the local pointers */
1041 0 : source=pArgs->source;
1042 0 : sourceLimit=pArgs->sourceLimit;
1043 0 : target=(uint8_t *)pArgs->target;
1044 0 : targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1045 0 : offsets=pArgs->offsets;
1046 :
1047 : /* get the state machine state */
1048 0 : isSingleByteMode=scsu->fromUIsSingleByteMode;
1049 0 : dynamicWindow=scsu->fromUDynamicWindow;
1050 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1051 :
1052 0 : c=cnv->fromUChar32;
1053 :
1054 : /* sourceIndex=-1 if the current character began in the previous buffer */
1055 0 : sourceIndex= c==0 ? 0 : -1;
1056 0 : nextSourceIndex=0;
1057 :
1058 : /* similar conversion "loop" as in toUnicode */
1059 : loop:
1060 0 : if(isSingleByteMode) {
1061 0 : if(c!=0 && targetCapacity>0) {
1062 0 : goto getTrailSingle;
1063 : }
1064 :
1065 : /* state machine for single-byte mode */
1066 : /* singleByteMode: */
1067 0 : while(source<sourceLimit) {
1068 0 : if(targetCapacity<=0) {
1069 : /* target is full */
1070 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1071 0 : break;
1072 : }
1073 0 : c=*source++;
1074 0 : ++nextSourceIndex;
1075 :
1076 0 : if((c-0x20)<=0x5f) {
1077 : /* pass US-ASCII graphic character through */
1078 0 : *target++=(uint8_t)c;
1079 0 : if(offsets!=NULL) {
1080 0 : *offsets++=sourceIndex;
1081 : }
1082 0 : --targetCapacity;
1083 0 : } else if(c<0x20) {
1084 0 : if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1085 : /* CR/LF/TAB/NUL */
1086 0 : *target++=(uint8_t)c;
1087 0 : if(offsets!=NULL) {
1088 0 : *offsets++=sourceIndex;
1089 : }
1090 0 : --targetCapacity;
1091 : } else {
1092 : /* quote C0 control character */
1093 0 : c|=SQ0<<8;
1094 0 : length=2;
1095 0 : goto outputBytes;
1096 : }
1097 0 : } else if((delta=c-currentOffset)<=0x7f) {
1098 : /* use the current dynamic window */
1099 0 : *target++=(uint8_t)(delta|0x80);
1100 0 : if(offsets!=NULL) {
1101 0 : *offsets++=sourceIndex;
1102 : }
1103 0 : --targetCapacity;
1104 0 : } else if(U16_IS_SURROGATE(c)) {
1105 0 : if(U16_IS_SURROGATE_LEAD(c)) {
1106 : getTrailSingle:
1107 0 : lead=(UChar)c;
1108 0 : if(source<sourceLimit) {
1109 : /* test the following code unit */
1110 0 : trail=*source;
1111 0 : if(U16_IS_TRAIL(trail)) {
1112 0 : ++source;
1113 0 : ++nextSourceIndex;
1114 0 : c=U16_GET_SUPPLEMENTARY(c, trail);
1115 : /* convert this surrogate code point */
1116 : /* exit this condition tree */
1117 : } else {
1118 : /* this is an unmatched lead code unit (1st surrogate) */
1119 : /* callback(illegal) */
1120 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1121 0 : goto endloop;
1122 : }
1123 : } else {
1124 : /* no more input */
1125 0 : break;
1126 : }
1127 : } else {
1128 : /* this is an unmatched trail code unit (2nd surrogate) */
1129 : /* callback(illegal) */
1130 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1131 0 : goto endloop;
1132 : }
1133 :
1134 : /* compress supplementary character U+10000..U+10ffff */
1135 0 : if((delta=c-currentOffset)<=0x7f) {
1136 : /* use the current dynamic window */
1137 0 : *target++=(uint8_t)(delta|0x80);
1138 0 : if(offsets!=NULL) {
1139 0 : *offsets++=sourceIndex;
1140 : }
1141 0 : --targetCapacity;
1142 0 : } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1143 : /* there is a dynamic window that contains this character, change to it */
1144 0 : dynamicWindow=window;
1145 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1146 0 : useDynamicWindow(scsu, dynamicWindow);
1147 0 : c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1148 0 : length=2;
1149 0 : goto outputBytes;
1150 0 : } else if((code=getDynamicOffset(c, &offset))>=0) {
1151 : /* might check if there are more characters in this window to come */
1152 : /* define an extended window with this character */
1153 0 : code-=0x200;
1154 0 : dynamicWindow=getNextDynamicWindow(scsu);
1155 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1156 0 : useDynamicWindow(scsu, dynamicWindow);
1157 0 : c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1158 0 : length=4;
1159 0 : goto outputBytes;
1160 : } else {
1161 : /* change to Unicode mode and output this (lead, trail) pair */
1162 0 : isSingleByteMode=FALSE;
1163 0 : *target++=(uint8_t)SCU;
1164 0 : if(offsets!=NULL) {
1165 0 : *offsets++=sourceIndex;
1166 : }
1167 0 : --targetCapacity;
1168 0 : c=((uint32_t)lead<<16)|trail;
1169 0 : length=4;
1170 0 : goto outputBytes;
1171 : }
1172 0 : } else if(c<0xa0) {
1173 : /* quote C1 control character */
1174 0 : c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1175 0 : length=2;
1176 0 : goto outputBytes;
1177 0 : } else if(c==0xfeff || c>=0xfff0) {
1178 : /* quote signature character=byte order mark and specials */
1179 0 : c|=SQU<<16;
1180 0 : length=3;
1181 0 : goto outputBytes;
1182 : } else {
1183 : /* compress all other BMP characters */
1184 0 : if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1185 : /* there is a window defined that contains this character - switch to it or quote from it? */
1186 0 : if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1187 : /* change to dynamic window */
1188 0 : dynamicWindow=window;
1189 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1190 0 : useDynamicWindow(scsu, dynamicWindow);
1191 0 : c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1192 0 : length=2;
1193 0 : goto outputBytes;
1194 : } else {
1195 : /* quote from dynamic window */
1196 0 : c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1197 0 : length=2;
1198 0 : goto outputBytes;
1199 : }
1200 0 : } else if((window=getWindow(staticOffsets, c))>=0) {
1201 : /* quote from static window */
1202 0 : c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1203 0 : length=2;
1204 0 : goto outputBytes;
1205 0 : } else if((code=getDynamicOffset(c, &offset))>=0) {
1206 : /* define a dynamic window with this character */
1207 0 : dynamicWindow=getNextDynamicWindow(scsu);
1208 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1209 0 : useDynamicWindow(scsu, dynamicWindow);
1210 0 : c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1211 0 : length=3;
1212 0 : goto outputBytes;
1213 0 : } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1214 0 : (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1215 : ) {
1216 : /*
1217 : * this character is not compressible (a BMP ideograph or similar);
1218 : * switch to Unicode mode if this is the last character in the block
1219 : * or there is at least one more ideograph following immediately
1220 : */
1221 0 : isSingleByteMode=FALSE;
1222 0 : c|=SCU<<16;
1223 0 : length=3;
1224 0 : goto outputBytes;
1225 : } else {
1226 : /* quote Unicode */
1227 0 : c|=SQU<<16;
1228 0 : length=3;
1229 0 : goto outputBytes;
1230 : }
1231 : }
1232 :
1233 : /* normal end of conversion: prepare for a new character */
1234 0 : c=0;
1235 0 : sourceIndex=nextSourceIndex;
1236 : }
1237 : } else {
1238 0 : if(c!=0 && targetCapacity>0) {
1239 0 : goto getTrailUnicode;
1240 : }
1241 :
1242 : /* state machine for Unicode mode */
1243 : /* unicodeByteMode: */
1244 0 : while(source<sourceLimit) {
1245 0 : if(targetCapacity<=0) {
1246 : /* target is full */
1247 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1248 0 : break;
1249 : }
1250 0 : c=*source++;
1251 0 : ++nextSourceIndex;
1252 :
1253 0 : if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1254 : /* not compressible, write character directly */
1255 0 : if(targetCapacity>=2) {
1256 0 : *target++=(uint8_t)(c>>8);
1257 0 : *target++=(uint8_t)c;
1258 0 : if(offsets!=NULL) {
1259 0 : *offsets++=sourceIndex;
1260 0 : *offsets++=sourceIndex;
1261 : }
1262 0 : targetCapacity-=2;
1263 : } else {
1264 0 : length=2;
1265 0 : goto outputBytes;
1266 : }
1267 0 : } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1268 : /* compress BMP character if the following one is not an uncompressible ideograph */
1269 0 : if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1270 0 : if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1271 : /* ASCII digit or letter */
1272 0 : isSingleByteMode=TRUE;
1273 0 : c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1274 0 : length=2;
1275 0 : goto outputBytes;
1276 0 : } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1277 : /* there is a dynamic window that contains this character, change to it */
1278 0 : isSingleByteMode=TRUE;
1279 0 : dynamicWindow=window;
1280 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1281 0 : useDynamicWindow(scsu, dynamicWindow);
1282 0 : c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1283 0 : length=2;
1284 0 : goto outputBytes;
1285 0 : } else if((code=getDynamicOffset(c, &offset))>=0) {
1286 : /* define a dynamic window with this character */
1287 0 : isSingleByteMode=TRUE;
1288 0 : dynamicWindow=getNextDynamicWindow(scsu);
1289 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1290 0 : useDynamicWindow(scsu, dynamicWindow);
1291 0 : c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1292 0 : length=3;
1293 0 : goto outputBytes;
1294 : }
1295 : }
1296 :
1297 : /* don't know how to compress this character, just write it directly */
1298 0 : length=2;
1299 0 : goto outputBytes;
1300 0 : } else if(c<0xe000) {
1301 : /* c is a surrogate */
1302 0 : if(U16_IS_SURROGATE_LEAD(c)) {
1303 : getTrailUnicode:
1304 0 : lead=(UChar)c;
1305 0 : if(source<sourceLimit) {
1306 : /* test the following code unit */
1307 0 : trail=*source;
1308 0 : if(U16_IS_TRAIL(trail)) {
1309 0 : ++source;
1310 0 : ++nextSourceIndex;
1311 0 : c=U16_GET_SUPPLEMENTARY(c, trail);
1312 : /* convert this surrogate code point */
1313 : /* exit this condition tree */
1314 : } else {
1315 : /* this is an unmatched lead code unit (1st surrogate) */
1316 : /* callback(illegal) */
1317 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1318 0 : goto endloop;
1319 : }
1320 : } else {
1321 : /* no more input */
1322 0 : break;
1323 : }
1324 : } else {
1325 : /* this is an unmatched trail code unit (2nd surrogate) */
1326 : /* callback(illegal) */
1327 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1328 0 : goto endloop;
1329 : }
1330 :
1331 : /* compress supplementary character */
1332 0 : if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1333 0 : !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1334 : ) {
1335 : /*
1336 : * there is a dynamic window that contains this character and
1337 : * the following character is not uncompressible,
1338 : * change to the window
1339 : */
1340 0 : isSingleByteMode=TRUE;
1341 0 : dynamicWindow=window;
1342 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1343 0 : useDynamicWindow(scsu, dynamicWindow);
1344 0 : c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1345 0 : length=2;
1346 0 : goto outputBytes;
1347 0 : } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1348 : (code=getDynamicOffset(c, &offset))>=0
1349 : ) {
1350 : /* two supplementary characters in (probably) the same window - define an extended one */
1351 0 : isSingleByteMode=TRUE;
1352 0 : code-=0x200;
1353 0 : dynamicWindow=getNextDynamicWindow(scsu);
1354 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1355 0 : useDynamicWindow(scsu, dynamicWindow);
1356 0 : c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1357 0 : length=4;
1358 0 : goto outputBytes;
1359 : } else {
1360 : /* don't know how to compress this character, just write it directly */
1361 0 : c=((uint32_t)lead<<16)|trail;
1362 0 : length=4;
1363 0 : goto outputBytes;
1364 : }
1365 : } else /* 0xe000<=c<0xf300 */ {
1366 : /* quote to avoid SCSU tags */
1367 0 : c|=UQU<<16;
1368 0 : length=3;
1369 0 : goto outputBytes;
1370 : }
1371 :
1372 : /* normal end of conversion: prepare for a new character */
1373 0 : c=0;
1374 0 : sourceIndex=nextSourceIndex;
1375 : }
1376 : }
1377 : endloop:
1378 :
1379 : /* set the converter state back into UConverter */
1380 0 : scsu->fromUIsSingleByteMode=isSingleByteMode;
1381 0 : scsu->fromUDynamicWindow=dynamicWindow;
1382 :
1383 0 : cnv->fromUChar32=c;
1384 :
1385 : /* write back the updated pointers */
1386 0 : pArgs->source=source;
1387 0 : pArgs->target=(char *)target;
1388 0 : pArgs->offsets=offsets;
1389 0 : return;
1390 :
1391 : outputBytes:
1392 : /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1393 : /* from the first if in the loop we know that targetCapacity>0 */
1394 0 : if(length<=targetCapacity) {
1395 0 : if(offsets==NULL) {
1396 0 : switch(length) {
1397 : /* each branch falls through to the next one */
1398 : case 4:
1399 0 : *target++=(uint8_t)(c>>24);
1400 : U_FALLTHROUGH;
1401 : case 3:
1402 0 : *target++=(uint8_t)(c>>16);
1403 : U_FALLTHROUGH;
1404 : case 2:
1405 0 : *target++=(uint8_t)(c>>8);
1406 : U_FALLTHROUGH;
1407 : case 1:
1408 0 : *target++=(uint8_t)c;
1409 : U_FALLTHROUGH;
1410 : default:
1411 : /* will never occur */
1412 0 : break;
1413 : }
1414 : } else {
1415 0 : switch(length) {
1416 : /* each branch falls through to the next one */
1417 : case 4:
1418 0 : *target++=(uint8_t)(c>>24);
1419 0 : *offsets++=sourceIndex;
1420 : U_FALLTHROUGH;
1421 : case 3:
1422 0 : *target++=(uint8_t)(c>>16);
1423 0 : *offsets++=sourceIndex;
1424 : U_FALLTHROUGH;
1425 : case 2:
1426 0 : *target++=(uint8_t)(c>>8);
1427 0 : *offsets++=sourceIndex;
1428 : U_FALLTHROUGH;
1429 : case 1:
1430 0 : *target++=(uint8_t)c;
1431 0 : *offsets++=sourceIndex;
1432 : U_FALLTHROUGH;
1433 : default:
1434 : /* will never occur */
1435 0 : break;
1436 : }
1437 : }
1438 0 : targetCapacity-=length;
1439 :
1440 : /* normal end of conversion: prepare for a new character */
1441 0 : c=0;
1442 0 : sourceIndex=nextSourceIndex;
1443 0 : goto loop;
1444 : } else {
1445 : uint8_t *p;
1446 :
1447 : /*
1448 : * We actually do this backwards here:
1449 : * In order to save an intermediate variable, we output
1450 : * first to the overflow buffer what does not fit into the
1451 : * regular target.
1452 : */
1453 : /* we know that 0<=targetCapacity<length<=4 */
1454 : /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1455 0 : length-=targetCapacity;
1456 0 : p=(uint8_t *)cnv->charErrorBuffer;
1457 0 : switch(length) {
1458 : /* each branch falls through to the next one */
1459 : case 4:
1460 0 : *p++=(uint8_t)(c>>24);
1461 : U_FALLTHROUGH;
1462 : case 3:
1463 0 : *p++=(uint8_t)(c>>16);
1464 : U_FALLTHROUGH;
1465 : case 2:
1466 0 : *p++=(uint8_t)(c>>8);
1467 : U_FALLTHROUGH;
1468 : case 1:
1469 0 : *p=(uint8_t)c;
1470 : U_FALLTHROUGH;
1471 : default:
1472 : /* will never occur */
1473 0 : break;
1474 : }
1475 0 : cnv->charErrorBufferLength=(int8_t)length;
1476 :
1477 : /* now output what fits into the regular target */
1478 0 : c>>=8*length; /* length was reduced by targetCapacity */
1479 0 : switch(targetCapacity) {
1480 : /* each branch falls through to the next one */
1481 : case 3:
1482 0 : *target++=(uint8_t)(c>>16);
1483 0 : if(offsets!=NULL) {
1484 0 : *offsets++=sourceIndex;
1485 : }
1486 : U_FALLTHROUGH;
1487 : case 2:
1488 0 : *target++=(uint8_t)(c>>8);
1489 0 : if(offsets!=NULL) {
1490 0 : *offsets++=sourceIndex;
1491 : }
1492 : U_FALLTHROUGH;
1493 : case 1:
1494 0 : *target++=(uint8_t)c;
1495 0 : if(offsets!=NULL) {
1496 0 : *offsets++=sourceIndex;
1497 : }
1498 : U_FALLTHROUGH;
1499 : default:
1500 0 : break;
1501 : }
1502 :
1503 : /* target overflow */
1504 0 : targetCapacity=0;
1505 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1506 0 : c=0;
1507 0 : goto endloop;
1508 : }
1509 : }
1510 :
1511 : /*
1512 : * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
1513 : * If a change is made in the original function, then either
1514 : * change this function the same way or
1515 : * re-copy the original function and remove the variables
1516 : * offsets, sourceIndex, and nextSourceIndex.
1517 : */
1518 : static void U_CALLCONV
1519 0 : _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
1520 : UErrorCode *pErrorCode) {
1521 : UConverter *cnv;
1522 : SCSUData *scsu;
1523 : const UChar *source, *sourceLimit;
1524 : uint8_t *target;
1525 : int32_t targetCapacity;
1526 :
1527 : UBool isSingleByteMode;
1528 : uint8_t dynamicWindow;
1529 : uint32_t currentOffset;
1530 :
1531 : uint32_t c, delta;
1532 :
1533 : int32_t length;
1534 :
1535 : /* variables for compression heuristics */
1536 : uint32_t offset;
1537 : UChar lead, trail;
1538 : int code;
1539 : int8_t window;
1540 :
1541 : /* set up the local pointers */
1542 0 : cnv=pArgs->converter;
1543 0 : scsu=(SCSUData *)cnv->extraInfo;
1544 :
1545 : /* set up the local pointers */
1546 0 : source=pArgs->source;
1547 0 : sourceLimit=pArgs->sourceLimit;
1548 0 : target=(uint8_t *)pArgs->target;
1549 0 : targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
1550 :
1551 : /* get the state machine state */
1552 0 : isSingleByteMode=scsu->fromUIsSingleByteMode;
1553 0 : dynamicWindow=scsu->fromUDynamicWindow;
1554 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1555 :
1556 0 : c=cnv->fromUChar32;
1557 :
1558 : /* similar conversion "loop" as in toUnicode */
1559 : loop:
1560 0 : if(isSingleByteMode) {
1561 0 : if(c!=0 && targetCapacity>0) {
1562 0 : goto getTrailSingle;
1563 : }
1564 :
1565 : /* state machine for single-byte mode */
1566 : /* singleByteMode: */
1567 0 : while(source<sourceLimit) {
1568 0 : if(targetCapacity<=0) {
1569 : /* target is full */
1570 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1571 0 : break;
1572 : }
1573 0 : c=*source++;
1574 :
1575 0 : if((c-0x20)<=0x5f) {
1576 : /* pass US-ASCII graphic character through */
1577 0 : *target++=(uint8_t)c;
1578 0 : --targetCapacity;
1579 0 : } else if(c<0x20) {
1580 0 : if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
1581 : /* CR/LF/TAB/NUL */
1582 0 : *target++=(uint8_t)c;
1583 0 : --targetCapacity;
1584 : } else {
1585 : /* quote C0 control character */
1586 0 : c|=SQ0<<8;
1587 0 : length=2;
1588 0 : goto outputBytes;
1589 : }
1590 0 : } else if((delta=c-currentOffset)<=0x7f) {
1591 : /* use the current dynamic window */
1592 0 : *target++=(uint8_t)(delta|0x80);
1593 0 : --targetCapacity;
1594 0 : } else if(U16_IS_SURROGATE(c)) {
1595 0 : if(U16_IS_SURROGATE_LEAD(c)) {
1596 : getTrailSingle:
1597 0 : lead=(UChar)c;
1598 0 : if(source<sourceLimit) {
1599 : /* test the following code unit */
1600 0 : trail=*source;
1601 0 : if(U16_IS_TRAIL(trail)) {
1602 0 : ++source;
1603 0 : c=U16_GET_SUPPLEMENTARY(c, trail);
1604 : /* convert this surrogate code point */
1605 : /* exit this condition tree */
1606 : } else {
1607 : /* this is an unmatched lead code unit (1st surrogate) */
1608 : /* callback(illegal) */
1609 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1610 0 : goto endloop;
1611 : }
1612 : } else {
1613 : /* no more input */
1614 0 : break;
1615 : }
1616 : } else {
1617 : /* this is an unmatched trail code unit (2nd surrogate) */
1618 : /* callback(illegal) */
1619 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1620 0 : goto endloop;
1621 : }
1622 :
1623 : /* compress supplementary character U+10000..U+10ffff */
1624 0 : if((delta=c-currentOffset)<=0x7f) {
1625 : /* use the current dynamic window */
1626 0 : *target++=(uint8_t)(delta|0x80);
1627 0 : --targetCapacity;
1628 0 : } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1629 : /* there is a dynamic window that contains this character, change to it */
1630 0 : dynamicWindow=window;
1631 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1632 0 : useDynamicWindow(scsu, dynamicWindow);
1633 0 : c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1634 0 : length=2;
1635 0 : goto outputBytes;
1636 0 : } else if((code=getDynamicOffset(c, &offset))>=0) {
1637 : /* might check if there are more characters in this window to come */
1638 : /* define an extended window with this character */
1639 0 : code-=0x200;
1640 0 : dynamicWindow=getNextDynamicWindow(scsu);
1641 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1642 0 : useDynamicWindow(scsu, dynamicWindow);
1643 0 : c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1644 0 : length=4;
1645 0 : goto outputBytes;
1646 : } else {
1647 : /* change to Unicode mode and output this (lead, trail) pair */
1648 0 : isSingleByteMode=FALSE;
1649 0 : *target++=(uint8_t)SCU;
1650 0 : --targetCapacity;
1651 0 : c=((uint32_t)lead<<16)|trail;
1652 0 : length=4;
1653 0 : goto outputBytes;
1654 : }
1655 0 : } else if(c<0xa0) {
1656 : /* quote C1 control character */
1657 0 : c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
1658 0 : length=2;
1659 0 : goto outputBytes;
1660 0 : } else if(c==0xfeff || c>=0xfff0) {
1661 : /* quote signature character=byte order mark and specials */
1662 0 : c|=SQU<<16;
1663 0 : length=3;
1664 0 : goto outputBytes;
1665 : } else {
1666 : /* compress all other BMP characters */
1667 0 : if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1668 : /* there is a window defined that contains this character - switch to it or quote from it? */
1669 0 : if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
1670 : /* change to dynamic window */
1671 0 : dynamicWindow=window;
1672 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1673 0 : useDynamicWindow(scsu, dynamicWindow);
1674 0 : c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1675 0 : length=2;
1676 0 : goto outputBytes;
1677 : } else {
1678 : /* quote from dynamic window */
1679 0 : c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
1680 0 : length=2;
1681 0 : goto outputBytes;
1682 : }
1683 0 : } else if((window=getWindow(staticOffsets, c))>=0) {
1684 : /* quote from static window */
1685 0 : c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
1686 0 : length=2;
1687 0 : goto outputBytes;
1688 0 : } else if((code=getDynamicOffset(c, &offset))>=0) {
1689 : /* define a dynamic window with this character */
1690 0 : dynamicWindow=getNextDynamicWindow(scsu);
1691 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1692 0 : useDynamicWindow(scsu, dynamicWindow);
1693 0 : c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1694 0 : length=3;
1695 0 : goto outputBytes;
1696 0 : } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
1697 0 : (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1698 : ) {
1699 : /*
1700 : * this character is not compressible (a BMP ideograph or similar);
1701 : * switch to Unicode mode if this is the last character in the block
1702 : * or there is at least one more ideograph following immediately
1703 : */
1704 0 : isSingleByteMode=FALSE;
1705 0 : c|=SCU<<16;
1706 0 : length=3;
1707 0 : goto outputBytes;
1708 : } else {
1709 : /* quote Unicode */
1710 0 : c|=SQU<<16;
1711 0 : length=3;
1712 0 : goto outputBytes;
1713 : }
1714 : }
1715 :
1716 : /* normal end of conversion: prepare for a new character */
1717 0 : c=0;
1718 : }
1719 : } else {
1720 0 : if(c!=0 && targetCapacity>0) {
1721 0 : goto getTrailUnicode;
1722 : }
1723 :
1724 : /* state machine for Unicode mode */
1725 : /* unicodeByteMode: */
1726 0 : while(source<sourceLimit) {
1727 0 : if(targetCapacity<=0) {
1728 : /* target is full */
1729 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1730 0 : break;
1731 : }
1732 0 : c=*source++;
1733 :
1734 0 : if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
1735 : /* not compressible, write character directly */
1736 0 : if(targetCapacity>=2) {
1737 0 : *target++=(uint8_t)(c>>8);
1738 0 : *target++=(uint8_t)c;
1739 0 : targetCapacity-=2;
1740 : } else {
1741 0 : length=2;
1742 0 : goto outputBytes;
1743 : }
1744 0 : } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
1745 : /* compress BMP character if the following one is not an uncompressible ideograph */
1746 0 : if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
1747 0 : if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
1748 : /* ASCII digit or letter */
1749 0 : isSingleByteMode=TRUE;
1750 0 : c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
1751 0 : length=2;
1752 0 : goto outputBytes;
1753 0 : } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
1754 : /* there is a dynamic window that contains this character, change to it */
1755 0 : isSingleByteMode=TRUE;
1756 0 : dynamicWindow=window;
1757 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1758 0 : useDynamicWindow(scsu, dynamicWindow);
1759 0 : c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1760 0 : length=2;
1761 0 : goto outputBytes;
1762 0 : } else if((code=getDynamicOffset(c, &offset))>=0) {
1763 : /* define a dynamic window with this character */
1764 0 : isSingleByteMode=TRUE;
1765 0 : dynamicWindow=getNextDynamicWindow(scsu);
1766 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1767 0 : useDynamicWindow(scsu, dynamicWindow);
1768 0 : c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1769 0 : length=3;
1770 0 : goto outputBytes;
1771 : }
1772 : }
1773 :
1774 : /* don't know how to compress this character, just write it directly */
1775 0 : length=2;
1776 0 : goto outputBytes;
1777 0 : } else if(c<0xe000) {
1778 : /* c is a surrogate */
1779 0 : if(U16_IS_SURROGATE_LEAD(c)) {
1780 : getTrailUnicode:
1781 0 : lead=(UChar)c;
1782 0 : if(source<sourceLimit) {
1783 : /* test the following code unit */
1784 0 : trail=*source;
1785 0 : if(U16_IS_TRAIL(trail)) {
1786 0 : ++source;
1787 0 : c=U16_GET_SUPPLEMENTARY(c, trail);
1788 : /* convert this surrogate code point */
1789 : /* exit this condition tree */
1790 : } else {
1791 : /* this is an unmatched lead code unit (1st surrogate) */
1792 : /* callback(illegal) */
1793 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1794 0 : goto endloop;
1795 : }
1796 : } else {
1797 : /* no more input */
1798 0 : break;
1799 : }
1800 : } else {
1801 : /* this is an unmatched trail code unit (2nd surrogate) */
1802 : /* callback(illegal) */
1803 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1804 0 : goto endloop;
1805 : }
1806 :
1807 : /* compress supplementary character */
1808 0 : if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
1809 0 : !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
1810 : ) {
1811 : /*
1812 : * there is a dynamic window that contains this character and
1813 : * the following character is not uncompressible,
1814 : * change to the window
1815 : */
1816 0 : isSingleByteMode=TRUE;
1817 0 : dynamicWindow=window;
1818 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
1819 0 : useDynamicWindow(scsu, dynamicWindow);
1820 0 : c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
1821 0 : length=2;
1822 0 : goto outputBytes;
1823 0 : } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
1824 : (code=getDynamicOffset(c, &offset))>=0
1825 : ) {
1826 : /* two supplementary characters in (probably) the same window - define an extended one */
1827 0 : isSingleByteMode=TRUE;
1828 0 : code-=0x200;
1829 0 : dynamicWindow=getNextDynamicWindow(scsu);
1830 0 : currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
1831 0 : useDynamicWindow(scsu, dynamicWindow);
1832 0 : c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
1833 0 : length=4;
1834 0 : goto outputBytes;
1835 : } else {
1836 : /* don't know how to compress this character, just write it directly */
1837 0 : c=((uint32_t)lead<<16)|trail;
1838 0 : length=4;
1839 0 : goto outputBytes;
1840 : }
1841 : } else /* 0xe000<=c<0xf300 */ {
1842 : /* quote to avoid SCSU tags */
1843 0 : c|=UQU<<16;
1844 0 : length=3;
1845 0 : goto outputBytes;
1846 : }
1847 :
1848 : /* normal end of conversion: prepare for a new character */
1849 0 : c=0;
1850 : }
1851 : }
1852 : endloop:
1853 :
1854 : /* set the converter state back into UConverter */
1855 0 : scsu->fromUIsSingleByteMode=isSingleByteMode;
1856 0 : scsu->fromUDynamicWindow=dynamicWindow;
1857 :
1858 0 : cnv->fromUChar32=c;
1859 :
1860 : /* write back the updated pointers */
1861 0 : pArgs->source=source;
1862 0 : pArgs->target=(char *)target;
1863 0 : return;
1864 :
1865 : outputBytes:
1866 : /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
1867 : /* from the first if in the loop we know that targetCapacity>0 */
1868 0 : if(length<=targetCapacity) {
1869 0 : switch(length) {
1870 : /* each branch falls through to the next one */
1871 : case 4:
1872 0 : *target++=(uint8_t)(c>>24);
1873 : U_FALLTHROUGH;
1874 : case 3:
1875 0 : *target++=(uint8_t)(c>>16);
1876 : U_FALLTHROUGH;
1877 : case 2:
1878 0 : *target++=(uint8_t)(c>>8);
1879 : U_FALLTHROUGH;
1880 : case 1:
1881 0 : *target++=(uint8_t)c;
1882 : U_FALLTHROUGH;
1883 : default:
1884 : /* will never occur */
1885 0 : break;
1886 : }
1887 0 : targetCapacity-=length;
1888 :
1889 : /* normal end of conversion: prepare for a new character */
1890 0 : c=0;
1891 0 : goto loop;
1892 : } else {
1893 : uint8_t *p;
1894 :
1895 : /*
1896 : * We actually do this backwards here:
1897 : * In order to save an intermediate variable, we output
1898 : * first to the overflow buffer what does not fit into the
1899 : * regular target.
1900 : */
1901 : /* we know that 0<=targetCapacity<length<=4 */
1902 : /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
1903 0 : length-=targetCapacity;
1904 0 : p=(uint8_t *)cnv->charErrorBuffer;
1905 0 : switch(length) {
1906 : /* each branch falls through to the next one */
1907 : case 4:
1908 0 : *p++=(uint8_t)(c>>24);
1909 : U_FALLTHROUGH;
1910 : case 3:
1911 0 : *p++=(uint8_t)(c>>16);
1912 : U_FALLTHROUGH;
1913 : case 2:
1914 0 : *p++=(uint8_t)(c>>8);
1915 : U_FALLTHROUGH;
1916 : case 1:
1917 0 : *p=(uint8_t)c;
1918 : U_FALLTHROUGH;
1919 : default:
1920 : /* will never occur */
1921 0 : break;
1922 : }
1923 0 : cnv->charErrorBufferLength=(int8_t)length;
1924 :
1925 : /* now output what fits into the regular target */
1926 0 : c>>=8*length; /* length was reduced by targetCapacity */
1927 0 : switch(targetCapacity) {
1928 : /* each branch falls through to the next one */
1929 : case 3:
1930 0 : *target++=(uint8_t)(c>>16);
1931 : U_FALLTHROUGH;
1932 : case 2:
1933 0 : *target++=(uint8_t)(c>>8);
1934 : U_FALLTHROUGH;
1935 : case 1:
1936 0 : *target++=(uint8_t)c;
1937 : U_FALLTHROUGH;
1938 : default:
1939 0 : break;
1940 : }
1941 :
1942 : /* target overflow */
1943 0 : targetCapacity=0;
1944 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1945 0 : c=0;
1946 0 : goto endloop;
1947 : }
1948 : }
1949 :
1950 : /* miscellaneous ------------------------------------------------------------ */
1951 :
1952 : static const char * U_CALLCONV
1953 0 : _SCSUGetName(const UConverter *cnv) {
1954 0 : SCSUData *scsu=(SCSUData *)cnv->extraInfo;
1955 :
1956 0 : switch(scsu->locale) {
1957 : case l_ja:
1958 0 : return "SCSU,locale=ja";
1959 : default:
1960 0 : return "SCSU";
1961 : }
1962 : }
1963 :
1964 : /* structure for SafeClone calculations */
1965 : struct cloneSCSUStruct
1966 : {
1967 : UConverter cnv;
1968 : SCSUData mydata;
1969 : };
1970 :
1971 : static UConverter * U_CALLCONV
1972 0 : _SCSUSafeClone(const UConverter *cnv,
1973 : void *stackBuffer,
1974 : int32_t *pBufferSize,
1975 : UErrorCode *status)
1976 : {
1977 : struct cloneSCSUStruct * localClone;
1978 0 : int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
1979 :
1980 0 : if (U_FAILURE(*status)){
1981 0 : return 0;
1982 : }
1983 :
1984 0 : if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1985 0 : *pBufferSize = bufferSizeNeeded;
1986 0 : return 0;
1987 : }
1988 :
1989 0 : localClone = (struct cloneSCSUStruct *)stackBuffer;
1990 : /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1991 :
1992 0 : uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
1993 0 : localClone->cnv.extraInfo = &localClone->mydata;
1994 0 : localClone->cnv.isExtraLocal = TRUE;
1995 :
1996 0 : return &localClone->cnv;
1997 : }
1998 : U_CDECL_END
1999 :
2000 : static const UConverterImpl _SCSUImpl={
2001 : UCNV_SCSU,
2002 :
2003 : NULL,
2004 : NULL,
2005 :
2006 : _SCSUOpen,
2007 : _SCSUClose,
2008 : _SCSUReset,
2009 :
2010 : _SCSUToUnicode,
2011 : _SCSUToUnicodeWithOffsets,
2012 : _SCSUFromUnicode,
2013 : _SCSUFromUnicodeWithOffsets,
2014 : NULL,
2015 :
2016 : NULL,
2017 : _SCSUGetName,
2018 : NULL,
2019 : _SCSUSafeClone,
2020 : ucnv_getCompleteUnicodeSet,
2021 : NULL,
2022 : NULL
2023 : };
2024 :
2025 : static const UConverterStaticData _SCSUStaticData={
2026 : sizeof(UConverterStaticData),
2027 : "SCSU",
2028 : 1212, /* CCSID for SCSU */
2029 : UCNV_IBM, UCNV_SCSU,
2030 : 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
2031 : /*
2032 : * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
2033 : * substitution string.
2034 : */
2035 : { 0x0e, 0xff, 0xfd, 0 }, 3,
2036 : FALSE, FALSE,
2037 : 0,
2038 : 0,
2039 : { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
2040 : };
2041 :
2042 : const UConverterSharedData _SCSUData=
2043 : UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
2044 :
2045 : #endif
|