Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *****************************************************************************
5 : *
6 : * Copyright (C) 1998-2016, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *****************************************************************************
10 : *
11 : * ucnv_err.c
12 : * Implements error behaviour functions called by T_UConverter_{from,to}Unicode
13 : *
14 : *
15 : * Change history:
16 : *
17 : * 06/29/2000 helena Major rewrite of the callback APIs.
18 : */
19 :
20 : #include "unicode/utypes.h"
21 :
22 : #if !UCONFIG_NO_CONVERSION
23 :
24 : #include "unicode/ucnv_err.h"
25 : #include "unicode/ucnv_cb.h"
26 : #include "ucnv_cnv.h"
27 : #include "cmemory.h"
28 : #include "unicode/ucnv.h"
29 : #include "ustrfmt.h"
30 :
31 : #define VALUE_STRING_LENGTH 48
32 : /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
33 : #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
34 : #define UNICODE_U_CODEPOINT 0x0055
35 : #define UNICODE_X_CODEPOINT 0x0058
36 : #define UNICODE_RS_CODEPOINT 0x005C
37 : #define UNICODE_U_LOW_CODEPOINT 0x0075
38 : #define UNICODE_X_LOW_CODEPOINT 0x0078
39 : #define UNICODE_AMP_CODEPOINT 0x0026
40 : #define UNICODE_HASH_CODEPOINT 0x0023
41 : #define UNICODE_SEMICOLON_CODEPOINT 0x003B
42 : #define UNICODE_PLUS_CODEPOINT 0x002B
43 : #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
44 : #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
45 : #define UNICODE_SPACE_CODEPOINT 0x0020
46 : #define UCNV_PRV_ESCAPE_ICU 0
47 : #define UCNV_PRV_ESCAPE_C 'C'
48 : #define UCNV_PRV_ESCAPE_XML_DEC 'D'
49 : #define UCNV_PRV_ESCAPE_XML_HEX 'X'
50 : #define UCNV_PRV_ESCAPE_JAVA 'J'
51 : #define UCNV_PRV_ESCAPE_UNICODE 'U'
52 : #define UCNV_PRV_ESCAPE_CSS2 'S'
53 : #define UCNV_PRV_STOP_ON_ILLEGAL 'i'
54 :
55 : /*
56 : * IS_DEFAULT_IGNORABLE_CODE_POINT
57 : * This is to check if a code point has the default ignorable unicode property.
58 : * As such, this list needs to be updated if the ignorable code point list ever
59 : * changes.
60 : * To avoid dependency on other code, this list is hard coded here.
61 : * When an ignorable code point is found and is unmappable, the default callbacks
62 : * will ignore them.
63 : * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
64 : *
65 : * This list should be sync with the one in CharsetCallback.java
66 : */
67 : #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
68 : (c == 0x00AD) || \
69 : (c == 0x034F) || \
70 : (c == 0x061C) || \
71 : (c == 0x115F) || \
72 : (c == 0x1160) || \
73 : (0x17B4 <= c && c <= 0x17B5) || \
74 : (0x180B <= c && c <= 0x180E) || \
75 : (0x200B <= c && c <= 0x200F) || \
76 : (0x202A <= c && c <= 0x202E) || \
77 : (c == 0x2060) || \
78 : (0x2066 <= c && c <= 0x2069) || \
79 : (0x2061 <= c && c <= 0x2064) || \
80 : (0x206A <= c && c <= 0x206F) || \
81 : (c == 0x3164) || \
82 : (0x0FE00 <= c && c <= 0x0FE0F) || \
83 : (c == 0x0FEFF) || \
84 : (c == 0x0FFA0) || \
85 : (0x01BCA0 <= c && c <= 0x01BCA3) || \
86 : (0x01D173 <= c && c <= 0x01D17A) || \
87 : (c == 0x0E0001) || \
88 : (0x0E0020 <= c && c <= 0x0E007F) || \
89 : (0x0E0100 <= c && c <= 0x0E01EF) || \
90 : (c == 0x2065) || \
91 : (0x0FFF0 <= c && c <= 0x0FFF8) || \
92 : (c == 0x0E0000) || \
93 : (0x0E0002 <= c && c <= 0x0E001F) || \
94 : (0x0E0080 <= c && c <= 0x0E00FF) || \
95 : (0x0E01F0 <= c && c <= 0x0E0FFF) \
96 : )
97 :
98 :
99 : /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
100 : U_CAPI void U_EXPORT2
101 0 : UCNV_FROM_U_CALLBACK_STOP (
102 : const void *context,
103 : UConverterFromUnicodeArgs *fromUArgs,
104 : const UChar* codeUnits,
105 : int32_t length,
106 : UChar32 codePoint,
107 : UConverterCallbackReason reason,
108 : UErrorCode * err)
109 : {
110 : (void)context;
111 : (void)fromUArgs;
112 : (void)codeUnits;
113 : (void)length;
114 0 : if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
115 : {
116 : /*
117 : * Skip if the codepoint has unicode property of default ignorable.
118 : */
119 0 : *err = U_ZERO_ERROR;
120 : }
121 : /* the caller must have set the error code accordingly */
122 0 : return;
123 : }
124 :
125 :
126 : /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
127 : U_CAPI void U_EXPORT2
128 0 : UCNV_TO_U_CALLBACK_STOP (
129 : const void *context,
130 : UConverterToUnicodeArgs *toUArgs,
131 : const char* codePoints,
132 : int32_t length,
133 : UConverterCallbackReason reason,
134 : UErrorCode * err)
135 : {
136 : /* the caller must have set the error code accordingly */
137 : (void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
138 0 : return;
139 : }
140 :
141 : U_CAPI void U_EXPORT2
142 0 : UCNV_FROM_U_CALLBACK_SKIP (
143 : const void *context,
144 : UConverterFromUnicodeArgs *fromUArgs,
145 : const UChar* codeUnits,
146 : int32_t length,
147 : UChar32 codePoint,
148 : UConverterCallbackReason reason,
149 : UErrorCode * err)
150 : {
151 : (void)fromUArgs;
152 : (void)codeUnits;
153 : (void)length;
154 0 : if (reason <= UCNV_IRREGULAR)
155 : {
156 0 : if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
157 : {
158 : /*
159 : * Skip if the codepoint has unicode property of default ignorable.
160 : */
161 0 : *err = U_ZERO_ERROR;
162 : }
163 0 : else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
164 : {
165 0 : *err = U_ZERO_ERROR;
166 : }
167 : /* else the caller must have set the error code accordingly. */
168 : }
169 : /* else ignore the reset, close and clone calls. */
170 0 : }
171 :
172 : U_CAPI void U_EXPORT2
173 0 : UCNV_FROM_U_CALLBACK_SUBSTITUTE (
174 : const void *context,
175 : UConverterFromUnicodeArgs *fromArgs,
176 : const UChar* codeUnits,
177 : int32_t length,
178 : UChar32 codePoint,
179 : UConverterCallbackReason reason,
180 : UErrorCode * err)
181 : {
182 : (void)codeUnits;
183 : (void)length;
184 0 : if (reason <= UCNV_IRREGULAR)
185 : {
186 0 : if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
187 : {
188 : /*
189 : * Skip if the codepoint has unicode property of default ignorable.
190 : */
191 0 : *err = U_ZERO_ERROR;
192 : }
193 0 : else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
194 : {
195 0 : *err = U_ZERO_ERROR;
196 0 : ucnv_cbFromUWriteSub(fromArgs, 0, err);
197 : }
198 : /* else the caller must have set the error code accordingly. */
199 : }
200 : /* else ignore the reset, close and clone calls. */
201 0 : }
202 :
203 : /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
204 : *uses a clean copy (resetted) of the converter, to convert that unicode
205 : *escape sequence to the target codepage (if conversion failure happens then
206 : *we revert to substituting with subchar)
207 : */
208 : U_CAPI void U_EXPORT2
209 0 : UCNV_FROM_U_CALLBACK_ESCAPE (
210 : const void *context,
211 : UConverterFromUnicodeArgs *fromArgs,
212 : const UChar *codeUnits,
213 : int32_t length,
214 : UChar32 codePoint,
215 : UConverterCallbackReason reason,
216 : UErrorCode * err)
217 : {
218 :
219 : UChar valueString[VALUE_STRING_LENGTH];
220 0 : int32_t valueStringLength = 0;
221 0 : int32_t i = 0;
222 :
223 0 : const UChar *myValueSource = NULL;
224 0 : UErrorCode err2 = U_ZERO_ERROR;
225 0 : UConverterFromUCallback original = NULL;
226 : const void *originalContext;
227 :
228 0 : UConverterFromUCallback ignoredCallback = NULL;
229 : const void *ignoredContext;
230 :
231 0 : if (reason > UCNV_IRREGULAR)
232 : {
233 0 : return;
234 : }
235 0 : else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
236 : {
237 : /*
238 : * Skip if the codepoint has unicode property of default ignorable.
239 : */
240 0 : *err = U_ZERO_ERROR;
241 0 : return;
242 : }
243 :
244 0 : ucnv_setFromUCallBack (fromArgs->converter,
245 : (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
246 : NULL,
247 : &original,
248 : &originalContext,
249 0 : &err2);
250 :
251 0 : if (U_FAILURE (err2))
252 : {
253 0 : *err = err2;
254 0 : return;
255 : }
256 0 : if(context==NULL)
257 : {
258 0 : while (i < length)
259 : {
260 0 : valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
261 0 : valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
262 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
263 : }
264 : }
265 : else
266 : {
267 0 : switch(*((char*)context))
268 : {
269 : case UCNV_PRV_ESCAPE_JAVA:
270 0 : while (i < length)
271 : {
272 0 : valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
273 0 : valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
274 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
275 : }
276 0 : break;
277 :
278 : case UCNV_PRV_ESCAPE_C:
279 0 : valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
280 :
281 0 : if(length==2){
282 0 : valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
283 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
284 :
285 : }
286 : else{
287 0 : valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
288 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
289 : }
290 0 : break;
291 :
292 : case UCNV_PRV_ESCAPE_XML_DEC:
293 :
294 0 : valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
295 0 : valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
296 0 : if(length==2){
297 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
298 : }
299 : else{
300 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
301 : }
302 0 : valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
303 0 : break;
304 :
305 : case UCNV_PRV_ESCAPE_XML_HEX:
306 :
307 0 : valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
308 0 : valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
309 0 : valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
310 0 : if(length==2){
311 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
312 : }
313 : else{
314 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
315 : }
316 0 : valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
317 0 : break;
318 :
319 : case UCNV_PRV_ESCAPE_UNICODE:
320 0 : valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
321 0 : valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
322 0 : valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
323 0 : if (length == 2) {
324 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
325 : } else {
326 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
327 : }
328 0 : valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
329 0 : break;
330 :
331 : case UCNV_PRV_ESCAPE_CSS2:
332 0 : valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
333 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
334 : /* Always add space character, becase the next character might be whitespace,
335 : which would erroneously be considered the termination of the escape sequence. */
336 0 : valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
337 0 : break;
338 :
339 : default:
340 0 : while (i < length)
341 : {
342 0 : valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
343 0 : valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
344 0 : valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
345 : }
346 : }
347 : }
348 0 : myValueSource = valueString;
349 :
350 : /* reset the error */
351 0 : *err = U_ZERO_ERROR;
352 :
353 0 : ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
354 :
355 0 : ucnv_setFromUCallBack (fromArgs->converter,
356 : original,
357 : originalContext,
358 : &ignoredCallback,
359 : &ignoredContext,
360 0 : &err2);
361 0 : if (U_FAILURE (err2))
362 : {
363 0 : *err = err2;
364 0 : return;
365 : }
366 :
367 0 : return;
368 : }
369 :
370 :
371 :
372 : U_CAPI void U_EXPORT2
373 0 : UCNV_TO_U_CALLBACK_SKIP (
374 : const void *context,
375 : UConverterToUnicodeArgs *toArgs,
376 : const char* codeUnits,
377 : int32_t length,
378 : UConverterCallbackReason reason,
379 : UErrorCode * err)
380 : {
381 : (void)toArgs;
382 : (void)codeUnits;
383 : (void)length;
384 0 : if (reason <= UCNV_IRREGULAR)
385 : {
386 0 : if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
387 : {
388 0 : *err = U_ZERO_ERROR;
389 : }
390 : /* else the caller must have set the error code accordingly. */
391 : }
392 : /* else ignore the reset, close and clone calls. */
393 0 : }
394 :
395 : U_CAPI void U_EXPORT2
396 0 : UCNV_TO_U_CALLBACK_SUBSTITUTE (
397 : const void *context,
398 : UConverterToUnicodeArgs *toArgs,
399 : const char* codeUnits,
400 : int32_t length,
401 : UConverterCallbackReason reason,
402 : UErrorCode * err)
403 : {
404 : (void)codeUnits;
405 : (void)length;
406 0 : if (reason <= UCNV_IRREGULAR)
407 : {
408 0 : if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
409 : {
410 0 : *err = U_ZERO_ERROR;
411 0 : ucnv_cbToUWriteSub(toArgs,0,err);
412 : }
413 : /* else the caller must have set the error code accordingly. */
414 : }
415 : /* else ignore the reset, close and clone calls. */
416 0 : }
417 :
418 : /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
419 : *and uses that as the substitution sequence
420 : */
421 : U_CAPI void U_EXPORT2
422 0 : UCNV_TO_U_CALLBACK_ESCAPE (
423 : const void *context,
424 : UConverterToUnicodeArgs *toArgs,
425 : const char* codeUnits,
426 : int32_t length,
427 : UConverterCallbackReason reason,
428 : UErrorCode * err)
429 : {
430 : UChar uniValueString[VALUE_STRING_LENGTH];
431 0 : int32_t valueStringLength = 0;
432 0 : int32_t i = 0;
433 :
434 0 : if (reason > UCNV_IRREGULAR)
435 : {
436 0 : return;
437 : }
438 :
439 0 : if(context==NULL)
440 : {
441 0 : while (i < length)
442 : {
443 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
444 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
445 0 : valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
446 : }
447 : }
448 : else
449 : {
450 0 : switch(*((char*)context))
451 : {
452 : case UCNV_PRV_ESCAPE_XML_DEC:
453 0 : while (i < length)
454 : {
455 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
456 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
457 0 : valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
458 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
459 : }
460 0 : break;
461 :
462 : case UCNV_PRV_ESCAPE_XML_HEX:
463 0 : while (i < length)
464 : {
465 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */
466 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */
467 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
468 0 : valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
469 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
470 : }
471 0 : break;
472 : case UCNV_PRV_ESCAPE_C:
473 0 : while (i < length)
474 : {
475 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */
476 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
477 0 : valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
478 : }
479 0 : break;
480 : default:
481 0 : while (i < length)
482 : {
483 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
484 0 : uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */
485 0 : uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
486 0 : valueStringLength += 2;
487 : }
488 : }
489 : }
490 : /* reset the error */
491 0 : *err = U_ZERO_ERROR;
492 :
493 0 : ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
494 : }
495 :
496 : #endif
|