Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (C) 2002-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : * file name: ucnv_u7.c
9 : * encoding: UTF-8
10 : * tab size: 8 (not used)
11 : * indentation:4
12 : *
13 : * created on: 2002jul01
14 : * created by: Markus W. Scherer
15 : *
16 : * UTF-7 converter implementation. Used to be in ucnv_utf.c.
17 : */
18 :
19 : #include "unicode/utypes.h"
20 :
21 : #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
22 :
23 : #include "cmemory.h"
24 : #include "unicode/ucnv.h"
25 : #include "ucnv_bld.h"
26 : #include "ucnv_cnv.h"
27 : #include "uassert.h"
28 :
29 : /* UTF-7 -------------------------------------------------------------------- */
30 :
31 : /*
32 : * UTF-7 is a stateful encoding of Unicode.
33 : * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
34 : * It was intended for use in Internet email systems, using in its bytewise
35 : * encoding only a subset of 7-bit US-ASCII.
36 : * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
37 : * occasionally used.
38 : *
39 : * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
40 : * characters directly or in base64. Especially, the characters in set O
41 : * as defined in the RFC (see below) may be encoded directly but are not
42 : * allowed in, e.g., email headers.
43 : * By default, the ICU UTF-7 converter encodes set O directly.
44 : * By choosing the option "version=1", set O will be escaped instead.
45 : * For example:
46 : * utf7Converter=ucnv_open("UTF-7,version=1");
47 : *
48 : * For details about email headers see RFC 2047.
49 : */
50 :
51 : /*
52 : * Tests for US-ASCII characters belonging to character classes
53 : * defined in UTF-7.
54 : *
55 : * Set D (directly encoded characters) consists of the following
56 : * characters: the upper and lower case letters A through Z
57 : * and a through z, the 10 digits 0-9, and the following nine special
58 : * characters (note that "+" and "=" are omitted):
59 : * '(),-./:?
60 : *
61 : * Set O (optional direct characters) consists of the following
62 : * characters (note that "\" and "~" are omitted):
63 : * !"#$%&*;<=>@[]^_`{|}
64 : *
65 : * According to the rules in RFC 2152, the byte values for the following
66 : * US-ASCII characters are not used in UTF-7 and are therefore illegal:
67 : * - all C0 control codes except for CR LF TAB
68 : * - BACKSLASH
69 : * - TILDE
70 : * - DEL
71 : * - all codes beyond US-ASCII, i.e. all >127
72 : */
73 : #define inSetD(c) \
74 : ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
75 : (uint8_t)((c)-48)<10 || /* digits */ \
76 : (uint8_t)((c)-39)<3 || /* '() */ \
77 : (uint8_t)((c)-44)<4 || /* ,-./ */ \
78 : (c)==58 || (c)==63 /* :? */ \
79 : )
80 :
81 : #define inSetO(c) \
82 : ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
83 : (uint8_t)((c)-59)<4 || /* ;<=> */ \
84 : (uint8_t)((c)-93)<4 || /* ]^_` */ \
85 : (uint8_t)((c)-123)<3 || /* {|} */ \
86 : (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
87 : )
88 :
89 : #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
90 : #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
91 :
92 : #define PLUS 43
93 : #define MINUS 45
94 : #define BACKSLASH 92
95 : #define TILDE 126
96 :
97 : /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
98 : #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
99 :
100 : /* encode directly sets D and O and CR LF SP TAB */
101 : static const UBool encodeDirectlyMaximum[128]={
102 : /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
103 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
104 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 :
106 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
107 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108 :
109 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
111 :
112 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
114 : };
115 :
116 : /* encode directly set D and CR LF SP TAB but not set O */
117 : static const UBool encodeDirectlyRestricted[128]={
118 : /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
119 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
120 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 :
122 : 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
123 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
124 :
125 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
126 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
127 :
128 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
130 : };
131 :
132 : static const uint8_t
133 : toBase64[64]={
134 : /* A-Z */
135 : 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
136 : 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
137 : /* a-z */
138 : 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
139 : 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
140 : /* 0-9 */
141 : 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
142 : /* +/ */
143 : 43, 47
144 : };
145 :
146 : static const int8_t
147 : fromBase64[128]={
148 : /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
149 : -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
150 : -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
151 :
152 : /* general punctuation with + and / and a special value (-2) for - */
153 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
154 : /* digits */
155 : 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
156 :
157 : /* A-Z */
158 : -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
159 : 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
160 :
161 : /* a-z */
162 : -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
163 : 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
164 : };
165 :
166 : /*
167 : * converter status values:
168 : *
169 : * toUnicodeStatus:
170 : * 24 inDirectMode (boolean)
171 : * 23..16 base64Counter (-1..7)
172 : * 15..0 bits (up to 14 bits incoming base64)
173 : *
174 : * fromUnicodeStatus:
175 : * 31..28 version (0: set O direct 1: set O escaped)
176 : * 24 inDirectMode (boolean)
177 : * 23..16 base64Counter (0..2)
178 : * 7..0 bits (6 bits outgoing base64)
179 : *
180 : */
181 :
182 : U_CDECL_BEGIN
183 : static void U_CALLCONV
184 0 : _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
185 0 : if(choice<=UCNV_RESET_TO_UNICODE) {
186 : /* reset toUnicode */
187 0 : cnv->toUnicodeStatus=0x1000000; /* inDirectMode=TRUE */
188 0 : cnv->toULength=0;
189 : }
190 0 : if(choice!=UCNV_RESET_TO_UNICODE) {
191 : /* reset fromUnicode */
192 0 : cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
193 : }
194 0 : }
195 :
196 : static void U_CALLCONV
197 0 : _UTF7Open(UConverter *cnv,
198 : UConverterLoadArgs *pArgs,
199 : UErrorCode *pErrorCode) {
200 : (void)pArgs;
201 0 : if(UCNV_GET_VERSION(cnv)<=1) {
202 : /* TODO(markus): Should just use cnv->options rather than copying the version number. */
203 0 : cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
204 0 : _UTF7Reset(cnv, UCNV_RESET_BOTH);
205 : } else {
206 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
207 : }
208 0 : }
209 :
210 : static void U_CALLCONV
211 0 : _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
212 : UErrorCode *pErrorCode) {
213 : UConverter *cnv;
214 : const uint8_t *source, *sourceLimit;
215 : UChar *target;
216 : const UChar *targetLimit;
217 : int32_t *offsets;
218 :
219 : uint8_t *bytes;
220 : uint8_t byteIndex;
221 :
222 : int32_t length, targetCapacity;
223 :
224 : /* UTF-7 state */
225 : uint16_t bits;
226 : int8_t base64Counter;
227 : UBool inDirectMode;
228 :
229 : int8_t base64Value;
230 :
231 : int32_t sourceIndex, nextSourceIndex;
232 :
233 : uint8_t b;
234 : /* set up the local pointers */
235 0 : cnv=pArgs->converter;
236 :
237 0 : source=(const uint8_t *)pArgs->source;
238 0 : sourceLimit=(const uint8_t *)pArgs->sourceLimit;
239 0 : target=pArgs->target;
240 0 : targetLimit=pArgs->targetLimit;
241 0 : offsets=pArgs->offsets;
242 : /* get the state machine state */
243 : {
244 0 : uint32_t status=cnv->toUnicodeStatus;
245 0 : inDirectMode=(UBool)((status>>24)&1);
246 0 : base64Counter=(int8_t)(status>>16);
247 0 : bits=(uint16_t)status;
248 : }
249 0 : bytes=cnv->toUBytes;
250 0 : byteIndex=cnv->toULength;
251 :
252 : /* sourceIndex=-1 if the current character began in the previous buffer */
253 0 : sourceIndex=byteIndex==0 ? 0 : -1;
254 0 : nextSourceIndex=0;
255 :
256 0 : if(inDirectMode) {
257 : directMode:
258 : /*
259 : * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
260 : * with their US-ASCII byte values.
261 : * Backslash and Tilde and most control characters are not allowed in UTF-7.
262 : * A plus sign starts Unicode (or "escape") Mode.
263 : *
264 : * In Direct Mode, only the sourceIndex is used.
265 : */
266 0 : byteIndex=0;
267 0 : length=(int32_t)(sourceLimit-source);
268 0 : targetCapacity=(int32_t)(targetLimit-target);
269 0 : if(length>targetCapacity) {
270 0 : length=targetCapacity;
271 : }
272 0 : while(length>0) {
273 0 : b=*source++;
274 0 : if(!isLegalUTF7(b)) {
275 : /* illegal */
276 0 : bytes[0]=b;
277 0 : byteIndex=1;
278 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
279 0 : break;
280 0 : } else if(b!=PLUS) {
281 : /* write directly encoded character */
282 0 : *target++=b;
283 0 : if(offsets!=NULL) {
284 0 : *offsets++=sourceIndex++;
285 : }
286 : } else /* PLUS */ {
287 : /* switch to Unicode mode */
288 0 : nextSourceIndex=++sourceIndex;
289 0 : inDirectMode=FALSE;
290 0 : byteIndex=0;
291 0 : bits=0;
292 0 : base64Counter=-1;
293 0 : goto unicodeMode;
294 : }
295 0 : --length;
296 : }
297 0 : if(source<sourceLimit && target>=targetLimit) {
298 : /* target is full */
299 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
300 : }
301 : } else {
302 : unicodeMode:
303 : /*
304 : * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
305 : * The base64 sequence ends with any character that is not in the base64 alphabet.
306 : * A terminating minus sign is consumed.
307 : *
308 : * In Unicode Mode, the sourceIndex has the index to the start of the current
309 : * base64 bytes, while nextSourceIndex is precisely parallel to source,
310 : * keeping the index to the following byte.
311 : * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
312 : */
313 0 : while(source<sourceLimit) {
314 0 : if(target<targetLimit) {
315 0 : bytes[byteIndex++]=b=*source++;
316 0 : ++nextSourceIndex;
317 0 : base64Value = -3; /* initialize as illegal */
318 0 : if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
319 : /* either
320 : * base64Value==-1 for any legal character except base64 and minus sign, or
321 : * base64Value==-3 for illegal characters:
322 : * 1. In either case, leave Unicode mode.
323 : * 2.1. If we ended with an incomplete UChar or none after the +, then
324 : * generate an error for the preceding erroneous sequence and deal with
325 : * the current (possibly illegal) character next time through.
326 : * 2.2. Else the current char comes after a complete UChar, which was already
327 : * pushed to the output buf, so:
328 : * 2.2.1. If the current char is legal, just save it for processing next time.
329 : * It may be for example, a plus which we need to deal with in direct mode.
330 : * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
331 : */
332 0 : inDirectMode=TRUE;
333 0 : if(base64Counter==-1) {
334 : /* illegal: + immediately followed by something other than base64 or minus sign */
335 : /* include the plus sign in the reported sequence, but not the subsequent char */
336 0 : --source;
337 0 : bytes[0]=PLUS;
338 0 : byteIndex=1;
339 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
340 0 : break;
341 0 : } else if(bits!=0) {
342 : /* bits are illegally left over, a UChar is incomplete */
343 : /* don't include current char (legal or illegal) in error seq */
344 0 : --source;
345 0 : --byteIndex;
346 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
347 0 : break;
348 : } else {
349 : /* previous UChar was complete */
350 0 : if(base64Value==-3) {
351 : /* current character is illegal, deal with it here */
352 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
353 0 : break;
354 : } else {
355 : /* un-read the current character in case it is a plus sign */
356 0 : --source;
357 0 : sourceIndex=nextSourceIndex-1;
358 0 : goto directMode;
359 : }
360 : }
361 0 : } else if(base64Value>=0) {
362 : /* collect base64 bytes into UChars */
363 0 : switch(base64Counter) {
364 : case -1: /* -1 is immediately after the + */
365 : case 0:
366 0 : bits=base64Value;
367 0 : base64Counter=1;
368 0 : break;
369 : case 1:
370 : case 3:
371 : case 4:
372 : case 6:
373 0 : bits=(uint16_t)((bits<<6)|base64Value);
374 0 : ++base64Counter;
375 0 : break;
376 : case 2:
377 0 : *target++=(UChar)((bits<<4)|(base64Value>>2));
378 0 : if(offsets!=NULL) {
379 0 : *offsets++=sourceIndex;
380 0 : sourceIndex=nextSourceIndex-1;
381 : }
382 0 : bytes[0]=b; /* keep this byte in case an error occurs */
383 0 : byteIndex=1;
384 0 : bits=(uint16_t)(base64Value&3);
385 0 : base64Counter=3;
386 0 : break;
387 : case 5:
388 0 : *target++=(UChar)((bits<<2)|(base64Value>>4));
389 0 : if(offsets!=NULL) {
390 0 : *offsets++=sourceIndex;
391 0 : sourceIndex=nextSourceIndex-1;
392 : }
393 0 : bytes[0]=b; /* keep this byte in case an error occurs */
394 0 : byteIndex=1;
395 0 : bits=(uint16_t)(base64Value&15);
396 0 : base64Counter=6;
397 0 : break;
398 : case 7:
399 0 : *target++=(UChar)((bits<<6)|base64Value);
400 0 : if(offsets!=NULL) {
401 0 : *offsets++=sourceIndex;
402 0 : sourceIndex=nextSourceIndex;
403 : }
404 0 : byteIndex=0;
405 0 : bits=0;
406 0 : base64Counter=0;
407 0 : break;
408 : default:
409 : /* will never occur */
410 0 : break;
411 : }
412 : } else /*base64Value==-2*/ {
413 : /* minus sign terminates the base64 sequence */
414 0 : inDirectMode=TRUE;
415 0 : if(base64Counter==-1) {
416 : /* +- i.e. a minus immediately following a plus */
417 0 : *target++=PLUS;
418 0 : if(offsets!=NULL) {
419 0 : *offsets++=sourceIndex-1;
420 : }
421 : } else {
422 : /* absorb the minus and leave the Unicode Mode */
423 0 : if(bits!=0) {
424 : /* bits are illegally left over, a UChar is incomplete */
425 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
426 0 : break;
427 : }
428 : }
429 0 : sourceIndex=nextSourceIndex;
430 0 : goto directMode;
431 : }
432 : } else {
433 : /* target is full */
434 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
435 0 : break;
436 : }
437 : }
438 : }
439 :
440 0 : if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
441 : /*
442 : * if we are in Unicode mode, then the byteIndex might not be 0,
443 : * but that is ok if bits==0
444 : * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
445 : * (not true for IMAP-mailbox-name where we must end in direct mode)
446 : */
447 0 : byteIndex=0;
448 : }
449 :
450 : /* set the converter state back into UConverter */
451 0 : cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
452 0 : cnv->toULength=byteIndex;
453 :
454 : /* write back the updated pointers */
455 0 : pArgs->source=(const char *)source;
456 0 : pArgs->target=target;
457 0 : pArgs->offsets=offsets;
458 0 : return;
459 : }
460 :
461 : static void U_CALLCONV
462 0 : _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
463 : UErrorCode *pErrorCode) {
464 : UConverter *cnv;
465 : const UChar *source, *sourceLimit;
466 : uint8_t *target, *targetLimit;
467 : int32_t *offsets;
468 :
469 : int32_t length, targetCapacity, sourceIndex;
470 : UChar c;
471 :
472 : /* UTF-7 state */
473 : const UBool *encodeDirectly;
474 : uint8_t bits;
475 : int8_t base64Counter;
476 : UBool inDirectMode;
477 :
478 : /* set up the local pointers */
479 0 : cnv=pArgs->converter;
480 :
481 : /* set up the local pointers */
482 0 : source=pArgs->source;
483 0 : sourceLimit=pArgs->sourceLimit;
484 0 : target=(uint8_t *)pArgs->target;
485 0 : targetLimit=(uint8_t *)pArgs->targetLimit;
486 0 : offsets=pArgs->offsets;
487 :
488 : /* get the state machine state */
489 : {
490 0 : uint32_t status=cnv->fromUnicodeStatus;
491 0 : encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
492 0 : inDirectMode=(UBool)((status>>24)&1);
493 0 : base64Counter=(int8_t)(status>>16);
494 0 : bits=(uint8_t)status;
495 0 : U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
496 : }
497 :
498 : /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
499 0 : sourceIndex=0;
500 :
501 0 : if(inDirectMode) {
502 : directMode:
503 0 : length=(int32_t)(sourceLimit-source);
504 0 : targetCapacity=(int32_t)(targetLimit-target);
505 0 : if(length>targetCapacity) {
506 0 : length=targetCapacity;
507 : }
508 0 : while(length>0) {
509 0 : c=*source++;
510 : /* currently always encode CR LF SP TAB directly */
511 0 : if(c<=127 && encodeDirectly[c]) {
512 : /* encode directly */
513 0 : *target++=(uint8_t)c;
514 0 : if(offsets!=NULL) {
515 0 : *offsets++=sourceIndex++;
516 : }
517 0 : } else if(c==PLUS) {
518 : /* output +- for + */
519 0 : *target++=PLUS;
520 0 : if(target<targetLimit) {
521 0 : *target++=MINUS;
522 0 : if(offsets!=NULL) {
523 0 : *offsets++=sourceIndex;
524 0 : *offsets++=sourceIndex++;
525 : }
526 : /* realign length and targetCapacity */
527 0 : goto directMode;
528 : } else {
529 0 : if(offsets!=NULL) {
530 0 : *offsets++=sourceIndex++;
531 : }
532 0 : cnv->charErrorBuffer[0]=MINUS;
533 0 : cnv->charErrorBufferLength=1;
534 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
535 0 : break;
536 : }
537 : } else {
538 : /* un-read this character and switch to Unicode Mode */
539 0 : --source;
540 0 : *target++=PLUS;
541 0 : if(offsets!=NULL) {
542 0 : *offsets++=sourceIndex;
543 : }
544 0 : inDirectMode=FALSE;
545 0 : base64Counter=0;
546 0 : goto unicodeMode;
547 : }
548 0 : --length;
549 : }
550 0 : if(source<sourceLimit && target>=targetLimit) {
551 : /* target is full */
552 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
553 : }
554 : } else {
555 : unicodeMode:
556 0 : while(source<sourceLimit) {
557 0 : if(target<targetLimit) {
558 0 : c=*source++;
559 0 : if(c<=127 && encodeDirectly[c]) {
560 : /* encode directly */
561 0 : inDirectMode=TRUE;
562 :
563 : /* trick: back out this character to make this easier */
564 0 : --source;
565 :
566 : /* terminate the base64 sequence */
567 0 : if(base64Counter!=0) {
568 : /* write remaining bits for the previous character */
569 0 : *target++=toBase64[bits];
570 0 : if(offsets!=NULL) {
571 0 : *offsets++=sourceIndex-1;
572 : }
573 : }
574 0 : if(fromBase64[c]!=-1) {
575 : /* need to terminate with a minus */
576 0 : if(target<targetLimit) {
577 0 : *target++=MINUS;
578 0 : if(offsets!=NULL) {
579 0 : *offsets++=sourceIndex-1;
580 : }
581 : } else {
582 0 : cnv->charErrorBuffer[0]=MINUS;
583 0 : cnv->charErrorBufferLength=1;
584 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
585 0 : break;
586 : }
587 : }
588 0 : goto directMode;
589 : } else {
590 : /*
591 : * base64 this character:
592 : * Output 2 or 3 base64 bytes for the remaining bits of the previous character
593 : * and the bits of this character, each implicitly in UTF-16BE.
594 : *
595 : * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
596 : * character to the next. The actual 2 or 4 bits are shifted to the left edge
597 : * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
598 : */
599 0 : switch(base64Counter) {
600 : case 0:
601 0 : *target++=toBase64[c>>10];
602 0 : if(target<targetLimit) {
603 0 : *target++=toBase64[(c>>4)&0x3f];
604 0 : if(offsets!=NULL) {
605 0 : *offsets++=sourceIndex;
606 0 : *offsets++=sourceIndex++;
607 : }
608 : } else {
609 0 : if(offsets!=NULL) {
610 0 : *offsets++=sourceIndex++;
611 : }
612 0 : cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
613 0 : cnv->charErrorBufferLength=1;
614 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
615 : }
616 0 : bits=(uint8_t)((c&15)<<2);
617 0 : base64Counter=1;
618 0 : break;
619 : case 1:
620 0 : *target++=toBase64[bits|(c>>14)];
621 0 : if(target<targetLimit) {
622 0 : *target++=toBase64[(c>>8)&0x3f];
623 0 : if(target<targetLimit) {
624 0 : *target++=toBase64[(c>>2)&0x3f];
625 0 : if(offsets!=NULL) {
626 0 : *offsets++=sourceIndex;
627 0 : *offsets++=sourceIndex;
628 0 : *offsets++=sourceIndex++;
629 : }
630 : } else {
631 0 : if(offsets!=NULL) {
632 0 : *offsets++=sourceIndex;
633 0 : *offsets++=sourceIndex++;
634 : }
635 0 : cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
636 0 : cnv->charErrorBufferLength=1;
637 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
638 : }
639 : } else {
640 0 : if(offsets!=NULL) {
641 0 : *offsets++=sourceIndex++;
642 : }
643 0 : cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
644 0 : cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
645 0 : cnv->charErrorBufferLength=2;
646 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
647 : }
648 0 : bits=(uint8_t)((c&3)<<4);
649 0 : base64Counter=2;
650 0 : break;
651 : case 2:
652 0 : *target++=toBase64[bits|(c>>12)];
653 0 : if(target<targetLimit) {
654 0 : *target++=toBase64[(c>>6)&0x3f];
655 0 : if(target<targetLimit) {
656 0 : *target++=toBase64[c&0x3f];
657 0 : if(offsets!=NULL) {
658 0 : *offsets++=sourceIndex;
659 0 : *offsets++=sourceIndex;
660 0 : *offsets++=sourceIndex++;
661 : }
662 : } else {
663 0 : if(offsets!=NULL) {
664 0 : *offsets++=sourceIndex;
665 0 : *offsets++=sourceIndex++;
666 : }
667 0 : cnv->charErrorBuffer[0]=toBase64[c&0x3f];
668 0 : cnv->charErrorBufferLength=1;
669 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
670 : }
671 : } else {
672 0 : if(offsets!=NULL) {
673 0 : *offsets++=sourceIndex++;
674 : }
675 0 : cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
676 0 : cnv->charErrorBuffer[1]=toBase64[c&0x3f];
677 0 : cnv->charErrorBufferLength=2;
678 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
679 : }
680 0 : bits=0;
681 0 : base64Counter=0;
682 0 : break;
683 : default:
684 : /* will never occur */
685 0 : break;
686 : }
687 : }
688 : } else {
689 : /* target is full */
690 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
691 0 : break;
692 : }
693 : }
694 : }
695 :
696 0 : if(pArgs->flush && source>=sourceLimit) {
697 : /* flush remaining bits to the target */
698 0 : if(!inDirectMode) {
699 0 : if (base64Counter!=0) {
700 0 : if(target<targetLimit) {
701 0 : *target++=toBase64[bits];
702 0 : if(offsets!=NULL) {
703 0 : *offsets++=sourceIndex-1;
704 : }
705 : } else {
706 0 : cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
707 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
708 : }
709 : }
710 : /* Add final MINUS to terminate unicodeMode */
711 0 : if(target<targetLimit) {
712 0 : *target++=MINUS;
713 0 : if(offsets!=NULL) {
714 0 : *offsets++=sourceIndex-1;
715 : }
716 : } else {
717 0 : cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
718 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
719 : }
720 : }
721 : /* reset the state for the next conversion */
722 0 : cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
723 : } else {
724 : /* set the converter state back into UConverter */
725 0 : cnv->fromUnicodeStatus=
726 0 : (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
727 0 : ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
728 : }
729 :
730 : /* write back the updated pointers */
731 0 : pArgs->source=source;
732 0 : pArgs->target=(char *)target;
733 0 : pArgs->offsets=offsets;
734 0 : return;
735 : }
736 :
737 : static const char * U_CALLCONV
738 0 : _UTF7GetName(const UConverter *cnv) {
739 0 : switch(cnv->fromUnicodeStatus>>28) {
740 : case 1:
741 0 : return "UTF-7,version=1";
742 : default:
743 0 : return "UTF-7";
744 : }
745 : }
746 : U_CDECL_END
747 :
748 : static const UConverterImpl _UTF7Impl={
749 : UCNV_UTF7,
750 :
751 : NULL,
752 : NULL,
753 :
754 : _UTF7Open,
755 : NULL,
756 : _UTF7Reset,
757 :
758 : _UTF7ToUnicodeWithOffsets,
759 : _UTF7ToUnicodeWithOffsets,
760 : _UTF7FromUnicodeWithOffsets,
761 : _UTF7FromUnicodeWithOffsets,
762 : NULL,
763 :
764 : NULL,
765 : _UTF7GetName,
766 : NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
767 : NULL,
768 : ucnv_getCompleteUnicodeSet,
769 :
770 : NULL,
771 : NULL
772 : };
773 :
774 : static const UConverterStaticData _UTF7StaticData={
775 : sizeof(UConverterStaticData),
776 : "UTF-7",
777 : 0, /* TODO CCSID for UTF-7 */
778 : UCNV_IBM, UCNV_UTF7,
779 : 1, 4,
780 : { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
781 : FALSE, FALSE,
782 : 0,
783 : 0,
784 : { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
785 : };
786 :
787 : const UConverterSharedData _UTF7Data=
788 : UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
789 :
790 : /* IMAP mailbox name encoding ----------------------------------------------- */
791 :
792 : /*
793 : * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
794 : * http://www.ietf.org/rfc/rfc2060.txt
795 : *
796 : * 5.1.3. Mailbox International Naming Convention
797 : *
798 : * By convention, international mailbox names are specified using a
799 : * modified version of the UTF-7 encoding described in [UTF-7]. The
800 : * purpose of these modifications is to correct the following problems
801 : * with UTF-7:
802 : *
803 : * 1) UTF-7 uses the "+" character for shifting; this conflicts with
804 : * the common use of "+" in mailbox names, in particular USENET
805 : * newsgroup names.
806 : *
807 : * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
808 : * conflicts with the use of "/" as a popular hierarchy delimiter.
809 : *
810 : * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
811 : * the use of "\" as a popular hierarchy delimiter.
812 : *
813 : * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
814 : * the use of "~" in some servers as a home directory indicator.
815 : *
816 : * 5) UTF-7 permits multiple alternate forms to represent the same
817 : * string; in particular, printable US-ASCII chararacters can be
818 : * represented in encoded form.
819 : *
820 : * In modified UTF-7, printable US-ASCII characters except for "&"
821 : * represent themselves; that is, characters with octet values 0x20-0x25
822 : * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
823 : * octet sequence "&-".
824 : *
825 : * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
826 : * Unicode 16-bit octets) are represented in modified BASE64, with a
827 : * further modification from [UTF-7] that "," is used instead of "/".
828 : * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
829 : * character which can represent itself.
830 : *
831 : * "&" is used to shift to modified BASE64 and "-" to shift back to US-
832 : * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
833 : * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
834 : * ").
835 : *
836 : * For example, here is a mailbox name which mixes English, Japanese,
837 : * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
838 : */
839 :
840 : /*
841 : * Tests for US-ASCII characters belonging to character classes
842 : * defined in UTF-7.
843 : *
844 : * Set D (directly encoded characters) consists of the following
845 : * characters: the upper and lower case letters A through Z
846 : * and a through z, the 10 digits 0-9, and the following nine special
847 : * characters (note that "+" and "=" are omitted):
848 : * '(),-./:?
849 : *
850 : * Set O (optional direct characters) consists of the following
851 : * characters (note that "\" and "~" are omitted):
852 : * !"#$%&*;<=>@[]^_`{|}
853 : *
854 : * According to the rules in RFC 2152, the byte values for the following
855 : * US-ASCII characters are not used in UTF-7 and are therefore illegal:
856 : * - all C0 control codes except for CR LF TAB
857 : * - BACKSLASH
858 : * - TILDE
859 : * - DEL
860 : * - all codes beyond US-ASCII, i.e. all >127
861 : */
862 :
863 : /* uses '&' not '+' to start a base64 sequence */
864 : #define AMPERSAND 0x26
865 : #define COMMA 0x2c
866 : #define SLASH 0x2f
867 :
868 : /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
869 : #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
870 :
871 : /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
872 : #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
873 :
874 : #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
875 : #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
876 :
877 : /*
878 : * converter status values:
879 : *
880 : * toUnicodeStatus:
881 : * 24 inDirectMode (boolean)
882 : * 23..16 base64Counter (-1..7)
883 : * 15..0 bits (up to 14 bits incoming base64)
884 : *
885 : * fromUnicodeStatus:
886 : * 24 inDirectMode (boolean)
887 : * 23..16 base64Counter (0..2)
888 : * 7..0 bits (6 bits outgoing base64)
889 : *
890 : * ignore bits 31..25
891 : */
892 :
893 : U_CDECL_BEGIN
894 : static void U_CALLCONV
895 0 : _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
896 : UErrorCode *pErrorCode) {
897 : UConverter *cnv;
898 : const uint8_t *source, *sourceLimit;
899 : UChar *target;
900 : const UChar *targetLimit;
901 : int32_t *offsets;
902 :
903 : uint8_t *bytes;
904 : uint8_t byteIndex;
905 :
906 : int32_t length, targetCapacity;
907 :
908 : /* UTF-7 state */
909 : uint16_t bits;
910 : int8_t base64Counter;
911 : UBool inDirectMode;
912 :
913 : int8_t base64Value;
914 :
915 : int32_t sourceIndex, nextSourceIndex;
916 :
917 : UChar c;
918 : uint8_t b;
919 :
920 : /* set up the local pointers */
921 0 : cnv=pArgs->converter;
922 :
923 0 : source=(const uint8_t *)pArgs->source;
924 0 : sourceLimit=(const uint8_t *)pArgs->sourceLimit;
925 0 : target=pArgs->target;
926 0 : targetLimit=pArgs->targetLimit;
927 0 : offsets=pArgs->offsets;
928 : /* get the state machine state */
929 : {
930 0 : uint32_t status=cnv->toUnicodeStatus;
931 0 : inDirectMode=(UBool)((status>>24)&1);
932 0 : base64Counter=(int8_t)(status>>16);
933 0 : bits=(uint16_t)status;
934 : }
935 0 : bytes=cnv->toUBytes;
936 0 : byteIndex=cnv->toULength;
937 :
938 : /* sourceIndex=-1 if the current character began in the previous buffer */
939 0 : sourceIndex=byteIndex==0 ? 0 : -1;
940 0 : nextSourceIndex=0;
941 :
942 0 : if(inDirectMode) {
943 : directMode:
944 : /*
945 : * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
946 : * with their US-ASCII byte values.
947 : * An ampersand starts Unicode (or "escape") Mode.
948 : *
949 : * In Direct Mode, only the sourceIndex is used.
950 : */
951 0 : byteIndex=0;
952 0 : length=(int32_t)(sourceLimit-source);
953 0 : targetCapacity=(int32_t)(targetLimit-target);
954 0 : if(length>targetCapacity) {
955 0 : length=targetCapacity;
956 : }
957 0 : while(length>0) {
958 0 : b=*source++;
959 0 : if(!isLegalIMAP(b)) {
960 : /* illegal */
961 0 : bytes[0]=b;
962 0 : byteIndex=1;
963 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
964 0 : break;
965 0 : } else if(b!=AMPERSAND) {
966 : /* write directly encoded character */
967 0 : *target++=b;
968 0 : if(offsets!=NULL) {
969 0 : *offsets++=sourceIndex++;
970 : }
971 : } else /* AMPERSAND */ {
972 : /* switch to Unicode mode */
973 0 : nextSourceIndex=++sourceIndex;
974 0 : inDirectMode=FALSE;
975 0 : byteIndex=0;
976 0 : bits=0;
977 0 : base64Counter=-1;
978 0 : goto unicodeMode;
979 : }
980 0 : --length;
981 : }
982 0 : if(source<sourceLimit && target>=targetLimit) {
983 : /* target is full */
984 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
985 : }
986 : } else {
987 : unicodeMode:
988 : /*
989 : * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
990 : * The base64 sequence ends with any character that is not in the base64 alphabet.
991 : * A terminating minus sign is consumed.
992 : * US-ASCII must not be base64-ed.
993 : *
994 : * In Unicode Mode, the sourceIndex has the index to the start of the current
995 : * base64 bytes, while nextSourceIndex is precisely parallel to source,
996 : * keeping the index to the following byte.
997 : * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
998 : */
999 0 : while(source<sourceLimit) {
1000 0 : if(target<targetLimit) {
1001 0 : bytes[byteIndex++]=b=*source++;
1002 0 : ++nextSourceIndex;
1003 0 : if(b>0x7e) {
1004 : /* illegal - test other illegal US-ASCII values by base64Value==-3 */
1005 0 : inDirectMode=TRUE;
1006 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1007 0 : break;
1008 0 : } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
1009 : /* collect base64 bytes into UChars */
1010 0 : switch(base64Counter) {
1011 : case -1: /* -1 is immediately after the & */
1012 : case 0:
1013 0 : bits=base64Value;
1014 0 : base64Counter=1;
1015 0 : break;
1016 : case 1:
1017 : case 3:
1018 : case 4:
1019 : case 6:
1020 0 : bits=(uint16_t)((bits<<6)|base64Value);
1021 0 : ++base64Counter;
1022 0 : break;
1023 : case 2:
1024 0 : c=(UChar)((bits<<4)|(base64Value>>2));
1025 0 : if(isLegalIMAP(c)) {
1026 : /* illegal */
1027 0 : inDirectMode=TRUE;
1028 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1029 0 : goto endloop;
1030 : }
1031 0 : *target++=c;
1032 0 : if(offsets!=NULL) {
1033 0 : *offsets++=sourceIndex;
1034 0 : sourceIndex=nextSourceIndex-1;
1035 : }
1036 0 : bytes[0]=b; /* keep this byte in case an error occurs */
1037 0 : byteIndex=1;
1038 0 : bits=(uint16_t)(base64Value&3);
1039 0 : base64Counter=3;
1040 0 : break;
1041 : case 5:
1042 0 : c=(UChar)((bits<<2)|(base64Value>>4));
1043 0 : if(isLegalIMAP(c)) {
1044 : /* illegal */
1045 0 : inDirectMode=TRUE;
1046 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1047 0 : goto endloop;
1048 : }
1049 0 : *target++=c;
1050 0 : if(offsets!=NULL) {
1051 0 : *offsets++=sourceIndex;
1052 0 : sourceIndex=nextSourceIndex-1;
1053 : }
1054 0 : bytes[0]=b; /* keep this byte in case an error occurs */
1055 0 : byteIndex=1;
1056 0 : bits=(uint16_t)(base64Value&15);
1057 0 : base64Counter=6;
1058 0 : break;
1059 : case 7:
1060 0 : c=(UChar)((bits<<6)|base64Value);
1061 0 : if(isLegalIMAP(c)) {
1062 : /* illegal */
1063 0 : inDirectMode=TRUE;
1064 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1065 0 : goto endloop;
1066 : }
1067 0 : *target++=c;
1068 0 : if(offsets!=NULL) {
1069 0 : *offsets++=sourceIndex;
1070 0 : sourceIndex=nextSourceIndex;
1071 : }
1072 0 : byteIndex=0;
1073 0 : bits=0;
1074 0 : base64Counter=0;
1075 0 : break;
1076 : default:
1077 : /* will never occur */
1078 0 : break;
1079 : }
1080 0 : } else if(base64Value==-2) {
1081 : /* minus sign terminates the base64 sequence */
1082 0 : inDirectMode=TRUE;
1083 0 : if(base64Counter==-1) {
1084 : /* &- i.e. a minus immediately following an ampersand */
1085 0 : *target++=AMPERSAND;
1086 0 : if(offsets!=NULL) {
1087 0 : *offsets++=sourceIndex-1;
1088 : }
1089 : } else {
1090 : /* absorb the minus and leave the Unicode Mode */
1091 0 : if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
1092 : /* bits are illegally left over, a UChar is incomplete */
1093 : /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
1094 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1095 0 : break;
1096 : }
1097 : }
1098 0 : sourceIndex=nextSourceIndex;
1099 0 : goto directMode;
1100 : } else {
1101 0 : if(base64Counter==-1) {
1102 : /* illegal: & immediately followed by something other than base64 or minus sign */
1103 : /* include the ampersand in the reported sequence */
1104 0 : --sourceIndex;
1105 0 : bytes[0]=AMPERSAND;
1106 0 : bytes[1]=b;
1107 0 : byteIndex=2;
1108 : }
1109 : /* base64Value==-1 for characters that are illegal only in Unicode mode */
1110 : /* base64Value==-3 for illegal characters */
1111 : /* illegal */
1112 0 : inDirectMode=TRUE;
1113 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
1114 0 : break;
1115 : }
1116 : } else {
1117 : /* target is full */
1118 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1119 0 : break;
1120 : }
1121 : }
1122 : }
1123 : endloop:
1124 :
1125 : /*
1126 : * the end of the input stream and detection of truncated input
1127 : * are handled by the framework, but here we must check if we are in Unicode
1128 : * mode and byteIndex==0 because we must end in direct mode
1129 : *
1130 : * conditions:
1131 : * successful
1132 : * in Unicode mode and byteIndex==0
1133 : * end of input and no truncated input
1134 : */
1135 0 : if( U_SUCCESS(*pErrorCode) &&
1136 0 : !inDirectMode && byteIndex==0 &&
1137 0 : pArgs->flush && source>=sourceLimit
1138 : ) {
1139 0 : if(base64Counter==-1) {
1140 : /* & at the very end of the input */
1141 : /* make the ampersand the reported sequence */
1142 0 : bytes[0]=AMPERSAND;
1143 0 : byteIndex=1;
1144 : }
1145 : /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
1146 :
1147 0 : inDirectMode=TRUE; /* avoid looping */
1148 0 : *pErrorCode=U_TRUNCATED_CHAR_FOUND;
1149 : }
1150 :
1151 : /* set the converter state back into UConverter */
1152 0 : cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
1153 0 : cnv->toULength=byteIndex;
1154 :
1155 : /* write back the updated pointers */
1156 0 : pArgs->source=(const char *)source;
1157 0 : pArgs->target=target;
1158 0 : pArgs->offsets=offsets;
1159 0 : return;
1160 : }
1161 :
1162 : static void U_CALLCONV
1163 0 : _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
1164 : UErrorCode *pErrorCode) {
1165 : UConverter *cnv;
1166 : const UChar *source, *sourceLimit;
1167 : uint8_t *target, *targetLimit;
1168 : int32_t *offsets;
1169 :
1170 : int32_t length, targetCapacity, sourceIndex;
1171 : UChar c;
1172 : uint8_t b;
1173 :
1174 : /* UTF-7 state */
1175 : uint8_t bits;
1176 : int8_t base64Counter;
1177 : UBool inDirectMode;
1178 :
1179 : /* set up the local pointers */
1180 0 : cnv=pArgs->converter;
1181 :
1182 : /* set up the local pointers */
1183 0 : source=pArgs->source;
1184 0 : sourceLimit=pArgs->sourceLimit;
1185 0 : target=(uint8_t *)pArgs->target;
1186 0 : targetLimit=(uint8_t *)pArgs->targetLimit;
1187 0 : offsets=pArgs->offsets;
1188 :
1189 : /* get the state machine state */
1190 : {
1191 0 : uint32_t status=cnv->fromUnicodeStatus;
1192 0 : inDirectMode=(UBool)((status>>24)&1);
1193 0 : base64Counter=(int8_t)(status>>16);
1194 0 : bits=(uint8_t)status;
1195 : }
1196 :
1197 : /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
1198 0 : sourceIndex=0;
1199 :
1200 0 : if(inDirectMode) {
1201 : directMode:
1202 0 : length=(int32_t)(sourceLimit-source);
1203 0 : targetCapacity=(int32_t)(targetLimit-target);
1204 0 : if(length>targetCapacity) {
1205 0 : length=targetCapacity;
1206 : }
1207 0 : while(length>0) {
1208 0 : c=*source++;
1209 : /* encode 0x20..0x7e except '&' directly */
1210 0 : if(inSetDIMAP(c)) {
1211 : /* encode directly */
1212 0 : *target++=(uint8_t)c;
1213 0 : if(offsets!=NULL) {
1214 0 : *offsets++=sourceIndex++;
1215 : }
1216 0 : } else if(c==AMPERSAND) {
1217 : /* output &- for & */
1218 0 : *target++=AMPERSAND;
1219 0 : if(target<targetLimit) {
1220 0 : *target++=MINUS;
1221 0 : if(offsets!=NULL) {
1222 0 : *offsets++=sourceIndex;
1223 0 : *offsets++=sourceIndex++;
1224 : }
1225 : /* realign length and targetCapacity */
1226 0 : goto directMode;
1227 : } else {
1228 0 : if(offsets!=NULL) {
1229 0 : *offsets++=sourceIndex++;
1230 : }
1231 0 : cnv->charErrorBuffer[0]=MINUS;
1232 0 : cnv->charErrorBufferLength=1;
1233 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1234 0 : break;
1235 : }
1236 : } else {
1237 : /* un-read this character and switch to Unicode Mode */
1238 0 : --source;
1239 0 : *target++=AMPERSAND;
1240 0 : if(offsets!=NULL) {
1241 0 : *offsets++=sourceIndex;
1242 : }
1243 0 : inDirectMode=FALSE;
1244 0 : base64Counter=0;
1245 0 : goto unicodeMode;
1246 : }
1247 0 : --length;
1248 : }
1249 0 : if(source<sourceLimit && target>=targetLimit) {
1250 : /* target is full */
1251 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1252 : }
1253 : } else {
1254 : unicodeMode:
1255 0 : while(source<sourceLimit) {
1256 0 : if(target<targetLimit) {
1257 0 : c=*source++;
1258 0 : if(isLegalIMAP(c)) {
1259 : /* encode directly */
1260 0 : inDirectMode=TRUE;
1261 :
1262 : /* trick: back out this character to make this easier */
1263 0 : --source;
1264 :
1265 : /* terminate the base64 sequence */
1266 0 : if(base64Counter!=0) {
1267 : /* write remaining bits for the previous character */
1268 0 : *target++=TO_BASE64_IMAP(bits);
1269 0 : if(offsets!=NULL) {
1270 0 : *offsets++=sourceIndex-1;
1271 : }
1272 : }
1273 : /* need to terminate with a minus */
1274 0 : if(target<targetLimit) {
1275 0 : *target++=MINUS;
1276 0 : if(offsets!=NULL) {
1277 0 : *offsets++=sourceIndex-1;
1278 : }
1279 : } else {
1280 0 : cnv->charErrorBuffer[0]=MINUS;
1281 0 : cnv->charErrorBufferLength=1;
1282 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1283 0 : break;
1284 : }
1285 0 : goto directMode;
1286 : } else {
1287 : /*
1288 : * base64 this character:
1289 : * Output 2 or 3 base64 bytes for the remaining bits of the previous character
1290 : * and the bits of this character, each implicitly in UTF-16BE.
1291 : *
1292 : * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
1293 : * character to the next. The actual 2 or 4 bits are shifted to the left edge
1294 : * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
1295 : */
1296 0 : switch(base64Counter) {
1297 : case 0:
1298 0 : b=(uint8_t)(c>>10);
1299 0 : *target++=TO_BASE64_IMAP(b);
1300 0 : if(target<targetLimit) {
1301 0 : b=(uint8_t)((c>>4)&0x3f);
1302 0 : *target++=TO_BASE64_IMAP(b);
1303 0 : if(offsets!=NULL) {
1304 0 : *offsets++=sourceIndex;
1305 0 : *offsets++=sourceIndex++;
1306 : }
1307 : } else {
1308 0 : if(offsets!=NULL) {
1309 0 : *offsets++=sourceIndex++;
1310 : }
1311 0 : b=(uint8_t)((c>>4)&0x3f);
1312 0 : cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1313 0 : cnv->charErrorBufferLength=1;
1314 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1315 : }
1316 0 : bits=(uint8_t)((c&15)<<2);
1317 0 : base64Counter=1;
1318 0 : break;
1319 : case 1:
1320 0 : b=(uint8_t)(bits|(c>>14));
1321 0 : *target++=TO_BASE64_IMAP(b);
1322 0 : if(target<targetLimit) {
1323 0 : b=(uint8_t)((c>>8)&0x3f);
1324 0 : *target++=TO_BASE64_IMAP(b);
1325 0 : if(target<targetLimit) {
1326 0 : b=(uint8_t)((c>>2)&0x3f);
1327 0 : *target++=TO_BASE64_IMAP(b);
1328 0 : if(offsets!=NULL) {
1329 0 : *offsets++=sourceIndex;
1330 0 : *offsets++=sourceIndex;
1331 0 : *offsets++=sourceIndex++;
1332 : }
1333 : } else {
1334 0 : if(offsets!=NULL) {
1335 0 : *offsets++=sourceIndex;
1336 0 : *offsets++=sourceIndex++;
1337 : }
1338 0 : b=(uint8_t)((c>>2)&0x3f);
1339 0 : cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1340 0 : cnv->charErrorBufferLength=1;
1341 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1342 : }
1343 : } else {
1344 0 : if(offsets!=NULL) {
1345 0 : *offsets++=sourceIndex++;
1346 : }
1347 0 : b=(uint8_t)((c>>8)&0x3f);
1348 0 : cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1349 0 : b=(uint8_t)((c>>2)&0x3f);
1350 0 : cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1351 0 : cnv->charErrorBufferLength=2;
1352 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1353 : }
1354 0 : bits=(uint8_t)((c&3)<<4);
1355 0 : base64Counter=2;
1356 0 : break;
1357 : case 2:
1358 0 : b=(uint8_t)(bits|(c>>12));
1359 0 : *target++=TO_BASE64_IMAP(b);
1360 0 : if(target<targetLimit) {
1361 0 : b=(uint8_t)((c>>6)&0x3f);
1362 0 : *target++=TO_BASE64_IMAP(b);
1363 0 : if(target<targetLimit) {
1364 0 : b=(uint8_t)(c&0x3f);
1365 0 : *target++=TO_BASE64_IMAP(b);
1366 0 : if(offsets!=NULL) {
1367 0 : *offsets++=sourceIndex;
1368 0 : *offsets++=sourceIndex;
1369 0 : *offsets++=sourceIndex++;
1370 : }
1371 : } else {
1372 0 : if(offsets!=NULL) {
1373 0 : *offsets++=sourceIndex;
1374 0 : *offsets++=sourceIndex++;
1375 : }
1376 0 : b=(uint8_t)(c&0x3f);
1377 0 : cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1378 0 : cnv->charErrorBufferLength=1;
1379 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1380 : }
1381 : } else {
1382 0 : if(offsets!=NULL) {
1383 0 : *offsets++=sourceIndex++;
1384 : }
1385 0 : b=(uint8_t)((c>>6)&0x3f);
1386 0 : cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
1387 0 : b=(uint8_t)(c&0x3f);
1388 0 : cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
1389 0 : cnv->charErrorBufferLength=2;
1390 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1391 : }
1392 0 : bits=0;
1393 0 : base64Counter=0;
1394 0 : break;
1395 : default:
1396 : /* will never occur */
1397 0 : break;
1398 : }
1399 : }
1400 : } else {
1401 : /* target is full */
1402 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1403 0 : break;
1404 : }
1405 : }
1406 : }
1407 :
1408 0 : if(pArgs->flush && source>=sourceLimit) {
1409 : /* flush remaining bits to the target */
1410 0 : if(!inDirectMode) {
1411 0 : if(base64Counter!=0) {
1412 0 : if(target<targetLimit) {
1413 0 : *target++=TO_BASE64_IMAP(bits);
1414 0 : if(offsets!=NULL) {
1415 0 : *offsets++=sourceIndex-1;
1416 : }
1417 : } else {
1418 0 : cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
1419 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1420 : }
1421 : }
1422 : /* need to terminate with a minus */
1423 0 : if(target<targetLimit) {
1424 0 : *target++=MINUS;
1425 0 : if(offsets!=NULL) {
1426 0 : *offsets++=sourceIndex-1;
1427 : }
1428 : } else {
1429 0 : cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
1430 0 : *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1431 : }
1432 : }
1433 : /* reset the state for the next conversion */
1434 0 : cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=TRUE */
1435 : } else {
1436 : /* set the converter state back into UConverter */
1437 0 : cnv->fromUnicodeStatus=
1438 0 : (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
1439 0 : ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
1440 : }
1441 :
1442 : /* write back the updated pointers */
1443 0 : pArgs->source=source;
1444 0 : pArgs->target=(char *)target;
1445 0 : pArgs->offsets=offsets;
1446 0 : return;
1447 : }
1448 : U_CDECL_END
1449 :
1450 : static const UConverterImpl _IMAPImpl={
1451 : UCNV_IMAP_MAILBOX,
1452 :
1453 : NULL,
1454 : NULL,
1455 :
1456 : _UTF7Open,
1457 : NULL,
1458 : _UTF7Reset,
1459 :
1460 : _IMAPToUnicodeWithOffsets,
1461 : _IMAPToUnicodeWithOffsets,
1462 : _IMAPFromUnicodeWithOffsets,
1463 : _IMAPFromUnicodeWithOffsets,
1464 : NULL,
1465 :
1466 : NULL,
1467 : NULL,
1468 : NULL, /* we don't need writeSub() because we never call a callback at fromUnicode() */
1469 : NULL,
1470 : ucnv_getCompleteUnicodeSet,
1471 : NULL,
1472 : NULL
1473 : };
1474 :
1475 : static const UConverterStaticData _IMAPStaticData={
1476 : sizeof(UConverterStaticData),
1477 : "IMAP-mailbox-name",
1478 : 0, /* TODO CCSID for IMAP-mailbox-name */
1479 : UCNV_IBM, UCNV_IMAP_MAILBOX,
1480 : 1, 4,
1481 : { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
1482 : FALSE, FALSE,
1483 : 0,
1484 : 0,
1485 : { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
1486 : };
1487 :
1488 : const UConverterSharedData _IMAPData=
1489 : UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
1490 :
1491 : #endif
|