Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : *******************************************************************************
5 : *
6 : * Copyright (C) 2002-2011, International Business Machines
7 : * Corporation and others. All Rights Reserved.
8 : *
9 : *******************************************************************************
10 : * file name: punycode.cpp
11 : * encoding: UTF-8
12 : * tab size: 8 (not used)
13 : * indentation:4
14 : *
15 : * created on: 2002jan31
16 : * created by: Markus W. Scherer
17 : */
18 :
19 :
20 : /* This ICU code derived from: */
21 : /*
22 : punycode.c 0.4.0 (2001-Nov-17-Sat)
23 : http://www.cs.berkeley.edu/~amc/idn/
24 : Adam M. Costello
25 : http://www.nicemice.net/amc/
26 :
27 : Disclaimer and license
28 :
29 : Regarding this entire document or any portion of it (including
30 : the pseudocode and C code), the author makes no guarantees and
31 : is not responsible for any damage resulting from its use. The
32 : author grants irrevocable permission to anyone to use, modify,
33 : and distribute it in any way that does not diminish the rights
34 : of anyone else to use, modify, and distribute it, provided that
35 : redistributed derivative works do not contain misleading author or
36 : version information. Derivative works need not be licensed under
37 : similar terms.
38 : */
39 : /*
40 : * ICU modifications:
41 : * - ICU data types and coding conventions
42 : * - ICU string buffer handling with implicit source lengths
43 : * and destination preflighting
44 : * - UTF-16 handling
45 : */
46 :
47 : #include "unicode/utypes.h"
48 :
49 : #if !UCONFIG_NO_IDNA
50 :
51 : #include "unicode/ustring.h"
52 : #include "unicode/utf.h"
53 : #include "unicode/utf16.h"
54 : #include "ustr_imp.h"
55 : #include "cstring.h"
56 : #include "cmemory.h"
57 : #include "punycode.h"
58 : #include "uassert.h"
59 :
60 :
61 : /* Punycode ----------------------------------------------------------------- */
62 :
63 : /* Punycode parameters for Bootstring */
64 : #define BASE 36
65 : #define TMIN 1
66 : #define TMAX 26
67 : #define SKEW 38
68 : #define DAMP 700
69 : #define INITIAL_BIAS 72
70 : #define INITIAL_N 0x80
71 :
72 : /* "Basic" Unicode/ASCII code points */
73 : #define _HYPHEN 0X2d
74 : #define DELIMITER _HYPHEN
75 :
76 : #define _ZERO_ 0X30
77 : #define _NINE 0x39
78 :
79 : #define _SMALL_A 0X61
80 : #define _SMALL_Z 0X7a
81 :
82 : #define _CAPITAL_A 0X41
83 : #define _CAPITAL_Z 0X5a
84 :
85 : #define IS_BASIC(c) ((c)<0x80)
86 : #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87 :
88 : /**
89 : * digitToBasic() returns the basic code point whose value
90 : * (when used for representing integers) is d, which must be in the
91 : * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92 : * nonzero, in which case the uppercase form is used.
93 : */
94 : static inline char
95 0 : digitToBasic(int32_t digit, UBool uppercase) {
96 : /* 0..25 map to ASCII a..z or A..Z */
97 : /* 26..35 map to ASCII 0..9 */
98 0 : if(digit<26) {
99 0 : if(uppercase) {
100 0 : return (char)(_CAPITAL_A+digit);
101 : } else {
102 0 : return (char)(_SMALL_A+digit);
103 : }
104 : } else {
105 0 : return (char)((_ZERO_-26)+digit);
106 : }
107 : }
108 :
109 : /**
110 : * basicToDigit[] contains the numeric value of a basic code
111 : * point (for use in representing integers) in the range 0 to
112 : * BASE-1, or -1 if b is does not represent a value.
113 : */
114 : static const int8_t
115 : basicToDigit[256]={
116 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118 :
119 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120 : 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
121 :
122 : -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
123 : 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
124 :
125 : -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
126 : 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
127 :
128 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
129 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130 :
131 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
132 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133 :
134 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
135 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
136 :
137 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138 : -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
139 : };
140 :
141 : static inline char
142 0 : asciiCaseMap(char b, UBool uppercase) {
143 0 : if(uppercase) {
144 0 : if(_SMALL_A<=b && b<=_SMALL_Z) {
145 0 : b-=(_SMALL_A-_CAPITAL_A);
146 : }
147 : } else {
148 0 : if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
149 0 : b+=(_SMALL_A-_CAPITAL_A);
150 : }
151 : }
152 0 : return b;
153 : }
154 :
155 : /* Punycode-specific Bootstring code ---------------------------------------- */
156 :
157 : /*
158 : * The following code omits the {parts} of the pseudo-algorithm in the spec
159 : * that are not used with the Punycode parameter set.
160 : */
161 :
162 : /* Bias adaptation function. */
163 : static int32_t
164 0 : adaptBias(int32_t delta, int32_t length, UBool firstTime) {
165 : int32_t count;
166 :
167 0 : if(firstTime) {
168 0 : delta/=DAMP;
169 : } else {
170 0 : delta/=2;
171 : }
172 :
173 0 : delta+=delta/length;
174 0 : for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
175 0 : delta/=(BASE-TMIN);
176 : }
177 :
178 0 : return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
179 : }
180 :
181 : #define MAX_CP_COUNT 200
182 :
183 : U_CFUNC int32_t
184 0 : u_strToPunycode(const UChar *src, int32_t srcLength,
185 : UChar *dest, int32_t destCapacity,
186 : const UBool *caseFlags,
187 : UErrorCode *pErrorCode) {
188 :
189 : int32_t cpBuffer[MAX_CP_COUNT];
190 : int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
191 : UChar c, c2;
192 :
193 : /* argument checking */
194 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
195 0 : return 0;
196 : }
197 :
198 0 : if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
199 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
200 0 : return 0;
201 : }
202 :
203 : /*
204 : * Handle the basic code points and
205 : * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
206 : */
207 0 : srcCPCount=destLength=0;
208 0 : if(srcLength==-1) {
209 : /* NUL-terminated input */
210 0 : for(j=0; /* no condition */; ++j) {
211 0 : if((c=src[j])==0) {
212 0 : break;
213 : }
214 0 : if(srcCPCount==MAX_CP_COUNT) {
215 : /* too many input code points */
216 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
217 0 : return 0;
218 : }
219 0 : if(IS_BASIC(c)) {
220 0 : cpBuffer[srcCPCount++]=0;
221 0 : if(destLength<destCapacity) {
222 0 : dest[destLength]=
223 0 : caseFlags!=NULL ?
224 0 : asciiCaseMap((char)c, caseFlags[j]) :
225 : (char)c;
226 : }
227 0 : ++destLength;
228 : } else {
229 0 : n=(caseFlags!=NULL && caseFlags[j])<<31L;
230 0 : if(U16_IS_SINGLE(c)) {
231 0 : n|=c;
232 0 : } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
233 0 : ++j;
234 0 : n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
235 : } else {
236 : /* error: unmatched surrogate */
237 0 : *pErrorCode=U_INVALID_CHAR_FOUND;
238 0 : return 0;
239 : }
240 0 : cpBuffer[srcCPCount++]=n;
241 : }
242 : }
243 : } else {
244 : /* length-specified input */
245 0 : for(j=0; j<srcLength; ++j) {
246 0 : if(srcCPCount==MAX_CP_COUNT) {
247 : /* too many input code points */
248 0 : *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
249 0 : return 0;
250 : }
251 0 : c=src[j];
252 0 : if(IS_BASIC(c)) {
253 0 : cpBuffer[srcCPCount++]=0;
254 0 : if(destLength<destCapacity) {
255 0 : dest[destLength]=
256 0 : caseFlags!=NULL ?
257 0 : asciiCaseMap((char)c, caseFlags[j]) :
258 : (char)c;
259 : }
260 0 : ++destLength;
261 : } else {
262 0 : n=(caseFlags!=NULL && caseFlags[j])<<31L;
263 0 : if(U16_IS_SINGLE(c)) {
264 0 : n|=c;
265 0 : } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
266 0 : ++j;
267 0 : n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
268 : } else {
269 : /* error: unmatched surrogate */
270 0 : *pErrorCode=U_INVALID_CHAR_FOUND;
271 0 : return 0;
272 : }
273 0 : cpBuffer[srcCPCount++]=n;
274 : }
275 : }
276 : }
277 :
278 : /* Finish the basic string - if it is not empty - with a delimiter. */
279 0 : basicLength=destLength;
280 0 : if(basicLength>0) {
281 0 : if(destLength<destCapacity) {
282 0 : dest[destLength]=DELIMITER;
283 : }
284 0 : ++destLength;
285 : }
286 :
287 : /*
288 : * handledCPCount is the number of code points that have been handled
289 : * basicLength is the number of basic code points
290 : * destLength is the number of chars that have been output
291 : */
292 :
293 : /* Initialize the state: */
294 0 : n=INITIAL_N;
295 0 : delta=0;
296 0 : bias=INITIAL_BIAS;
297 :
298 : /* Main encoding loop: */
299 0 : for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
300 : /*
301 : * All non-basic code points < n have been handled already.
302 : * Find the next larger one:
303 : */
304 0 : for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
305 0 : q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
306 0 : if(n<=q && q<m) {
307 0 : m=q;
308 : }
309 : }
310 :
311 : /*
312 : * Increase delta enough to advance the decoder's
313 : * <n,i> state to <m,0>, but guard against overflow:
314 : */
315 0 : if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
316 0 : *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
317 0 : return 0;
318 : }
319 0 : delta+=(m-n)*(handledCPCount+1);
320 0 : n=m;
321 :
322 : /* Encode a sequence of same code points n */
323 0 : for(j=0; j<srcCPCount; ++j) {
324 0 : q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
325 0 : if(q<n) {
326 0 : ++delta;
327 0 : } else if(q==n) {
328 : /* Represent delta as a generalized variable-length integer: */
329 0 : for(q=delta, k=BASE; /* no condition */; k+=BASE) {
330 :
331 : /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
332 :
333 : t=k-bias;
334 : if(t<TMIN) {
335 : t=TMIN;
336 : } else if(t>TMAX) {
337 : t=TMAX;
338 : }
339 : */
340 :
341 0 : t=k-bias;
342 0 : if(t<TMIN) {
343 0 : t=TMIN;
344 0 : } else if(k>=(bias+TMAX)) {
345 0 : t=TMAX;
346 : }
347 :
348 0 : if(q<t) {
349 0 : break;
350 : }
351 :
352 0 : if(destLength<destCapacity) {
353 0 : dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
354 : }
355 0 : ++destLength;
356 0 : q=(q-t)/(BASE-t);
357 : }
358 :
359 0 : if(destLength<destCapacity) {
360 0 : dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
361 : }
362 0 : ++destLength;
363 0 : bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
364 0 : delta=0;
365 0 : ++handledCPCount;
366 : }
367 : }
368 :
369 0 : ++delta;
370 0 : ++n;
371 : }
372 :
373 0 : return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
374 : }
375 :
376 : U_CFUNC int32_t
377 0 : u_strFromPunycode(const UChar *src, int32_t srcLength,
378 : UChar *dest, int32_t destCapacity,
379 : UBool *caseFlags,
380 : UErrorCode *pErrorCode) {
381 : int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
382 : destCPCount, firstSupplementaryIndex, cpLength;
383 : UChar b;
384 :
385 : /* argument checking */
386 0 : if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
387 0 : return 0;
388 : }
389 :
390 0 : if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
391 0 : *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
392 0 : return 0;
393 : }
394 :
395 0 : if(srcLength==-1) {
396 0 : srcLength=u_strlen(src);
397 : }
398 :
399 : /*
400 : * Handle the basic code points:
401 : * Let basicLength be the number of input code points
402 : * before the last delimiter, or 0 if there is none,
403 : * then copy the first basicLength code points to the output.
404 : *
405 : * The two following loops iterate backward.
406 : */
407 0 : for(j=srcLength; j>0;) {
408 0 : if(src[--j]==DELIMITER) {
409 0 : break;
410 : }
411 : }
412 0 : destLength=basicLength=destCPCount=j;
413 0 : U_ASSERT(destLength>=0);
414 :
415 0 : while(j>0) {
416 0 : b=src[--j];
417 0 : if(!IS_BASIC(b)) {
418 0 : *pErrorCode=U_INVALID_CHAR_FOUND;
419 0 : return 0;
420 : }
421 :
422 0 : if(j<destCapacity) {
423 0 : dest[j]=(UChar)b;
424 :
425 0 : if(caseFlags!=NULL) {
426 0 : caseFlags[j]=IS_BASIC_UPPERCASE(b);
427 : }
428 : }
429 : }
430 :
431 : /* Initialize the state: */
432 0 : n=INITIAL_N;
433 0 : i=0;
434 0 : bias=INITIAL_BIAS;
435 0 : firstSupplementaryIndex=1000000000;
436 :
437 : /*
438 : * Main decoding loop:
439 : * Start just after the last delimiter if any
440 : * basic code points were copied; start at the beginning otherwise.
441 : */
442 0 : for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
443 : /*
444 : * in is the index of the next character to be consumed, and
445 : * destCPCount is the number of code points in the output array.
446 : *
447 : * Decode a generalized variable-length integer into delta,
448 : * which gets added to i. The overflow checking is easier
449 : * if we increase i as we go, then subtract off its starting
450 : * value at the end to obtain delta.
451 : */
452 0 : for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
453 0 : if(in>=srcLength) {
454 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
455 0 : return 0;
456 : }
457 :
458 0 : digit=basicToDigit[(uint8_t)src[in++]];
459 0 : if(digit<0) {
460 0 : *pErrorCode=U_INVALID_CHAR_FOUND;
461 0 : return 0;
462 : }
463 0 : if(digit>(0x7fffffff-i)/w) {
464 : /* integer overflow */
465 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
466 0 : return 0;
467 : }
468 :
469 0 : i+=digit*w;
470 : /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
471 : t=k-bias;
472 : if(t<TMIN) {
473 : t=TMIN;
474 : } else if(t>TMAX) {
475 : t=TMAX;
476 : }
477 : */
478 0 : t=k-bias;
479 0 : if(t<TMIN) {
480 0 : t=TMIN;
481 0 : } else if(k>=(bias+TMAX)) {
482 0 : t=TMAX;
483 : }
484 0 : if(digit<t) {
485 0 : break;
486 : }
487 :
488 0 : if(w>0x7fffffff/(BASE-t)) {
489 : /* integer overflow */
490 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
491 0 : return 0;
492 : }
493 0 : w*=BASE-t;
494 : }
495 :
496 : /*
497 : * Modification from sample code:
498 : * Increments destCPCount here,
499 : * where needed instead of in for() loop tail.
500 : */
501 0 : ++destCPCount;
502 0 : bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
503 :
504 : /*
505 : * i was supposed to wrap around from (incremented) destCPCount to 0,
506 : * incrementing n each time, so we'll fix that now:
507 : */
508 0 : if(i/destCPCount>(0x7fffffff-n)) {
509 : /* integer overflow */
510 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
511 0 : return 0;
512 : }
513 :
514 0 : n+=i/destCPCount;
515 0 : i%=destCPCount;
516 : /* not needed for Punycode: */
517 : /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
518 :
519 0 : if(n>0x10ffff || U_IS_SURROGATE(n)) {
520 : /* Unicode code point overflow */
521 0 : *pErrorCode=U_ILLEGAL_CHAR_FOUND;
522 0 : return 0;
523 : }
524 :
525 : /* Insert n at position i of the output: */
526 0 : cpLength=U16_LENGTH(n);
527 0 : if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
528 : int32_t codeUnitIndex;
529 :
530 : /*
531 : * Handle indexes when supplementary code points are present.
532 : *
533 : * In almost all cases, there will be only BMP code points before i
534 : * and even in the entire string.
535 : * This is handled with the same efficiency as with UTF-32.
536 : *
537 : * Only the rare cases with supplementary code points are handled
538 : * more slowly - but not too bad since this is an insertion anyway.
539 : */
540 0 : if(i<=firstSupplementaryIndex) {
541 0 : codeUnitIndex=i;
542 0 : if(cpLength>1) {
543 0 : firstSupplementaryIndex=codeUnitIndex;
544 : } else {
545 0 : ++firstSupplementaryIndex;
546 : }
547 : } else {
548 0 : codeUnitIndex=firstSupplementaryIndex;
549 0 : U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
550 : }
551 :
552 : /* use the UChar index codeUnitIndex instead of the code point index i */
553 0 : if(codeUnitIndex<destLength) {
554 0 : uprv_memmove(dest+codeUnitIndex+cpLength,
555 : dest+codeUnitIndex,
556 0 : (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
557 0 : if(caseFlags!=NULL) {
558 0 : uprv_memmove(caseFlags+codeUnitIndex+cpLength,
559 : caseFlags+codeUnitIndex,
560 0 : destLength-codeUnitIndex);
561 : }
562 : }
563 0 : if(cpLength==1) {
564 : /* BMP, insert one code unit */
565 0 : dest[codeUnitIndex]=(UChar)n;
566 : } else {
567 : /* supplementary character, insert two code units */
568 0 : dest[codeUnitIndex]=U16_LEAD(n);
569 0 : dest[codeUnitIndex+1]=U16_TRAIL(n);
570 : }
571 0 : if(caseFlags!=NULL) {
572 : /* Case of last character determines uppercase flag: */
573 0 : caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
574 0 : if(cpLength==2) {
575 0 : caseFlags[codeUnitIndex+1]=FALSE;
576 : }
577 : }
578 : }
579 0 : destLength+=cpLength;
580 0 : U_ASSERT(destLength>=0);
581 0 : ++i;
582 : }
583 :
584 0 : return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
585 : }
586 :
587 : /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
588 :
589 : #endif /* #if !UCONFIG_NO_IDNA */
|