Line data Source code
1 : /*
2 : * Copyright (C) 2005 The Android Open Source Project
3 : *
4 : * Licensed under the Apache License, Version 2.0 (the "License");
5 : * you may not use this file except in compliance with the License.
6 : * You may obtain a copy of the License at
7 : *
8 : * http://www.apache.org/licenses/LICENSE-2.0
9 : *
10 : * Unless required by applicable law or agreed to in writing, software
11 : * distributed under the License is distributed on an "AS IS" BASIS,
12 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 : * See the License for the specific language governing permissions and
14 : * limitations under the License.
15 : */
16 :
17 : #include <utils/Unicode.h>
18 :
19 : #include <stddef.h>
20 :
21 : #ifdef HAVE_WINSOCK
22 : # undef nhtol
23 : # undef htonl
24 : # undef nhtos
25 : # undef htons
26 :
27 : # ifdef HAVE_LITTLE_ENDIAN
28 : # define ntohl(x) ( ((x) << 24) | (((x) >> 24) & 255) | (((x) << 8) & 0xff0000) | (((x) >> 8) & 0xff00) )
29 : # define htonl(x) ntohl(x)
30 : # define ntohs(x) ( (((x) << 8) & 0xff00) | (((x) >> 8) & 255) )
31 : # define htons(x) ntohs(x)
32 : # else
33 : # define ntohl(x) (x)
34 : # define htonl(x) (x)
35 : # define ntohs(x) (x)
36 : # define htons(x) (x)
37 : # endif
38 : #else
39 : # include <netinet/in.h>
40 : #endif
41 :
42 : extern "C" {
43 :
44 : static const char32_t kByteMask = 0x000000BF;
45 : static const char32_t kByteMark = 0x00000080;
46 :
47 : // Surrogates aren't valid for UTF-32 characters, so define some
48 : // constants that will let us screen them out.
49 : static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
50 : static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
51 : static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
52 : static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
53 : static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
54 : static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
55 : static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
56 :
57 : // Mask used to set appropriate bits in first byte of UTF-8 sequence,
58 : // indexed by number of bytes in the sequence.
59 : // 0xxxxxxx
60 : // -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
61 : // 110yyyyx 10xxxxxx
62 : // -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
63 : // 1110yyyy 10yxxxxx 10xxxxxx
64 : // -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
65 : // 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
66 : // -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
67 : static const char32_t kFirstByteMark[] = {
68 : 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
69 : };
70 :
71 : // --------------------------------------------------------------------------
72 : // UTF-32
73 : // --------------------------------------------------------------------------
74 :
75 : /**
76 : * Return number of UTF-8 bytes required for the character. If the character
77 : * is invalid, return size of 0.
78 : */
79 0 : static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
80 : {
81 : // Figure out how many bytes the result will require.
82 0 : if (srcChar < 0x00000080) {
83 0 : return 1;
84 0 : } else if (srcChar < 0x00000800) {
85 0 : return 2;
86 0 : } else if (srcChar < 0x00010000) {
87 0 : if ((srcChar < kUnicodeSurrogateStart) || (srcChar > kUnicodeSurrogateEnd)) {
88 0 : return 3;
89 : } else {
90 : // Surrogates are invalid UTF-32 characters.
91 0 : return 0;
92 : }
93 : }
94 : // Max code point for Unicode is 0x0010FFFF.
95 0 : else if (srcChar <= kUnicodeMaxCodepoint) {
96 0 : return 4;
97 : } else {
98 : // Invalid UTF-32 character.
99 0 : return 0;
100 : }
101 : }
102 :
103 : // Write out the source character to <dstP>.
104 :
105 0 : static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
106 : {
107 0 : dstP += bytes;
108 0 : switch (bytes)
109 : { /* note: everything falls through. */
110 0 : case 4: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
111 0 : case 3: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
112 0 : case 2: *--dstP = (uint8_t)((srcChar | kByteMark) & kByteMask); srcChar >>= 6;
113 0 : case 1: *--dstP = (uint8_t)(srcChar | kFirstByteMark[bytes]);
114 : }
115 0 : }
116 :
117 0 : size_t strlen32(const char32_t *s)
118 : {
119 0 : const char32_t *ss = s;
120 0 : while ( *ss )
121 0 : ss++;
122 0 : return ss-s;
123 : }
124 :
125 0 : size_t strnlen32(const char32_t *s, size_t maxlen)
126 : {
127 0 : const char32_t *ss = s;
128 0 : while ((maxlen > 0) && *ss) {
129 0 : ss++;
130 0 : maxlen--;
131 : }
132 0 : return ss-s;
133 : }
134 :
135 0 : static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
136 : {
137 0 : const char first_char = *cur;
138 0 : if ((first_char & 0x80) == 0) { // ASCII
139 0 : *num_read = 1;
140 0 : return *cur;
141 : }
142 0 : cur++;
143 : char32_t mask, to_ignore_mask;
144 0 : size_t num_to_read = 0;
145 0 : char32_t utf32 = first_char;
146 0 : for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
147 0 : (first_char & mask);
148 0 : num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
149 : // 0x3F == 00111111
150 0 : utf32 = (utf32 << 6) + (*cur++ & 0x3F);
151 : }
152 0 : to_ignore_mask |= mask;
153 0 : utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
154 :
155 0 : *num_read = num_to_read;
156 0 : return static_cast<int32_t>(utf32);
157 : }
158 :
159 0 : int32_t utf32_from_utf8_at(const char *src, size_t src_len, size_t index, size_t *next_index)
160 : {
161 0 : if (index >= src_len) {
162 0 : return -1;
163 : }
164 : size_t dummy_index;
165 0 : if (next_index == NULL) {
166 0 : next_index = &dummy_index;
167 : }
168 : size_t num_read;
169 0 : int32_t ret = utf32_at_internal(src + index, &num_read);
170 0 : if (ret >= 0) {
171 0 : *next_index = index + num_read;
172 : }
173 :
174 0 : return ret;
175 : }
176 :
177 0 : ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
178 : {
179 0 : if (src == NULL || src_len == 0) {
180 0 : return -1;
181 : }
182 :
183 0 : size_t ret = 0;
184 0 : const char32_t *end = src + src_len;
185 0 : while (src < end) {
186 0 : ret += utf32_codepoint_utf8_length(*src++);
187 : }
188 0 : return ret;
189 : }
190 :
191 0 : void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
192 : {
193 0 : if (src == NULL || src_len == 0 || dst == NULL) {
194 0 : return;
195 : }
196 :
197 0 : const char32_t *cur_utf32 = src;
198 0 : const char32_t *end_utf32 = src + src_len;
199 0 : char *cur = dst;
200 0 : while (cur_utf32 < end_utf32) {
201 0 : size_t len = utf32_codepoint_utf8_length(*cur_utf32);
202 0 : utf32_codepoint_to_utf8((uint8_t *)cur, *cur_utf32++, len);
203 0 : cur += len;
204 : }
205 0 : *cur = '\0';
206 : }
207 :
208 : // --------------------------------------------------------------------------
209 : // UTF-16
210 : // --------------------------------------------------------------------------
211 :
212 0 : int strcmp16(const char16_t *s1, const char16_t *s2)
213 : {
214 : char16_t ch;
215 0 : int d = 0;
216 :
217 : while ( 1 ) {
218 0 : d = (int)(ch = *s1++) - (int)*s2++;
219 0 : if ( d || !ch )
220 : break;
221 : }
222 :
223 0 : return d;
224 : }
225 :
226 0 : int strncmp16(const char16_t *s1, const char16_t *s2, size_t n)
227 : {
228 : char16_t ch;
229 0 : int d = 0;
230 :
231 0 : while ( n-- ) {
232 0 : d = (int)(ch = *s1++) - (int)*s2++;
233 0 : if ( d || !ch )
234 : break;
235 : }
236 :
237 0 : return d;
238 : }
239 :
240 0 : char16_t *strcpy16(char16_t *dst, const char16_t *src)
241 : {
242 0 : char16_t *q = dst;
243 0 : const char16_t *p = src;
244 : char16_t ch;
245 :
246 0 : do {
247 0 : *q++ = ch = *p++;
248 0 : } while ( ch );
249 :
250 0 : return dst;
251 : }
252 :
253 0 : size_t strlen16(const char16_t *s)
254 : {
255 0 : const char16_t *ss = s;
256 0 : while ( *ss )
257 0 : ss++;
258 0 : return ss-s;
259 : }
260 :
261 :
262 0 : char16_t *strncpy16(char16_t *dst, const char16_t *src, size_t n)
263 : {
264 0 : char16_t *q = dst;
265 0 : const char16_t *p = src;
266 : char ch;
267 :
268 0 : while (n) {
269 0 : n--;
270 0 : *q++ = ch = *p++;
271 0 : if ( !ch )
272 0 : break;
273 : }
274 :
275 0 : *q = 0;
276 :
277 0 : return dst;
278 : }
279 :
280 0 : size_t strnlen16(const char16_t *s, size_t maxlen)
281 : {
282 0 : const char16_t *ss = s;
283 :
284 : /* Important: the maxlen test must precede the reference through ss;
285 : since the byte beyond the maximum may segfault */
286 0 : while ((maxlen > 0) && *ss) {
287 0 : ss++;
288 0 : maxlen--;
289 : }
290 0 : return ss-s;
291 : }
292 :
293 0 : int strzcmp16(const char16_t *s1, size_t n1, const char16_t *s2, size_t n2)
294 : {
295 0 : const char16_t* e1 = s1+n1;
296 0 : const char16_t* e2 = s2+n2;
297 :
298 0 : while (s1 < e1 && s2 < e2) {
299 0 : const int d = (int)*s1++ - (int)*s2++;
300 0 : if (d) {
301 0 : return d;
302 : }
303 : }
304 :
305 : return n1 < n2
306 0 : ? (0 - (int)*s2)
307 : : (n1 > n2
308 0 : ? ((int)*s1 - 0)
309 0 : : 0);
310 : }
311 :
312 0 : int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2)
313 : {
314 0 : const char16_t* e1 = s1H+n1;
315 0 : const char16_t* e2 = s2N+n2;
316 :
317 0 : while (s1H < e1 && s2N < e2) {
318 0 : const char16_t c2 = ntohs(*s2N);
319 0 : const int d = (int)*s1H++ - (int)c2;
320 0 : s2N++;
321 0 : if (d) {
322 0 : return d;
323 : }
324 : }
325 :
326 : return n1 < n2
327 0 : ? (0 - (int)ntohs(*s2N))
328 : : (n1 > n2
329 0 : ? ((int)*s1H - 0)
330 0 : : 0);
331 : }
332 :
333 0 : void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
334 : {
335 0 : if (src == NULL || src_len == 0 || dst == NULL) {
336 0 : return;
337 : }
338 :
339 0 : const char16_t* cur_utf16 = src;
340 0 : const char16_t* const end_utf16 = src + src_len;
341 0 : char *cur = dst;
342 0 : while (cur_utf16 < end_utf16) {
343 : char32_t utf32;
344 : // surrogate pairs
345 0 : if ((*cur_utf16 & 0xFC00) == 0xD800) {
346 0 : utf32 = (*cur_utf16++ - 0xD800) << 10;
347 0 : utf32 |= *cur_utf16++ - 0xDC00;
348 0 : utf32 += 0x10000;
349 : } else {
350 0 : utf32 = (char32_t) *cur_utf16++;
351 : }
352 0 : const size_t len = utf32_codepoint_utf8_length(utf32);
353 0 : utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
354 0 : cur += len;
355 : }
356 0 : *cur = '\0';
357 : }
358 :
359 : // --------------------------------------------------------------------------
360 : // UTF-8
361 : // --------------------------------------------------------------------------
362 :
363 0 : ssize_t utf8_length(const char *src)
364 : {
365 0 : const char *cur = src;
366 0 : size_t ret = 0;
367 0 : while (*cur != '\0') {
368 0 : const char first_char = *cur++;
369 0 : if ((first_char & 0x80) == 0) { // ASCII
370 0 : ret += 1;
371 0 : continue;
372 : }
373 : // (UTF-8's character must not be like 10xxxxxx,
374 : // but 110xxxxx, 1110xxxx, ... or 1111110x)
375 0 : if ((first_char & 0x40) == 0) {
376 0 : return -1;
377 : }
378 :
379 : int32_t mask, to_ignore_mask;
380 0 : size_t num_to_read = 0;
381 0 : char32_t utf32 = 0;
382 0 : for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
383 0 : num_to_read < 5 && (first_char & mask);
384 0 : num_to_read++, to_ignore_mask |= mask, mask >>= 1) {
385 0 : if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
386 0 : return -1;
387 : }
388 : // 0x3F == 00111111
389 0 : utf32 = (utf32 << 6) + (*cur++ & 0x3F);
390 : }
391 : // "first_char" must be (110xxxxx - 11110xxx)
392 0 : if (num_to_read == 5) {
393 0 : return -1;
394 : }
395 0 : to_ignore_mask |= mask;
396 0 : utf32 |= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
397 0 : if (utf32 > kUnicodeMaxCodepoint) {
398 0 : return -1;
399 : }
400 :
401 0 : ret += num_to_read;
402 : }
403 0 : return ret;
404 : }
405 :
406 0 : ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
407 : {
408 0 : if (src == NULL || src_len == 0) {
409 0 : return -1;
410 : }
411 :
412 0 : size_t ret = 0;
413 0 : const char16_t* const end = src + src_len;
414 0 : while (src < end) {
415 0 : if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
416 0 : && (*++src & 0xFC00) == 0xDC00) {
417 : // surrogate pairs are always 4 bytes.
418 0 : ret += 4;
419 0 : src++;
420 : } else {
421 0 : ret += utf32_codepoint_utf8_length((char32_t) *src++);
422 : }
423 : }
424 0 : return ret;
425 : }
426 :
427 : /**
428 : * Returns 1-4 based on the number of leading bits.
429 : *
430 : * 1111 -> 4
431 : * 1110 -> 3
432 : * 110x -> 2
433 : * 10xx -> 1
434 : * 0xxx -> 1
435 : */
436 0 : static inline size_t utf8_codepoint_len(uint8_t ch)
437 : {
438 0 : return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
439 : }
440 :
441 0 : static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
442 : {
443 0 : *codePoint <<= 6;
444 0 : *codePoint |= 0x3F & byte;
445 0 : }
446 :
447 0 : size_t utf8_to_utf32_length(const char *src, size_t src_len)
448 : {
449 0 : if (src == NULL || src_len == 0) {
450 0 : return 0;
451 : }
452 0 : size_t ret = 0;
453 : const char* cur;
454 : const char* end;
455 : size_t num_to_skip;
456 0 : for (cur = src, end = src + src_len, num_to_skip = 1;
457 0 : cur < end;
458 0 : cur += num_to_skip, ret++) {
459 0 : const char first_char = *cur;
460 0 : num_to_skip = 1;
461 0 : if ((first_char & 0x80) == 0) { // ASCII
462 0 : continue;
463 : }
464 : int32_t mask;
465 :
466 0 : for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
467 : }
468 : }
469 0 : return ret;
470 : }
471 :
472 0 : void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
473 : {
474 0 : if (src == NULL || src_len == 0 || dst == NULL) {
475 0 : return;
476 : }
477 :
478 0 : const char* cur = src;
479 0 : const char* const end = src + src_len;
480 0 : char32_t* cur_utf32 = dst;
481 0 : while (cur < end) {
482 : size_t num_read;
483 0 : *cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
484 0 : cur += num_read;
485 : }
486 0 : *cur_utf32 = 0;
487 : }
488 :
489 0 : static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
490 : {
491 : uint32_t unicode;
492 :
493 0 : switch (length)
494 : {
495 : case 1:
496 0 : return src[0];
497 : case 2:
498 0 : unicode = src[0] & 0x1f;
499 0 : utf8_shift_and_mask(&unicode, src[1]);
500 0 : return unicode;
501 : case 3:
502 0 : unicode = src[0] & 0x0f;
503 0 : utf8_shift_and_mask(&unicode, src[1]);
504 0 : utf8_shift_and_mask(&unicode, src[2]);
505 0 : return unicode;
506 : case 4:
507 0 : unicode = src[0] & 0x07;
508 0 : utf8_shift_and_mask(&unicode, src[1]);
509 0 : utf8_shift_and_mask(&unicode, src[2]);
510 0 : utf8_shift_and_mask(&unicode, src[3]);
511 0 : return unicode;
512 : default:
513 0 : return 0xffff;
514 : }
515 :
516 : //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
517 : }
518 :
519 0 : ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
520 : {
521 0 : const uint8_t* const u8end = u8str + u8len;
522 0 : const uint8_t* u8cur = u8str;
523 :
524 : /* Validate that the UTF-8 is the correct len */
525 0 : size_t u16measuredLen = 0;
526 0 : while (u8cur < u8end) {
527 0 : u16measuredLen++;
528 0 : int u8charLen = utf8_codepoint_len(*u8cur);
529 0 : uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
530 0 : if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
531 0 : u8cur += u8charLen;
532 : }
533 :
534 : /**
535 : * Make sure that we ended where we thought we would and the output UTF-16
536 : * will be exactly how long we were told it would be.
537 : */
538 0 : if (u8cur != u8end) {
539 0 : return -1;
540 : }
541 :
542 0 : return u16measuredLen;
543 : }
544 :
545 0 : char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
546 : {
547 0 : const uint8_t* const u8end = u8str + u8len;
548 0 : const uint8_t* u8cur = u8str;
549 0 : char16_t* u16cur = u16str;
550 :
551 0 : while (u8cur < u8end) {
552 0 : size_t u8len = utf8_codepoint_len(*u8cur);
553 0 : uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
554 :
555 : // Convert the UTF32 codepoint to one or more UTF16 codepoints
556 0 : if (codepoint <= 0xFFFF) {
557 : // Single UTF16 character
558 0 : *u16cur++ = (char16_t) codepoint;
559 : } else {
560 : // Multiple UTF16 characters with surrogates
561 0 : codepoint = codepoint - 0x10000;
562 0 : *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
563 0 : *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
564 : }
565 :
566 0 : u8cur += u8len;
567 : }
568 0 : return u16cur;
569 : }
570 :
571 0 : void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
572 0 : char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
573 0 : *end = 0;
574 0 : }
575 :
576 0 : char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
577 0 : const uint8_t* const u8end = src + srcLen;
578 0 : const uint8_t* u8cur = src;
579 0 : const uint16_t* const u16end = (const uint16_t* const) dst + dstLen;
580 0 : uint16_t* u16cur = (uint16_t*) dst;
581 :
582 0 : while (u8cur < u8end && u16cur < u16end) {
583 0 : size_t u8len = utf8_codepoint_len(*u8cur);
584 0 : uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
585 :
586 : // Convert the UTF32 codepoint to one or more UTF16 codepoints
587 0 : if (codepoint <= 0xFFFF) {
588 : // Single UTF16 character
589 0 : *u16cur++ = (char16_t) codepoint;
590 : } else {
591 : // Multiple UTF16 characters with surrogates
592 0 : codepoint = codepoint - 0x10000;
593 0 : *u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
594 0 : if (u16cur >= u16end) {
595 : // Ooops... not enough room for this surrogate pair.
596 0 : return (char16_t*) u16cur-1;
597 : }
598 0 : *u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
599 : }
600 :
601 0 : u8cur += u8len;
602 : }
603 0 : return (char16_t*) u16cur;
604 : }
605 :
606 9 : }
|