Line data Source code
1 : /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
2 : /* cairo - a vector graphics library with display and print output
3 : *
4 : * The code in this file is derived from GLib's gutf8.c and
5 : * ultimately from libunicode. It is relicensed under the
6 : * dual LGPL/MPL with permission of the original authors.
7 : *
8 : * Copyright © 1999 Tom Tromey
9 : * Copyright © 2005 Red Hat, Inc
10 : *
11 : * This library is free software; you can redistribute it and/or
12 : * modify it either under the terms of the GNU Lesser General Public
13 : * License version 2.1 as published by the Free Software Foundation
14 : * (the "LGPL") or, at your option, under the terms of the Mozilla
15 : * Public License Version 1.1 (the "MPL"). If you do not alter this
16 : * notice, a recipient may use your version of this file under either
17 : * the MPL or the LGPL.
18 : *
19 : * You should have received a copy of the LGPL along with this library
20 : * in the file COPYING-LGPL-2.1; if not, write to the Free Software
21 : * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
22 : * You should have received a copy of the MPL along with this library
23 : * in the file COPYING-MPL-1.1
24 : *
25 : * The contents of this file are subject to the Mozilla Public License
26 : * Version 1.1 (the "License"); you may not use this file except in
27 : * compliance with the License. You may obtain a copy of the License at
28 : * http://www.mozilla.org/MPL/
29 : *
30 : * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
31 : * OF ANY KIND, either express or implied. See the LGPL or the MPL for
32 : * the specific language governing rights and limitations.
33 : *
34 : * The Original Code is the cairo graphics library.
35 : *
36 : * The Initial Developer of the Original Code is Tom Tromey.
37 : * and Red Hat, Inc.
38 : *
39 : * Contributor(s):
40 : * Owen Taylor <otaylor@redhat.com>
41 : */
42 :
43 : #include "cairoint.h"
44 : #include "cairo-error-private.h"
45 :
46 : #define UTF8_COMPUTE(Char, Mask, Len) \
47 : if (Char < 128) \
48 : { \
49 : Len = 1; \
50 : Mask = 0x7f; \
51 : } \
52 : else if ((Char & 0xe0) == 0xc0) \
53 : { \
54 : Len = 2; \
55 : Mask = 0x1f; \
56 : } \
57 : else if ((Char & 0xf0) == 0xe0) \
58 : { \
59 : Len = 3; \
60 : Mask = 0x0f; \
61 : } \
62 : else if ((Char & 0xf8) == 0xf0) \
63 : { \
64 : Len = 4; \
65 : Mask = 0x07; \
66 : } \
67 : else if ((Char & 0xfc) == 0xf8) \
68 : { \
69 : Len = 5; \
70 : Mask = 0x03; \
71 : } \
72 : else if ((Char & 0xfe) == 0xfc) \
73 : { \
74 : Len = 6; \
75 : Mask = 0x01; \
76 : } \
77 : else \
78 : Len = -1;
79 :
80 : #define UTF8_LENGTH(Char) \
81 : ((Char) < 0x80 ? 1 : \
82 : ((Char) < 0x800 ? 2 : \
83 : ((Char) < 0x10000 ? 3 : \
84 : ((Char) < 0x200000 ? 4 : \
85 : ((Char) < 0x4000000 ? 5 : 6)))))
86 :
87 : #define UTF8_GET(Result, Chars, Count, Mask, Len) \
88 : (Result) = (Chars)[0] & (Mask); \
89 : for ((Count) = 1; (Count) < (Len); ++(Count)) \
90 : { \
91 : if (((Chars)[(Count)] & 0xc0) != 0x80) \
92 : { \
93 : (Result) = -1; \
94 : break; \
95 : } \
96 : (Result) <<= 6; \
97 : (Result) |= ((Chars)[(Count)] & 0x3f); \
98 : }
99 :
100 : #define UNICODE_VALID(Char) \
101 : ((Char) < 0x110000 && \
102 : (((Char) & 0xFFFFF800) != 0xD800) && \
103 : ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
104 : ((Char) & 0xFFFE) != 0xFFFE)
105 :
106 : static const char utf8_skip_data[256] = {
107 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
113 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
114 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
115 : };
116 :
117 : #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
118 :
119 : /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
120 : * If @p does not point to a valid UTF-8 encoded character, results are
121 : * undefined.
122 : **/
123 : static uint32_t
124 0 : _utf8_get_char (const unsigned char *p)
125 : {
126 0 : int i, mask = 0, len;
127 : uint32_t result;
128 0 : unsigned char c = (unsigned char) *p;
129 :
130 0 : UTF8_COMPUTE (c, mask, len);
131 0 : if (len == -1)
132 0 : return (uint32_t)-1;
133 0 : UTF8_GET (result, p, i, mask, len);
134 :
135 0 : return result;
136 : }
137 :
138 : /* Like _utf8_get_char, but take a maximum length
139 : * and return (uint32_t)-2 on incomplete trailing character
140 : */
141 : static uint32_t
142 0 : _utf8_get_char_extended (const unsigned char *p,
143 : long max_len)
144 : {
145 : int i, len;
146 0 : uint32_t wc = (unsigned char) *p;
147 :
148 0 : if (wc < 0x80) {
149 0 : return wc;
150 0 : } else if (wc < 0xc0) {
151 0 : return (uint32_t)-1;
152 0 : } else if (wc < 0xe0) {
153 0 : len = 2;
154 0 : wc &= 0x1f;
155 0 : } else if (wc < 0xf0) {
156 0 : len = 3;
157 0 : wc &= 0x0f;
158 0 : } else if (wc < 0xf8) {
159 0 : len = 4;
160 0 : wc &= 0x07;
161 0 : } else if (wc < 0xfc) {
162 0 : len = 5;
163 0 : wc &= 0x03;
164 0 : } else if (wc < 0xfe) {
165 0 : len = 6;
166 0 : wc &= 0x01;
167 : } else {
168 0 : return (uint32_t)-1;
169 : }
170 :
171 0 : if (max_len >= 0 && len > max_len) {
172 0 : for (i = 1; i < max_len; i++) {
173 0 : if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
174 0 : return (uint32_t)-1;
175 : }
176 0 : return (uint32_t)-2;
177 : }
178 :
179 0 : for (i = 1; i < len; ++i) {
180 0 : uint32_t ch = ((unsigned char *)p)[i];
181 :
182 0 : if ((ch & 0xc0) != 0x80) {
183 0 : if (ch)
184 0 : return (uint32_t)-1;
185 : else
186 0 : return (uint32_t)-2;
187 : }
188 :
189 0 : wc <<= 6;
190 0 : wc |= (ch & 0x3f);
191 : }
192 :
193 0 : if (UTF8_LENGTH(wc) != len)
194 0 : return (uint32_t)-1;
195 :
196 0 : return wc;
197 : }
198 :
199 : /**
200 : * _cairo_utf8_get_char_validated:
201 : * @p: a UTF-8 string
202 : * @unicode: location to store one Unicode character
203 : *
204 : * Decodes the first character of a valid UTF-8 string, and returns
205 : * the number of bytes consumed.
206 : *
207 : * Note that the string should be valid. Do not use this without
208 : * validating the string first.
209 : *
210 : * Returns: the number of bytes forming the character returned.
211 : **/
212 : int
213 0 : _cairo_utf8_get_char_validated (const char *p,
214 : uint32_t *unicode)
215 : {
216 0 : int i, mask = 0, len;
217 : uint32_t result;
218 0 : unsigned char c = (unsigned char) *p;
219 :
220 0 : UTF8_COMPUTE (c, mask, len);
221 0 : if (len == -1) {
222 0 : if (unicode)
223 0 : *unicode = (uint32_t)-1;
224 0 : return 1;
225 : }
226 0 : UTF8_GET (result, p, i, mask, len);
227 :
228 0 : if (unicode)
229 0 : *unicode = result;
230 0 : return len;
231 : }
232 :
233 : /**
234 : * _cairo_utf8_to_ucs4:
235 : * @str: an UTF-8 string
236 : * @len: length of @str in bytes, or -1 if it is nul-terminated.
237 : * If @len is supplied and the string has an embedded nul
238 : * byte, only the portion before the nul byte is converted.
239 : * @result: location to store a pointer to a newly allocated UTF-32
240 : * string (always native endian), or %NULL. Free with free(). A 0
241 : * word will be written after the last character.
242 : * @items_written: location to store number of 32-bit words
243 : * written. (Not including the trailing 0)
244 : *
245 : * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
246 : * with 1 32-bit word per character. The string is validated to
247 : * consist entirely of valid Unicode characters.
248 : *
249 : * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
250 : * successfully converted. %CAIRO_STATUS_INVALID_STRING if an
251 : * invalid sequence was found.
252 : **/
253 : cairo_status_t
254 0 : _cairo_utf8_to_ucs4 (const char *str,
255 : int len,
256 : uint32_t **result,
257 : int *items_written)
258 : {
259 0 : uint32_t *str32 = NULL;
260 : int n_chars, i;
261 : const unsigned char *in;
262 0 : const unsigned char * const ustr = (const unsigned char *) str;
263 :
264 0 : in = ustr;
265 0 : n_chars = 0;
266 0 : while ((len < 0 || ustr + len - in > 0) && *in)
267 : {
268 0 : uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
269 0 : if (wc & 0x80000000 || !UNICODE_VALID (wc))
270 0 : return _cairo_error (CAIRO_STATUS_INVALID_STRING);
271 :
272 0 : n_chars++;
273 0 : if (n_chars == INT_MAX)
274 0 : return _cairo_error (CAIRO_STATUS_INVALID_STRING);
275 :
276 0 : in = UTF8_NEXT_CHAR (in);
277 : }
278 :
279 0 : if (result) {
280 0 : str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
281 0 : if (!str32)
282 0 : return _cairo_error (CAIRO_STATUS_NO_MEMORY);
283 :
284 0 : in = ustr;
285 0 : for (i=0; i < n_chars; i++) {
286 0 : str32[i] = _utf8_get_char (in);
287 0 : in = UTF8_NEXT_CHAR (in);
288 : }
289 0 : str32[i] = 0;
290 :
291 0 : *result = str32;
292 : }
293 :
294 0 : if (items_written)
295 0 : *items_written = n_chars;
296 :
297 0 : return CAIRO_STATUS_SUCCESS;
298 : }
299 :
300 : /**
301 : * _cairo_ucs4_to_utf8:
302 : * @unicode: a UCS-4 character
303 : * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
304 : * space available. Or %NULL.
305 : *
306 : * This space left intentionally blank.
307 : *
308 : * Return value: Number of bytes in the utf8 string or 0 if an invalid
309 : * unicode character
310 : **/
311 : int
312 0 : _cairo_ucs4_to_utf8 (uint32_t unicode,
313 : char *utf8)
314 : {
315 : int bytes;
316 : char *p;
317 :
318 0 : if (unicode < 0x80) {
319 0 : if (utf8)
320 0 : *utf8 = unicode;
321 0 : return 1;
322 0 : } else if (unicode < 0x800) {
323 0 : bytes = 2;
324 0 : } else if (unicode < 0x10000) {
325 0 : bytes = 3;
326 0 : } else if (unicode < 0x200000) {
327 0 : bytes = 4;
328 : } else {
329 0 : return 0;
330 : }
331 :
332 0 : if (!utf8)
333 0 : return bytes;
334 :
335 0 : p = utf8 + bytes;
336 0 : while (p > utf8) {
337 0 : *--p = 0x80 | (unicode & 0x3f);
338 0 : unicode >>= 6;
339 : }
340 0 : *p |= 0xf0 << (4 - bytes);
341 :
342 0 : return bytes;
343 : }
344 :
345 : #if CAIRO_HAS_UTF8_TO_UTF16
346 : /**
347 : * _cairo_utf8_to_utf16:
348 : * @str: an UTF-8 string
349 : * @len: length of @str in bytes, or -1 if it is nul-terminated.
350 : * If @len is supplied and the string has an embedded nul
351 : * byte, only the portion before the nul byte is converted.
352 : * @result: location to store a pointer to a newly allocated UTF-16
353 : * string (always native endian). Free with free(). A 0
354 : * word will be written after the last character.
355 : * @items_written: location to store number of 16-bit words
356 : * written. (Not including the trailing 0)
357 : *
358 : * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
359 : * where characters are represented either as a single 16-bit word, or
360 : * as a pair of 16-bit "surrogates". The string is validated to
361 : * consist entirely of valid Unicode characters.
362 : *
363 : * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
364 : * successfully converted. %CAIRO_STATUS_INVALID_STRING if an
365 : * an invalid sequence was found.
366 : **/
367 : cairo_status_t
368 0 : _cairo_utf8_to_utf16 (const char *str,
369 : int len,
370 : uint16_t **result,
371 : int *items_written)
372 : {
373 0 : uint16_t *str16 = NULL;
374 : int n16, i;
375 : const unsigned char *in;
376 0 : const unsigned char * const ustr = (const unsigned char *) str;
377 :
378 0 : in = ustr;
379 0 : n16 = 0;
380 0 : while ((len < 0 || ustr + len - in > 0) && *in) {
381 0 : uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
382 0 : if (wc & 0x80000000 || !UNICODE_VALID (wc))
383 0 : return _cairo_error (CAIRO_STATUS_INVALID_STRING);
384 :
385 0 : if (wc < 0x10000)
386 0 : n16 += 1;
387 : else
388 0 : n16 += 2;
389 :
390 0 : if (n16 == INT_MAX - 1 || n16 == INT_MAX)
391 0 : return _cairo_error (CAIRO_STATUS_INVALID_STRING);
392 :
393 0 : in = UTF8_NEXT_CHAR (in);
394 : }
395 :
396 0 : str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
397 0 : if (!str16)
398 0 : return _cairo_error (CAIRO_STATUS_NO_MEMORY);
399 :
400 0 : in = ustr;
401 0 : for (i = 0; i < n16;) {
402 0 : uint32_t wc = _utf8_get_char (in);
403 :
404 0 : if (wc < 0x10000) {
405 0 : str16[i++] = wc;
406 : } else {
407 0 : str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
408 0 : str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
409 : }
410 :
411 0 : in = UTF8_NEXT_CHAR (in);
412 : }
413 :
414 0 : str16[i] = 0;
415 :
416 0 : *result = str16;
417 0 : if (items_written)
418 0 : *items_written = n16;
419 :
420 0 : return CAIRO_STATUS_SUCCESS;
421 : }
422 : #endif
|