LCOV - code coverage report
Current view: top level - gfx/cairo/cairo/src - cairo-unicode.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 135 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 6 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: c; c-basic-offset: 4; indent-tabs-mode: t; tab-width: 8; -*- */
       2             : /* cairo - a vector graphics library with display and print output
       3             :  *
       4             :  * The code in this file is derived from GLib's gutf8.c and
       5             :  *   ultimately from libunicode. It is relicensed under the
       6             :  *   dual LGPL/MPL with permission of the original authors.
       7             :  *
       8             :  * Copyright © 1999 Tom Tromey
       9             :  * Copyright © 2005 Red Hat, Inc
      10             :  *
      11             :  * This library is free software; you can redistribute it and/or
      12             :  * modify it either under the terms of the GNU Lesser General Public
      13             :  * License version 2.1 as published by the Free Software Foundation
      14             :  * (the "LGPL") or, at your option, under the terms of the Mozilla
      15             :  * Public License Version 1.1 (the "MPL"). If you do not alter this
      16             :  * notice, a recipient may use your version of this file under either
      17             :  * the MPL or the LGPL.
      18             :  *
      19             :  * You should have received a copy of the LGPL along with this library
      20             :  * in the file COPYING-LGPL-2.1; if not, write to the Free Software
      21             :  * Foundation, Inc., 51 Franklin Street, Suite 500, Boston, MA 02110-1335, USA
      22             :  * You should have received a copy of the MPL along with this library
      23             :  * in the file COPYING-MPL-1.1
      24             :  *
      25             :  * The contents of this file are subject to the Mozilla Public License
      26             :  * Version 1.1 (the "License"); you may not use this file except in
      27             :  * compliance with the License. You may obtain a copy of the License at
      28             :  * http://www.mozilla.org/MPL/
      29             :  *
      30             :  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY
      31             :  * OF ANY KIND, either express or implied. See the LGPL or the MPL for
      32             :  * the specific language governing rights and limitations.
      33             :  *
      34             :  * The Original Code is the cairo graphics library.
      35             :  *
      36             :  * The Initial Developer of the Original Code is Tom Tromey.
      37             :  *  and Red Hat, Inc.
      38             :  *
      39             :  * Contributor(s):
      40             :  *      Owen Taylor <otaylor@redhat.com>
      41             :  */
      42             : 
      43             : #include "cairoint.h"
      44             : #include "cairo-error-private.h"
      45             : 
      46             : #define UTF8_COMPUTE(Char, Mask, Len)                                         \
      47             :   if (Char < 128)                                                          \
      48             :     {                                                                         \
      49             :       Len = 1;                                                                \
      50             :       Mask = 0x7f;                                                            \
      51             :     }                                                                         \
      52             :   else if ((Char & 0xe0) == 0xc0)                                         \
      53             :     {                                                                         \
      54             :       Len = 2;                                                                \
      55             :       Mask = 0x1f;                                                            \
      56             :     }                                                                         \
      57             :   else if ((Char & 0xf0) == 0xe0)                                         \
      58             :     {                                                                         \
      59             :       Len = 3;                                                                \
      60             :       Mask = 0x0f;                                                            \
      61             :     }                                                                         \
      62             :   else if ((Char & 0xf8) == 0xf0)                                         \
      63             :     {                                                                         \
      64             :       Len = 4;                                                                \
      65             :       Mask = 0x07;                                                            \
      66             :     }                                                                         \
      67             :   else if ((Char & 0xfc) == 0xf8)                                         \
      68             :     {                                                                         \
      69             :       Len = 5;                                                                \
      70             :       Mask = 0x03;                                                            \
      71             :     }                                                                         \
      72             :   else if ((Char & 0xfe) == 0xfc)                                         \
      73             :     {                                                                         \
      74             :       Len = 6;                                                                \
      75             :       Mask = 0x01;                                                            \
      76             :     }                                                                         \
      77             :   else                                                                        \
      78             :     Len = -1;
      79             : 
      80             : #define UTF8_LENGTH(Char)              \
      81             :   ((Char) < 0x80 ? 1 :                 \
      82             :    ((Char) < 0x800 ? 2 :               \
      83             :     ((Char) < 0x10000 ? 3 :            \
      84             :      ((Char) < 0x200000 ? 4 :          \
      85             :       ((Char) < 0x4000000 ? 5 : 6)))))
      86             : 
      87             : #define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
      88             :   (Result) = (Chars)[0] & (Mask);                                         \
      89             :   for ((Count) = 1; (Count) < (Len); ++(Count))                                    \
      90             :     {                                                                         \
      91             :       if (((Chars)[(Count)] & 0xc0) != 0x80)                                      \
      92             :         {                                                                     \
      93             :           (Result) = -1;                                                      \
      94             :           break;                                                              \
      95             :         }                                                                     \
      96             :       (Result) <<= 6;                                                           \
      97             :       (Result) |= ((Chars)[(Count)] & 0x3f);                                      \
      98             :     }
      99             : 
     100             : #define UNICODE_VALID(Char)                   \
     101             :     ((Char) < 0x110000 &&                     \
     102             :      (((Char) & 0xFFFFF800) != 0xD800) &&     \
     103             :      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
     104             :      ((Char) & 0xFFFE) != 0xFFFE)
     105             : 
     106             : static const char utf8_skip_data[256] = {
     107             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     108             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     109             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     110             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     111             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     112             :     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     113             :     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     114             :     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
     115             : };
     116             : 
     117             : #define UTF8_NEXT_CHAR(p) ((p) + utf8_skip_data[*(unsigned char *)(p)])
     118             : 
     119             : /* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
     120             :  * If @p does not point to a valid UTF-8 encoded character, results are
     121             :  * undefined.
     122             :  **/
     123             : static uint32_t
     124           0 : _utf8_get_char (const unsigned char *p)
     125             : {
     126           0 :     int i, mask = 0, len;
     127             :     uint32_t result;
     128           0 :     unsigned char c = (unsigned char) *p;
     129             : 
     130           0 :     UTF8_COMPUTE (c, mask, len);
     131           0 :     if (len == -1)
     132           0 :         return (uint32_t)-1;
     133           0 :     UTF8_GET (result, p, i, mask, len);
     134             : 
     135           0 :     return result;
     136             : }
     137             : 
     138             : /* Like _utf8_get_char, but take a maximum length
     139             :  * and return (uint32_t)-2 on incomplete trailing character
     140             :  */
     141             : static uint32_t
     142           0 : _utf8_get_char_extended (const unsigned char *p,
     143             :                          long                 max_len)
     144             : {
     145             :     int i, len;
     146           0 :     uint32_t wc = (unsigned char) *p;
     147             : 
     148           0 :     if (wc < 0x80) {
     149           0 :         return wc;
     150           0 :     } else if (wc < 0xc0) {
     151           0 :         return (uint32_t)-1;
     152           0 :     } else if (wc < 0xe0) {
     153           0 :         len = 2;
     154           0 :         wc &= 0x1f;
     155           0 :     } else if (wc < 0xf0) {
     156           0 :         len = 3;
     157           0 :         wc &= 0x0f;
     158           0 :     } else if (wc < 0xf8) {
     159           0 :         len = 4;
     160           0 :         wc &= 0x07;
     161           0 :     } else if (wc < 0xfc) {
     162           0 :         len = 5;
     163           0 :         wc &= 0x03;
     164           0 :     } else if (wc < 0xfe) {
     165           0 :         len = 6;
     166           0 :         wc &= 0x01;
     167             :     } else {
     168           0 :         return (uint32_t)-1;
     169             :     }
     170             : 
     171           0 :     if (max_len >= 0 && len > max_len) {
     172           0 :         for (i = 1; i < max_len; i++) {
     173           0 :             if ((((unsigned char *)p)[i] & 0xc0) != 0x80)
     174           0 :                 return (uint32_t)-1;
     175             :         }
     176           0 :         return (uint32_t)-2;
     177             :     }
     178             : 
     179           0 :     for (i = 1; i < len; ++i) {
     180           0 :         uint32_t ch = ((unsigned char *)p)[i];
     181             : 
     182           0 :         if ((ch & 0xc0) != 0x80) {
     183           0 :             if (ch)
     184           0 :                 return (uint32_t)-1;
     185             :             else
     186           0 :                 return (uint32_t)-2;
     187             :         }
     188             : 
     189           0 :         wc <<= 6;
     190           0 :         wc |= (ch & 0x3f);
     191             :     }
     192             : 
     193           0 :     if (UTF8_LENGTH(wc) != len)
     194           0 :         return (uint32_t)-1;
     195             : 
     196           0 :     return wc;
     197             : }
     198             : 
     199             : /**
     200             :  * _cairo_utf8_get_char_validated:
     201             :  * @p: a UTF-8 string
     202             :  * @unicode: location to store one Unicode character
     203             :  *
     204             :  * Decodes the first character of a valid UTF-8 string, and returns
     205             :  * the number of bytes consumed.
     206             :  *
     207             :  * Note that the string should be valid.  Do not use this without
     208             :  * validating the string first.
     209             :  *
     210             :  * Returns: the number of bytes forming the character returned.
     211             :  **/
     212             : int
     213           0 : _cairo_utf8_get_char_validated (const char *p,
     214             :                                 uint32_t   *unicode)
     215             : {
     216           0 :     int i, mask = 0, len;
     217             :     uint32_t result;
     218           0 :     unsigned char c = (unsigned char) *p;
     219             : 
     220           0 :     UTF8_COMPUTE (c, mask, len);
     221           0 :     if (len == -1) {
     222           0 :         if (unicode)
     223           0 :             *unicode = (uint32_t)-1;
     224           0 :         return 1;
     225             :     }
     226           0 :     UTF8_GET (result, p, i, mask, len);
     227             : 
     228           0 :     if (unicode)
     229           0 :         *unicode = result;
     230           0 :     return len;
     231             : }
     232             : 
     233             : /**
     234             :  * _cairo_utf8_to_ucs4:
     235             :  * @str: an UTF-8 string
     236             :  * @len: length of @str in bytes, or -1 if it is nul-terminated.
     237             :  *   If @len is supplied and the string has an embedded nul
     238             :  *   byte, only the portion before the nul byte is converted.
     239             :  * @result: location to store a pointer to a newly allocated UTF-32
     240             :  *   string (always native endian), or %NULL. Free with free(). A 0
     241             :  *   word will be written after the last character.
     242             :  * @items_written: location to store number of 32-bit words
     243             :  *   written. (Not including the trailing 0)
     244             :  *
     245             :  * Converts a UTF-8 string to UCS-4. UCS-4 is an encoding of Unicode
     246             :  * with 1 32-bit word per character. The string is validated to
     247             :  * consist entirely of valid Unicode characters.
     248             :  *
     249             :  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
     250             :  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
     251             :  *   invalid sequence was found.
     252             :  **/
     253             : cairo_status_t
     254           0 : _cairo_utf8_to_ucs4 (const char *str,
     255             :                      int         len,
     256             :                      uint32_t  **result,
     257             :                      int        *items_written)
     258             : {
     259           0 :     uint32_t *str32 = NULL;
     260             :     int n_chars, i;
     261             :     const unsigned char *in;
     262           0 :     const unsigned char * const ustr = (const unsigned char *) str;
     263             : 
     264           0 :     in = ustr;
     265           0 :     n_chars = 0;
     266           0 :     while ((len < 0 || ustr + len - in > 0) && *in)
     267             :     {
     268           0 :         uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
     269           0 :         if (wc & 0x80000000 || !UNICODE_VALID (wc))
     270           0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     271             : 
     272           0 :         n_chars++;
     273           0 :         if (n_chars == INT_MAX)
     274           0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     275             : 
     276           0 :         in = UTF8_NEXT_CHAR (in);
     277             :     }
     278             : 
     279           0 :     if (result) {
     280           0 :         str32 = _cairo_malloc_ab (n_chars + 1, sizeof (uint32_t));
     281           0 :         if (!str32)
     282           0 :             return _cairo_error (CAIRO_STATUS_NO_MEMORY);
     283             : 
     284           0 :         in = ustr;
     285           0 :         for (i=0; i < n_chars; i++) {
     286           0 :             str32[i] = _utf8_get_char (in);
     287           0 :             in = UTF8_NEXT_CHAR (in);
     288             :         }
     289           0 :         str32[i] = 0;
     290             : 
     291           0 :         *result = str32;
     292             :     }
     293             : 
     294           0 :     if (items_written)
     295           0 :         *items_written = n_chars;
     296             : 
     297           0 :     return CAIRO_STATUS_SUCCESS;
     298             : }
     299             : 
     300             : /**
     301             :  * _cairo_ucs4_to_utf8:
     302             :  * @unicode: a UCS-4 character
     303             :  * @utf8: buffer to write utf8 string into. Must have at least 4 bytes
     304             :  * space available. Or %NULL.
     305             :  *
     306             :  * This space left intentionally blank.
     307             :  *
     308             :  * Return value: Number of bytes in the utf8 string or 0 if an invalid
     309             :  * unicode character
     310             :  **/
     311             : int
     312           0 : _cairo_ucs4_to_utf8 (uint32_t  unicode,
     313             :                      char     *utf8)
     314             : {
     315             :     int bytes;
     316             :     char *p;
     317             : 
     318           0 :     if (unicode < 0x80) {
     319           0 :         if (utf8)
     320           0 :             *utf8 = unicode;
     321           0 :         return 1;
     322           0 :     } else if (unicode < 0x800) {
     323           0 :         bytes = 2;
     324           0 :     } else if (unicode < 0x10000) {
     325           0 :         bytes = 3;
     326           0 :     } else if (unicode < 0x200000) {
     327           0 :         bytes = 4;
     328             :     } else {
     329           0 :         return 0;
     330             :     }
     331             : 
     332           0 :     if (!utf8)
     333           0 :         return bytes;
     334             : 
     335           0 :     p = utf8 + bytes;
     336           0 :     while (p > utf8) {
     337           0 :         *--p = 0x80 | (unicode & 0x3f);
     338           0 :         unicode >>= 6;
     339             :     }
     340           0 :     *p |= 0xf0 << (4 - bytes);
     341             : 
     342           0 :     return bytes;
     343             : }
     344             : 
     345             : #if CAIRO_HAS_UTF8_TO_UTF16
     346             : /**
     347             :  * _cairo_utf8_to_utf16:
     348             :  * @str: an UTF-8 string
     349             :  * @len: length of @str in bytes, or -1 if it is nul-terminated.
     350             :  *   If @len is supplied and the string has an embedded nul
     351             :  *   byte, only the portion before the nul byte is converted.
     352             :  * @result: location to store a pointer to a newly allocated UTF-16
     353             :  *   string (always native endian). Free with free(). A 0
     354             :  *   word will be written after the last character.
     355             :  * @items_written: location to store number of 16-bit words
     356             :  *   written. (Not including the trailing 0)
     357             :  *
     358             :  * Converts a UTF-8 string to UTF-16. UTF-16 is an encoding of Unicode
     359             :  * where characters are represented either as a single 16-bit word, or
     360             :  * as a pair of 16-bit "surrogates". The string is validated to
     361             :  * consist entirely of valid Unicode characters.
     362             :  *
     363             :  * Return value: %CAIRO_STATUS_SUCCESS if the entire string was
     364             :  *   successfully converted. %CAIRO_STATUS_INVALID_STRING if an
     365             :  *   an invalid sequence was found.
     366             :  **/
     367             : cairo_status_t
     368           0 : _cairo_utf8_to_utf16 (const char *str,
     369             :                       int         len,
     370             :                       uint16_t **result,
     371             :                       int       *items_written)
     372             : {
     373           0 :     uint16_t *str16 = NULL;
     374             :     int n16, i;
     375             :     const unsigned char *in;
     376           0 :     const unsigned char * const ustr = (const unsigned char *) str;
     377             : 
     378           0 :     in = ustr;
     379           0 :     n16 = 0;
     380           0 :     while ((len < 0 || ustr + len - in > 0) && *in) {
     381           0 :         uint32_t wc = _utf8_get_char_extended (in, ustr + len - in);
     382           0 :         if (wc & 0x80000000 || !UNICODE_VALID (wc))
     383           0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     384             : 
     385           0 :         if (wc < 0x10000)
     386           0 :             n16 += 1;
     387             :         else
     388           0 :             n16 += 2;
     389             : 
     390           0 :         if (n16 == INT_MAX - 1 || n16 == INT_MAX)
     391           0 :             return _cairo_error (CAIRO_STATUS_INVALID_STRING);
     392             : 
     393           0 :         in = UTF8_NEXT_CHAR (in);
     394             :     }
     395             : 
     396           0 :     str16 = _cairo_malloc_ab (n16 + 1, sizeof (uint16_t));
     397           0 :     if (!str16)
     398           0 :         return _cairo_error (CAIRO_STATUS_NO_MEMORY);
     399             : 
     400           0 :     in = ustr;
     401           0 :     for (i = 0; i < n16;) {
     402           0 :         uint32_t wc = _utf8_get_char (in);
     403             : 
     404           0 :         if (wc < 0x10000) {
     405           0 :             str16[i++] = wc;
     406             :         } else {
     407           0 :             str16[i++] = (wc - 0x10000) / 0x400 + 0xd800;
     408           0 :             str16[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
     409             :         }
     410             : 
     411           0 :         in = UTF8_NEXT_CHAR (in);
     412             :     }
     413             : 
     414           0 :     str16[i] = 0;
     415             : 
     416           0 :     *result = str16;
     417           0 :     if (items_written)
     418           0 :         *items_written = n16;
     419             : 
     420           0 :     return CAIRO_STATUS_SUCCESS;
     421             : }
     422             : #endif

Generated by: LCOV version 1.13