LCOV - code coverage report
Current view: top level - toolkit/crashreporter/google-breakpad/src/common - convert_UTF.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 247 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 8 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
       3             :  * Distributed under the Terms of Use in 
       4             :  * http://www.unicode.org/copyright.html.
       5             :  *
       6             :  * Permission is hereby granted, free of charge, to any person obtaining
       7             :  * a copy of the Unicode data files and any associated documentation
       8             :  * (the "Data Files") or Unicode software and any associated documentation
       9             :  * (the "Software") to deal in the Data Files or Software
      10             :  * without restriction, including without limitation the rights to use,
      11             :  * copy, modify, merge, publish, distribute, and/or sell copies of
      12             :  * the Data Files or Software, and to permit persons to whom the Data Files
      13             :  * or Software are furnished to do so, provided that
      14             :  * (a) this copyright and permission notice appear with all copies 
      15             :  * of the Data Files or Software,
      16             :  * (b) this copyright and permission notice appear in associated 
      17             :  * documentation, and
      18             :  * (c) there is clear notice in each modified Data File or in the Software
      19             :  * as well as in the documentation associated with the Data File(s) or
      20             :  * Software that the data or software has been modified.
      21             :  *
      22             :  * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
      23             :  * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
      24             :  * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
      25             :  * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
      26             :  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
      27             :  * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
      28             :  * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
      29             :  * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
      30             :  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
      31             :  * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
      32             :  *
      33             :  * Except as contained in this notice, the name of a copyright holder
      34             :  * shall not be used in advertising or otherwise to promote the sale,
      35             :  * use or other dealings in these Data Files or Software without prior
      36             :  * written authorization of the copyright holder.
      37             :  */
      38             : 
      39             : /* ---------------------------------------------------------------------
      40             : 
      41             : Conversions between UTF32, UTF-16, and UTF-8. Source code file.
      42             : Author: Mark E. Davis, 1994.
      43             : Rev History: Rick McGowan, fixes & updates May 2001.
      44             : Sept 2001: fixed const & error conditions per
      45             : mods suggested by S. Parent & A. Lillich.
      46             : June 2002: Tim Dodd added detection and handling of incomplete
      47             : source sequences, enhanced error detection, added casts
      48             : to eliminate compiler warnings.
      49             : July 2003: slight mods to back out aggressive FFFE detection.
      50             : Jan 2004: updated switches in from-UTF8 conversions.
      51             : Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
      52             : 
      53             : See the header file "ConvertUTF.h" for complete documentation.
      54             : 
      55             : ------------------------------------------------------------------------ */
      56             : 
      57             : 
      58             : #include "convert_UTF.h"
      59             : #ifdef CVTUTF_DEBUG
      60             : #include <stdio.h>
      61             : #endif
      62             : 
      63             : static const int halfShift  = 10; /* used for shifting by 10 bits */
      64             : 
      65             : static const UTF32 halfBase = 0x0010000UL;
      66             : static const UTF32 halfMask = 0x3FFUL;
      67             : 
      68             : #define UNI_SUR_HIGH_START  (UTF32)0xD800
      69             : #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
      70             : #define UNI_SUR_LOW_START   (UTF32)0xDC00
      71             : #define UNI_SUR_LOW_END     (UTF32)0xDFFF
      72             : 
      73             : #ifndef false
      74             : #define false      0
      75             : #endif
      76             : #ifndef true
      77             : #define true        1
      78             : #endif
      79             : 
      80             : /* --------------------------------------------------------------------- */
      81             : 
      82           0 : ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
      83             :                                       UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
      84           0 :   ConversionResult result = conversionOK;
      85           0 :   const UTF32* source = *sourceStart;
      86           0 :   UTF16* target = *targetStart;
      87           0 :   while (source < sourceEnd) {
      88             :     UTF32 ch;
      89           0 :     if (target >= targetEnd) {
      90           0 :             result = targetExhausted; break;
      91             :     }
      92           0 :     ch = *source++;
      93           0 :     if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
      94             :             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
      95           0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
      96           0 :         if (flags == strictConversion) {
      97           0 :           --source; /* return to the illegal value itself */
      98           0 :           result = sourceIllegal;
      99           0 :           break;
     100             :         } else {
     101           0 :           *target++ = UNI_REPLACEMENT_CHAR;
     102             :         }
     103             :             } else {
     104           0 :         *target++ = (UTF16)ch; /* normal case */
     105             :             }
     106           0 :     } else if (ch > UNI_MAX_LEGAL_UTF32) {
     107           0 :             if (flags == strictConversion) {
     108           0 :         result = sourceIllegal;
     109             :             } else {
     110           0 :         *target++ = UNI_REPLACEMENT_CHAR;
     111             :             }
     112             :     } else {
     113             :             /* target is a character in range 0xFFFF - 0x10FFFF. */
     114           0 :             if (target + 1 >= targetEnd) {
     115           0 :         --source; /* Back up source pointer! */
     116           0 :         result = targetExhausted; break;
     117             :             }
     118           0 :             ch -= halfBase;
     119           0 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     120           0 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     121             :     }
     122             :   }
     123           0 : *sourceStart = source;
     124           0 : *targetStart = target;
     125           0 : return result;
     126             : }
     127             : 
     128             : /* --------------------------------------------------------------------- */
     129             : 
     130           0 : ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
     131             :                                       UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     132           0 :   ConversionResult result = conversionOK;
     133           0 :   const UTF16* source = *sourceStart;
     134           0 :   UTF32* target = *targetStart;
     135             :   UTF32 ch, ch2;
     136           0 :   while (source < sourceEnd) {
     137           0 :     const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
     138           0 :     ch = *source++;
     139             :     /* If we have a surrogate pair, convert to UTF32 first. */
     140           0 :     if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     141             :             /* If the 16 bits following the high surrogate are in the source buffer... */
     142           0 :             if (source < sourceEnd) {
     143           0 :         ch2 = *source;
     144             :         /* If it's a low surrogate, convert to UTF32. */
     145           0 :         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     146           0 :           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     147           0 :           + (ch2 - UNI_SUR_LOW_START) + halfBase;
     148           0 :           ++source;
     149           0 :         } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     150           0 :           --source; /* return to the illegal value itself */
     151           0 :           result = sourceIllegal;
     152           0 :           break;
     153             :         }
     154             :             } else { /* We don't have the 16 bits following the high surrogate. */
     155           0 :         --source; /* return to the high surrogate */
     156           0 :         result = sourceExhausted;
     157           0 :         break;
     158             :             }
     159           0 :     } else if (flags == strictConversion) {
     160             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     161           0 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     162           0 :         --source; /* return to the illegal value itself */
     163           0 :         result = sourceIllegal;
     164           0 :         break;
     165             :             }
     166             :     }
     167           0 :     if (target >= targetEnd) {
     168           0 :             source = oldSource; /* Back up source pointer! */
     169           0 :             result = targetExhausted; break;
     170             :     }
     171           0 :     *target++ = ch;
     172             :   }
     173           0 :   *sourceStart = source;
     174           0 :   *targetStart = target;
     175             : #ifdef CVTUTF_DEBUG
     176             :   if (result == sourceIllegal) {
     177             :     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
     178             :     fflush(stderr);
     179             :   }
     180             : #endif
     181           0 :   return result;
     182             : }
     183             : 
     184             : /* --------------------------------------------------------------------- */
     185             : 
     186             : /*
     187             :  * Index into the table below with the first byte of a UTF-8 sequence to
     188             :  * get the number of trailing bytes that are supposed to follow it.
     189             :  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
     190             :  * left as-is for anyone who may want to do such conversion, which was
     191             :  * allowed in earlier algorithms.
     192             :  */
     193             : static const char trailingBytesForUTF8[256] = {
     194             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     195             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     196             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     197             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     198             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     199             :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     200             :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     201             :   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
     202             : };
     203             : 
     204             : /*
     205             :  * Magic values subtracted from a buffer value during UTF8 conversion.
     206             :  * This table contains as many values as there might be trailing bytes
     207             :  * in a UTF-8 sequence.
     208             :  */
     209             : static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
     210             :   0x03C82080UL, 0xFA082080UL, 0x82082080UL };
     211             : 
     212             : /*
     213             :  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
     214             :  * into the first byte, depending on how many bytes follow.  There are
     215             :  * as many entries in this table as there are UTF-8 sequence types.
     216             :  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
     217             :  * for *legal* UTF-8 will be 4 or fewer bytes total.
     218             :  */
     219             : static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
     220             : 
     221             : /* --------------------------------------------------------------------- */
     222             : 
     223             : /* The interface converts a whole buffer to avoid function-call overhead.
     224             : * Constants have been gathered. Loops & conditionals have been removed as
     225             : * much as possible for efficiency, in favor of drop-through switches.
     226             : * (See "Note A" at the bottom of the file for equivalent code.)
     227             : * If your compiler supports it, the "isLegalUTF8" call can be turned
     228             : * into an inline function.
     229             : */
     230             : 
     231             : /* --------------------------------------------------------------------- */
     232             : 
     233           0 : ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
     234             :                                      UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     235           0 :   ConversionResult result = conversionOK;
     236           0 :   const UTF16* source = *sourceStart;
     237           0 :   UTF8* target = *targetStart;
     238           0 :   while (source < sourceEnd) {
     239             :     UTF32 ch;
     240           0 :     unsigned short bytesToWrite = 0;
     241           0 :     const UTF32 byteMask = 0xBF;
     242           0 :     const UTF32 byteMark = 0x80;
     243           0 :     const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
     244           0 :     ch = *source++;
     245             :     /* If we have a surrogate pair, convert to UTF32 first. */
     246           0 :     if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
     247             :             /* If the 16 bits following the high surrogate are in the source buffer... */
     248           0 :             if (source < sourceEnd) {
     249           0 :         UTF32 ch2 = *source;
     250             :         /* If it's a low surrogate, convert to UTF32. */
     251           0 :         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
     252           0 :           ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
     253           0 :           + (ch2 - UNI_SUR_LOW_START) + halfBase;
     254           0 :           ++source;
     255           0 :         } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
     256           0 :           --source; /* return to the illegal value itself */
     257           0 :           result = sourceIllegal;
     258           0 :           break;
     259             :         }
     260             :             } else { /* We don't have the 16 bits following the high surrogate. */
     261           0 :         --source; /* return to the high surrogate */
     262           0 :         result = sourceExhausted;
     263           0 :         break;
     264             :             }
     265           0 :     } else if (flags == strictConversion) {
     266             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     267           0 :             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
     268           0 :         --source; /* return to the illegal value itself */
     269           0 :         result = sourceIllegal;
     270           0 :         break;
     271             :             }
     272             :     }
     273             :     /* Figure out how many bytes the result will require */
     274           0 :     if (ch < (UTF32)0x80) {       bytesToWrite = 1;
     275           0 :     } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     276           0 :     } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     277           0 :     } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
     278           0 :     } else {                        bytesToWrite = 3;
     279           0 :       ch = UNI_REPLACEMENT_CHAR;
     280             :     }
     281             : 
     282           0 :     target += bytesToWrite;
     283           0 :     if (target > targetEnd) {
     284           0 :             source = oldSource; /* Back up source pointer! */
     285           0 :             target -= bytesToWrite; result = targetExhausted; break;
     286             :     }
     287           0 :     switch (bytesToWrite) { /* note: everything falls through. */
     288           0 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     289           0 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     290           0 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     291           0 :             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
     292             :     }
     293           0 :     target += bytesToWrite;
     294             :   }
     295           0 : *sourceStart = source;
     296           0 : *targetStart = target;
     297           0 : return result;
     298             : }
     299             : 
     300             : /* --------------------------------------------------------------------- */
     301             : 
     302             : /*
     303             :  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
     304             :  * This must be called with the length pre-determined by the first byte.
     305             :  * If not calling this from ConvertUTF8to*, then the length can be set by:
     306             :  *  length = trailingBytesForUTF8[*source]+1;
     307             :  * and the sequence is illegal right away if there aren't that many bytes
     308             :  * available.
     309             :  * If presented with a length > 4, this returns false.  The Unicode
     310             :  * definition of UTF-8 goes up to 4-byte sequences.
     311             :  */
     312             : 
     313           0 : static Boolean isLegalUTF8(const UTF8 *source, int length) {
     314             :   UTF8 a;
     315           0 :   const UTF8 *srcptr = source+length;
     316           0 :   switch (length) {
     317           0 :     default: return false;
     318             :       /* Everything else falls through when "true"... */
     319           0 :     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     320           0 :     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
     321           0 :     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
     322             : 
     323           0 :       switch (*source) {
     324             :         /* no fall-through in this inner switch */
     325           0 :         case 0xE0: if (a < 0xA0) return false; break;
     326           0 :         case 0xED: if (a > 0x9F) return false; break;
     327           0 :         case 0xF0: if (a < 0x90) return false; break;
     328           0 :         case 0xF4: if (a > 0x8F) return false; break;
     329           0 :         default:   if (a < 0x80) return false;
     330             :       }
     331             : 
     332           0 :       case 1: if (*source >= 0x80 && *source < 0xC2) return false;
     333             :   }
     334           0 :   if (*source > 0xF4) return false;
     335           0 :   return true;
     336             : }
     337             : 
     338             : /* --------------------------------------------------------------------- */
     339             : 
     340             : /*
     341             :  * Exported function to return whether a UTF-8 sequence is legal or not.
     342             :  * This is not used here; it's just exported.
     343             :  */
     344           0 : Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
     345           0 :   int length = trailingBytesForUTF8[*source]+1;
     346           0 :   if (source+length > sourceEnd) {
     347           0 :     return false;
     348             :   }
     349           0 :   return isLegalUTF8(source, length);
     350             : }
     351             : 
     352             : /* --------------------------------------------------------------------- */
     353             : 
     354           0 : ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
     355             :                                      UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     356           0 :   ConversionResult result = conversionOK;
     357           0 :   const UTF8* source = *sourceStart;
     358           0 :   UTF16* target = *targetStart;
     359           0 :   while (source < sourceEnd) {
     360           0 :     UTF32 ch = 0;
     361           0 :     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     362           0 :     if (source + extraBytesToRead >= sourceEnd) {
     363           0 :             result = sourceExhausted; break;
     364             :     }
     365             :     /* Do this check whether lenient or strict */
     366           0 :     if (! isLegalUTF8(source, extraBytesToRead+1)) {
     367           0 :             result = sourceIllegal;
     368           0 :             break;
     369             :     }
     370             :     /*
     371             :      * The cases all fall through. See "Note A" below.
     372             :      */
     373           0 :     switch (extraBytesToRead) {
     374           0 :             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     375           0 :             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
     376           0 :             case 3: ch += *source++; ch <<= 6;
     377           0 :             case 2: ch += *source++; ch <<= 6;
     378           0 :             case 1: ch += *source++; ch <<= 6;
     379           0 :             case 0: ch += *source++;
     380             :     }
     381           0 :     ch -= offsetsFromUTF8[extraBytesToRead];
     382             : 
     383           0 :     if (target >= targetEnd) {
     384           0 :             source -= (extraBytesToRead+1); /* Back up source pointer! */
     385           0 :             result = targetExhausted; break;
     386             :     }
     387           0 :     if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     388             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     389           0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     390           0 :         if (flags == strictConversion) {
     391           0 :           source -= (extraBytesToRead+1); /* return to the illegal value itself */
     392           0 :           result = sourceIllegal;
     393           0 :           break;
     394             :         } else {
     395           0 :           *target++ = UNI_REPLACEMENT_CHAR;
     396             :         }
     397             :             } else {
     398           0 :         *target++ = (UTF16)ch; /* normal case */
     399             :             }
     400           0 :     } else if (ch > UNI_MAX_UTF16) {
     401           0 :             if (flags == strictConversion) {
     402           0 :         result = sourceIllegal;
     403           0 :         source -= (extraBytesToRead+1); /* return to the start */
     404           0 :         break; /* Bail out; shouldn't continue */
     405             :             } else {
     406           0 :         *target++ = UNI_REPLACEMENT_CHAR;
     407             :             }
     408             :     } else {
     409             :             /* target is a character in range 0xFFFF - 0x10FFFF. */
     410           0 :             if (target + 1 >= targetEnd) {
     411           0 :         source -= (extraBytesToRead+1); /* Back up source pointer! */
     412           0 :         result = targetExhausted; break;
     413             :             }
     414           0 :             ch -= halfBase;
     415           0 :             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
     416           0 :             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
     417             :     }
     418             :   }
     419           0 : *sourceStart = source;
     420           0 : *targetStart = target;
     421           0 : return result;
     422             : }
     423             : 
     424             : /* --------------------------------------------------------------------- */
     425             : 
     426           0 : ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
     427             :                                      UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
     428           0 :   ConversionResult result = conversionOK;
     429           0 :   const UTF32* source = *sourceStart;
     430           0 :   UTF8* target = *targetStart;
     431           0 :   while (source < sourceEnd) {
     432             :     UTF32 ch;
     433           0 :     unsigned short bytesToWrite = 0;
     434           0 :     const UTF32 byteMask = 0xBF;
     435           0 :     const UTF32 byteMark = 0x80;
     436           0 :     ch = *source++;
     437           0 :     if (flags == strictConversion ) {
     438             :             /* UTF-16 surrogate values are illegal in UTF-32 */
     439           0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     440           0 :         --source; /* return to the illegal value itself */
     441           0 :         result = sourceIllegal;
     442           0 :         break;
     443             :             }
     444             :     }
     445             :     /*
     446             :      * Figure out how many bytes the result will require. Turn any
     447             :      * illegally large UTF32 things (> Plane 17) into replacement chars.
     448             :      */
     449           0 :     if (ch < (UTF32)0x80) {       bytesToWrite = 1;
     450           0 :     } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
     451           0 :     } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
     452           0 :     } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
     453           0 :     } else {                        bytesToWrite = 3;
     454           0 :       ch = UNI_REPLACEMENT_CHAR;
     455           0 :       result = sourceIllegal;
     456             :     }
     457             : 
     458           0 :     target += bytesToWrite;
     459           0 :     if (target > targetEnd) {
     460           0 :             --source; /* Back up source pointer! */
     461           0 :             target -= bytesToWrite; result = targetExhausted; break;
     462             :     }
     463           0 :     switch (bytesToWrite) { /* note: everything falls through. */
     464           0 :             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     465           0 :             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     466           0 :             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
     467           0 :             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
     468             :     }
     469           0 :     target += bytesToWrite;
     470             :   }
     471           0 : *sourceStart = source;
     472           0 : *targetStart = target;
     473           0 : return result;
     474             : }
     475             : 
     476             : /* --------------------------------------------------------------------- */
     477             : 
     478           0 : ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
     479             :                                      UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
     480           0 :   ConversionResult result = conversionOK;
     481           0 :   const UTF8* source = *sourceStart;
     482           0 :   UTF32* target = *targetStart;
     483           0 :   while (source < sourceEnd) {
     484           0 :     UTF32 ch = 0;
     485           0 :     unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
     486           0 :     if (source + extraBytesToRead >= sourceEnd) {
     487           0 :             result = sourceExhausted; break;
     488             :     }
     489             :     /* Do this check whether lenient or strict */
     490           0 :     if (! isLegalUTF8(source, extraBytesToRead+1)) {
     491           0 :             result = sourceIllegal;
     492           0 :             break;
     493             :     }
     494             :     /*
     495             :      * The cases all fall through. See "Note A" below.
     496             :      */
     497           0 :     switch (extraBytesToRead) {
     498           0 :             case 5: ch += *source++; ch <<= 6;
     499           0 :             case 4: ch += *source++; ch <<= 6;
     500           0 :             case 3: ch += *source++; ch <<= 6;
     501           0 :             case 2: ch += *source++; ch <<= 6;
     502           0 :             case 1: ch += *source++; ch <<= 6;
     503           0 :             case 0: ch += *source++;
     504             :     }
     505           0 :     ch -= offsetsFromUTF8[extraBytesToRead];
     506             : 
     507           0 :     if (target >= targetEnd) {
     508           0 :             source -= (extraBytesToRead+1); /* Back up the source pointer! */
     509           0 :             result = targetExhausted; break;
     510             :     }
     511           0 :     if (ch <= UNI_MAX_LEGAL_UTF32) {
     512             :             /*
     513             :              * UTF-16 surrogate values are illegal in UTF-32, and anything
     514             :              * over Plane 17 (> 0x10FFFF) is illegal.
     515             :              */
     516           0 :             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     517           0 :         if (flags == strictConversion) {
     518           0 :           source -= (extraBytesToRead+1); /* return to the illegal value itself */
     519           0 :           result = sourceIllegal;
     520           0 :           break;
     521             :         } else {
     522           0 :           *target++ = UNI_REPLACEMENT_CHAR;
     523             :         }
     524             :             } else {
     525           0 :         *target++ = ch;
     526             :             }
     527             :     } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
     528           0 :             result = sourceIllegal;
     529           0 :             *target++ = UNI_REPLACEMENT_CHAR;
     530             :     }
     531             :   }
     532           0 :   *sourceStart = source;
     533           0 :   *targetStart = target;
     534           0 :   return result;
     535             : }
     536             : 
     537             : /* ---------------------------------------------------------------------
     538             : 
     539             : Note A.
     540             : The fall-through switches in UTF-8 reading code save a
     541             : temp variable, some decrements & conditionals.  The switches
     542             : are equivalent to the following loop:
     543             : {
     544             :   int tmpBytesToRead = extraBytesToRead+1;
     545             :   do {
     546             :                 ch += *source++;
     547             :                 --tmpBytesToRead;
     548             :                 if (tmpBytesToRead) ch <<= 6;
     549             :   } while (tmpBytesToRead > 0);
     550             : }
     551             : In UTF-8 writing code, the switches on "bytesToWrite" are
     552             : similarly unrolled loops.
     553             : 
     554             : --------------------------------------------------------------------- */

Generated by: LCOV version 1.13