LCOV - code coverage report
Current view: top level - xpcom/string - nsUTF8Utils.h (source / functions) Hit Total Coverage
Test: output.info Lines: 141 281 50.2 %
Date: 2017-07-14 16:53:18 Functions: 28 33 84.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
       3             : /* This Source Code Form is subject to the terms of the Mozilla Public
       4             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       5             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       6             : #ifndef nsUTF8Utils_h_
       7             : #define nsUTF8Utils_h_
       8             : 
       9             : // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
      10             : // file will provide signatures for the Mozilla abstract string types. It will
      11             : // use XPCOM assertion/debugging macros, etc.
      12             : 
      13             : #include "nscore.h"
      14             : #include "mozilla/Assertions.h"
      15             : #include "mozilla/SSE.h"
      16             : #include "mozilla/TypeTraits.h"
      17             : 
      18             : #include "nsCharTraits.h"
      19             : 
      20             : #ifdef MOZILLA_INTERNAL_API
      21             : #define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
      22             : #else
      23             : #define UTF8UTILS_WARNING(msg)
      24             : #endif
      25             : 
      26             : class UTF8traits
      27             : {
      28             : public:
      29      774251 :   static bool isASCII(char aChar)
      30             :   {
      31      774251 :     return (aChar & 0x80) == 0x00;
      32             :   }
      33        2440 :   static bool isInSeq(char aChar)
      34             :   {
      35        2440 :     return (aChar & 0xC0) == 0x80;
      36             :   }
      37        2520 :   static bool is2byte(char aChar)
      38             :   {
      39        2520 :     return (aChar & 0xE0) == 0xC0;
      40             :   }
      41        2360 :   static bool is3byte(char aChar)
      42             :   {
      43        2360 :     return (aChar & 0xF0) == 0xE0;
      44             :   }
      45           0 :   static bool is4byte(char aChar)
      46             :   {
      47           0 :     return (aChar & 0xF8) == 0xF0;
      48             :   }
      49           0 :   static bool is5byte(char aChar)
      50             :   {
      51           0 :     return (aChar & 0xFC) == 0xF8;
      52             :   }
      53           0 :   static bool is6byte(char aChar)
      54             :   {
      55           0 :     return (aChar & 0xFE) == 0xFC;
      56             :   }
      57             : };
      58             : 
      59             : /**
      60             :  * Extract the next UCS-4 character from the buffer and return it.  The
      61             :  * pointer passed in is advanced to the start of the next character in the
      62             :  * buffer.  If non-null, the parameters err and overlong are filled in to
      63             :  * indicate that the character was represented by an overlong sequence, or
      64             :  * that an error occurred.
      65             :  */
      66             : 
      67             : class UTF8CharEnumerator
      68             : {
      69             : public:
      70      270438 :   static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
      71             :   {
      72      270438 :     NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
      73             : 
      74      270438 :     const char* p = *aBuffer;
      75      270438 :     *aErr = false;
      76             : 
      77      270438 :     if (p >= aEnd) {
      78           0 :       *aErr = true;
      79             : 
      80           0 :       return 0;
      81             :     }
      82             : 
      83      270438 :     char c = *p++;
      84             : 
      85      270438 :     if (UTF8traits::isASCII(c)) {
      86      269178 :       *aBuffer = p;
      87      269178 :       return c;
      88             :     }
      89             : 
      90             :     uint32_t ucs4;
      91             :     uint32_t minUcs4;
      92        1260 :     int32_t state = 0;
      93             : 
      94        1260 :     if (!CalcState(c, ucs4, minUcs4, state)) {
      95           0 :       NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
      96           0 :       *aErr = true;
      97             : 
      98           0 :       return 0;
      99             :     }
     100             : 
     101        6140 :     while (state--) {
     102        2440 :       if (p == aEnd) {
     103           0 :         *aErr = true;
     104             : 
     105           0 :         return 0;
     106             :       }
     107             : 
     108        2440 :       c = *p++;
     109             : 
     110        2440 :       if (!AddByte(c, state, ucs4)) {
     111           0 :         *aErr = true;
     112             : 
     113           0 :         return 0;
     114             :       }
     115             :     }
     116             : 
     117        1260 :     if (ucs4 < minUcs4) {
     118             :       // Overlong sequence
     119           0 :       ucs4 = UCS2_REPLACEMENT_CHAR;
     120        1320 :     } else if (ucs4 >= 0xD800 &&
     121         120 :                (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
     122             :       // Surrogates and code points outside the Unicode range.
     123           0 :       ucs4 = UCS2_REPLACEMENT_CHAR;
     124             :     }
     125             : 
     126        1260 :     *aBuffer = p;
     127        1260 :     return ucs4;
     128             :   }
     129             : 
     130             : private:
     131        1260 :   static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
     132             :                         int32_t& aState)
     133             :   {
     134        1260 :     if (UTF8traits::is2byte(aChar)) {
     135          80 :       aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
     136          80 :       aState = 1;
     137          80 :       aMinUcs4 = 0x00000080;
     138        1180 :     } else if (UTF8traits::is3byte(aChar)) {
     139        1180 :       aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
     140        1180 :       aState = 2;
     141        1180 :       aMinUcs4 = 0x00000800;
     142           0 :     } else if (UTF8traits::is4byte(aChar)) {
     143           0 :       aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
     144           0 :       aState = 3;
     145           0 :       aMinUcs4 = 0x00010000;
     146           0 :     } else if (UTF8traits::is5byte(aChar)) {
     147           0 :       aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
     148           0 :       aState = 4;
     149           0 :       aMinUcs4 = 0x00200000;
     150           0 :     } else if (UTF8traits::is6byte(aChar)) {
     151           0 :       aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
     152           0 :       aState = 5;
     153           0 :       aMinUcs4 = 0x04000000;
     154             :     } else {
     155           0 :       return false;
     156             :     }
     157             : 
     158        1260 :     return true;
     159             :   }
     160             : 
     161        2440 :   static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
     162             :   {
     163        2440 :     if (UTF8traits::isInSeq(aChar)) {
     164        2440 :       int32_t shift = aState * 6;
     165        2440 :       aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
     166        2440 :       return true;
     167             :     }
     168             : 
     169           0 :     return false;
     170             :   }
     171             : };
     172             : 
     173             : 
     174             : /**
     175             :  * Extract the next UCS-4 character from the buffer and return it.  The
     176             :  * pointer passed in is advanced to the start of the next character in the
     177             :  * buffer.  If non-null, the err parameter is filled in if an error occurs.
     178             :  *
     179             :  * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
     180             :  * the buffer will be updated to move only a single UCS-2 character.
     181             :  *
     182             :  * Any other error returns 0 and does not move the buffer position.
     183             :  */
     184             : 
     185             : 
     186             : class UTF16CharEnumerator
     187             : {
     188             : public:
     189         378 :   static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
     190             :                            bool* aErr = nullptr)
     191             :   {
     192         378 :     NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
     193             : 
     194         378 :     const char16_t* p = *aBuffer;
     195             : 
     196         378 :     if (p >= aEnd) {
     197           0 :       NS_ERROR("No input to work with");
     198           0 :       if (aErr) {
     199           0 :         *aErr = true;
     200             :       }
     201             : 
     202           0 :       return 0;
     203             :     }
     204             : 
     205         378 :     char16_t c = *p++;
     206             : 
     207         378 :     if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
     208         378 :       if (aErr) {
     209           0 :         *aErr = false;
     210             :       }
     211         378 :       *aBuffer = p;
     212         378 :       return c;
     213           0 :     } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
     214           0 :       if (p == aEnd) {
     215             :         // Found a high surrogate at the end of the buffer. Flag this
     216             :         // as an error and return the Unicode replacement
     217             :         // character 0xFFFD.
     218             : 
     219           0 :         UTF8UTILS_WARNING("Unexpected end of buffer after high surrogate");
     220             : 
     221           0 :         if (aErr) {
     222           0 :           *aErr = true;
     223             :         }
     224           0 :         *aBuffer = p;
     225           0 :         return 0xFFFD;
     226             :       }
     227             : 
     228             :       // D800- DBFF - High Surrogate
     229           0 :       char16_t h = c;
     230             : 
     231           0 :       c = *p++;
     232             : 
     233           0 :       if (NS_IS_LOW_SURROGATE(c)) {
     234             :         // DC00- DFFF - Low Surrogate
     235             :         // N = (H - D800) *400 + 10000 + (L - DC00)
     236           0 :         uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
     237           0 :         if (aErr) {
     238           0 :           *aErr = false;
     239             :         }
     240           0 :         *aBuffer = p;
     241           0 :         return ucs4;
     242             :       } else {
     243             :         // Found a high surrogate followed by something other than
     244             :         // a low surrogate. Flag this as an error and return the
     245             :         // Unicode replacement character 0xFFFD.  Note that the
     246             :         // pointer to the next character points to the second 16-bit
     247             :         // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
     248             :         // only the first code unit of an illegal sequence must be
     249             :         // treated as an illegally terminated code unit sequence
     250             :         // (also Chapter 3 D91, "isolated [not paired and ill-formed]
     251             :         // UTF-16 code units in the range D800..DFFF are ill-formed").
     252           0 :         UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
     253             : 
     254           0 :         if (aErr) {
     255           0 :           *aErr = true;
     256             :         }
     257           0 :         *aBuffer = p - 1;
     258           0 :         return 0xFFFD;
     259             :       }
     260             :     } else { // U+DC00 - U+DFFF
     261             :       // DC00- DFFF - Low Surrogate
     262             : 
     263             :       // Found a low surrogate w/o a preceding high surrogate. Flag
     264             :       // this as an error and return the Unicode replacement
     265             :       // character 0xFFFD.
     266             : 
     267           0 :       UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
     268           0 :       if (aErr) {
     269           0 :         *aErr = true;
     270             :       }
     271           0 :       *aBuffer = p;
     272           0 :       return 0xFFFD;
     273             :     }
     274             : 
     275             :     MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
     276             :   }
     277             : };
     278             : 
     279             : 
     280             : /**
     281             :  * A character sink (see |copy_string| in nsAlgorithm.h) for converting
     282             :  * UTF-8 to UTF-16
     283             :  */
     284             : class ConvertUTF8toUTF16
     285             : {
     286             : public:
     287             :   typedef char value_type;
     288             :   typedef char16_t buffer_type;
     289             : 
     290        6891 :   explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
     291        6891 :     : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
     292             :   {
     293        6891 :   }
     294             : 
     295        6891 :   size_t Length() const
     296             :   {
     297        6891 :     return mBuffer - mStart;
     298             :   }
     299             : 
     300       13416 :   bool ErrorEncountered() const
     301             :   {
     302       13416 :     return mErrorEncountered;
     303             :   }
     304             : 
     305        6891 :   void write(const value_type* aStart, uint32_t aN)
     306             :   {
     307        6891 :     if (mErrorEncountered) {
     308           0 :       return;
     309             :     }
     310             : 
     311             :     // algorithm assumes utf8 units won't
     312             :     // be spread across fragments
     313        6891 :     const value_type* p = aStart;
     314        6891 :     const value_type* end = aStart + aN;
     315        6891 :     buffer_type* out = mBuffer;
     316      517087 :     for (; p != end /* && *p */;) {
     317             :       bool err;
     318      255098 :       uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
     319             : 
     320      255098 :       if (err) {
     321           0 :         mErrorEncountered = true;
     322           0 :         mBuffer = out;
     323           0 :         return;
     324             :       }
     325             : 
     326      255098 :       if (ucs4 >= PLANE1_BASE) {
     327           0 :         *out++ = (buffer_type)H_SURROGATE(ucs4);
     328           0 :         *out++ = (buffer_type)L_SURROGATE(ucs4);
     329             :       } else {
     330      255098 :         *out++ = ucs4;
     331             :       }
     332             :     }
     333        6891 :     mBuffer = out;
     334             :   }
     335             : 
     336         154 :   void write_terminator()
     337             :   {
     338         154 :     *mBuffer = buffer_type(0);
     339         154 :   }
     340             : 
     341             : private:
     342             :   buffer_type* const mStart;
     343             :   buffer_type* mBuffer;
     344             :   bool mErrorEncountered;
     345             : };
     346             : 
     347             : /**
     348             :  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
     349             :  * the length of the UTF-16 string equivalent to a UTF-8 string.
     350             :  */
     351             : class CalculateUTF8Length
     352             : {
     353             : public:
     354             :   typedef char value_type;
     355             : 
     356        7266 :   CalculateUTF8Length()
     357        7266 :     : mLength(0), mErrorEncountered(false)
     358             :   {
     359        7266 :   }
     360             : 
     361        7266 :   size_t Length() const
     362             :   {
     363        7266 :     return mLength;
     364             :   }
     365             : 
     366        7266 :   void write(const value_type* aStart, uint32_t aN)
     367             :   {
     368             :     // ignore any further requests
     369        7266 :     if (mErrorEncountered) {
     370           0 :       return;
     371             :     }
     372             : 
     373             :     // algorithm assumes utf8 units won't
     374             :     // be spread across fragments
     375        7266 :     const value_type* p = aStart;
     376        7266 :     const value_type* end = aStart + aN;
     377      287478 :     for (; p < end /* && *p */; ++mLength) {
     378      140106 :       if (UTF8traits::isASCII(*p)) {
     379      139638 :         p += 1;
     380         468 :       } else if (UTF8traits::is2byte(*p)) {
     381          80 :         p += 2;
     382         388 :       } else if (UTF8traits::is3byte(*p)) {
     383         388 :         p += 3;
     384           0 :       } else if (UTF8traits::is4byte(*p)) {
     385             :         // Because a UTF-8 sequence of 4 bytes represents a codepoint
     386             :         // greater than 0xFFFF, it will become a surrogate pair in the
     387             :         // UTF-16 string, so add 1 more to mLength.
     388             :         // This doesn't happen with is5byte and is6byte because they
     389             :         // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
     390             :         // converted to a single replacement character.
     391             : 
     392             :         // However, there is one case when a 4 byte UTF-8 sequence will
     393             :         // only generate 2 UTF-16 bytes. If we have a properly encoded
     394             :         // sequence, but with an invalid value (too small or too big),
     395             :         // that will result in a replacement character being written
     396             :         // This replacement character is encoded as just 1 single
     397             :         // UTF-16 character, which is 2 bytes.
     398             : 
     399             :         // The below code therefore only adds 1 to mLength if the UTF8
     400             :         // data will produce a decoded character which is greater than
     401             :         // or equal to 0x010000 and less than 0x0110000.
     402             : 
     403             :         // A 4byte UTF8 character is encoded as
     404             :         // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     405             :         // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
     406             :         // map to bit 17-21 in the final result. If these bits are
     407             :         // between 0x01 and 0x11, that means that the final result is
     408             :         // between 0x010000 and 0x110000. The below code reads these
     409             :         // bits out and assigns them to c, but shifted up 4 bits to
     410             :         // avoid having to shift twice.
     411             : 
     412             :         // It doesn't matter what to do in the case where p + 4 > end
     413             :         // since no UTF16 characters will be written in that case by
     414             :         // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
     415             :         // any of the surrogate bits are wrong since no UTF16
     416             :         // characters will be written in that case either.
     417             : 
     418           0 :         if (p + 4 <= end) {
     419           0 :           uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
     420           0 :                        ((uint32_t)(p[1] & 0x30));
     421           0 :           if (c >= 0x010 && c < 0x110) {
     422           0 :             ++mLength;
     423             :           }
     424             :         }
     425             : 
     426           0 :         p += 4;
     427           0 :       } else if (UTF8traits::is5byte(*p)) {
     428           0 :         p += 5;
     429           0 :       } else if (UTF8traits::is6byte(*p)) {
     430           0 :         p += 6;
     431             :       } else { // error
     432           0 :         ++mLength; // to account for the decrement below
     433           0 :         break;
     434             :       }
     435             :     }
     436        7266 :     if (p != end) {
     437           0 :       NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
     438           0 :       --mLength; // The last multi-byte char wasn't complete, discard it.
     439           0 :       mErrorEncountered = true;
     440             :     }
     441             :   }
     442             : 
     443             : private:
     444             :   size_t mLength;
     445             :   bool mErrorEncountered;
     446             : };
     447             : 
     448             : /**
     449             :  * A character sink (see |copy_string| in nsAlgorithm.h) for
     450             :  * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
     451             :  * (0xEFBFBD in UTF-8).
     452             :  */
     453             : class ConvertUTF16toUTF8
     454             : {
     455             : public:
     456             :   typedef char16_t value_type;
     457             :   typedef char buffer_type;
     458             : 
     459             :   // The error handling here is more lenient than that in
     460             :   // |ConvertUTF8toUTF16|, but it's that way for backwards
     461             :   // compatibility.
     462             : 
     463         919 :   explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
     464         919 :     : mStart(aBuffer), mBuffer(aBuffer)
     465             :   {
     466         919 :   }
     467             : 
     468         919 :   size_t Size() const
     469             :   {
     470         919 :     return mBuffer - mStart;
     471             :   }
     472             : 
     473         919 :   void write(const value_type* aStart, uint32_t aN)
     474             :   {
     475         919 :     buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
     476             : 
     477        6185 :     for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
     478        5266 :       value_type c = *p;
     479        5266 :       if (!(c & 0xFF80)) { // U+0000 - U+007F
     480        4915 :         *out++ = (char)c;
     481         351 :       } else if (!(c & 0xF800)) { // U+0100 - U+07FF
     482          60 :         *out++ = 0xC0 | (char)(c >> 6);
     483          60 :         *out++ = 0x80 | (char)(0x003F & c);
     484         291 :       } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
     485         291 :         *out++ = 0xE0 | (char)(c >> 12);
     486         291 :         *out++ = 0x80 | (char)(0x003F & (c >> 6));
     487         291 :         *out++ = 0x80 | (char)(0x003F & c);
     488           0 :       } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
     489             :         // D800- DBFF - High Surrogate
     490           0 :         value_type h = c;
     491             : 
     492           0 :         ++p;
     493           0 :         if (p == end) {
     494             :           // Treat broken characters as the Unicode
     495             :           // replacement character 0xFFFD (0xEFBFBD in
     496             :           // UTF-8)
     497           0 :           *out++ = '\xEF';
     498           0 :           *out++ = '\xBF';
     499           0 :           *out++ = '\xBD';
     500             : 
     501           0 :           UTF8UTILS_WARNING("String ending in half a surrogate pair!");
     502             : 
     503           0 :           break;
     504             :         }
     505           0 :         c = *p;
     506             : 
     507           0 :         if (NS_IS_LOW_SURROGATE(c)) {
     508             :           // DC00- DFFF - Low Surrogate
     509             :           // N = (H - D800) *400 + 10000 + ( L - DC00 )
     510           0 :           uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
     511             : 
     512             :           // 0001 0000-001F FFFF
     513           0 :           *out++ = 0xF0 | (char)(ucs4 >> 18);
     514           0 :           *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
     515           0 :           *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
     516           0 :           *out++ = 0x80 | (char)(0x003F & ucs4);
     517             :         } else {
     518             :           // Treat broken characters as the Unicode
     519             :           // replacement character 0xFFFD (0xEFBFBD in
     520             :           // UTF-8)
     521           0 :           *out++ = '\xEF';
     522           0 :           *out++ = '\xBF';
     523           0 :           *out++ = '\xBD';
     524             : 
     525             :           // The pointer to the next character points to the second
     526             :           // 16-bit value, not beyond it, as per Unicode 5.0.0
     527             :           // Chapter 3 C10, only the first code unit of an illegal
     528             :           // sequence must be treated as an illegally terminated
     529             :           // code unit sequence (also Chapter 3 D91, "isolated [not
     530             :           // paired and ill-formed] UTF-16 code units in the range
     531             :           // D800..DFFF are ill-formed").
     532           0 :           p--;
     533             : 
     534           0 :           UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
     535             :         }
     536             :       } else { // U+DC00 - U+DFFF
     537             :         // Treat broken characters as the Unicode replacement
     538             :         // character 0xFFFD (0xEFBFBD in UTF-8)
     539           0 :         *out++ = '\xEF';
     540           0 :         *out++ = '\xBF';
     541           0 :         *out++ = '\xBD';
     542             : 
     543             :         // DC00- DFFF - Low Surrogate
     544           0 :         UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
     545             :       }
     546             :     }
     547             : 
     548         919 :     mBuffer = out;
     549         919 :   }
     550             : 
     551           1 :   void write_terminator()
     552             :   {
     553           1 :     *mBuffer = buffer_type(0);
     554           1 :   }
     555             : 
     556             : private:
     557             :   buffer_type* const mStart;
     558             :   buffer_type* mBuffer;
     559             : };
     560             : 
     561             : /**
     562             :  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
     563             :  * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
     564             :  * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
     565             :  */
     566             : class CalculateUTF8Size
     567             : {
     568             : public:
     569             :   typedef char16_t value_type;
     570             : 
     571         644 :   CalculateUTF8Size()
     572         644 :     : mSize(0)
     573             :   {
     574         644 :   }
     575             : 
     576         645 :   size_t Size() const
     577             :   {
     578         645 :     return mSize;
     579             :   }
     580             : 
     581         644 :   void write(const value_type* aStart, uint32_t aN)
     582             :   {
     583             :     // Assume UCS2 surrogate pairs won't be spread across fragments.
     584        5556 :     for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
     585        4912 :       value_type c = *p;
     586        4912 :       if (!(c & 0xFF80)) { // U+0000 - U+007F
     587        4912 :         mSize += 1;
     588           0 :       } else if (!(c & 0xF800)) { // U+0100 - U+07FF
     589           0 :         mSize += 2;
     590           0 :       } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
     591           0 :         mSize += 3;
     592           0 :       } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
     593           0 :         ++p;
     594           0 :         if (p == end) {
     595             :           // Treat broken characters as the Unicode
     596             :           // replacement character 0xFFFD (0xEFBFBD in
     597             :           // UTF-8)
     598           0 :           mSize += 3;
     599             : 
     600           0 :           UTF8UTILS_WARNING("String ending in half a surrogate pair!");
     601             : 
     602           0 :           break;
     603             :         }
     604           0 :         c = *p;
     605             : 
     606           0 :         if (0xDC00 == (0xFC00 & c)) {
     607           0 :           mSize += 4;
     608             :         } else {
     609             :           // Treat broken characters as the Unicode
     610             :           // replacement character 0xFFFD (0xEFBFBD in
     611             :           // UTF-8)
     612           0 :           mSize += 3;
     613             : 
     614             :           // The next code unit is the second 16-bit value, not
     615             :           // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
     616             :           // only the first code unit of an illegal sequence must
     617             :           // be treated as an illegally terminated code unit
     618             :           // sequence (also Chapter 3 D91, "isolated [not paired and
     619             :           // ill-formed] UTF-16 code units in the range D800..DFFF
     620             :           // are ill-formed").
     621           0 :           p--;
     622             : 
     623           0 :           UTF8UTILS_WARNING("got a high Surrogate but no low surrogate");
     624             :         }
     625             :       } else { // U+DC00 - U+DFFF
     626             :         // Treat broken characters as the Unicode replacement
     627             :         // character 0xFFFD (0xEFBFBD in UTF-8)
     628           0 :         mSize += 3;
     629             : 
     630           0 :         UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
     631             :       }
     632             :     }
     633         644 :   }
     634             : 
     635             : private:
     636             :   size_t mSize;
     637             : };
     638             : 
     639             : #ifdef MOZILLA_INTERNAL_API
     640             : /**
     641             :  * A character sink that performs a |reinterpret_cast|-style conversion
     642             :  * from char to char16_t.
     643             :  */
     644             : class LossyConvertEncoding8to16
     645             : {
     646             : public:
     647             :   typedef char value_type;
     648             :   typedef char input_type;
     649             :   typedef char16_t output_type;
     650             : 
     651             : public:
     652        1274 :   explicit LossyConvertEncoding8to16(char16_t* aDestination) :
     653        1274 :     mDestination(aDestination)
     654             :   {
     655        1274 :   }
     656             : 
     657             :   void
     658        1274 :   write(const char* aSource, uint32_t aSourceLength)
     659             :   {
     660             : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     661        1274 :     if (mozilla::supports_sse2()) {
     662        1274 :       write_sse2(aSource, aSourceLength);
     663        1274 :       return;
     664             :     }
     665             : #endif
     666           0 :     const char* done_writing = aSource + aSourceLength;
     667           0 :     while (aSource < done_writing) {
     668           0 :       *mDestination++ = (char16_t)(unsigned char)(*aSource++);
     669             :     }
     670             :   }
     671             : 
     672             :   void
     673             :   write_sse2(const char* aSource, uint32_t aSourceLength);
     674             : 
     675             :   void
     676           0 :   write_terminator()
     677             :   {
     678           0 :     *mDestination = (char16_t)(0);
     679           0 :   }
     680             : 
     681             : private:
     682             :   char16_t* mDestination;
     683             : };
     684             : 
     685             : /**
     686             :  * A character sink that performs a |reinterpret_cast|-style conversion
     687             :  * from char16_t to char.
     688             :  */
     689             : class LossyConvertEncoding16to8
     690             : {
     691             : public:
     692             :   typedef char16_t value_type;
     693             :   typedef char16_t input_type;
     694             :   typedef char output_type;
     695             : 
     696        4329 :   explicit LossyConvertEncoding16to8(char* aDestination)
     697        4329 :     : mDestination(aDestination)
     698             :   {
     699        4329 :   }
     700             : 
     701             :   void
     702        4329 :   write(const char16_t* aSource, uint32_t aSourceLength)
     703             :   {
     704             : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     705        4329 :     if (mozilla::supports_sse2()) {
     706        4329 :       write_sse2(aSource, aSourceLength);
     707        4329 :       return;
     708             :     }
     709             : #endif
     710           0 :     const char16_t* done_writing = aSource + aSourceLength;
     711           0 :     while (aSource < done_writing) {
     712           0 :       *mDestination++ = (char)(*aSource++);
     713             :     }
     714             :   }
     715             : 
     716             : #ifdef MOZILLA_MAY_SUPPORT_SSE2
     717             :   void
     718             :   write_sse2(const char16_t* aSource, uint32_t aSourceLength);
     719             : #endif
     720             : 
     721             :   void
     722          10 :   write_terminator()
     723             :   {
     724          10 :     *mDestination = '\0';
     725          10 :   }
     726             : 
     727             : private:
     728             :   char* mDestination;
     729             : };
     730             : #endif // MOZILLA_INTERNAL_API
     731             : 
     732             : 
     733             : template<typename Char, typename UnsignedT>
     734             : inline UnsignedT
     735           0 : RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
     736             : {
     737             :   static_assert(mozilla::IsSame<Char, char>::value ||
     738             :                 mozilla::IsSame<Char, unsigned char>::value ||
     739             :                 mozilla::IsSame<Char, signed char>::value,
     740             :                 "UTF-8 data must be in 8-bit units");
     741             :   static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
     742           0 :   while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
     743           0 :     --index;
     744             : 
     745           0 :   return index;
     746             : }
     747             : 
     748             : #undef UTF8UTILS_WARNING
     749             : 
     750             : #endif /* !defined(nsUTF8Utils_h_) */

Generated by: LCOV version 1.13