LCOV - code coverage report
Current view: top level - js/public - CharacterEncoding.h (source / functions) Hit Total Coverage
Test: output.info Lines: 24 37 64.9 %
Date: 2017-07-14 16:53:18 Functions: 11 19 57.9 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
       2             :  * vim: set ts=8 sts=4 et sw=4 tw=99:
       3             :  * This Source Code Form is subject to the terms of the Mozilla Public
       4             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       5             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       6             : 
       7             : #ifndef js_CharacterEncoding_h
       8             : #define js_CharacterEncoding_h
       9             : 
      10             : #include "mozilla/Range.h"
      11             : 
      12             : #include "js/TypeDecls.h"
      13             : #include "js/Utility.h"
      14             : 
      15             : class JSFlatString;
      16             : 
      17             : namespace JS {
      18             : 
      19             : /*
      20             :  * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
      21             :  * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
      22             :  * byte is treated as a 2-byte character, and there is no way to pass in a
      23             :  * string containing characters beyond U+00FF.
      24             :  */
      25             : class Latin1Chars : public mozilla::Range<Latin1Char>
      26             : {
      27             :     typedef mozilla::Range<Latin1Char> Base;
      28             : 
      29             :   public:
      30             :     using CharT = Latin1Char;
      31             : 
      32             :     Latin1Chars() : Base() {}
      33           0 :     Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
      34             :     Latin1Chars(const Latin1Char* aBytes, size_t aLength)
      35             :       : Base(const_cast<Latin1Char*>(aBytes), aLength)
      36             :     {}
      37             :     Latin1Chars(const char* aBytes, size_t aLength)
      38             :       : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
      39             :     {}
      40             : };
      41             : 
      42             : /*
      43             :  * A Latin1Chars, but with \0 termination for C compatibility.
      44             :  */
      45             : class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
      46             : {
      47             :     typedef mozilla::RangedPtr<Latin1Char> Base;
      48             : 
      49             :   public:
      50             :     using CharT = Latin1Char;
      51             : 
      52           0 :     Latin1CharsZ() : Base(nullptr, 0) {}
      53             : 
      54             :     Latin1CharsZ(char* aBytes, size_t aLength)
      55             :       : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
      56             :     {
      57             :         MOZ_ASSERT(aBytes[aLength] == '\0');
      58             :     }
      59             : 
      60           0 :     Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
      61           0 :       : Base(aBytes, aLength)
      62             :     {
      63           0 :         MOZ_ASSERT(aBytes[aLength] == '\0');
      64           0 :     }
      65             : 
      66             :     using Base::operator=;
      67             : 
      68           0 :     char* c_str() { return reinterpret_cast<char*>(get()); }
      69             : };
      70             : 
      71             : class UTF8Chars : public mozilla::Range<unsigned char>
      72             : {
      73             :     typedef mozilla::Range<unsigned char> Base;
      74             : 
      75             :   public:
      76             :     using CharT = unsigned char;
      77             : 
      78             :     UTF8Chars() : Base() {}
      79           0 :     UTF8Chars(char* aBytes, size_t aLength)
      80           0 :       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
      81           0 :     {}
      82         323 :     UTF8Chars(const char* aBytes, size_t aLength)
      83         323 :       : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
      84         323 :     {}
      85             : };
      86             : 
      87             : /*
      88             :  * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
      89             :  */
      90             : class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
      91             : {
      92             :     typedef mozilla::RangedPtr<unsigned char> Base;
      93             : 
      94             :   public:
      95             :     using CharT = unsigned char;
      96             : 
      97           0 :     UTF8CharsZ() : Base(nullptr, 0) {}
      98             : 
      99         654 :     UTF8CharsZ(char* aBytes, size_t aLength)
     100         654 :       : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
     101             :     {
     102         654 :         MOZ_ASSERT(aBytes[aLength] == '\0');
     103         654 :     }
     104             : 
     105             :     UTF8CharsZ(unsigned char* aBytes, size_t aLength)
     106             :       : Base(aBytes, aLength)
     107             :     {
     108             :         MOZ_ASSERT(aBytes[aLength] == '\0');
     109             :     }
     110             : 
     111             :     using Base::operator=;
     112             : 
     113         654 :     char* c_str() { return reinterpret_cast<char*>(get()); }
     114             : };
     115             : 
     116             : /*
     117             :  * A wrapper for a "const char*" that is encoded using UTF-8.
     118             :  * This class does not manage ownership of the data; that is left
     119             :  * to others.  This differs from UTF8CharsZ in that the chars are
     120             :  * const and it allows assignment.
     121             :  */
     122             : class ConstUTF8CharsZ
     123             : {
     124             :     const char* data_;
     125             : 
     126             :   public:
     127             :     using CharT = unsigned char;
     128             : 
     129           6 :     ConstUTF8CharsZ() : data_(nullptr)
     130           6 :     {}
     131             : 
     132           4 :     ConstUTF8CharsZ(const char* aBytes, size_t aLength)
     133           4 :       : data_(aBytes)
     134             :     {
     135           4 :         MOZ_ASSERT(aBytes[aLength] == '\0');
     136             : #ifdef DEBUG
     137           4 :         validate(aLength);
     138             : #endif
     139           4 :     }
     140             : 
     141           2 :     const void* get() const { return data_; }
     142             : 
     143           8 :     const char* c_str() const { return data_; }
     144             : 
     145          12 :     explicit operator bool() const { return data_ != nullptr; }
     146             : 
     147             :   private:
     148             : #ifdef DEBUG
     149             :     void validate(size_t aLength);
     150             : #endif
     151             : };
     152             : 
     153             : /*
     154             :  * SpiderMonkey uses a 2-byte character representation: it is a
     155             :  * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
     156             :  * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
     157             :  * sufficiently dedicated JavaScript program to be fully unicode-aware by
     158             :  * manually interpreting UTF-16 extension characters embedded in the JS
     159             :  * string.
     160             :  */
     161             : class TwoByteChars : public mozilla::Range<char16_t>
     162             : {
     163             :     typedef mozilla::Range<char16_t> Base;
     164             : 
     165             :   public:
     166             :     using CharT = char16_t;
     167             : 
     168             :     TwoByteChars() : Base() {}
     169         280 :     TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
     170           0 :     TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
     171             : };
     172             : 
     173             : /*
     174             :  * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
     175             :  */
     176             : class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
     177             : {
     178             :     typedef mozilla::RangedPtr<char16_t> Base;
     179             : 
     180             :   public:
     181             :     using CharT = char16_t;
     182             : 
     183           0 :     TwoByteCharsZ() : Base(nullptr, 0) {}
     184             : 
     185         317 :     TwoByteCharsZ(char16_t* chars, size_t length)
     186         317 :       : Base(chars, length)
     187             :     {
     188         317 :         MOZ_ASSERT(chars[length] == '\0');
     189         317 :     }
     190             : 
     191             :     using Base::operator=;
     192             : };
     193             : 
     194             : typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
     195             : 
     196             : /*
     197             :  * Like TwoByteChars, but the chars are const.
     198             :  */
     199             : class ConstTwoByteChars : public mozilla::Range<const char16_t>
     200             : {
     201             :     typedef mozilla::Range<const char16_t> Base;
     202             : 
     203             :   public:
     204             :     using CharT = char16_t;
     205             : 
     206             :     ConstTwoByteChars() : Base() {}
     207           2 :     ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
     208             : };
     209             : 
     210             : /*
     211             :  * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
     212             :  * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
     213             :  * contains any UTF-16 extension characters, then this may give invalid Latin1
     214             :  * output. The returned string is zero terminated. The returned string or the
     215             :  * returned string's |start()| must be freed with JS_free or js_free,
     216             :  * respectively. If allocation fails, an OOM error will be set and the method
     217             :  * will return a nullptr chars (which can be tested for with the ! operator).
     218             :  * This method cannot trigger GC.
     219             :  */
     220             : extern Latin1CharsZ
     221             : LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
     222             :                                    const mozilla::Range<const char16_t> tbchars);
     223             : 
     224             : inline Latin1CharsZ
     225             : LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, const char16_t* begin, size_t length)
     226             : {
     227             :     const mozilla::Range<const char16_t> tbchars(begin, length);
     228             :     return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
     229             : }
     230             : 
     231             : template <typename CharT>
     232             : extern UTF8CharsZ
     233             : CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range<CharT> chars);
     234             : 
     235             : JS_PUBLIC_API(uint32_t)
     236             : Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
     237             : 
     238             : /*
     239             :  * Inflate bytes in UTF-8 encoding to char16_t.
     240             :  * - On error, returns an empty TwoByteCharsZ.
     241             :  * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
     242             :  *   its length;  the length value excludes the trailing null.
     243             :  */
     244             : extern JS_PUBLIC_API(TwoByteCharsZ)
     245             : UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
     246             : 
     247             : /*
     248             :  * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
     249             :  */
     250             : extern JS_PUBLIC_API(TwoByteCharsZ)
     251             : UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
     252             : 
     253             : /*
     254             :  * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
     255             :  * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
     256             :  * input.
     257             :  */
     258             : extern JS_PUBLIC_API(TwoByteCharsZ)
     259             : LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
     260             : 
     261             : extern JS_PUBLIC_API(TwoByteCharsZ)
     262             : LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
     263             : 
     264             : /*
     265             :  * Returns the length of the char buffer required to encode |s| as UTF8.
     266             :  * Does not include the null-terminator.
     267             :  */
     268             : JS_PUBLIC_API(size_t)
     269             : GetDeflatedUTF8StringLength(JSFlatString* s);
     270             : 
     271             : /*
     272             :  * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
     273             :  * to encode the entire string or pass the length of the buffer as |dstlenp|,
     274             :  * in which case the function will encode characters from the string until
     275             :  * the buffer is exhausted. Does not write the null terminator.
     276             :  *
     277             :  * If |dstlenp| is provided, it will be updated to hold the number of bytes
     278             :  * written to the buffer. If |numcharsp| is provided, it will be updated to hold
     279             :  * the number of Unicode characters written to the buffer (which can be less
     280             :  * than the length of the string, if the buffer is exhausted before the string
     281             :  * is fully encoded).
     282             :  */
     283             : JS_PUBLIC_API(void)
     284             : DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
     285             :                           size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
     286             : 
     287             : /*
     288             :  * The smallest character encoding capable of fully representing a particular
     289             :  * string.
     290             :  */
     291             : enum class SmallestEncoding {
     292             :     ASCII,
     293             :     Latin1,
     294             :     UTF16
     295             : };
     296             : 
     297             : /*
     298             :  * Returns the smallest encoding possible for the given string: if all
     299             :  * codepoints are <128 then ASCII, otherwise if all codepoints are <256
     300             :  * Latin-1, else UTF16.
     301             :  */
     302             : JS_PUBLIC_API(SmallestEncoding)
     303             : FindSmallestEncoding(UTF8Chars utf8);
     304             : 
     305             : /*
     306             :   * Return a null-terminated Latin-1 string copied from the input string,
     307             :   * storing its length (excluding null terminator) in |*outlen|.  Fail and
     308             :   * report an error if the string contains non-Latin-1 codepoints.  Returns
     309             :   * Latin1CharsZ() on failure.
     310             :  */
     311             : extern JS_PUBLIC_API(Latin1CharsZ)
     312             : UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
     313             : 
     314             : /*
     315             :  * Return a null-terminated Latin-1 string copied from the input string,
     316             :  * storing its length (excluding null terminator) in |*outlen|.  Non-Latin-1
     317             :  * codepoints are replaced by '?'.  Returns Latin1CharsZ() on failure.
     318             :  */
     319             : extern JS_PUBLIC_API(Latin1CharsZ)
     320             : LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
     321             : 
     322             : /*
     323             :  * Returns true if all characters in the given null-terminated string are
     324             :  * ASCII, i.e. < 0x80, false otherwise.
     325             :  */
     326             : extern JS_PUBLIC_API(bool)
     327             : StringIsASCII(const char* s);
     328             : 
     329             : } // namespace JS
     330             : 
     331             : inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
     332             : inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
     333             : 
     334             : #endif /* js_CharacterEncoding_h */

Generated by: LCOV version 1.13