LCOV - code coverage report
Current view: top level - intl - Encoding.h (source / functions) Hit Total Coverage
Test: output.info Lines: 58 138 42.0 %
Date: 2017-07-14 16:53:18 Functions: 18 37 48.6 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
       2             : // file at the top-level directory of this distribution.
       3             : //
       4             : // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
       5             : // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
       6             : // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
       7             : // option. This file may not be copied, modified, or distributed
       8             : // except according to those terms.
       9             : 
      10             : // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
      11             : // "top-level directory" in the above notice refers to
      12             : // third_party/rust/encoding_c/.
      13             : 
      14             : #ifndef mozilla_Encoding_h
      15             : #define mozilla_Encoding_h
      16             : 
      17             : #include "mozilla/CheckedInt.h"
      18             : #include "mozilla/NotNull.h"
      19             : #include "mozilla/Span.h"
      20             : #include "mozilla/Tuple.h"
      21             : #include "nsString.h"
      22             : 
      23             : namespace mozilla {
      24             : class Encoding;
      25             : class Decoder;
      26             : class Encoder;
      27             : }; // namespace mozilla
      28             : 
      29             : #define ENCODING_RS_ENCODING mozilla::Encoding
      30             : #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR mozilla::NotNull<const mozilla::Encoding*>
      31             : #define ENCODING_RS_ENCODER mozilla::Encoder
      32             : #define ENCODING_RS_DECODER mozilla::Decoder
      33             : 
      34             : #include "encoding_rs.h"
      35             : 
      36             : extern "C" {
      37             : 
      38             : nsresult
      39             : mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
      40             :                                     uint8_t const* src,
      41             :                                     size_t src_len,
      42             :                                     nsAString* dst);
      43             : 
      44             : nsresult
      45             : mozilla_encoding_decode_to_nsstring_with_bom_removal(
      46             :   mozilla::Encoding const* encoding,
      47             :   uint8_t const* src,
      48             :   size_t src_len,
      49             :   nsAString* dst);
      50             : 
      51             : nsresult
      52             : mozilla_encoding_decode_to_nsstring_without_bom_handling(
      53             :   mozilla::Encoding const* encoding,
      54             :   uint8_t const* src,
      55             :   size_t src_len,
      56             :   nsAString* dst);
      57             : 
      58             : nsresult
      59             : mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
      60             :   mozilla::Encoding const* encoding,
      61             :   uint8_t const* src,
      62             :   size_t src_len,
      63             :   nsAString* dst);
      64             : 
      65             : nsresult
      66             : mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
      67             :                                    char16_t const* src,
      68             :                                    size_t src_len,
      69             :                                    nsACString* dst);
      70             : 
      71             : nsresult
      72             : mozilla_encoding_decode_to_nscstring(mozilla::Encoding const** encoding,
      73             :                                      nsACString const* src,
      74             :                                      nsACString* dst);
      75             : 
      76             : nsresult
      77             : mozilla_encoding_decode_to_nscstring_with_bom_removal(
      78             :   mozilla::Encoding const* encoding,
      79             :   nsACString const* src,
      80             :   nsACString* dst);
      81             : 
      82             : nsresult
      83             : mozilla_encoding_decode_to_nscstring_without_bom_handling(
      84             :   mozilla::Encoding const* encoding,
      85             :   nsACString const* src,
      86             :   nsACString* dst);
      87             : 
      88             : nsresult
      89             : mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
      90             :   mozilla::Encoding const* encoding,
      91             :   nsACString const* src,
      92             :   nsACString* dst);
      93             : 
      94             : nsresult
      95             : mozilla_encoding_encode_from_nscstring(mozilla::Encoding const** encoding,
      96             :                                        nsACString const* src,
      97             :                                        nsACString* dst);
      98             : 
      99             : } // extern "C"
     100             : 
     101             : namespace mozilla {
     102             : 
     103             : /**
     104             :  * Return value from `Decoder`/`Encoder` to indicate that input
     105             :  * was exhausted.
     106             :  */
     107             : const uint32_t kInputEmpty = INPUT_EMPTY;
     108             : 
     109             : /**
     110             :  * Return value from `Decoder`/`Encoder` to indicate that output
     111             :  * space was insufficient.
     112             :  */
     113             : const uint32_t kOutputFull = OUTPUT_FULL;
     114             : 
     115             : /**
     116             :  * An encoding as defined in the Encoding Standard
     117             :  * (https://encoding.spec.whatwg.org/).
     118             :  *
     119             :  * See https://docs.rs/encoding_rs/ for the Rust API docs.
     120             :  *
     121             :  * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
     122             :  * sequence and, in most cases, vice versa. Each encoding has a name, an output
     123             :  * encoding, and one or more labels.
     124             :  *
     125             :  * _Labels_ are ASCII-case-insensitive strings that are used to identify an
     126             :  * encoding in formats and protocols. The _name_ of the encoding is the
     127             :  * preferred label in the case appropriate for returning from the
     128             :  * `characterSet` property of the `Document` DOM interface, except for
     129             :  * the replacement encoding whose name is not one of its labels.
     130             :  *
     131             :  * The _output encoding_ is the encoding used for form submission and URL
     132             :  * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
     133             :  * UTF-16LE and UTF-16BE encodings and the encoding itself for other
     134             :  * encodings.
     135             :  *
     136             :  * # Streaming vs. Non-Streaming
     137             :  *
     138             :  * When you have the entire input in a single buffer, you can use the
     139             :  * methods `Decode()`, `DecodeWithBOMRemoval()`,
     140             :  * `DecodeWithoutBOMHandling()`,
     141             :  * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
     142             :  * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
     143             :  * NewEncoder()` methods), these methods perform heap allocations. You should
     144             :  * the `Decoder` and `Encoder` objects when your input is split into multiple
     145             :  * buffers or when you want to control the allocation of the output buffers.
     146             :  *
     147             :  * # Instances
     148             :  *
     149             :  * All instances of `Encoding` are statically allocated and have the process's
     150             :  * lifetime. There is precisely one unique `Encoding` instance for each
     151             :  * encoding defined in the Encoding Standard.
     152             :  *
     153             :  * To obtain a reference to a particular encoding whose identity you know at
     154             :  * compile time, use a `static` that refers to encoding. There is a `static`
     155             :  * for each encoding. The `static`s are named in all caps with hyphens
     156             :  * replaced with underscores and with `_ENCODING` appended to the
     157             :  * name. For example, if you know at compile time that you will want to
     158             :  * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
     159             :  *
     160             :  * If you don't know what encoding you need at compile time and need to
     161             :  * dynamically get an encoding by label, use `Encoding::for_label()`.
     162             :  *
     163             :  * Pointers to `Encoding` can be compared with `==` to check for the sameness
     164             :  * of two encodings.
     165             :  *
     166             :  * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
     167             :  * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
     168             :  * `const mozilla::Encoding*` in the C signature and
     169             :  * `*const encoding_rs::Encoding` is the corresponding Rust signature.
     170             :  */
     171             : class Encoding final
     172             : {
     173             : public:
     174             :   /**
     175             :    * Implements the _get an encoding_ algorithm
     176             :    * (https://encoding.spec.whatwg.org/#concept-encoding-get).
     177             :    *
     178             :    * If, after ASCII-lowercasing and removing leading and trailing
     179             :    * whitespace, the argument matches a label defined in the Encoding
     180             :    * Standard, `const Encoding*` representing the corresponding
     181             :    * encoding is returned. If there is no match, `nullptr` is returned.
     182             :    *
     183             :    * This is the right method to use if the action upon the method returning
     184             :    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
     185             :    * instead. When the action upon the method returning `nullptr` is not to
     186             :    * proceed with a fallback but to refuse processing,
     187             :    * `ForLabelNoReplacement()` is more appropriate.
     188             :   */
     189         141 :   static inline const Encoding* ForLabel(Span<const char> aLabel)
     190             :   {
     191         141 :     return encoding_for_label(
     192         141 :       reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
     193             :   }
     194             : 
     195             :   /**
     196             :    * `nsAString` argument version. See above for docs.
     197             :    */
     198          20 :   static inline const Encoding* ForLabel(const nsAString& aLabel)
     199             :   {
     200          20 :     return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
     201             :   }
     202             : 
     203             :   /**
     204             :    * This method behaves the same as `ForLabel()`, except when `ForLabel()`
     205             :    * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
     206             :    *
     207             :    * This method is useful in scenarios where a fatal error is required
     208             :    * upon invalid label, because in those cases the caller typically wishes
     209             :    * to treat the labels that map to the replacement encoding as fatal
     210             :    * errors, too.
     211             :    *
     212             :    * It is not OK to use this method when the action upon the method returning
     213             :    * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
     214             :    * such a case, the `ForLabel()` method should be used instead in order to avoid
     215             :    * unsafe fallback for labels that `ForLabel()` maps to `REPLACEMENT_ENCODING`.
     216             :    */
     217        4805 :   static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel)
     218             :   {
     219        4805 :     return encoding_for_label_no_replacement(
     220        4805 :       reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
     221             :   }
     222             : 
     223             :   /**
     224             :    * `nsAString` argument version. See above for docs.
     225             :    */
     226           0 :   static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel)
     227             :   {
     228           0 :     return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
     229             :   }
     230             : 
     231             :   /**
     232             :    * Performs non-incremental BOM sniffing.
     233             :    *
     234             :    * The argument must either be a buffer representing the entire input
     235             :    * stream (non-streaming case) or a buffer representing at least the first
     236             :    * three bytes of the input stream (streaming case).
     237             :    *
     238             :    * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
     239             :    * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
     240             :    * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
     241             :    */
     242         116 :   static inline Tuple<const Encoding*, size_t> ForBOM(
     243             :     Span<const uint8_t> aBuffer)
     244             :   {
     245         116 :     size_t len = aBuffer.Length();
     246         116 :     const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
     247         116 :     return MakeTuple(encoding, len);
     248             :   }
     249             : 
     250             :   /**
     251             :    * If the argument matches exactly (case-sensitively; no whitespace
     252             :    * removal performed) the name of an encoding, returns
     253             :    * `const Encoding*` representing that encoding. Otherwise `MOZ_CRASH`es.
     254             :    *
     255             :    * The motivating use case for this method is interoperability with
     256             :    * legacy Gecko code that represents encodings as name string instead of
     257             :    * type-safe `Encoding` objects. Using this method for other purposes is
     258             :    * most likely the wrong thing to do.
     259             :    */
     260           4 :   static inline NotNull<const mozilla::Encoding*> ForName(
     261             :     Span<const char> aName)
     262             :   {
     263             :     return WrapNotNull(encoding_for_name(
     264           4 :       reinterpret_cast<const uint8_t*>(aName.Elements()), aName.Length()));
     265             :   }
     266             : 
     267             :   /**
     268             :    * Writes the name of this encoding into `aName`.
     269             :    *
     270             :    * This name is appropriate to return as-is from the DOM
     271             :    * `document.characterSet` property.
     272             :    */
     273         312 :   inline void Name(nsACString& aName) const
     274             :   {
     275         312 :     aName.SetLength(ENCODING_NAME_MAX_LENGTH);
     276             :     size_t length =
     277         312 :       encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
     278         312 :     aName.SetLength(length); // truncation is the 64-bit case is OK
     279         312 :   }
     280             : 
     281             :   /**
     282             :    * Checks whether the _output encoding_ of this encoding can encode every
     283             :    * Unicode code point. (Only true if the output encoding is UTF-8.)
     284             :    */
     285           0 :   inline bool CanEncodeEverything() const
     286             :   {
     287           0 :     return encoding_can_encode_everything(this);
     288             :   }
     289             : 
     290             :   /**
     291             :    * Checks whether the bytes 0x00...0x7F map exclusively to the characters
     292             :    * U+0000...U+007F and vice versa.
     293             :    */
     294           9 :   inline bool IsAsciiCompatible() const
     295             :   {
     296           9 :     return encoding_is_ascii_compatible(this);
     297             :   }
     298             : 
     299             :   /**
     300             :    * Returns the _output encoding_ of this encoding. This is UTF-8 for
     301             :    * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
     302             :    */
     303           0 :   inline NotNull<const mozilla::Encoding*> OutputEncoding() const
     304             :   {
     305           0 :     return WrapNotNull(encoding_output_encoding(this));
     306             :   }
     307             : 
     308             :   /**
     309             :    * Decode complete input to `nsACString` _with BOM sniffing_ and with
     310             :    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
     311             :    * entire input is available as a single buffer (i.e. the end of the
     312             :    * buffer marks the end of the stream).
     313             :    *
     314             :    * This method implements the (non-streaming version of) the
     315             :    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
     316             :    *
     317             :    * The second item in the returned tuple is the encoding that was actually
     318             :    * used (which may differ from this encoding thanks to BOM sniffing).
     319             :    *
     320             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
     321             :    * if there were malformed sequences (that were replaced with the
     322             :    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
     323             :    * tuple.
     324             :    *
     325             :    * The backing buffer of the string isn't copied if the input buffer
     326             :    * is heap-allocated and decoding from UTF-8 and the input is valid
     327             :    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
     328             :    * the input is valid ASCII or decoding from ISO-2022-JP and the
     329             :    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
     330             :    * the same string as both arguments.
     331             :    *
     332             :    * _Note:_ It is wrong to use this when the input buffer represents only
     333             :    * a segment of the input instead of the whole input. Use `NewDecoder()`
     334             :    * when decoding segmented input.
     335             :    */
     336             :   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
     337             :     const nsACString& aBytes,
     338             :     nsACString& aOut) const
     339             :   {
     340             :     const Encoding* encoding = this;
     341             :     const nsACString* bytes = &aBytes;
     342             :     nsACString* out = &aOut;
     343             :     nsresult rv;
     344             :     if (bytes == out) {
     345             :       nsAutoCString temp(aBytes);
     346             :       rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
     347             :     } else {
     348             :       rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
     349             :     }
     350             :     return MakeTuple(rv, WrapNotNull(encoding));
     351             :   }
     352             : 
     353             :   /**
     354             :    * Decode complete input to `nsAString` _with BOM sniffing_ and with
     355             :    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
     356             :    * entire input is available as a single buffer (i.e. the end of the
     357             :    * buffer marks the end of the stream).
     358             :    *
     359             :    * This method implements the (non-streaming version of) the
     360             :    * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
     361             :    *
     362             :    * The second item in the returned tuple is the encoding that was actually
     363             :    * used (which may differ from this encoding thanks to BOM sniffing).
     364             :    *
     365             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
     366             :    * if there were malformed sequences (that were replaced with the
     367             :    * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
     368             :    * tuple.
     369             :    *
     370             :    * _Note:_ It is wrong to use this when the input buffer represents only
     371             :    * a segment of the input instead of the whole input. Use `NewDecoder()`
     372             :    * when decoding segmented input.
     373             :    */
     374           0 :   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
     375             :     Span<const uint8_t> aBytes,
     376             :     nsAString& aOut) const
     377             :   {
     378           0 :     const Encoding* encoding = this;
     379           0 :     nsresult rv = mozilla_encoding_decode_to_nsstring(
     380           0 :       &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
     381           0 :     return MakeTuple(rv, WrapNotNull(encoding));
     382             :   }
     383             : 
     384             :   /**
     385             :    * Decode complete input to `nsACString` _with BOM removal_ and with
     386             :    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
     387             :    * entire input is available as a single buffer (i.e. the end of the
     388             :    * buffer marks the end of the stream).
     389             :    *
     390             :    * When invoked on `UTF_8`, this method implements the (non-streaming
     391             :    * version of) the _UTF-8 decode_
     392             :    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
     393             :    *
     394             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
     395             :    * if there were malformed sequences (that were replaced with the
     396             :    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
     397             :    *
     398             :    * The backing buffer of the string isn't copied if the input buffer
     399             :    * is heap-allocated and decoding from UTF-8 and the input is valid
     400             :    * BOMless UTF-8, decoding from an ASCII-compatible encoding and
     401             :    * the input is valid ASCII or decoding from ISO-2022-JP and the
     402             :    * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
     403             :    * the same string as both arguments.
     404             :    *
     405             :    * _Note:_ It is wrong to use this when the input buffer represents only
     406             :    * a segment of the input instead of the whole input. Use
     407             :    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
     408             :    */
     409             :   inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
     410             :                                        nsACString& aOut) const
     411             :   {
     412             :     const nsACString* bytes = &aBytes;
     413             :     nsACString* out = &aOut;
     414             :     if (bytes == out) {
     415             :       nsAutoCString temp(aBytes);
     416             :       return mozilla_encoding_decode_to_nscstring_with_bom_removal(
     417             :         this, &temp, out);
     418             :     }
     419             :     return mozilla_encoding_decode_to_nscstring_with_bom_removal(
     420             :       this, bytes, out);
     421             :   }
     422             : 
     423             :   /**
     424             :    * Decode complete input to `nsAString` _with BOM removal_ and with
     425             :    * malformed sequences replaced with the REPLACEMENT CHARACTER when the
     426             :    * entire input is available as a single buffer (i.e. the end of the
     427             :    * buffer marks the end of the stream).
     428             :    *
     429             :    * When invoked on `UTF_8`, this method implements the (non-streaming
     430             :    * version of) the _UTF-8 decode_
     431             :    * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
     432             :    *
     433             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
     434             :    * if there were malformed sequences (that were replaced with the
     435             :    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
     436             :    *
     437             :    * _Note:_ It is wrong to use this when the input buffer represents only
     438             :    * a segment of the input instead of the whole input. Use
     439             :    * `NewDecoderWithBOMRemoval()` when decoding segmented input.
     440             :    */
     441           1 :   inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
     442             :                                        nsAString& aOut) const
     443             :   {
     444           1 :     return mozilla_encoding_decode_to_nsstring_with_bom_removal(
     445           1 :       this, aBytes.Elements(), aBytes.Length(), &aOut);
     446             :   }
     447             : 
     448             :   /**
     449             :    * Decode complete input to `nsACString` _without BOM handling_ and
     450             :    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
     451             :    * the entire input is available as a single buffer (i.e. the end of the
     452             :    * buffer marks the end of the stream).
     453             :    *
     454             :    * When invoked on `UTF_8`, this method implements the (non-streaming
     455             :    * version of) the _UTF-8 decode without BOM_
     456             :    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
     457             :    *
     458             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
     459             :    * if there were malformed sequences (that were replaced with the
     460             :    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
     461             :    *
     462             :    * The backing buffer of the string isn't copied if the input buffer
     463             :    * is heap-allocated and decoding from UTF-8 and the input is valid
     464             :    * UTF-8, decoding from an ASCII-compatible encoding and the input
     465             :    * is valid ASCII or decoding from ISO-2022-JP and the input stays
     466             :    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
     467             :    * as both arguments.
     468             :    *
     469             :    * _Note:_ It is wrong to use this when the input buffer represents only
     470             :    * a segment of the input instead of the whole input. Use
     471             :    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
     472             :    */
     473           0 :   inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
     474             :                                            nsACString& aOut) const
     475             :   {
     476           0 :     const nsACString* bytes = &aBytes;
     477           0 :     nsACString* out = &aOut;
     478           0 :     if (bytes == out) {
     479           0 :       nsAutoCString temp(aBytes);
     480             :       return mozilla_encoding_decode_to_nscstring_without_bom_handling(
     481           0 :         this, &temp, out);
     482             :     }
     483             :     return mozilla_encoding_decode_to_nscstring_without_bom_handling(
     484           0 :       this, bytes, out);
     485             :   }
     486             : 
     487             :   /**
     488             :    * Decode complete input to `nsAString` _without BOM handling_ and
     489             :    * with malformed sequences replaced with the REPLACEMENT CHARACTER when
     490             :    * the entire input is available as a single buffer (i.e. the end of the
     491             :    * buffer marks the end of the stream).
     492             :    *
     493             :    * When invoked on `UTF_8`, this method implements the (non-streaming
     494             :    * version of) the _UTF-8 decode without BOM_
     495             :    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
     496             :    *
     497             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
     498             :    * if there were malformed sequences (that were replaced with the
     499             :    * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
     500             :    *
     501             :    * _Note:_ It is wrong to use this when the input buffer represents only
     502             :    * a segment of the input instead of the whole input. Use
     503             :    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
     504             :    */
     505          25 :   inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
     506             :                                            nsAString& aOut) const
     507             :   {
     508          25 :     return mozilla_encoding_decode_to_nsstring_without_bom_handling(
     509          25 :       this, aBytes.Elements(), aBytes.Length(), &aOut);
     510             :   }
     511             : 
     512             :   /**
     513             :    * Decode complete input to `nsACString` _without BOM handling_ and
     514             :    * _with malformed sequences treated as fatal_ when the entire input is
     515             :    * available as a single buffer (i.e. the end of the buffer marks the end
     516             :    * of the stream).
     517             :    *
     518             :    * When invoked on `UTF_8`, this method implements the (non-streaming
     519             :    * version of) the _UTF-8 decode without BOM or fail_
     520             :    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
     521             :    * spec concept.
     522             :    *
     523             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
     524             :    * if a malformed sequence was encountered and `NS_OK` otherwise.
     525             :    *
     526             :    * The backing buffer of the string isn't copied if the input buffer
     527             :    * is heap-allocated and decoding from UTF-8 and the input is valid
     528             :    * UTF-8, decoding from an ASCII-compatible encoding and the input
     529             :    * is valid ASCII or decoding from ISO-2022-JP and the input stays
     530             :    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
     531             :    * as both arguments.
     532             :    *
     533             :    * _Note:_ It is wrong to use this when the input buffer represents only
     534             :    * a segment of the input instead of the whole input. Use
     535             :    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
     536             :    */
     537           0 :   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
     538             :     const nsACString& aBytes,
     539             :     nsACString& aOut) const
     540             :   {
     541           0 :     const nsACString* bytes = &aBytes;
     542           0 :     nsACString* out = &aOut;
     543           0 :     if (bytes == out) {
     544           0 :       nsAutoCString temp(aBytes);
     545             :       return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
     546           0 :         this, &temp, out);
     547             :     }
     548             :     return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
     549           0 :       this, bytes, out);
     550             :   }
     551             : 
     552             :   /**
     553             :    * Decode complete input to `nsAString` _without BOM handling_ and
     554             :    * _with malformed sequences treated as fatal_ when the entire input is
     555             :    * available as a single buffer (i.e. the end of the buffer marks the end
     556             :    * of the stream).
     557             :    *
     558             :    * When invoked on `UTF_8`, this method implements the (non-streaming
     559             :    * version of) the _UTF-8 decode without BOM or fail_
     560             :    * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
     561             :    * spec concept.
     562             :    *
     563             :    * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
     564             :    * if a malformed sequence was encountered and `NS_OK` otherwise.
     565             :    *
     566             :    * _Note:_ It is wrong to use this when the input buffer represents only
     567             :    * a segment of the input instead of the whole input. Use
     568             :    * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
     569             :    */
     570           0 :   inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
     571             :     Span<const uint8_t> aBytes,
     572             :     nsAString& aOut) const
     573             :   {
     574           0 :     return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
     575           0 :       this, aBytes.Elements(), aBytes.Length(), &aOut);
     576             :   }
     577             : 
     578             :   /**
     579             :    * Encode complete input to `nsACString` with unmappable characters
     580             :    * replaced with decimal numeric character references when the entire input
     581             :    * is available as a single buffer (i.e. the end of the buffer marks the
     582             :    * end of the stream).
     583             :    *
     584             :    * This method implements the (non-streaming version of) the
     585             :    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
     586             :    *
     587             :    * The second item in the returned tuple is the encoding that was actually
     588             :    * used (which may differ from this encoding thanks to some encodings
     589             :    * having UTF-8 as their output encoding).
     590             :    *
     591             :    * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
     592             :    * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
     593             :    * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
     594             :    * replaced with numeric character references) and `NS_OK` otherwise.
     595             :    *
     596             :    * The backing buffer of the string isn't copied if the input buffer
     597             :    * is heap-allocated and encoding to UTF-8 and the input is valid
     598             :    * UTF-8, encoding to an ASCII-compatible encoding and the input
     599             :    * is valid ASCII or encoding from ISO-2022-JP and the input stays
     600             :    * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
     601             :    * as both arguments.
     602             :    *
     603             :    * _Note:_ It is wrong to use this when the input buffer represents only
     604             :    * a segment of the input instead of the whole input. Use `NewEncoder()`
     605             :    * when encoding segmented output.
     606             :    */
     607           0 :   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
     608             :     const nsACString& aString,
     609             :     nsACString& aOut) const
     610             :   {
     611           0 :     const Encoding* encoding = this;
     612           0 :     const nsACString* string = &aString;
     613           0 :     nsACString* out = &aOut;
     614             :     nsresult rv;
     615           0 :     if (string == out) {
     616           0 :       nsAutoCString temp(aString);
     617           0 :       rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
     618             :     } else {
     619           0 :       rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
     620             :     }
     621           0 :     return MakeTuple(rv, WrapNotNull(encoding));
     622             :   }
     623             : 
     624             :   /**
     625             :    * Encode complete input to `nsACString` with unmappable characters
     626             :    * replaced with decimal numeric character references when the entire input
     627             :    * is available as a single buffer (i.e. the end of the buffer marks the
     628             :    * end of the stream).
     629             :    *
     630             :    * This method implements the (non-streaming version of) the
     631             :    * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
     632             :    *
     633             :    * The second item in the returned tuple is the encoding that was actually
     634             :    * used (which may differ from this encoding thanks to some encodings
     635             :    * having UTF-8 as their output encoding).
     636             :    *
     637             :    * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
     638             :    * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
     639             :    * were replaced with numeric character references) and `NS_OK` otherwise.
     640             : 
     641             :    * _Note:_ It is wrong to use this when the input buffer represents only
     642             :    * a segment of the input instead of the whole input. Use `NewEncoder()`
     643             :    * when encoding segmented output.
     644             :    */
     645           0 :   inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
     646             :     Span<const char16_t> aString,
     647             :     nsACString& aOut) const
     648             :   {
     649           0 :     const Encoding* encoding = this;
     650           0 :     nsresult rv = mozilla_encoding_encode_from_utf16(
     651           0 :       &encoding, aString.Elements(), aString.Length(), &aOut);
     652           0 :     return MakeTuple(rv, WrapNotNull(encoding));
     653             :   }
     654             : 
     655             :   /**
     656             :    * Instantiates a new decoder for this encoding with BOM sniffing enabled.
     657             :    *
     658             :    * BOM sniffing may cause the returned decoder to morph into a decoder
     659             :    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
     660             :    */
     661           5 :   inline UniquePtr<Decoder> NewDecoder() const
     662             :   {
     663           5 :     UniquePtr<Decoder> decoder(encoding_new_decoder(this));
     664           5 :     return decoder;
     665             :   }
     666             : 
     667             :   /**
     668             :    * Instantiates a new decoder for this encoding with BOM sniffing enabled
     669             :    * into memory occupied by a previously-instantiated decoder.
     670             :    *
     671             :    * BOM sniffing may cause the returned decoder to morph into a decoder
     672             :    * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
     673             :    */
     674             :   inline void NewDecoderInto(Decoder& aDecoder) const
     675             :   {
     676             :     encoding_new_decoder_into(this, &aDecoder);
     677             :   }
     678             : 
     679             :   /**
     680             :    * Instantiates a new decoder for this encoding with BOM removal.
     681             :    *
     682             :    * If the input starts with bytes that are the BOM for this encoding,
     683             :    * those bytes are removed. However, the decoder never morphs into a
     684             :    * decoder for another encoding: A BOM for another encoding is treated as
     685             :    * (potentially malformed) input to the decoding algorithm for this
     686             :    * encoding.
     687             :    */
     688          82 :   inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const
     689             :   {
     690          82 :     UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
     691          82 :     return decoder;
     692             :   }
     693             : 
     694             :   /**
     695             :    * Instantiates a new decoder for this encoding with BOM removal
     696             :    * into memory occupied by a previously-instantiated decoder.
     697             :    *
     698             :    * If the input starts with bytes that are the BOM for this encoding,
     699             :    * those bytes are removed. However, the decoder never morphs into a
     700             :    * decoder for another encoding: A BOM for another encoding is treated as
     701             :    * (potentially malformed) input to the decoding algorithm for this
     702             :    * encoding.
     703             :    */
     704           0 :   inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const
     705             :   {
     706           0 :     encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
     707           0 :   }
     708             : 
     709             :   /**
     710             :    * Instantiates a new decoder for this encoding with BOM handling disabled.
     711             :    *
     712             :    * If the input starts with bytes that look like a BOM, those bytes are
     713             :    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
     714             :    * for another encoding.)
     715             :    *
     716             :    * _Note:_ If the caller has performed BOM sniffing on its own but has not
     717             :    * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
     718             :    * instead of this method to cause the BOM to be removed.
     719             :    */
     720          40 :   inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const
     721             :   {
     722          40 :     UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
     723          40 :     return decoder;
     724             :   }
     725             : 
     726             :   /**
     727             :    * Instantiates a new decoder for this encoding with BOM handling disabled
     728             :    * into memory occupied by a previously-instantiated decoder.
     729             :    *
     730             :    * If the input starts with bytes that look like a BOM, those bytes are
     731             :    * not treated as a BOM. (Hence, the decoder never morphs into a decoder
     732             :    * for another encoding.)
     733             :    *
     734             :    * _Note:_ If the caller has performed BOM sniffing on its own but has not
     735             :    * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
     736             :    * instead of this method to cause the BOM to be removed.
     737             :    */
     738           0 :   inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const
     739             :   {
     740           0 :     encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
     741           0 :   }
     742             : 
     743             :   /**
     744             :    * Instantiates a new encoder for the output encoding of this encoding.
     745             :    */
     746           0 :   inline UniquePtr<Encoder> NewEncoder() const
     747             :   {
     748           0 :     UniquePtr<Encoder> encoder(encoding_new_encoder(this));
     749           0 :     return encoder;
     750             :   }
     751             : 
     752             :   /**
     753             :    * Instantiates a new encoder for the output encoding of this encoding
     754             :    * into memory occupied by a previously-instantiated encoder.
     755             :    */
     756           0 :   inline void NewEncoderInto(Encoder& aEncoder) const
     757             :   {
     758           0 :     encoding_new_encoder_into(this, &aEncoder);
     759           0 :   }
     760             : 
     761             :   /**
     762             :    * Validates UTF-8.
     763             :    *
     764             :    * Returns the index of the first byte that makes the input malformed as
     765             :    * UTF-8 or the length of the input if the input is entirely valid.
     766             :    */
     767             :   static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer)
     768             :   {
     769             :     return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
     770             :   }
     771             : 
     772             :   /**
     773             :    * Validates ASCII.
     774             :    *
     775             :    * Returns the index of the first byte that makes the input malformed as
     776             :    * ASCII or the length of the input if the input is entirely valid.
     777             :    */
     778             :   static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer)
     779             :   {
     780             :     return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
     781             :   }
     782             : 
     783             :   /**
     784             :    * Validates ISO-2022-JP ASCII-state data.
     785             :    *
     786             :    * Returns the index of the first byte that makes the input not
     787             :    * representable in the ASCII state of ISO-2022-JP or the length of the
     788             :    * input if the input is entirely representable in the ASCII state of
     789             :    * ISO-2022-JP.
     790             :    */
     791             :   static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)
     792             :   {
     793             :     return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
     794             :                                                   aBuffer.Length());
     795             :   }
     796             : 
     797             : private:
     798             :   Encoding() = delete;
     799             :   Encoding(const Encoding&) = delete;
     800             :   Encoding& operator=(const Encoding&) = delete;
     801             :   ~Encoding() = delete;
     802             : 
     803             : };
     804             : 
     805             : /**
     806             :  * A converter that decodes a byte stream into Unicode according to a
     807             :  * character encoding in a streaming (incremental) manner.
     808             :  *
     809             :  * The various `Decode*` methods take an input buffer (`aSrc`) and an output
     810             :  * buffer `aDst` both of which are caller-allocated. There are variants for
     811             :  * both UTF-8 and UTF-16 output buffers.
     812             :  *
     813             :  * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
     814             :  * into `aDst` until one of the following three things happens:
     815             :  *
     816             :  * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
     817             :  *    variants only).
     818             :  *
     819             :  * 2. The output buffer has been filled so near capacity that the decoder
     820             :  *    cannot be sure that processing an additional byte of input wouldn't
     821             :  *    cause so much output that the output buffer would overflow.
     822             :  *
     823             :  * 3. All the input bytes have been processed.
     824             :  *
     825             :  * The `Decode*` method then returns tuple of a status indicating which one
     826             :  * of the three reasons to return happened, how many input bytes were read,
     827             :  * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
     828             :  * when decoding to UTF-16) were written, and in the case of the
     829             :  * variants performing replacement, a boolean indicating whether an error was
     830             :  * replaced with the REPLACEMENT CHARACTER during the call.
     831             :  *
     832             :  * The number of bytes "written" is what's logically written. Garbage may be
     833             :  * written in the output buffer beyond the point logically written to.
     834             :  *
     835             :  * In the case of the `*WithoutReplacement` variants, the status is a
     836             :  * `uint32_t` whose possible values are packed info about a malformed byte
     837             :  * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
     838             :  * listed above).
     839             :  *
     840             :  * Packed info about malformed sequences has the following format:
     841             :  * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
     842             :  * indicate the number of bytes that were consumed after the malformed
     843             :  * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
     844             :  * the length of the malformed byte sequence (possible decimal values 1, 2,
     845             :  * 3 or 4). The maximum possible sum of the two is 6.
     846             :  *
     847             :  * In the case of methods whose name does not end with
     848             :  * `*WithoutReplacement`, malformed sequences are automatically replaced
     849             :  * with the REPLACEMENT CHARACTER and errors do not cause the methods to
     850             :  * return early.
     851             :  *
     852             :  * When decoding to UTF-8, the output buffer must have at least 4 bytes of
     853             :  * space. When decoding to UTF-16, the output buffer must have at least two
     854             :  * UTF-16 code units (`char16_t`) of space.
     855             :  *
     856             :  * When decoding to UTF-8 without replacement, the methods are guaranteed
     857             :  * not to return indicating that more output space is needed if the length
     858             :  * of the output buffer is at least the length returned by
     859             :  * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
     860             :  * with replacement, the length of the output buffer that guarantees the
     861             :  * methods not to return indicating that more output space is needed is given
     862             :  * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
     863             :  * or without replacement, the length of the output buffer that guarantees
     864             :  * the methods not to return indicating that more output space is needed is
     865             :  * given by `MaxUTF16BufferLength()`.
     866             :  *
     867             :  * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
     868             :  * and the output after each `Decode*` call is guaranteed to consist of
     869             :  * complete characters. (I.e. the code unit sequence for the last character is
     870             :  * guaranteed not to be split across output buffers.)
     871             :  *
     872             :  * The boolean argument `aLast` indicates that the end of the stream is reached
     873             :  * when all the bytes in `aSrc` have been consumed.
     874             :  *
     875             :  * A `Decoder` object can be used to incrementally decode a byte stream.
     876             :  *
     877             :  * During the processing of a single stream, the caller must call `Decode*`
     878             :  * zero or more times with `aLast` set to `false` and then call `Decode*` at
     879             :  * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
     880             :  * the processing of the stream has ended. Otherwise, the caller must call
     881             :  * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
     882             :  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
     883             :  *
     884             :  * Once the stream has ended, the `Decoder` object must not be used anymore.
     885             :  * That is, you need to create another one to process another stream.
     886             :  *
     887             :  * When the decoder returns `kOutputFull` or the decoder returns a malformed
     888             :  * result and the caller does not wish to treat it as a fatal error, the input
     889             :  * buffer `aSrc` may not have been completely consumed. In that case, the caller
     890             :  * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
     891             :  * call.
     892             :  *
     893             :  * # Infinite loops
     894             :  *
     895             :  * When converting with a fixed-size output buffer whose size is too small to
     896             :  * accommodate one character of output, an infinite loop ensues. When
     897             :  * converting with a fixed-size output buffer, it generally makes sense to
     898             :  * make the buffer fairly large (e.g. couple of kilobytes).
     899             :  */
     900             : class Decoder final
     901             : {
     902             : public:
     903         124 :   ~Decoder() {}
     904         124 :   static void operator delete(void* aDecoder)
     905             :   {
     906         124 :     decoder_free(reinterpret_cast<Decoder*>(aDecoder));
     907         124 :   }
     908             : 
     909             :   /**
     910             :    * The `Encoding` this `Decoder` is for.
     911             :    *
     912             :    * BOM sniffing can change the return value of this method during the life
     913             :    * of the decoder.
     914             :    */
     915          37 :   inline NotNull<const mozilla::Encoding*> Encoding() const
     916             :   {
     917          37 :     return WrapNotNull(decoder_encoding(this));
     918             :   }
     919             : 
     920             :   /**
     921             :    * Query the worst-case UTF-8 output size _with replacement_.
     922             :    *
     923             :    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
     924             :    * that will not overflow given the current state of the decoder and
     925             :    * `aByteLength` number of additional input bytes when decoding with
     926             :    * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
     927             :    * sequence.
     928             :    */
     929             :   inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const
     930             :   {
     931             :     CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
     932             :     if (max.value() == MaxValue<size_t>::value) {
     933             :       // Mark invalid by overflowing
     934             :       max++;
     935             :       MOZ_ASSERT(!max.isValid());
     936             :     }
     937             :     return max;
     938             :   }
     939             : 
     940             :   /**
     941             :    * Query the worst-case UTF-8 output size _without replacement_.
     942             :    *
     943             :    * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
     944             :    * that will not overflow given the current state of the decoder and
     945             :    * `aByteLength` number of additional input bytes when decoding without
     946             :    * replacement error handling.
     947             :    *
     948             :    * Note that this value may be too small for the `WithReplacement` case.
     949             :    * Use `MaxUTF8BufferLength()` for that case.
     950             :    */
     951             :   inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
     952             :     size_t aByteLength) const
     953             :   {
     954             :     CheckedInt<size_t> max(
     955             :       decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
     956             :     if (max.value() == MaxValue<size_t>::value) {
     957             :       // Mark invalid by overflowing
     958             :       max++;
     959             :       MOZ_ASSERT(!max.isValid());
     960             :     }
     961             :     return max;
     962             :   }
     963             : 
     964             :   /**
     965             :    * Incrementally decode a byte stream into UTF-8 with malformed sequences
     966             :    * replaced with the REPLACEMENT CHARACTER.
     967             :    *
     968             :    * See the documentation of the class for documentation for `Decode*`
     969             :    * methods collectively.
     970             :    */
     971             :   inline Tuple<uint32_t, size_t, size_t, bool>
     972             :   DecodeToUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
     973             :   {
     974             :     size_t srcRead = aSrc.Length();
     975             :     size_t dstWritten = aDst.Length();
     976             :     bool hadReplacements;
     977             :     uint32_t result = decoder_decode_to_utf8(this,
     978             :                                              aSrc.Elements(),
     979             :                                              &srcRead,
     980             :                                              aDst.Elements(),
     981             :                                              &dstWritten,
     982             :                                              aLast,
     983             :                                              &hadReplacements);
     984             :     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
     985             :   }
     986             : 
     987             :   /**
     988             :    * Incrementally decode a byte stream into UTF-8 _without replacement_.
     989             :    *
     990             :    * See the documentation of the class for documentation for `Decode*`
     991             :    * methods collectively.
     992             :    */
     993             :   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
     994             :     Span<const uint8_t> aSrc,
     995             :     Span<uint8_t> aDst,
     996             :     bool aLast)
     997             :   {
     998             :     size_t srcRead = aSrc.Length();
     999             :     size_t dstWritten = aDst.Length();
    1000             :     uint32_t result = decoder_decode_to_utf8_without_replacement(
    1001             :       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
    1002             :     return MakeTuple(result, srcRead, dstWritten);
    1003             :   }
    1004             : 
    1005             :   /**
    1006             :    * Query the worst-case UTF-16 output size (with or without replacement).
    1007             :    *
    1008             :    * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
    1009             :    * that will not overflow given the current state of the decoder and
    1010             :    * `aByteLength` number of additional input bytes.
    1011             :    *
    1012             :    * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
    1013             :    * return value of this method applies also in the
    1014             :    * `_without_replacement` case.
    1015             :    */
    1016         176 :   inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const
    1017             :   {
    1018         176 :     CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
    1019         176 :     if (max.value() == MaxValue<size_t>::value) {
    1020             :       // Mark invalid by overflowing
    1021           0 :       max++;
    1022           0 :       MOZ_ASSERT(!max.isValid());
    1023             :     }
    1024         176 :     return max;
    1025             :   }
    1026             : 
    1027             :   /**
    1028             :    * Incrementally decode a byte stream into UTF-16 with malformed sequences
    1029             :    * replaced with the REPLACEMENT CHARACTER.
    1030             :    *
    1031             :    * See the documentation of the class for documentation for `Decode*`
    1032             :    * methods collectively.
    1033             :    */
    1034             :   inline Tuple<uint32_t, size_t, size_t, bool>
    1035         157 :   DecodeToUTF16(Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast)
    1036             :   {
    1037         157 :     size_t srcRead = aSrc.Length();
    1038         157 :     size_t dstWritten = aDst.Length();
    1039             :     bool hadReplacements;
    1040         157 :     uint32_t result = decoder_decode_to_utf16(this,
    1041             :                                               aSrc.Elements(),
    1042             :                                               &srcRead,
    1043             :                                               aDst.Elements(),
    1044             :                                               &dstWritten,
    1045             :                                               aLast,
    1046         157 :                                               &hadReplacements);
    1047         157 :     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
    1048             :   }
    1049             : 
    1050             :   /**
    1051             :    * Incrementally decode a byte stream into UTF-16 _without replacement_.
    1052             :    *
    1053             :    * See the documentation of the class for documentation for `Decode*`
    1054             :    * methods collectively.
    1055             :    */
    1056          25 :   inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
    1057             :     Span<const uint8_t> aSrc,
    1058             :     Span<char16_t> aDst,
    1059             :     bool aLast)
    1060             :   {
    1061          25 :     size_t srcRead = aSrc.Length();
    1062          25 :     size_t dstWritten = aDst.Length();
    1063          25 :     uint32_t result = decoder_decode_to_utf16_without_replacement(
    1064          25 :       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
    1065          25 :     return MakeTuple(result, srcRead, dstWritten);
    1066             :   }
    1067             : 
    1068             : private:
    1069             :   Decoder() = delete;
    1070             :   Decoder(const Decoder&) = delete;
    1071             :   Decoder& operator=(const Decoder&) = delete;
    1072             : };
    1073             : 
    1074             : /**
    1075             :  * A converter that encodes a Unicode stream into bytes according to a
    1076             :  * character encoding in a streaming (incremental) manner.
    1077             :  *
    1078             :  * The various `Encode*` methods take an input buffer (`aSrc`) and an output
    1079             :  * buffer `aDst` both of which are caller-allocated. There are variants for
    1080             :  * both UTF-8 and UTF-16 input buffers.
    1081             :  *
    1082             :  * An `Encode*` method encode characters from `aSrc` into bytes characters
    1083             :  * stored into `aDst` until one of the following three things happens:
    1084             :  *
    1085             :  * 1. An unmappable character is encountered (`*WithoutReplacement` variants
    1086             :  *    only).
    1087             :  *
    1088             :  * 2. The output buffer has been filled so near capacity that the decoder
    1089             :  *    cannot be sure that processing an additional character of input wouldn't
    1090             :  *    cause so much output that the output buffer would overflow.
    1091             :  *
    1092             :  * 3. All the input characters have been processed.
    1093             :  *
    1094             :  * The `Encode*` method then returns tuple of a status indicating which one
    1095             :  * of the three reasons to return happened, how many input code units (`uint8_t`
    1096             :  * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
    1097             :  * how many output bytes were written, and in the case of the variants that
    1098             :  * perform replacement, a boolean indicating whether an unmappable
    1099             :  * character was replaced with a numeric character reference during the call.
    1100             :  *
    1101             :  * The number of bytes "written" is what's logically written. Garbage may be
    1102             :  * written in the output buffer beyond the point logically written to.
    1103             :  *
    1104             :  * In the case of the methods whose name ends with
    1105             :  * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
    1106             :  * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
    1107             :  * to the three cases listed above).
    1108             :  *
    1109             :  * In the case of methods whose name does not end with
    1110             :  * `*WithoutReplacement`, unmappable characters are automatically replaced
    1111             :  * with the corresponding numeric character references and unmappable
    1112             :  * characters do not cause the methods to return early.
    1113             :  *
    1114             :  * When encoding from UTF-8 without replacement, the methods are guaranteed
    1115             :  * not to return indicating that more output space is needed if the length
    1116             :  * of the output buffer is at least the length returned by
    1117             :  * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
    1118             :  * UTF-8 with replacement, the length of the output buffer that guarantees the
    1119             :  * methods not to return indicating that more output space is needed in the
    1120             :  * absence of unmappable characters is given by
    1121             :  * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
    1122             :  * UTF-16 without replacement, the methods are guaranteed not to return
    1123             :  * indicating that more output space is needed if the length of the output
    1124             :  * buffer is at least the length returned by
    1125             :  * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
    1126             :  * from UTF-16 with replacement, the the length of the output buffer that
    1127             :  * guarantees the methods not to return indicating that more output space is
    1128             :  * needed in the absence of unmappable characters is given by
    1129             :  * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
    1130             :  * When encoding with replacement, applications are not expected to size the
    1131             :  * buffer for the worst case ahead of time but to resize the buffer if there
    1132             :  * are unmappable characters. This is why max length queries are only available
    1133             :  * for the case where there are no unmappable characters.
    1134             :  *
    1135             :  * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
    1136             :  * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
    1137             :  * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
    1138             :  * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
    1139             :  * surrogate pairs are not split across input buffer boundaries.
    1140             :  *
    1141             :  * After an `Encode*` call returns, the output produced so far, taken as a
    1142             :  * whole from the start of the stream, is guaranteed to consist of a valid
    1143             :  * byte sequence in the target encoding. (I.e. the code unit sequence for a
    1144             :  * character is guaranteed not to be split across output buffers. However, due
    1145             :  * to the stateful nature of ISO-2022-JP, the stream needs to be considered
    1146             :  * from the start for it to be valid. For other encodings, the validity holds
    1147             :  * on a per-output buffer basis.)
    1148             :  *
    1149             :  * The boolean argument `aLast` indicates that the end of the stream is reached
    1150             :  * when all the characters in `aSrc` have been consumed. This argument is needed
    1151             :  * for ISO-2022-JP and is ignored for other encodings.
    1152             :  *
    1153             :  * An `Encoder` object can be used to incrementally encode a byte stream.
    1154             :  *
    1155             :  * During the processing of a single stream, the caller must call `Encode*`
    1156             :  * zero or more times with `aLast` set to `false` and then call `Encode*` at
    1157             :  * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
    1158             :  * the processing of the stream has ended. Otherwise, the caller must call
    1159             :  * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
    1160             :  * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
    1161             :  *
    1162             :  * Once the stream has ended, the `Encoder` object must not be used anymore.
    1163             :  * That is, you need to create another one to process another stream.
    1164             :  *
    1165             :  * When the encoder returns `kOutputFull` or the encoder returns an unmappable
    1166             :  * result and the caller does not wish to treat it as a fatal error, the input
    1167             :  * buffer `aSrc` may not have been completely consumed. In that case, the caller
    1168             :  * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
    1169             :  * call.
    1170             :  *
    1171             :  * # Infinite loops
    1172             :  *
    1173             :  * When converting with a fixed-size output buffer whose size is too small to
    1174             :  * accommodate one character of output, an infinite loop ensues. When
    1175             :  * converting with a fixed-size output buffer, it generally makes sense to
    1176             :  * make the buffer fairly large (e.g. couple of kilobytes).
    1177             :  */
    1178             : class Encoder final
    1179             : {
    1180             : public:
    1181           0 :   ~Encoder() {}
    1182             : 
    1183           0 :   static void operator delete(void* aEncoder)
    1184             :   {
    1185           0 :     encoder_free(reinterpret_cast<Encoder*>(aEncoder));
    1186           0 :   }
    1187             : 
    1188             :   /**
    1189             :    * The `Encoding` this `Encoder` is for.
    1190             :    */
    1191           0 :   inline NotNull<const mozilla::Encoding*> Encoding() const
    1192             :   {
    1193           0 :     return WrapNotNull(encoder_encoding(this));
    1194             :   }
    1195             : 
    1196             :   /**
    1197             :    * Returns `true` if this is an ISO-2022-JP encoder that's not in the
    1198             :    * ASCII state and `false` otherwise.
    1199             :    */
    1200             :   inline bool HasPendingState() const
    1201             :   {
    1202             :     return encoder_has_pending_state(this);
    1203             :   }
    1204             : 
    1205             :   /**
    1206             :    * Query the worst-case output size when encoding from UTF-8 with
    1207             :    * replacement.
    1208             :    *
    1209             :    * Returns the size of the output buffer in bytes that will not overflow
    1210             :    * given the current state of the encoder and `aByteLength` number of
    1211             :    * additional input code units if there are no unmappable characters in
    1212             :    * the input.
    1213             :    */
    1214             :   inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
    1215             :     size_t aByteLength) const
    1216             :   {
    1217             :     CheckedInt<size_t> max(
    1218             :       encoder_max_buffer_length_from_utf8_if_no_unmappables(this, aByteLength));
    1219             :     if (max.value() == MaxValue<size_t>::value) {
    1220             :       // Mark invalid by overflowing
    1221             :       max++;
    1222             :       MOZ_ASSERT(!max.isValid());
    1223             :     }
    1224             :     return max;
    1225             :   }
    1226             : 
    1227             :   /**
    1228             :    * Query the worst-case output size when encoding from UTF-8 without
    1229             :    * replacement.
    1230             :    *
    1231             :    * Returns the size of the output buffer in bytes that will not overflow
    1232             :    * given the current state of the encoder and `aByteLength` number of
    1233             :    * additional input code units.
    1234             :    */
    1235             :   inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
    1236             :     size_t aByteLength) const
    1237             :   {
    1238             :     CheckedInt<size_t> max(
    1239             :       encoder_max_buffer_length_from_utf8_without_replacement(this,
    1240             :                                                               aByteLength));
    1241             :     if (max.value() == MaxValue<size_t>::value) {
    1242             :       // Mark invalid by overflowing
    1243             :       max++;
    1244             :       MOZ_ASSERT(!max.isValid());
    1245             :     }
    1246             :     return max;
    1247             :   }
    1248             : 
    1249             :   /**
    1250             :    * Incrementally encode into byte stream from UTF-8 with unmappable
    1251             :    * characters replaced with HTML (decimal) numeric character references.
    1252             :    *
    1253             :    * See the documentation of the class for documentation for `Encode*`
    1254             :    * methods collectively.
    1255             :    *
    1256             :    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
    1257             :    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
    1258             :    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
    1259             :    */
    1260             :   inline Tuple<uint32_t, size_t, size_t, bool>
    1261             :   EncodeFromUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
    1262             :   {
    1263             :     size_t srcRead = aSrc.Length();
    1264             :     size_t dstWritten = aDst.Length();
    1265             :     bool hadReplacements;
    1266             :     uint32_t result = encoder_encode_from_utf8(this,
    1267             :                                                aSrc.Elements(),
    1268             :                                                &srcRead,
    1269             :                                                aDst.Elements(),
    1270             :                                                &dstWritten,
    1271             :                                                aLast,
    1272             :                                                &hadReplacements);
    1273             :     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
    1274             :   }
    1275             : 
    1276             :   /**
    1277             :    * Incrementally encode into byte stream from UTF-8 _without replacement_.
    1278             :    *
    1279             :    * See the documentation of the class for documentation for `Encode*`
    1280             :    * methods collectively.
    1281             :    *
    1282             :    * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
    1283             :    * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
    1284             :    * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
    1285             :    */
    1286             :   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
    1287             :     Span<const uint8_t> aSrc,
    1288             :     Span<uint8_t> aDst,
    1289             :     bool aLast)
    1290             :   {
    1291             :     size_t srcRead = aSrc.Length();
    1292             :     size_t dstWritten = aDst.Length();
    1293             :     uint32_t result = encoder_encode_from_utf8_without_replacement(
    1294             :       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
    1295             :     return MakeTuple(result, srcRead, dstWritten);
    1296             :   }
    1297             : 
    1298             :   /**
    1299             :    * Query the worst-case output size when encoding from UTF-16 with
    1300             :    * replacement.
    1301             :    *
    1302             :    * Returns the size of the output buffer in bytes that will not overflow
    1303             :    * given the current state of the encoder and `aU16Length` number of
    1304             :    * additional input code units if there are no unmappable characters in
    1305             :    * the input.
    1306             :    */
    1307             :   inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
    1308             :     size_t aU16Length) const
    1309             :   {
    1310             :     CheckedInt<size_t> max(
    1311             :       encoder_max_buffer_length_from_utf16_if_no_unmappables(this, aU16Length));
    1312             :     if (max.value() == MaxValue<size_t>::value) {
    1313             :       // Mark invalid by overflowing
    1314             :       max++;
    1315             :       MOZ_ASSERT(!max.isValid());
    1316             :     }
    1317             :     return max;
    1318             :   }
    1319             : 
    1320             :   /**
    1321             :    * Query the worst-case output size when encoding from UTF-16 without
    1322             :    * replacement.
    1323             :    *
    1324             :    * Returns the size of the output buffer in bytes that will not overflow
    1325             :    * given the current state of the encoder and `aU16Length` number of
    1326             :    * additional input code units.
    1327             :    */
    1328           0 :   inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
    1329             :     size_t aU16Length) const
    1330             :   {
    1331             :     CheckedInt<size_t> max(
    1332             :       encoder_max_buffer_length_from_utf16_without_replacement(this,
    1333           0 :                                                                aU16Length));
    1334           0 :     if (max.value() == MaxValue<size_t>::value) {
    1335             :       // Mark invalid by overflowing
    1336           0 :       max++;
    1337           0 :       MOZ_ASSERT(!max.isValid());
    1338             :     }
    1339           0 :     return max;
    1340             :   }
    1341             : 
    1342             :   /**
    1343             :    * Incrementally encode into byte stream from UTF-16 with unmappable
    1344             :    * characters replaced with HTML (decimal) numeric character references.
    1345             :    *
    1346             :    * See the documentation of the class for documentation for `Encode*`
    1347             :    * methods collectively.
    1348             :    */
    1349             :   inline Tuple<uint32_t, size_t, size_t, bool>
    1350           0 :   EncodeFromUTF16(Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast)
    1351             :   {
    1352           0 :     size_t srcRead = aSrc.Length();
    1353           0 :     size_t dstWritten = aDst.Length();
    1354             :     bool hadReplacements;
    1355           0 :     uint32_t result = encoder_encode_from_utf16(this,
    1356             :                                                 aSrc.Elements(),
    1357             :                                                 &srcRead,
    1358             :                                                 aDst.Elements(),
    1359             :                                                 &dstWritten,
    1360             :                                                 aLast,
    1361           0 :                                                 &hadReplacements);
    1362           0 :     return MakeTuple(result, srcRead, dstWritten, hadReplacements);
    1363             :   }
    1364             : 
    1365             :   /**
    1366             :    * Incrementally encode into byte stream from UTF-16 _without replacement_.
    1367             :    *
    1368             :    * See the documentation of the class for documentation for `Encode*`
    1369             :    * methods collectively.
    1370             :    */
    1371           0 :   inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
    1372             :     Span<const char16_t> aSrc,
    1373             :     Span<uint8_t> aDst,
    1374             :     bool aLast)
    1375             :   {
    1376           0 :     size_t srcRead = aSrc.Length();
    1377           0 :     size_t dstWritten = aDst.Length();
    1378           0 :     uint32_t result = encoder_encode_from_utf16_without_replacement(
    1379           0 :       this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
    1380           0 :     return MakeTuple(result, srcRead, dstWritten);
    1381             :   }
    1382             : 
    1383             : private:
    1384             :   Encoder() = delete;
    1385             :   Encoder(const Encoder&) = delete;
    1386             :   Encoder& operator=(const Encoder&) = delete;
    1387             : };
    1388             : 
    1389             : }; // namespace mozilla
    1390             : 
    1391             : #endif // mozilla_Encoding_h

Generated by: LCOV version 1.13