Line data Source code
1 : // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 : // file at the top-level directory of this distribution.
3 : //
4 : // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 : // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 : // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 : // option. This file may not be copied, modified, or distributed
8 : // except according to those terms.
9 :
10 : // Adapted from third_party/rust/encoding_c/include/encoding_rs_cpp.h, so the
11 : // "top-level directory" in the above notice refers to
12 : // third_party/rust/encoding_c/.
13 :
14 : #ifndef mozilla_Encoding_h
15 : #define mozilla_Encoding_h
16 :
17 : #include "mozilla/CheckedInt.h"
18 : #include "mozilla/NotNull.h"
19 : #include "mozilla/Span.h"
20 : #include "mozilla/Tuple.h"
21 : #include "nsString.h"
22 :
23 : namespace mozilla {
24 : class Encoding;
25 : class Decoder;
26 : class Encoder;
27 : }; // namespace mozilla
28 :
29 : #define ENCODING_RS_ENCODING mozilla::Encoding
30 : #define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR mozilla::NotNull<const mozilla::Encoding*>
31 : #define ENCODING_RS_ENCODER mozilla::Encoder
32 : #define ENCODING_RS_DECODER mozilla::Decoder
33 :
34 : #include "encoding_rs.h"
35 :
36 : extern "C" {
37 :
38 : nsresult
39 : mozilla_encoding_decode_to_nsstring(mozilla::Encoding const** encoding,
40 : uint8_t const* src,
41 : size_t src_len,
42 : nsAString* dst);
43 :
44 : nsresult
45 : mozilla_encoding_decode_to_nsstring_with_bom_removal(
46 : mozilla::Encoding const* encoding,
47 : uint8_t const* src,
48 : size_t src_len,
49 : nsAString* dst);
50 :
51 : nsresult
52 : mozilla_encoding_decode_to_nsstring_without_bom_handling(
53 : mozilla::Encoding const* encoding,
54 : uint8_t const* src,
55 : size_t src_len,
56 : nsAString* dst);
57 :
58 : nsresult
59 : mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
60 : mozilla::Encoding const* encoding,
61 : uint8_t const* src,
62 : size_t src_len,
63 : nsAString* dst);
64 :
65 : nsresult
66 : mozilla_encoding_encode_from_utf16(mozilla::Encoding const** encoding,
67 : char16_t const* src,
68 : size_t src_len,
69 : nsACString* dst);
70 :
71 : nsresult
72 : mozilla_encoding_decode_to_nscstring(mozilla::Encoding const** encoding,
73 : nsACString const* src,
74 : nsACString* dst);
75 :
76 : nsresult
77 : mozilla_encoding_decode_to_nscstring_with_bom_removal(
78 : mozilla::Encoding const* encoding,
79 : nsACString const* src,
80 : nsACString* dst);
81 :
82 : nsresult
83 : mozilla_encoding_decode_to_nscstring_without_bom_handling(
84 : mozilla::Encoding const* encoding,
85 : nsACString const* src,
86 : nsACString* dst);
87 :
88 : nsresult
89 : mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
90 : mozilla::Encoding const* encoding,
91 : nsACString const* src,
92 : nsACString* dst);
93 :
94 : nsresult
95 : mozilla_encoding_encode_from_nscstring(mozilla::Encoding const** encoding,
96 : nsACString const* src,
97 : nsACString* dst);
98 :
99 : } // extern "C"
100 :
101 : namespace mozilla {
102 :
103 : /**
104 : * Return value from `Decoder`/`Encoder` to indicate that input
105 : * was exhausted.
106 : */
107 : const uint32_t kInputEmpty = INPUT_EMPTY;
108 :
109 : /**
110 : * Return value from `Decoder`/`Encoder` to indicate that output
111 : * space was insufficient.
112 : */
113 : const uint32_t kOutputFull = OUTPUT_FULL;
114 :
115 : /**
116 : * An encoding as defined in the Encoding Standard
117 : * (https://encoding.spec.whatwg.org/).
118 : *
119 : * See https://docs.rs/encoding_rs/ for the Rust API docs.
120 : *
121 : * An _encoding_ defines a mapping from a byte sequence to a Unicode code point
122 : * sequence and, in most cases, vice versa. Each encoding has a name, an output
123 : * encoding, and one or more labels.
124 : *
125 : * _Labels_ are ASCII-case-insensitive strings that are used to identify an
126 : * encoding in formats and protocols. The _name_ of the encoding is the
127 : * preferred label in the case appropriate for returning from the
128 : * `characterSet` property of the `Document` DOM interface, except for
129 : * the replacement encoding whose name is not one of its labels.
130 : *
131 : * The _output encoding_ is the encoding used for form submission and URL
132 : * parsing on Web pages in the encoding. This is UTF-8 for the replacement,
133 : * UTF-16LE and UTF-16BE encodings and the encoding itself for other
134 : * encodings.
135 : *
136 : * # Streaming vs. Non-Streaming
137 : *
138 : * When you have the entire input in a single buffer, you can use the
139 : * methods `Decode()`, `DecodeWithBOMRemoval()`,
140 : * `DecodeWithoutBOMHandling()`,
141 : * `DecodeWithoutBOMHandlingAndWithoutReplacement()` and
142 : * `Encode()`. Unlike the rest of the API (apart from the `NewDecoder()` and
143 : * NewEncoder()` methods), these methods perform heap allocations. You should
144 : * the `Decoder` and `Encoder` objects when your input is split into multiple
145 : * buffers or when you want to control the allocation of the output buffers.
146 : *
147 : * # Instances
148 : *
149 : * All instances of `Encoding` are statically allocated and have the process's
150 : * lifetime. There is precisely one unique `Encoding` instance for each
151 : * encoding defined in the Encoding Standard.
152 : *
153 : * To obtain a reference to a particular encoding whose identity you know at
154 : * compile time, use a `static` that refers to encoding. There is a `static`
155 : * for each encoding. The `static`s are named in all caps with hyphens
156 : * replaced with underscores and with `_ENCODING` appended to the
157 : * name. For example, if you know at compile time that you will want to
158 : * decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
159 : *
160 : * If you don't know what encoding you need at compile time and need to
161 : * dynamically get an encoding by label, use `Encoding::for_label()`.
162 : *
163 : * Pointers to `Encoding` can be compared with `==` to check for the sameness
164 : * of two encodings.
165 : *
166 : * A pointer to a `mozilla::Encoding` in C++ is the same thing as a pointer
167 : * to an `encoding_rs::Encoding` in Rust. When writing FFI code, use
168 : * `const mozilla::Encoding*` in the C signature and
169 : * `*const encoding_rs::Encoding` is the corresponding Rust signature.
170 : */
171 : class Encoding final
172 : {
173 : public:
174 : /**
175 : * Implements the _get an encoding_ algorithm
176 : * (https://encoding.spec.whatwg.org/#concept-encoding-get).
177 : *
178 : * If, after ASCII-lowercasing and removing leading and trailing
179 : * whitespace, the argument matches a label defined in the Encoding
180 : * Standard, `const Encoding*` representing the corresponding
181 : * encoding is returned. If there is no match, `nullptr` is returned.
182 : *
183 : * This is the right method to use if the action upon the method returning
184 : * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
185 : * instead. When the action upon the method returning `nullptr` is not to
186 : * proceed with a fallback but to refuse processing,
187 : * `ForLabelNoReplacement()` is more appropriate.
188 : */
189 141 : static inline const Encoding* ForLabel(Span<const char> aLabel)
190 : {
191 141 : return encoding_for_label(
192 141 : reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
193 : }
194 :
195 : /**
196 : * `nsAString` argument version. See above for docs.
197 : */
198 20 : static inline const Encoding* ForLabel(const nsAString& aLabel)
199 : {
200 20 : return Encoding::ForLabel(NS_ConvertUTF16toUTF8(aLabel));
201 : }
202 :
203 : /**
204 : * This method behaves the same as `ForLabel()`, except when `ForLabel()`
205 : * would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
206 : *
207 : * This method is useful in scenarios where a fatal error is required
208 : * upon invalid label, because in those cases the caller typically wishes
209 : * to treat the labels that map to the replacement encoding as fatal
210 : * errors, too.
211 : *
212 : * It is not OK to use this method when the action upon the method returning
213 : * `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
214 : * such a case, the `ForLabel()` method should be used instead in order to avoid
215 : * unsafe fallback for labels that `ForLabel()` maps to `REPLACEMENT_ENCODING`.
216 : */
217 4805 : static inline const Encoding* ForLabelNoReplacement(Span<const char> aLabel)
218 : {
219 4805 : return encoding_for_label_no_replacement(
220 4805 : reinterpret_cast<const uint8_t*>(aLabel.Elements()), aLabel.Length());
221 : }
222 :
223 : /**
224 : * `nsAString` argument version. See above for docs.
225 : */
226 0 : static inline const Encoding* ForLabelNoReplacement(const nsAString& aLabel)
227 : {
228 0 : return Encoding::ForLabelNoReplacement(NS_ConvertUTF16toUTF8(aLabel));
229 : }
230 :
231 : /**
232 : * Performs non-incremental BOM sniffing.
233 : *
234 : * The argument must either be a buffer representing the entire input
235 : * stream (non-streaming case) or a buffer representing at least the first
236 : * three bytes of the input stream (streaming case).
237 : *
238 : * Returns `MakeTuple(UTF_8_ENCODING, 3)`, `MakeTuple(UTF_16LE_ENCODING, 2)`
239 : * or `MakeTuple(UTF_16BE_ENCODING, 3)` if the argument starts with the
240 : * UTF-8, UTF-16LE or UTF-16BE BOM or `MakeTuple(nullptr, 0)` otherwise.
241 : */
242 116 : static inline Tuple<const Encoding*, size_t> ForBOM(
243 : Span<const uint8_t> aBuffer)
244 : {
245 116 : size_t len = aBuffer.Length();
246 116 : const Encoding* encoding = encoding_for_bom(aBuffer.Elements(), &len);
247 116 : return MakeTuple(encoding, len);
248 : }
249 :
250 : /**
251 : * If the argument matches exactly (case-sensitively; no whitespace
252 : * removal performed) the name of an encoding, returns
253 : * `const Encoding*` representing that encoding. Otherwise `MOZ_CRASH`es.
254 : *
255 : * The motivating use case for this method is interoperability with
256 : * legacy Gecko code that represents encodings as name string instead of
257 : * type-safe `Encoding` objects. Using this method for other purposes is
258 : * most likely the wrong thing to do.
259 : */
260 4 : static inline NotNull<const mozilla::Encoding*> ForName(
261 : Span<const char> aName)
262 : {
263 : return WrapNotNull(encoding_for_name(
264 4 : reinterpret_cast<const uint8_t*>(aName.Elements()), aName.Length()));
265 : }
266 :
267 : /**
268 : * Writes the name of this encoding into `aName`.
269 : *
270 : * This name is appropriate to return as-is from the DOM
271 : * `document.characterSet` property.
272 : */
273 312 : inline void Name(nsACString& aName) const
274 : {
275 312 : aName.SetLength(ENCODING_NAME_MAX_LENGTH);
276 : size_t length =
277 312 : encoding_name(this, reinterpret_cast<uint8_t*>(aName.BeginWriting()));
278 312 : aName.SetLength(length); // truncation is the 64-bit case is OK
279 312 : }
280 :
281 : /**
282 : * Checks whether the _output encoding_ of this encoding can encode every
283 : * Unicode code point. (Only true if the output encoding is UTF-8.)
284 : */
285 0 : inline bool CanEncodeEverything() const
286 : {
287 0 : return encoding_can_encode_everything(this);
288 : }
289 :
290 : /**
291 : * Checks whether the bytes 0x00...0x7F map exclusively to the characters
292 : * U+0000...U+007F and vice versa.
293 : */
294 9 : inline bool IsAsciiCompatible() const
295 : {
296 9 : return encoding_is_ascii_compatible(this);
297 : }
298 :
299 : /**
300 : * Returns the _output encoding_ of this encoding. This is UTF-8 for
301 : * UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
302 : */
303 0 : inline NotNull<const mozilla::Encoding*> OutputEncoding() const
304 : {
305 0 : return WrapNotNull(encoding_output_encoding(this));
306 : }
307 :
308 : /**
309 : * Decode complete input to `nsACString` _with BOM sniffing_ and with
310 : * malformed sequences replaced with the REPLACEMENT CHARACTER when the
311 : * entire input is available as a single buffer (i.e. the end of the
312 : * buffer marks the end of the stream).
313 : *
314 : * This method implements the (non-streaming version of) the
315 : * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
316 : *
317 : * The second item in the returned tuple is the encoding that was actually
318 : * used (which may differ from this encoding thanks to BOM sniffing).
319 : *
320 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
321 : * if there were malformed sequences (that were replaced with the
322 : * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
323 : * tuple.
324 : *
325 : * The backing buffer of the string isn't copied if the input buffer
326 : * is heap-allocated and decoding from UTF-8 and the input is valid
327 : * BOMless UTF-8, decoding from an ASCII-compatible encoding and
328 : * the input is valid ASCII or decoding from ISO-2022-JP and the
329 : * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
330 : * the same string as both arguments.
331 : *
332 : * _Note:_ It is wrong to use this when the input buffer represents only
333 : * a segment of the input instead of the whole input. Use `NewDecoder()`
334 : * when decoding segmented input.
335 : */
336 : inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
337 : const nsACString& aBytes,
338 : nsACString& aOut) const
339 : {
340 : const Encoding* encoding = this;
341 : const nsACString* bytes = &aBytes;
342 : nsACString* out = &aOut;
343 : nsresult rv;
344 : if (bytes == out) {
345 : nsAutoCString temp(aBytes);
346 : rv = mozilla_encoding_decode_to_nscstring(&encoding, &temp, out);
347 : } else {
348 : rv = mozilla_encoding_decode_to_nscstring(&encoding, bytes, out);
349 : }
350 : return MakeTuple(rv, WrapNotNull(encoding));
351 : }
352 :
353 : /**
354 : * Decode complete input to `nsAString` _with BOM sniffing_ and with
355 : * malformed sequences replaced with the REPLACEMENT CHARACTER when the
356 : * entire input is available as a single buffer (i.e. the end of the
357 : * buffer marks the end of the stream).
358 : *
359 : * This method implements the (non-streaming version of) the
360 : * _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
361 : *
362 : * The second item in the returned tuple is the encoding that was actually
363 : * used (which may differ from this encoding thanks to BOM sniffing).
364 : *
365 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
366 : * if there were malformed sequences (that were replaced with the
367 : * REPLACEMENT CHARACTER) and `NS_OK` otherwise as the first item of the
368 : * tuple.
369 : *
370 : * _Note:_ It is wrong to use this when the input buffer represents only
371 : * a segment of the input instead of the whole input. Use `NewDecoder()`
372 : * when decoding segmented input.
373 : */
374 0 : inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Decode(
375 : Span<const uint8_t> aBytes,
376 : nsAString& aOut) const
377 : {
378 0 : const Encoding* encoding = this;
379 0 : nsresult rv = mozilla_encoding_decode_to_nsstring(
380 0 : &encoding, aBytes.Elements(), aBytes.Length(), &aOut);
381 0 : return MakeTuple(rv, WrapNotNull(encoding));
382 : }
383 :
384 : /**
385 : * Decode complete input to `nsACString` _with BOM removal_ and with
386 : * malformed sequences replaced with the REPLACEMENT CHARACTER when the
387 : * entire input is available as a single buffer (i.e. the end of the
388 : * buffer marks the end of the stream).
389 : *
390 : * When invoked on `UTF_8`, this method implements the (non-streaming
391 : * version of) the _UTF-8 decode_
392 : * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
393 : *
394 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
395 : * if there were malformed sequences (that were replaced with the
396 : * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
397 : *
398 : * The backing buffer of the string isn't copied if the input buffer
399 : * is heap-allocated and decoding from UTF-8 and the input is valid
400 : * BOMless UTF-8, decoding from an ASCII-compatible encoding and
401 : * the input is valid ASCII or decoding from ISO-2022-JP and the
402 : * input stays in the ASCII state of ISO-2022-JP. It is OK to pass
403 : * the same string as both arguments.
404 : *
405 : * _Note:_ It is wrong to use this when the input buffer represents only
406 : * a segment of the input instead of the whole input. Use
407 : * `NewDecoderWithBOMRemoval()` when decoding segmented input.
408 : */
409 : inline nsresult DecodeWithBOMRemoval(const nsACString& aBytes,
410 : nsACString& aOut) const
411 : {
412 : const nsACString* bytes = &aBytes;
413 : nsACString* out = &aOut;
414 : if (bytes == out) {
415 : nsAutoCString temp(aBytes);
416 : return mozilla_encoding_decode_to_nscstring_with_bom_removal(
417 : this, &temp, out);
418 : }
419 : return mozilla_encoding_decode_to_nscstring_with_bom_removal(
420 : this, bytes, out);
421 : }
422 :
423 : /**
424 : * Decode complete input to `nsAString` _with BOM removal_ and with
425 : * malformed sequences replaced with the REPLACEMENT CHARACTER when the
426 : * entire input is available as a single buffer (i.e. the end of the
427 : * buffer marks the end of the stream).
428 : *
429 : * When invoked on `UTF_8`, this method implements the (non-streaming
430 : * version of) the _UTF-8 decode_
431 : * (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
432 : *
433 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
434 : * if there were malformed sequences (that were replaced with the
435 : * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
436 : *
437 : * _Note:_ It is wrong to use this when the input buffer represents only
438 : * a segment of the input instead of the whole input. Use
439 : * `NewDecoderWithBOMRemoval()` when decoding segmented input.
440 : */
441 1 : inline nsresult DecodeWithBOMRemoval(Span<const uint8_t> aBytes,
442 : nsAString& aOut) const
443 : {
444 1 : return mozilla_encoding_decode_to_nsstring_with_bom_removal(
445 1 : this, aBytes.Elements(), aBytes.Length(), &aOut);
446 : }
447 :
448 : /**
449 : * Decode complete input to `nsACString` _without BOM handling_ and
450 : * with malformed sequences replaced with the REPLACEMENT CHARACTER when
451 : * the entire input is available as a single buffer (i.e. the end of the
452 : * buffer marks the end of the stream).
453 : *
454 : * When invoked on `UTF_8`, this method implements the (non-streaming
455 : * version of) the _UTF-8 decode without BOM_
456 : * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
457 : *
458 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
459 : * if there were malformed sequences (that were replaced with the
460 : * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
461 : *
462 : * The backing buffer of the string isn't copied if the input buffer
463 : * is heap-allocated and decoding from UTF-8 and the input is valid
464 : * UTF-8, decoding from an ASCII-compatible encoding and the input
465 : * is valid ASCII or decoding from ISO-2022-JP and the input stays
466 : * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
467 : * as both arguments.
468 : *
469 : * _Note:_ It is wrong to use this when the input buffer represents only
470 : * a segment of the input instead of the whole input. Use
471 : * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
472 : */
473 0 : inline nsresult DecodeWithoutBOMHandling(const nsACString& aBytes,
474 : nsACString& aOut) const
475 : {
476 0 : const nsACString* bytes = &aBytes;
477 0 : nsACString* out = &aOut;
478 0 : if (bytes == out) {
479 0 : nsAutoCString temp(aBytes);
480 : return mozilla_encoding_decode_to_nscstring_without_bom_handling(
481 0 : this, &temp, out);
482 : }
483 : return mozilla_encoding_decode_to_nscstring_without_bom_handling(
484 0 : this, bytes, out);
485 : }
486 :
487 : /**
488 : * Decode complete input to `nsAString` _without BOM handling_ and
489 : * with malformed sequences replaced with the REPLACEMENT CHARACTER when
490 : * the entire input is available as a single buffer (i.e. the end of the
491 : * buffer marks the end of the stream).
492 : *
493 : * When invoked on `UTF_8`, this method implements the (non-streaming
494 : * version of) the _UTF-8 decode without BOM_
495 : * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
496 : *
497 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_OK_HAD_REPLACEMENTS`
498 : * if there were malformed sequences (that were replaced with the
499 : * REPLACEMENT CHARACTER) and `NS_OK` otherwise.
500 : *
501 : * _Note:_ It is wrong to use this when the input buffer represents only
502 : * a segment of the input instead of the whole input. Use
503 : * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
504 : */
505 25 : inline nsresult DecodeWithoutBOMHandling(Span<const uint8_t> aBytes,
506 : nsAString& aOut) const
507 : {
508 25 : return mozilla_encoding_decode_to_nsstring_without_bom_handling(
509 25 : this, aBytes.Elements(), aBytes.Length(), &aOut);
510 : }
511 :
512 : /**
513 : * Decode complete input to `nsACString` _without BOM handling_ and
514 : * _with malformed sequences treated as fatal_ when the entire input is
515 : * available as a single buffer (i.e. the end of the buffer marks the end
516 : * of the stream).
517 : *
518 : * When invoked on `UTF_8`, this method implements the (non-streaming
519 : * version of) the _UTF-8 decode without BOM or fail_
520 : * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
521 : * spec concept.
522 : *
523 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
524 : * if a malformed sequence was encountered and `NS_OK` otherwise.
525 : *
526 : * The backing buffer of the string isn't copied if the input buffer
527 : * is heap-allocated and decoding from UTF-8 and the input is valid
528 : * UTF-8, decoding from an ASCII-compatible encoding and the input
529 : * is valid ASCII or decoding from ISO-2022-JP and the input stays
530 : * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
531 : * as both arguments.
532 : *
533 : * _Note:_ It is wrong to use this when the input buffer represents only
534 : * a segment of the input instead of the whole input. Use
535 : * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
536 : */
537 0 : inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
538 : const nsACString& aBytes,
539 : nsACString& aOut) const
540 : {
541 0 : const nsACString* bytes = &aBytes;
542 0 : nsACString* out = &aOut;
543 0 : if (bytes == out) {
544 0 : nsAutoCString temp(aBytes);
545 : return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
546 0 : this, &temp, out);
547 : }
548 : return mozilla_encoding_decode_to_nscstring_without_bom_handling_and_without_replacement(
549 0 : this, bytes, out);
550 : }
551 :
552 : /**
553 : * Decode complete input to `nsAString` _without BOM handling_ and
554 : * _with malformed sequences treated as fatal_ when the entire input is
555 : * available as a single buffer (i.e. the end of the buffer marks the end
556 : * of the stream).
557 : *
558 : * When invoked on `UTF_8`, this method implements the (non-streaming
559 : * version of) the _UTF-8 decode without BOM or fail_
560 : * (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
561 : * spec concept.
562 : *
563 : * Returns `NS_ERROR_OUT_OF_MEMORY` upon OOM, `NS_ERROR_UDEC_ILLEGALINPUT`
564 : * if a malformed sequence was encountered and `NS_OK` otherwise.
565 : *
566 : * _Note:_ It is wrong to use this when the input buffer represents only
567 : * a segment of the input instead of the whole input. Use
568 : * `NewDecoderWithoutBOMHandling()` when decoding segmented input.
569 : */
570 0 : inline nsresult DecodeWithoutBOMHandlingAndWithoutReplacement(
571 : Span<const uint8_t> aBytes,
572 : nsAString& aOut) const
573 : {
574 0 : return mozilla_encoding_decode_to_nsstring_without_bom_handling_and_without_replacement(
575 0 : this, aBytes.Elements(), aBytes.Length(), &aOut);
576 : }
577 :
578 : /**
579 : * Encode complete input to `nsACString` with unmappable characters
580 : * replaced with decimal numeric character references when the entire input
581 : * is available as a single buffer (i.e. the end of the buffer marks the
582 : * end of the stream).
583 : *
584 : * This method implements the (non-streaming version of) the
585 : * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
586 : *
587 : * The second item in the returned tuple is the encoding that was actually
588 : * used (which may differ from this encoding thanks to some encodings
589 : * having UTF-8 as their output encoding).
590 : *
591 : * The first item of the returned tuple is `NS_ERROR_UDEC_ILLEGALINPUT` if
592 : * the input is not valid UTF-8, `NS_ERROR_OUT_OF_MEMORY` upon OOM,
593 : * `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that were
594 : * replaced with numeric character references) and `NS_OK` otherwise.
595 : *
596 : * The backing buffer of the string isn't copied if the input buffer
597 : * is heap-allocated and encoding to UTF-8 and the input is valid
598 : * UTF-8, encoding to an ASCII-compatible encoding and the input
599 : * is valid ASCII or encoding from ISO-2022-JP and the input stays
600 : * in the ASCII state of ISO-2022-JP. It is OK to pass the same string
601 : * as both arguments.
602 : *
603 : * _Note:_ It is wrong to use this when the input buffer represents only
604 : * a segment of the input instead of the whole input. Use `NewEncoder()`
605 : * when encoding segmented output.
606 : */
607 0 : inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
608 : const nsACString& aString,
609 : nsACString& aOut) const
610 : {
611 0 : const Encoding* encoding = this;
612 0 : const nsACString* string = &aString;
613 0 : nsACString* out = &aOut;
614 : nsresult rv;
615 0 : if (string == out) {
616 0 : nsAutoCString temp(aString);
617 0 : rv = mozilla_encoding_encode_from_nscstring(&encoding, &temp, out);
618 : } else {
619 0 : rv = mozilla_encoding_encode_from_nscstring(&encoding, string, out);
620 : }
621 0 : return MakeTuple(rv, WrapNotNull(encoding));
622 : }
623 :
624 : /**
625 : * Encode complete input to `nsACString` with unmappable characters
626 : * replaced with decimal numeric character references when the entire input
627 : * is available as a single buffer (i.e. the end of the buffer marks the
628 : * end of the stream).
629 : *
630 : * This method implements the (non-streaming version of) the
631 : * _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
632 : *
633 : * The second item in the returned tuple is the encoding that was actually
634 : * used (which may differ from this encoding thanks to some encodings
635 : * having UTF-8 as their output encoding).
636 : *
637 : * The first item of the returned tuple is `NS_ERROR_OUT_OF_MEMORY` upon
638 : * OOM, `NS_OK_HAD_REPLACEMENTS` if there were unmappable code points (that
639 : * were replaced with numeric character references) and `NS_OK` otherwise.
640 :
641 : * _Note:_ It is wrong to use this when the input buffer represents only
642 : * a segment of the input instead of the whole input. Use `NewEncoder()`
643 : * when encoding segmented output.
644 : */
645 0 : inline Tuple<nsresult, NotNull<const mozilla::Encoding*>> Encode(
646 : Span<const char16_t> aString,
647 : nsACString& aOut) const
648 : {
649 0 : const Encoding* encoding = this;
650 0 : nsresult rv = mozilla_encoding_encode_from_utf16(
651 0 : &encoding, aString.Elements(), aString.Length(), &aOut);
652 0 : return MakeTuple(rv, WrapNotNull(encoding));
653 : }
654 :
655 : /**
656 : * Instantiates a new decoder for this encoding with BOM sniffing enabled.
657 : *
658 : * BOM sniffing may cause the returned decoder to morph into a decoder
659 : * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
660 : */
661 5 : inline UniquePtr<Decoder> NewDecoder() const
662 : {
663 5 : UniquePtr<Decoder> decoder(encoding_new_decoder(this));
664 5 : return decoder;
665 : }
666 :
667 : /**
668 : * Instantiates a new decoder for this encoding with BOM sniffing enabled
669 : * into memory occupied by a previously-instantiated decoder.
670 : *
671 : * BOM sniffing may cause the returned decoder to morph into a decoder
672 : * for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
673 : */
674 : inline void NewDecoderInto(Decoder& aDecoder) const
675 : {
676 : encoding_new_decoder_into(this, &aDecoder);
677 : }
678 :
679 : /**
680 : * Instantiates a new decoder for this encoding with BOM removal.
681 : *
682 : * If the input starts with bytes that are the BOM for this encoding,
683 : * those bytes are removed. However, the decoder never morphs into a
684 : * decoder for another encoding: A BOM for another encoding is treated as
685 : * (potentially malformed) input to the decoding algorithm for this
686 : * encoding.
687 : */
688 82 : inline UniquePtr<Decoder> NewDecoderWithBOMRemoval() const
689 : {
690 82 : UniquePtr<Decoder> decoder(encoding_new_decoder_with_bom_removal(this));
691 82 : return decoder;
692 : }
693 :
694 : /**
695 : * Instantiates a new decoder for this encoding with BOM removal
696 : * into memory occupied by a previously-instantiated decoder.
697 : *
698 : * If the input starts with bytes that are the BOM for this encoding,
699 : * those bytes are removed. However, the decoder never morphs into a
700 : * decoder for another encoding: A BOM for another encoding is treated as
701 : * (potentially malformed) input to the decoding algorithm for this
702 : * encoding.
703 : */
704 0 : inline void NewDecoderWithBOMRemovalInto(Decoder& aDecoder) const
705 : {
706 0 : encoding_new_decoder_with_bom_removal_into(this, &aDecoder);
707 0 : }
708 :
709 : /**
710 : * Instantiates a new decoder for this encoding with BOM handling disabled.
711 : *
712 : * If the input starts with bytes that look like a BOM, those bytes are
713 : * not treated as a BOM. (Hence, the decoder never morphs into a decoder
714 : * for another encoding.)
715 : *
716 : * _Note:_ If the caller has performed BOM sniffing on its own but has not
717 : * removed the BOM, the caller should use `NewDecoderWithBOMRemoval()`
718 : * instead of this method to cause the BOM to be removed.
719 : */
720 40 : inline UniquePtr<Decoder> NewDecoderWithoutBOMHandling() const
721 : {
722 40 : UniquePtr<Decoder> decoder(encoding_new_decoder_without_bom_handling(this));
723 40 : return decoder;
724 : }
725 :
726 : /**
727 : * Instantiates a new decoder for this encoding with BOM handling disabled
728 : * into memory occupied by a previously-instantiated decoder.
729 : *
730 : * If the input starts with bytes that look like a BOM, those bytes are
731 : * not treated as a BOM. (Hence, the decoder never morphs into a decoder
732 : * for another encoding.)
733 : *
734 : * _Note:_ If the caller has performed BOM sniffing on its own but has not
735 : * removed the BOM, the caller should use `NewDecoderWithBOMRemovalInto()`
736 : * instead of this method to cause the BOM to be removed.
737 : */
738 0 : inline void NewDecoderWithoutBOMHandlingInto(Decoder& aDecoder) const
739 : {
740 0 : encoding_new_decoder_without_bom_handling_into(this, &aDecoder);
741 0 : }
742 :
743 : /**
744 : * Instantiates a new encoder for the output encoding of this encoding.
745 : */
746 0 : inline UniquePtr<Encoder> NewEncoder() const
747 : {
748 0 : UniquePtr<Encoder> encoder(encoding_new_encoder(this));
749 0 : return encoder;
750 : }
751 :
752 : /**
753 : * Instantiates a new encoder for the output encoding of this encoding
754 : * into memory occupied by a previously-instantiated encoder.
755 : */
756 0 : inline void NewEncoderInto(Encoder& aEncoder) const
757 : {
758 0 : encoding_new_encoder_into(this, &aEncoder);
759 0 : }
760 :
761 : /**
762 : * Validates UTF-8.
763 : *
764 : * Returns the index of the first byte that makes the input malformed as
765 : * UTF-8 or the length of the input if the input is entirely valid.
766 : */
767 : static inline size_t UTF8ValidUpTo(Span<const uint8_t> aBuffer)
768 : {
769 : return encoding_utf8_valid_up_to(aBuffer.Elements(), aBuffer.Length());
770 : }
771 :
772 : /**
773 : * Validates ASCII.
774 : *
775 : * Returns the index of the first byte that makes the input malformed as
776 : * ASCII or the length of the input if the input is entirely valid.
777 : */
778 : static inline size_t ASCIIValidUpTo(Span<const uint8_t> aBuffer)
779 : {
780 : return encoding_ascii_valid_up_to(aBuffer.Elements(), aBuffer.Length());
781 : }
782 :
783 : /**
784 : * Validates ISO-2022-JP ASCII-state data.
785 : *
786 : * Returns the index of the first byte that makes the input not
787 : * representable in the ASCII state of ISO-2022-JP or the length of the
788 : * input if the input is entirely representable in the ASCII state of
789 : * ISO-2022-JP.
790 : */
791 : static inline size_t ISO2022JPASCIIValidUpTo(Span<const uint8_t> aBuffer)
792 : {
793 : return encoding_iso_2022_jp_ascii_valid_up_to(aBuffer.Elements(),
794 : aBuffer.Length());
795 : }
796 :
797 : private:
798 : Encoding() = delete;
799 : Encoding(const Encoding&) = delete;
800 : Encoding& operator=(const Encoding&) = delete;
801 : ~Encoding() = delete;
802 :
803 : };
804 :
805 : /**
806 : * A converter that decodes a byte stream into Unicode according to a
807 : * character encoding in a streaming (incremental) manner.
808 : *
809 : * The various `Decode*` methods take an input buffer (`aSrc`) and an output
810 : * buffer `aDst` both of which are caller-allocated. There are variants for
811 : * both UTF-8 and UTF-16 output buffers.
812 : *
813 : * A `Decode*` method decodes bytes from `aSrc` into Unicode characters stored
814 : * into `aDst` until one of the following three things happens:
815 : *
816 : * 1. A malformed byte sequence is encountered (`*WithoutReplacement`
817 : * variants only).
818 : *
819 : * 2. The output buffer has been filled so near capacity that the decoder
820 : * cannot be sure that processing an additional byte of input wouldn't
821 : * cause so much output that the output buffer would overflow.
822 : *
823 : * 3. All the input bytes have been processed.
824 : *
825 : * The `Decode*` method then returns tuple of a status indicating which one
826 : * of the three reasons to return happened, how many input bytes were read,
827 : * how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
828 : * when decoding to UTF-16) were written, and in the case of the
829 : * variants performing replacement, a boolean indicating whether an error was
830 : * replaced with the REPLACEMENT CHARACTER during the call.
831 : *
832 : * The number of bytes "written" is what's logically written. Garbage may be
833 : * written in the output buffer beyond the point logically written to.
834 : *
835 : * In the case of the `*WithoutReplacement` variants, the status is a
836 : * `uint32_t` whose possible values are packed info about a malformed byte
837 : * sequence, `kOutputFull` and `kInputEmpty` corresponding to the three cases
838 : * listed above).
839 : *
840 : * Packed info about malformed sequences has the following format:
841 : * The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
842 : * indicate the number of bytes that were consumed after the malformed
843 : * sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
844 : * the length of the malformed byte sequence (possible decimal values 1, 2,
845 : * 3 or 4). The maximum possible sum of the two is 6.
846 : *
847 : * In the case of methods whose name does not end with
848 : * `*WithoutReplacement`, malformed sequences are automatically replaced
849 : * with the REPLACEMENT CHARACTER and errors do not cause the methods to
850 : * return early.
851 : *
852 : * When decoding to UTF-8, the output buffer must have at least 4 bytes of
853 : * space. When decoding to UTF-16, the output buffer must have at least two
854 : * UTF-16 code units (`char16_t`) of space.
855 : *
856 : * When decoding to UTF-8 without replacement, the methods are guaranteed
857 : * not to return indicating that more output space is needed if the length
858 : * of the output buffer is at least the length returned by
859 : * `MaxUTF8BufferLengthWithoutReplacement()`. When decoding to UTF-8
860 : * with replacement, the length of the output buffer that guarantees the
861 : * methods not to return indicating that more output space is needed is given
862 : * by `MaxUTF8BufferLength()`. When decoding to UTF-16 with
863 : * or without replacement, the length of the output buffer that guarantees
864 : * the methods not to return indicating that more output space is needed is
865 : * given by `MaxUTF16BufferLength()`.
866 : *
867 : * The output written into `aDst` is guaranteed to be valid UTF-8 or UTF-16,
868 : * and the output after each `Decode*` call is guaranteed to consist of
869 : * complete characters. (I.e. the code unit sequence for the last character is
870 : * guaranteed not to be split across output buffers.)
871 : *
872 : * The boolean argument `aLast` indicates that the end of the stream is reached
873 : * when all the bytes in `aSrc` have been consumed.
874 : *
875 : * A `Decoder` object can be used to incrementally decode a byte stream.
876 : *
877 : * During the processing of a single stream, the caller must call `Decode*`
878 : * zero or more times with `aLast` set to `false` and then call `Decode*` at
879 : * least once with `aLast` set to `true`. If `Decode*` returns `kInputEmpty`,
880 : * the processing of the stream has ended. Otherwise, the caller must call
881 : * `Decode*` again with `aLast` set to `true` (or treat a malformed result,
882 : * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
883 : *
884 : * Once the stream has ended, the `Decoder` object must not be used anymore.
885 : * That is, you need to create another one to process another stream.
886 : *
887 : * When the decoder returns `kOutputFull` or the decoder returns a malformed
888 : * result and the caller does not wish to treat it as a fatal error, the input
889 : * buffer `aSrc` may not have been completely consumed. In that case, the caller
890 : * must pass the unconsumed contents of `aSrc` to `Decode*` again upon the next
891 : * call.
892 : *
893 : * # Infinite loops
894 : *
895 : * When converting with a fixed-size output buffer whose size is too small to
896 : * accommodate one character of output, an infinite loop ensues. When
897 : * converting with a fixed-size output buffer, it generally makes sense to
898 : * make the buffer fairly large (e.g. couple of kilobytes).
899 : */
900 : class Decoder final
901 : {
902 : public:
903 124 : ~Decoder() {}
904 124 : static void operator delete(void* aDecoder)
905 : {
906 124 : decoder_free(reinterpret_cast<Decoder*>(aDecoder));
907 124 : }
908 :
909 : /**
910 : * The `Encoding` this `Decoder` is for.
911 : *
912 : * BOM sniffing can change the return value of this method during the life
913 : * of the decoder.
914 : */
915 37 : inline NotNull<const mozilla::Encoding*> Encoding() const
916 : {
917 37 : return WrapNotNull(decoder_encoding(this));
918 : }
919 :
920 : /**
921 : * Query the worst-case UTF-8 output size _with replacement_.
922 : *
923 : * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
924 : * that will not overflow given the current state of the decoder and
925 : * `aByteLength` number of additional input bytes when decoding with
926 : * errors handled by outputting a REPLACEMENT CHARACTER for each malformed
927 : * sequence.
928 : */
929 : inline CheckedInt<size_t> MaxUTF8BufferLength(size_t aByteLength) const
930 : {
931 : CheckedInt<size_t> max(decoder_max_utf8_buffer_length(this, aByteLength));
932 : if (max.value() == MaxValue<size_t>::value) {
933 : // Mark invalid by overflowing
934 : max++;
935 : MOZ_ASSERT(!max.isValid());
936 : }
937 : return max;
938 : }
939 :
940 : /**
941 : * Query the worst-case UTF-8 output size _without replacement_.
942 : *
943 : * Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
944 : * that will not overflow given the current state of the decoder and
945 : * `aByteLength` number of additional input bytes when decoding without
946 : * replacement error handling.
947 : *
948 : * Note that this value may be too small for the `WithReplacement` case.
949 : * Use `MaxUTF8BufferLength()` for that case.
950 : */
951 : inline CheckedInt<size_t> MaxUTF8BufferLengthWithoutReplacement(
952 : size_t aByteLength) const
953 : {
954 : CheckedInt<size_t> max(
955 : decoder_max_utf8_buffer_length_without_replacement(this, aByteLength));
956 : if (max.value() == MaxValue<size_t>::value) {
957 : // Mark invalid by overflowing
958 : max++;
959 : MOZ_ASSERT(!max.isValid());
960 : }
961 : return max;
962 : }
963 :
964 : /**
965 : * Incrementally decode a byte stream into UTF-8 with malformed sequences
966 : * replaced with the REPLACEMENT CHARACTER.
967 : *
968 : * See the documentation of the class for documentation for `Decode*`
969 : * methods collectively.
970 : */
971 : inline Tuple<uint32_t, size_t, size_t, bool>
972 : DecodeToUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
973 : {
974 : size_t srcRead = aSrc.Length();
975 : size_t dstWritten = aDst.Length();
976 : bool hadReplacements;
977 : uint32_t result = decoder_decode_to_utf8(this,
978 : aSrc.Elements(),
979 : &srcRead,
980 : aDst.Elements(),
981 : &dstWritten,
982 : aLast,
983 : &hadReplacements);
984 : return MakeTuple(result, srcRead, dstWritten, hadReplacements);
985 : }
986 :
987 : /**
988 : * Incrementally decode a byte stream into UTF-8 _without replacement_.
989 : *
990 : * See the documentation of the class for documentation for `Decode*`
991 : * methods collectively.
992 : */
993 : inline Tuple<uint32_t, size_t, size_t> DecodeToUTF8WithoutReplacement(
994 : Span<const uint8_t> aSrc,
995 : Span<uint8_t> aDst,
996 : bool aLast)
997 : {
998 : size_t srcRead = aSrc.Length();
999 : size_t dstWritten = aDst.Length();
1000 : uint32_t result = decoder_decode_to_utf8_without_replacement(
1001 : this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1002 : return MakeTuple(result, srcRead, dstWritten);
1003 : }
1004 :
1005 : /**
1006 : * Query the worst-case UTF-16 output size (with or without replacement).
1007 : *
1008 : * Returns the size of the output buffer in UTF-16 code units (`char16_t`)
1009 : * that will not overflow given the current state of the decoder and
1010 : * `aByteLength` number of additional input bytes.
1011 : *
1012 : * Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
1013 : * return value of this method applies also in the
1014 : * `_without_replacement` case.
1015 : */
1016 176 : inline CheckedInt<size_t> MaxUTF16BufferLength(size_t aU16Length) const
1017 : {
1018 176 : CheckedInt<size_t> max(decoder_max_utf16_buffer_length(this, aU16Length));
1019 176 : if (max.value() == MaxValue<size_t>::value) {
1020 : // Mark invalid by overflowing
1021 0 : max++;
1022 0 : MOZ_ASSERT(!max.isValid());
1023 : }
1024 176 : return max;
1025 : }
1026 :
1027 : /**
1028 : * Incrementally decode a byte stream into UTF-16 with malformed sequences
1029 : * replaced with the REPLACEMENT CHARACTER.
1030 : *
1031 : * See the documentation of the class for documentation for `Decode*`
1032 : * methods collectively.
1033 : */
1034 : inline Tuple<uint32_t, size_t, size_t, bool>
1035 157 : DecodeToUTF16(Span<const uint8_t> aSrc, Span<char16_t> aDst, bool aLast)
1036 : {
1037 157 : size_t srcRead = aSrc.Length();
1038 157 : size_t dstWritten = aDst.Length();
1039 : bool hadReplacements;
1040 157 : uint32_t result = decoder_decode_to_utf16(this,
1041 : aSrc.Elements(),
1042 : &srcRead,
1043 : aDst.Elements(),
1044 : &dstWritten,
1045 : aLast,
1046 157 : &hadReplacements);
1047 157 : return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1048 : }
1049 :
1050 : /**
1051 : * Incrementally decode a byte stream into UTF-16 _without replacement_.
1052 : *
1053 : * See the documentation of the class for documentation for `Decode*`
1054 : * methods collectively.
1055 : */
1056 25 : inline Tuple<uint32_t, size_t, size_t> DecodeToUTF16WithoutReplacement(
1057 : Span<const uint8_t> aSrc,
1058 : Span<char16_t> aDst,
1059 : bool aLast)
1060 : {
1061 25 : size_t srcRead = aSrc.Length();
1062 25 : size_t dstWritten = aDst.Length();
1063 25 : uint32_t result = decoder_decode_to_utf16_without_replacement(
1064 25 : this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1065 25 : return MakeTuple(result, srcRead, dstWritten);
1066 : }
1067 :
1068 : private:
1069 : Decoder() = delete;
1070 : Decoder(const Decoder&) = delete;
1071 : Decoder& operator=(const Decoder&) = delete;
1072 : };
1073 :
1074 : /**
1075 : * A converter that encodes a Unicode stream into bytes according to a
1076 : * character encoding in a streaming (incremental) manner.
1077 : *
1078 : * The various `Encode*` methods take an input buffer (`aSrc`) and an output
1079 : * buffer `aDst` both of which are caller-allocated. There are variants for
1080 : * both UTF-8 and UTF-16 input buffers.
1081 : *
1082 : * An `Encode*` method encode characters from `aSrc` into bytes characters
1083 : * stored into `aDst` until one of the following three things happens:
1084 : *
1085 : * 1. An unmappable character is encountered (`*WithoutReplacement` variants
1086 : * only).
1087 : *
1088 : * 2. The output buffer has been filled so near capacity that the decoder
1089 : * cannot be sure that processing an additional character of input wouldn't
1090 : * cause so much output that the output buffer would overflow.
1091 : *
1092 : * 3. All the input characters have been processed.
1093 : *
1094 : * The `Encode*` method then returns tuple of a status indicating which one
1095 : * of the three reasons to return happened, how many input code units (`uint8_t`
1096 : * when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
1097 : * how many output bytes were written, and in the case of the variants that
1098 : * perform replacement, a boolean indicating whether an unmappable
1099 : * character was replaced with a numeric character reference during the call.
1100 : *
1101 : * The number of bytes "written" is what's logically written. Garbage may be
1102 : * written in the output buffer beyond the point logically written to.
1103 : *
1104 : * In the case of the methods whose name ends with
1105 : * `*WithoutReplacement`, the status is a `uint32_t` whose possible values
1106 : * are an unmappable code point, `kOutputFull` and `kInputEmpty` corresponding
1107 : * to the three cases listed above).
1108 : *
1109 : * In the case of methods whose name does not end with
1110 : * `*WithoutReplacement`, unmappable characters are automatically replaced
1111 : * with the corresponding numeric character references and unmappable
1112 : * characters do not cause the methods to return early.
1113 : *
1114 : * When encoding from UTF-8 without replacement, the methods are guaranteed
1115 : * not to return indicating that more output space is needed if the length
1116 : * of the output buffer is at least the length returned by
1117 : * `MaxBufferLengthFromUTF8WithoutReplacement()`. When encoding from
1118 : * UTF-8 with replacement, the length of the output buffer that guarantees the
1119 : * methods not to return indicating that more output space is needed in the
1120 : * absence of unmappable characters is given by
1121 : * `MaxBufferLengthFromUTF8IfNoUnmappables()`. When encoding from
1122 : * UTF-16 without replacement, the methods are guaranteed not to return
1123 : * indicating that more output space is needed if the length of the output
1124 : * buffer is at least the length returned by
1125 : * `MaxBufferLengthFromUTF16WithoutReplacement()`. When encoding
1126 : * from UTF-16 with replacement, the the length of the output buffer that
1127 : * guarantees the methods not to return indicating that more output space is
1128 : * needed in the absence of unmappable characters is given by
1129 : * `MaxBufferLengthFromUTF16IfNoUnmappables()`.
1130 : * When encoding with replacement, applications are not expected to size the
1131 : * buffer for the worst case ahead of time but to resize the buffer if there
1132 : * are unmappable characters. This is why max length queries are only available
1133 : * for the case where there are no unmappable characters.
1134 : *
1135 : * When encoding from UTF-8, each `aSrc` buffer _must_ be valid UTF-8. When
1136 : * encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
1137 : * REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
1138 : * turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
1139 : * surrogate pairs are not split across input buffer boundaries.
1140 : *
1141 : * After an `Encode*` call returns, the output produced so far, taken as a
1142 : * whole from the start of the stream, is guaranteed to consist of a valid
1143 : * byte sequence in the target encoding. (I.e. the code unit sequence for a
1144 : * character is guaranteed not to be split across output buffers. However, due
1145 : * to the stateful nature of ISO-2022-JP, the stream needs to be considered
1146 : * from the start for it to be valid. For other encodings, the validity holds
1147 : * on a per-output buffer basis.)
1148 : *
1149 : * The boolean argument `aLast` indicates that the end of the stream is reached
1150 : * when all the characters in `aSrc` have been consumed. This argument is needed
1151 : * for ISO-2022-JP and is ignored for other encodings.
1152 : *
1153 : * An `Encoder` object can be used to incrementally encode a byte stream.
1154 : *
1155 : * During the processing of a single stream, the caller must call `Encode*`
1156 : * zero or more times with `aLast` set to `false` and then call `Encode*` at
1157 : * least once with `aLast` set to `true`. If `Encode*` returns `kInputEmpty`,
1158 : * the processing of the stream has ended. Otherwise, the caller must call
1159 : * `Encode*` again with `aLast` set to `true` (or treat an unmappable result,
1160 : * i.e. neither `kInputEmpty` nor `kOutputFull`, as a fatal error).
1161 : *
1162 : * Once the stream has ended, the `Encoder` object must not be used anymore.
1163 : * That is, you need to create another one to process another stream.
1164 : *
1165 : * When the encoder returns `kOutputFull` or the encoder returns an unmappable
1166 : * result and the caller does not wish to treat it as a fatal error, the input
1167 : * buffer `aSrc` may not have been completely consumed. In that case, the caller
1168 : * must pass the unconsumed contents of `aSrc` to `Encode*` again upon the next
1169 : * call.
1170 : *
1171 : * # Infinite loops
1172 : *
1173 : * When converting with a fixed-size output buffer whose size is too small to
1174 : * accommodate one character of output, an infinite loop ensues. When
1175 : * converting with a fixed-size output buffer, it generally makes sense to
1176 : * make the buffer fairly large (e.g. couple of kilobytes).
1177 : */
1178 : class Encoder final
1179 : {
1180 : public:
1181 0 : ~Encoder() {}
1182 :
1183 0 : static void operator delete(void* aEncoder)
1184 : {
1185 0 : encoder_free(reinterpret_cast<Encoder*>(aEncoder));
1186 0 : }
1187 :
1188 : /**
1189 : * The `Encoding` this `Encoder` is for.
1190 : */
1191 0 : inline NotNull<const mozilla::Encoding*> Encoding() const
1192 : {
1193 0 : return WrapNotNull(encoder_encoding(this));
1194 : }
1195 :
1196 : /**
1197 : * Returns `true` if this is an ISO-2022-JP encoder that's not in the
1198 : * ASCII state and `false` otherwise.
1199 : */
1200 : inline bool HasPendingState() const
1201 : {
1202 : return encoder_has_pending_state(this);
1203 : }
1204 :
1205 : /**
1206 : * Query the worst-case output size when encoding from UTF-8 with
1207 : * replacement.
1208 : *
1209 : * Returns the size of the output buffer in bytes that will not overflow
1210 : * given the current state of the encoder and `aByteLength` number of
1211 : * additional input code units if there are no unmappable characters in
1212 : * the input.
1213 : */
1214 : inline CheckedInt<size_t> MaxBufferLengthFromUTF8IfNoUnmappables(
1215 : size_t aByteLength) const
1216 : {
1217 : CheckedInt<size_t> max(
1218 : encoder_max_buffer_length_from_utf8_if_no_unmappables(this, aByteLength));
1219 : if (max.value() == MaxValue<size_t>::value) {
1220 : // Mark invalid by overflowing
1221 : max++;
1222 : MOZ_ASSERT(!max.isValid());
1223 : }
1224 : return max;
1225 : }
1226 :
1227 : /**
1228 : * Query the worst-case output size when encoding from UTF-8 without
1229 : * replacement.
1230 : *
1231 : * Returns the size of the output buffer in bytes that will not overflow
1232 : * given the current state of the encoder and `aByteLength` number of
1233 : * additional input code units.
1234 : */
1235 : inline CheckedInt<size_t> MaxBufferLengthFromUTF8WithoutReplacement(
1236 : size_t aByteLength) const
1237 : {
1238 : CheckedInt<size_t> max(
1239 : encoder_max_buffer_length_from_utf8_without_replacement(this,
1240 : aByteLength));
1241 : if (max.value() == MaxValue<size_t>::value) {
1242 : // Mark invalid by overflowing
1243 : max++;
1244 : MOZ_ASSERT(!max.isValid());
1245 : }
1246 : return max;
1247 : }
1248 :
1249 : /**
1250 : * Incrementally encode into byte stream from UTF-8 with unmappable
1251 : * characters replaced with HTML (decimal) numeric character references.
1252 : *
1253 : * See the documentation of the class for documentation for `Encode*`
1254 : * methods collectively.
1255 : *
1256 : * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1257 : * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1258 : * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1259 : */
1260 : inline Tuple<uint32_t, size_t, size_t, bool>
1261 : EncodeFromUTF8(Span<const uint8_t> aSrc, Span<uint8_t> aDst, bool aLast)
1262 : {
1263 : size_t srcRead = aSrc.Length();
1264 : size_t dstWritten = aDst.Length();
1265 : bool hadReplacements;
1266 : uint32_t result = encoder_encode_from_utf8(this,
1267 : aSrc.Elements(),
1268 : &srcRead,
1269 : aDst.Elements(),
1270 : &dstWritten,
1271 : aLast,
1272 : &hadReplacements);
1273 : return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1274 : }
1275 :
1276 : /**
1277 : * Incrementally encode into byte stream from UTF-8 _without replacement_.
1278 : *
1279 : * See the documentation of the class for documentation for `Encode*`
1280 : * methods collectively.
1281 : *
1282 : * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING:
1283 : * The input ***MUST*** be valid UTF-8 or bad things happen! Unless
1284 : * absolutely sure, use `Encoding::UTF8ValidUpTo()` to check.
1285 : */
1286 : inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF8WithoutReplacement(
1287 : Span<const uint8_t> aSrc,
1288 : Span<uint8_t> aDst,
1289 : bool aLast)
1290 : {
1291 : size_t srcRead = aSrc.Length();
1292 : size_t dstWritten = aDst.Length();
1293 : uint32_t result = encoder_encode_from_utf8_without_replacement(
1294 : this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1295 : return MakeTuple(result, srcRead, dstWritten);
1296 : }
1297 :
1298 : /**
1299 : * Query the worst-case output size when encoding from UTF-16 with
1300 : * replacement.
1301 : *
1302 : * Returns the size of the output buffer in bytes that will not overflow
1303 : * given the current state of the encoder and `aU16Length` number of
1304 : * additional input code units if there are no unmappable characters in
1305 : * the input.
1306 : */
1307 : inline CheckedInt<size_t> MaxBufferLengthFromUTF16IfNoUnmappables(
1308 : size_t aU16Length) const
1309 : {
1310 : CheckedInt<size_t> max(
1311 : encoder_max_buffer_length_from_utf16_if_no_unmappables(this, aU16Length));
1312 : if (max.value() == MaxValue<size_t>::value) {
1313 : // Mark invalid by overflowing
1314 : max++;
1315 : MOZ_ASSERT(!max.isValid());
1316 : }
1317 : return max;
1318 : }
1319 :
1320 : /**
1321 : * Query the worst-case output size when encoding from UTF-16 without
1322 : * replacement.
1323 : *
1324 : * Returns the size of the output buffer in bytes that will not overflow
1325 : * given the current state of the encoder and `aU16Length` number of
1326 : * additional input code units.
1327 : */
1328 0 : inline CheckedInt<size_t> MaxBufferLengthFromUTF16WithoutReplacement(
1329 : size_t aU16Length) const
1330 : {
1331 : CheckedInt<size_t> max(
1332 : encoder_max_buffer_length_from_utf16_without_replacement(this,
1333 0 : aU16Length));
1334 0 : if (max.value() == MaxValue<size_t>::value) {
1335 : // Mark invalid by overflowing
1336 0 : max++;
1337 0 : MOZ_ASSERT(!max.isValid());
1338 : }
1339 0 : return max;
1340 : }
1341 :
1342 : /**
1343 : * Incrementally encode into byte stream from UTF-16 with unmappable
1344 : * characters replaced with HTML (decimal) numeric character references.
1345 : *
1346 : * See the documentation of the class for documentation for `Encode*`
1347 : * methods collectively.
1348 : */
1349 : inline Tuple<uint32_t, size_t, size_t, bool>
1350 0 : EncodeFromUTF16(Span<const char16_t> aSrc, Span<uint8_t> aDst, bool aLast)
1351 : {
1352 0 : size_t srcRead = aSrc.Length();
1353 0 : size_t dstWritten = aDst.Length();
1354 : bool hadReplacements;
1355 0 : uint32_t result = encoder_encode_from_utf16(this,
1356 : aSrc.Elements(),
1357 : &srcRead,
1358 : aDst.Elements(),
1359 : &dstWritten,
1360 : aLast,
1361 0 : &hadReplacements);
1362 0 : return MakeTuple(result, srcRead, dstWritten, hadReplacements);
1363 : }
1364 :
1365 : /**
1366 : * Incrementally encode into byte stream from UTF-16 _without replacement_.
1367 : *
1368 : * See the documentation of the class for documentation for `Encode*`
1369 : * methods collectively.
1370 : */
1371 0 : inline Tuple<uint32_t, size_t, size_t> EncodeFromUTF16WithoutReplacement(
1372 : Span<const char16_t> aSrc,
1373 : Span<uint8_t> aDst,
1374 : bool aLast)
1375 : {
1376 0 : size_t srcRead = aSrc.Length();
1377 0 : size_t dstWritten = aDst.Length();
1378 0 : uint32_t result = encoder_encode_from_utf16_without_replacement(
1379 0 : this, aSrc.Elements(), &srcRead, aDst.Elements(), &dstWritten, aLast);
1380 0 : return MakeTuple(result, srcRead, dstWritten);
1381 : }
1382 :
1383 : private:
1384 : Encoder() = delete;
1385 : Encoder(const Encoder&) = delete;
1386 : Encoder& operator=(const Encoder&) = delete;
1387 : };
1388 :
1389 : }; // namespace mozilla
1390 :
1391 : #endif // mozilla_Encoding_h
|