Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99:
3 : * This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #include "js/CharacterEncoding.h"
8 :
9 : #include "mozilla/Range.h"
10 : #include "mozilla/Sprintf.h"
11 :
12 : #include <algorithm>
13 : #include <type_traits>
14 :
15 : #include "jscntxt.h"
16 : #include "jsprf.h"
17 :
18 : using namespace js;
19 :
20 : Latin1CharsZ
21 0 : JS::LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
22 : const mozilla::Range<const char16_t> tbchars)
23 : {
24 0 : MOZ_ASSERT(cx);
25 0 : size_t len = tbchars.length();
26 0 : unsigned char* latin1 = cx->pod_malloc<unsigned char>(len + 1);
27 0 : if (!latin1)
28 0 : return Latin1CharsZ();
29 0 : for (size_t i = 0; i < len; ++i)
30 0 : latin1[i] = static_cast<unsigned char>(tbchars[i]);
31 0 : latin1[len] = '\0';
32 0 : return Latin1CharsZ(latin1, len);
33 : }
34 :
35 : template <typename CharT>
36 : static size_t
37 1774 : GetDeflatedUTF8StringLength(const CharT* chars, size_t nchars)
38 : {
39 1774 : size_t nbytes = nchars;
40 63165 : for (const CharT* end = chars + nchars; chars < end; chars++) {
41 61391 : char16_t c = *chars;
42 61391 : if (c < 0x80)
43 61391 : continue;
44 : uint32_t v;
45 0 : if (0xD800 <= c && c <= 0xDFFF) {
46 : /* nbytes sets 1 length since this is surrogate pair. */
47 0 : if (c >= 0xDC00 || (chars + 1) == end) {
48 0 : nbytes += 2; /* Bad Surrogate */
49 0 : continue;
50 : }
51 0 : char16_t c2 = chars[1];
52 0 : if (c2 < 0xDC00 || c2 > 0xDFFF) {
53 0 : nbytes += 2; /* Bad Surrogate */
54 0 : continue;
55 : }
56 0 : v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
57 0 : nbytes--;
58 0 : chars++;
59 : } else {
60 0 : v = c;
61 : }
62 0 : v >>= 11;
63 0 : nbytes++;
64 0 : while (v) {
65 0 : v >>= 5;
66 0 : nbytes++;
67 : }
68 : }
69 1774 : return nbytes;
70 : }
71 :
72 : JS_PUBLIC_API(size_t)
73 1120 : JS::GetDeflatedUTF8StringLength(JSFlatString* s)
74 : {
75 2240 : JS::AutoCheckCannotGC nogc;
76 1120 : return s->hasLatin1Chars()
77 1127 : ? ::GetDeflatedUTF8StringLength(s->latin1Chars(nogc), s->length())
78 2247 : : ::GetDeflatedUTF8StringLength(s->twoByteChars(nogc), s->length());
79 : }
80 :
81 : static const char16_t UTF8_REPLACEMENT_CHAR = 0xFFFD;
82 :
83 : template <typename CharT>
84 : static void
85 1774 : DeflateStringToUTF8Buffer(const CharT* src, size_t srclen, mozilla::RangedPtr<char> dst,
86 : size_t* dstlenp = nullptr, size_t* numcharsp = nullptr)
87 : {
88 1774 : size_t capacity = 0;
89 1774 : if (dstlenp) {
90 0 : capacity = *dstlenp;
91 0 : *dstlenp = 0;
92 : }
93 1774 : if (numcharsp)
94 0 : *numcharsp = 0;
95 :
96 124556 : while (srclen) {
97 : uint32_t v;
98 61391 : char16_t c = *src++;
99 61391 : srclen--;
100 61391 : if (c >= 0xDC00 && c <= 0xDFFF) {
101 0 : v = UTF8_REPLACEMENT_CHAR;
102 61391 : } else if (c < 0xD800 || c > 0xDBFF) {
103 61391 : v = c;
104 : } else {
105 0 : if (srclen < 1) {
106 0 : v = UTF8_REPLACEMENT_CHAR;
107 : } else {
108 0 : char16_t c2 = *src;
109 0 : if (c2 < 0xDC00 || c2 > 0xDFFF) {
110 0 : v = UTF8_REPLACEMENT_CHAR;
111 : } else {
112 0 : src++;
113 0 : srclen--;
114 0 : v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
115 : }
116 : }
117 : }
118 :
119 : size_t utf8Len;
120 61391 : if (v < 0x0080) {
121 : /* no encoding necessary - performance hack */
122 61391 : if (dstlenp && *dstlenp + 1 > capacity)
123 0 : return;
124 61391 : *dst++ = char(v);
125 61391 : utf8Len = 1;
126 : } else {
127 : uint8_t utf8buf[4];
128 0 : utf8Len = OneUcs4ToUtf8Char(utf8buf, v);
129 0 : if (dstlenp && *dstlenp + utf8Len > capacity)
130 0 : return;
131 0 : for (size_t i = 0; i < utf8Len; i++)
132 0 : *dst++ = char(utf8buf[i]);
133 : }
134 :
135 61391 : if (dstlenp)
136 0 : *dstlenp += utf8Len;
137 61391 : if (numcharsp)
138 0 : (*numcharsp)++;
139 : }
140 : }
141 :
142 : JS_PUBLIC_API(void)
143 1120 : JS::DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
144 : size_t* dstlenp, size_t* numcharsp)
145 : {
146 2240 : JS::AutoCheckCannotGC nogc;
147 1120 : return src->hasLatin1Chars()
148 1113 : ? ::DeflateStringToUTF8Buffer(src->latin1Chars(nogc), src->length(), dst,
149 : dstlenp, numcharsp)
150 7 : : ::DeflateStringToUTF8Buffer(src->twoByteChars(nogc), src->length(), dst,
151 3360 : dstlenp, numcharsp);
152 : }
153 :
154 : template <typename CharT>
155 : UTF8CharsZ
156 654 : JS::CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range<CharT> chars)
157 : {
158 : /* Get required buffer size. */
159 654 : const CharT* str = chars.begin().get();
160 654 : size_t len = ::GetDeflatedUTF8StringLength(str, chars.length());
161 :
162 : /* Allocate buffer. */
163 : char* utf8;
164 654 : if (maybeCx)
165 654 : utf8 = maybeCx->pod_malloc<char>(len + 1);
166 : else
167 0 : utf8 = js_pod_malloc<char>(len + 1);
168 654 : if (!utf8)
169 0 : return UTF8CharsZ();
170 :
171 : /* Encode to UTF8. */
172 654 : ::DeflateStringToUTF8Buffer(str, chars.length(), mozilla::RangedPtr<char>(utf8, len));
173 654 : utf8[len] = '\0';
174 :
175 654 : return UTF8CharsZ(utf8, len);
176 : }
177 :
178 : template UTF8CharsZ
179 : JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
180 : const mozilla::Range<Latin1Char> chars);
181 :
182 : template UTF8CharsZ
183 : JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
184 : const mozilla::Range<char16_t> chars);
185 :
186 : template UTF8CharsZ
187 : JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
188 : const mozilla::Range<const Latin1Char> chars);
189 :
190 : template UTF8CharsZ
191 : JS::CharsToNewUTF8CharsZ(JSContext* maybeCx,
192 : const mozilla::Range<const char16_t> chars);
193 :
194 : static const uint32_t INVALID_UTF8 = UINT32_MAX;
195 :
196 : /*
197 : * Convert a utf8 character sequence into a UCS-4 character and return that
198 : * character. It is assumed that the caller already checked that the sequence
199 : * is valid.
200 : */
201 : uint32_t
202 0 : JS::Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length)
203 : {
204 0 : MOZ_ASSERT(1 <= utf8Length && utf8Length <= 4);
205 :
206 0 : if (utf8Length == 1) {
207 0 : MOZ_ASSERT(!(*utf8Buffer & 0x80));
208 0 : return *utf8Buffer;
209 : }
210 :
211 : /* from Unicode 3.1, non-shortest form is illegal */
212 : static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };
213 :
214 0 : MOZ_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
215 : (0x100 - (1 << (8 - utf8Length))));
216 0 : uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
217 0 : uint32_t minucs4Char = minucs4Table[utf8Length - 2];
218 0 : while (--utf8Length) {
219 0 : MOZ_ASSERT((*utf8Buffer & 0xC0) == 0x80);
220 0 : ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
221 : }
222 :
223 0 : if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
224 0 : return INVALID_UTF8;
225 :
226 0 : return ucs4Char;
227 : }
228 :
229 : static void
230 0 : ReportInvalidCharacter(JSContext* cx, uint32_t offset)
231 : {
232 : char buffer[10];
233 0 : SprintfLiteral(buffer, "%u", offset);
234 : JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
235 0 : JSMSG_MALFORMED_UTF8_CHAR, buffer);
236 0 : }
237 :
238 : static void
239 0 : ReportBufferTooSmall(JSContext* cx, uint32_t dummy)
240 : {
241 0 : JS_ReportErrorNumberASCII(cx, GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
242 0 : }
243 :
244 : static void
245 0 : ReportTooBigCharacter(JSContext* cx, uint32_t v)
246 : {
247 : char buffer[10];
248 0 : SprintfLiteral(buffer, "0x%x", v + 0x10000);
249 : JS_ReportErrorFlagsAndNumberASCII(cx, JSREPORT_ERROR, GetErrorMessage, nullptr,
250 0 : JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
251 0 : }
252 :
253 : enum InflateUTF8Action {
254 : CountAndReportInvalids,
255 : CountAndIgnoreInvalids,
256 : AssertNoInvalids,
257 : Copy,
258 : FindEncoding
259 : };
260 :
261 : static const char16_t REPLACE_UTF8 = 0xFFFD;
262 : static const Latin1Char REPLACE_UTF8_LATIN1 = '?';
263 :
264 : // If making changes to this algorithm, make sure to also update
265 : // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
266 : template <InflateUTF8Action Action, typename CharT, class ContextT>
267 : static bool
268 323 : InflateUTF8StringToBuffer(ContextT* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
269 : JS::SmallestEncoding *smallestEncoding)
270 : {
271 : if (Action != AssertNoInvalids)
272 319 : *smallestEncoding = JS::SmallestEncoding::ASCII;
273 0 : auto RequireLatin1 = [&smallestEncoding]{
274 0 : *smallestEncoding = std::max(JS::SmallestEncoding::Latin1, *smallestEncoding);
275 323 : };
276 0 : auto RequireUTF16 = [&smallestEncoding]{
277 0 : *smallestEncoding = JS::SmallestEncoding::UTF16;
278 323 : };
279 :
280 : // Count how many code units need to be in the inflated string.
281 : // |i| is the index into |src|, and |j| is the the index into |dst|.
282 323 : size_t srclen = src.length();
283 323 : uint32_t j = 0;
284 4543 : for (uint32_t i = 0; i < srclen; i++, j++) {
285 4220 : uint32_t v = uint32_t(src[i]);
286 4220 : if (!(v & 0x80)) {
287 : // ASCII code unit. Simple copy.
288 : if (Action == Copy)
289 0 : dst[j] = CharT(v);
290 :
291 : } else {
292 : // Non-ASCII code unit. Determine its length in bytes (n).
293 0 : uint32_t n = 1;
294 0 : while (v & (0x80 >> n))
295 0 : n++;
296 :
297 : #define INVALID(report, arg, n2) \
298 : do { \
299 : if (Action == CountAndReportInvalids) { \
300 : report(cx, arg); \
301 : return false; \
302 : } else if (Action == AssertNoInvalids) { \
303 : MOZ_CRASH("invalid UTF-8 string: " # report); \
304 : } else { \
305 : if (Action == Copy) { \
306 : if (std::is_same<decltype(dst[0]), Latin1Char>::value) \
307 : dst[j] = CharT(REPLACE_UTF8_LATIN1); \
308 : else \
309 : dst[j] = CharT(REPLACE_UTF8); \
310 : } else { \
311 : MOZ_ASSERT(Action == CountAndIgnoreInvalids || \
312 : Action == FindEncoding); \
313 : } \
314 : n = n2; \
315 : goto invalidMultiByteCodeUnit; \
316 : } \
317 : } while (0)
318 :
319 : // Check the leading byte.
320 0 : if (n < 2 || n > 4)
321 0 : INVALID(ReportInvalidCharacter, i, 1);
322 :
323 : // Check that |src| is large enough to hold an n-byte code unit.
324 0 : if (i + n > srclen)
325 0 : INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
326 :
327 : // Check the second byte. From Unicode Standard v6.2, Table 3-7
328 : // Well-Formed UTF-8 Byte Sequences.
329 0 : if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF
330 0 : (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F
331 0 : (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF
332 0 : (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F
333 : {
334 0 : INVALID(ReportInvalidCharacter, i, 1);
335 : }
336 :
337 : // Check the continuation bytes.
338 0 : for (uint32_t m = 1; m < n; m++) {
339 0 : if ((src[i + m] & 0xC0) != 0x80)
340 0 : INVALID(ReportInvalidCharacter, i, m);
341 : }
342 :
343 : // Determine the code unit's length in CharT and act accordingly.
344 0 : v = JS::Utf8ToOneUcs4Char((uint8_t*)&src[i], n);
345 : if (Action != AssertNoInvalids) {
346 0 : if (v > 0xff) {
347 0 : RequireUTF16();
348 : if (Action == FindEncoding) {
349 0 : MOZ_ASSERT(dst == nullptr);
350 0 : return true;
351 : }
352 : } else {
353 0 : RequireLatin1();
354 : }
355 : }
356 0 : if (v < 0x10000) {
357 : // The n-byte UTF8 code unit will fit in a single CharT.
358 : if (Action == Copy)
359 0 : dst[j] = CharT(v);
360 : } else {
361 0 : v -= 0x10000;
362 0 : if (v <= 0xFFFFF) {
363 : // The n-byte UTF8 code unit will fit in two CharT units.
364 : if (Action == Copy)
365 0 : dst[j] = CharT((v >> 10) + 0xD800);
366 0 : j++;
367 : if (Action == Copy)
368 0 : dst[j] = CharT((v & 0x3FF) + 0xDC00);
369 :
370 : } else {
371 : // The n-byte UTF8 code unit won't fit in two CharT units.
372 0 : INVALID(ReportTooBigCharacter, v, 1);
373 : }
374 : }
375 :
376 : invalidMultiByteCodeUnit:
377 : // Move i to the last byte of the multi-byte code unit; the loop
378 : // header will do the final i++ to move to the start of the next
379 : // code unit.
380 0 : i += n - 1;
381 : if (Action != AssertNoInvalids)
382 0 : RequireUTF16();
383 : }
384 : }
385 :
386 : if (Action != AssertNoInvalids && Action != FindEncoding)
387 317 : *dstlenp = j;
388 :
389 323 : return true;
390 : }
391 :
392 : template <InflateUTF8Action Action, typename CharsT, class ContextT>
393 : static CharsT
394 317 : InflateUTF8StringHelper(ContextT* cx, const UTF8Chars src, size_t* outlen)
395 : {
396 : using CharT = typename CharsT::CharT;
397 317 : *outlen = 0;
398 :
399 : JS::SmallestEncoding encoding;
400 317 : if (!InflateUTF8StringToBuffer<Action, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding))
401 0 : return CharsT();
402 :
403 317 : CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1); // +1 for NUL
404 317 : if (!dst) {
405 0 : ReportOutOfMemory(cx);
406 0 : return CharsT();
407 : }
408 :
409 317 : if (encoding == JS::SmallestEncoding::ASCII) {
410 317 : size_t srclen = src.length();
411 317 : MOZ_ASSERT(*outlen == srclen);
412 4391 : for (uint32_t i = 0; i < srclen; i++)
413 4074 : dst[i] = CharT(src[i]);
414 : } else {
415 0 : MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<Copy, CharT>(cx, src, dst, outlen, &encoding)));
416 : }
417 :
418 317 : dst[*outlen] = 0; // NUL char
419 :
420 317 : return CharsT(dst, *outlen);
421 : }
422 :
423 : TwoByteCharsZ
424 317 : JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
425 : {
426 317 : return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, utf8, outlen);
427 : }
428 :
429 : TwoByteCharsZ
430 0 : JS::UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen)
431 : {
432 0 : UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
433 0 : return InflateUTF8StringHelper<CountAndReportInvalids, TwoByteCharsZ>(cx, chars, outlen);
434 : }
435 :
436 : TwoByteCharsZ
437 0 : JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const JS::UTF8Chars utf8, size_t* outlen)
438 : {
439 0 : return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, utf8, outlen);
440 : }
441 :
442 : TwoByteCharsZ
443 0 : JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const JS::ConstUTF8CharsZ& utf8, size_t* outlen)
444 : {
445 0 : UTF8Chars chars(utf8.c_str(), strlen(utf8.c_str()));
446 0 : return InflateUTF8StringHelper<CountAndIgnoreInvalids, TwoByteCharsZ>(cx, chars, outlen);
447 : }
448 :
449 : JS::SmallestEncoding
450 2 : JS::FindSmallestEncoding(UTF8Chars utf8)
451 : {
452 : JS::SmallestEncoding encoding;
453 2 : MOZ_ALWAYS_TRUE((InflateUTF8StringToBuffer<FindEncoding, char16_t, JSContext>(
454 : /* cx = */ nullptr,
455 : utf8,
456 : /* dst = */ nullptr,
457 : /* dstlen = */ nullptr,
458 : &encoding)));
459 2 : return encoding;
460 : }
461 :
462 : Latin1CharsZ
463 0 : JS::UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
464 : {
465 0 : return InflateUTF8StringHelper<CountAndReportInvalids, Latin1CharsZ>(cx, utf8, outlen);
466 : }
467 :
468 : Latin1CharsZ
469 0 : JS::LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen)
470 : {
471 0 : return InflateUTF8StringHelper<CountAndIgnoreInvalids, Latin1CharsZ>(cx, utf8, outlen);
472 : }
473 :
474 : #ifdef DEBUG
475 : void
476 4 : JS::ConstUTF8CharsZ::validate(size_t aLength)
477 : {
478 4 : MOZ_ASSERT(data_);
479 4 : UTF8Chars chars(data_, aLength);
480 : InflateUTF8StringToBuffer<AssertNoInvalids, char16_t, JSContext>(
481 : /* cx = */ nullptr,
482 : chars,
483 : /* dst = */ nullptr,
484 : /* dstlen = */ nullptr,
485 4 : /* smallestEncoding = */ nullptr);
486 4 : }
487 : #endif
488 :
489 : bool
490 549247 : JS::StringIsASCII(const char* s)
491 : {
492 1051731 : while (*s) {
493 502484 : if (*s & 0x80)
494 0 : return false;
495 502484 : s++;
496 : }
497 46763 : return true;
498 : }
|