Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 : #ifndef nsUTF8Utils_h_
7 : #define nsUTF8Utils_h_
8 :
9 : // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
10 : // file will provide signatures for the Mozilla abstract string types. It will
11 : // use XPCOM assertion/debugging macros, etc.
12 :
13 : #include "nscore.h"
14 : #include "mozilla/Assertions.h"
15 : #include "mozilla/SSE.h"
16 : #include "mozilla/TypeTraits.h"
17 :
18 : #include "nsCharTraits.h"
19 :
20 : #ifdef MOZILLA_INTERNAL_API
21 : #define UTF8UTILS_WARNING(msg) NS_WARNING(msg)
22 : #else
23 : #define UTF8UTILS_WARNING(msg)
24 : #endif
25 :
26 : class UTF8traits
27 : {
28 : public:
29 774251 : static bool isASCII(char aChar)
30 : {
31 774251 : return (aChar & 0x80) == 0x00;
32 : }
33 2440 : static bool isInSeq(char aChar)
34 : {
35 2440 : return (aChar & 0xC0) == 0x80;
36 : }
37 2520 : static bool is2byte(char aChar)
38 : {
39 2520 : return (aChar & 0xE0) == 0xC0;
40 : }
41 2360 : static bool is3byte(char aChar)
42 : {
43 2360 : return (aChar & 0xF0) == 0xE0;
44 : }
45 0 : static bool is4byte(char aChar)
46 : {
47 0 : return (aChar & 0xF8) == 0xF0;
48 : }
49 0 : static bool is5byte(char aChar)
50 : {
51 0 : return (aChar & 0xFC) == 0xF8;
52 : }
53 0 : static bool is6byte(char aChar)
54 : {
55 0 : return (aChar & 0xFE) == 0xFC;
56 : }
57 : };
58 :
59 : /**
60 : * Extract the next UCS-4 character from the buffer and return it. The
61 : * pointer passed in is advanced to the start of the next character in the
62 : * buffer. If non-null, the parameters err and overlong are filled in to
63 : * indicate that the character was represented by an overlong sequence, or
64 : * that an error occurred.
65 : */
66 :
67 : class UTF8CharEnumerator
68 : {
69 : public:
70 270438 : static uint32_t NextChar(const char** aBuffer, const char* aEnd, bool* aErr)
71 : {
72 270438 : NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
73 :
74 270438 : const char* p = *aBuffer;
75 270438 : *aErr = false;
76 :
77 270438 : if (p >= aEnd) {
78 0 : *aErr = true;
79 :
80 0 : return 0;
81 : }
82 :
83 270438 : char c = *p++;
84 :
85 270438 : if (UTF8traits::isASCII(c)) {
86 269178 : *aBuffer = p;
87 269178 : return c;
88 : }
89 :
90 : uint32_t ucs4;
91 : uint32_t minUcs4;
92 1260 : int32_t state = 0;
93 :
94 1260 : if (!CalcState(c, ucs4, minUcs4, state)) {
95 0 : NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
96 0 : *aErr = true;
97 :
98 0 : return 0;
99 : }
100 :
101 6140 : while (state--) {
102 2440 : if (p == aEnd) {
103 0 : *aErr = true;
104 :
105 0 : return 0;
106 : }
107 :
108 2440 : c = *p++;
109 :
110 2440 : if (!AddByte(c, state, ucs4)) {
111 0 : *aErr = true;
112 :
113 0 : return 0;
114 : }
115 : }
116 :
117 1260 : if (ucs4 < minUcs4) {
118 : // Overlong sequence
119 0 : ucs4 = UCS2_REPLACEMENT_CHAR;
120 1320 : } else if (ucs4 >= 0xD800 &&
121 120 : (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) {
122 : // Surrogates and code points outside the Unicode range.
123 0 : ucs4 = UCS2_REPLACEMENT_CHAR;
124 : }
125 :
126 1260 : *aBuffer = p;
127 1260 : return ucs4;
128 : }
129 :
130 : private:
131 1260 : static bool CalcState(char aChar, uint32_t& aUcs4, uint32_t& aMinUcs4,
132 : int32_t& aState)
133 : {
134 1260 : if (UTF8traits::is2byte(aChar)) {
135 80 : aUcs4 = (uint32_t(aChar) << 6) & 0x000007C0L;
136 80 : aState = 1;
137 80 : aMinUcs4 = 0x00000080;
138 1180 : } else if (UTF8traits::is3byte(aChar)) {
139 1180 : aUcs4 = (uint32_t(aChar) << 12) & 0x0000F000L;
140 1180 : aState = 2;
141 1180 : aMinUcs4 = 0x00000800;
142 0 : } else if (UTF8traits::is4byte(aChar)) {
143 0 : aUcs4 = (uint32_t(aChar) << 18) & 0x001F0000L;
144 0 : aState = 3;
145 0 : aMinUcs4 = 0x00010000;
146 0 : } else if (UTF8traits::is5byte(aChar)) {
147 0 : aUcs4 = (uint32_t(aChar) << 24) & 0x03000000L;
148 0 : aState = 4;
149 0 : aMinUcs4 = 0x00200000;
150 0 : } else if (UTF8traits::is6byte(aChar)) {
151 0 : aUcs4 = (uint32_t(aChar) << 30) & 0x40000000L;
152 0 : aState = 5;
153 0 : aMinUcs4 = 0x04000000;
154 : } else {
155 0 : return false;
156 : }
157 :
158 1260 : return true;
159 : }
160 :
161 2440 : static bool AddByte(char aChar, int32_t aState, uint32_t& aUcs4)
162 : {
163 2440 : if (UTF8traits::isInSeq(aChar)) {
164 2440 : int32_t shift = aState * 6;
165 2440 : aUcs4 |= (uint32_t(aChar) & 0x3F) << shift;
166 2440 : return true;
167 : }
168 :
169 0 : return false;
170 : }
171 : };
172 :
173 :
174 : /**
175 : * Extract the next UCS-4 character from the buffer and return it. The
176 : * pointer passed in is advanced to the start of the next character in the
177 : * buffer. If non-null, the err parameter is filled in if an error occurs.
178 : *
179 : * If an error occurs that causes UCS2_REPLACEMENT_CHAR to be returned, then
180 : * the buffer will be updated to move only a single UCS-2 character.
181 : *
182 : * Any other error returns 0 and does not move the buffer position.
183 : */
184 :
185 :
186 : class UTF16CharEnumerator
187 : {
188 : public:
189 378 : static uint32_t NextChar(const char16_t** aBuffer, const char16_t* aEnd,
190 : bool* aErr = nullptr)
191 : {
192 378 : NS_ASSERTION(aBuffer && *aBuffer, "null buffer!");
193 :
194 378 : const char16_t* p = *aBuffer;
195 :
196 378 : if (p >= aEnd) {
197 0 : NS_ERROR("No input to work with");
198 0 : if (aErr) {
199 0 : *aErr = true;
200 : }
201 :
202 0 : return 0;
203 : }
204 :
205 378 : char16_t c = *p++;
206 :
207 378 : if (!IS_SURROGATE(c)) { // U+0000 - U+D7FF,U+E000 - U+FFFF
208 378 : if (aErr) {
209 0 : *aErr = false;
210 : }
211 378 : *aBuffer = p;
212 378 : return c;
213 0 : } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
214 0 : if (p == aEnd) {
215 : // Found a high surrogate at the end of the buffer. Flag this
216 : // as an error and return the Unicode replacement
217 : // character 0xFFFD.
218 :
219 0 : UTF8UTILS_WARNING("Unexpected end of buffer after high surrogate");
220 :
221 0 : if (aErr) {
222 0 : *aErr = true;
223 : }
224 0 : *aBuffer = p;
225 0 : return 0xFFFD;
226 : }
227 :
228 : // D800- DBFF - High Surrogate
229 0 : char16_t h = c;
230 :
231 0 : c = *p++;
232 :
233 0 : if (NS_IS_LOW_SURROGATE(c)) {
234 : // DC00- DFFF - Low Surrogate
235 : // N = (H - D800) *400 + 10000 + (L - DC00)
236 0 : uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
237 0 : if (aErr) {
238 0 : *aErr = false;
239 : }
240 0 : *aBuffer = p;
241 0 : return ucs4;
242 : } else {
243 : // Found a high surrogate followed by something other than
244 : // a low surrogate. Flag this as an error and return the
245 : // Unicode replacement character 0xFFFD. Note that the
246 : // pointer to the next character points to the second 16-bit
247 : // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
248 : // only the first code unit of an illegal sequence must be
249 : // treated as an illegally terminated code unit sequence
250 : // (also Chapter 3 D91, "isolated [not paired and ill-formed]
251 : // UTF-16 code units in the range D800..DFFF are ill-formed").
252 0 : UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
253 :
254 0 : if (aErr) {
255 0 : *aErr = true;
256 : }
257 0 : *aBuffer = p - 1;
258 0 : return 0xFFFD;
259 : }
260 : } else { // U+DC00 - U+DFFF
261 : // DC00- DFFF - Low Surrogate
262 :
263 : // Found a low surrogate w/o a preceding high surrogate. Flag
264 : // this as an error and return the Unicode replacement
265 : // character 0xFFFD.
266 :
267 0 : UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
268 0 : if (aErr) {
269 0 : *aErr = true;
270 : }
271 0 : *aBuffer = p;
272 0 : return 0xFFFD;
273 : }
274 :
275 : MOZ_ASSERT_UNREACHABLE("Impossible UCS-2 character value.");
276 : }
277 : };
278 :
279 :
280 : /**
281 : * A character sink (see |copy_string| in nsAlgorithm.h) for converting
282 : * UTF-8 to UTF-16
283 : */
284 : class ConvertUTF8toUTF16
285 : {
286 : public:
287 : typedef char value_type;
288 : typedef char16_t buffer_type;
289 :
290 6891 : explicit ConvertUTF8toUTF16(buffer_type* aBuffer)
291 6891 : : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false)
292 : {
293 6891 : }
294 :
295 6891 : size_t Length() const
296 : {
297 6891 : return mBuffer - mStart;
298 : }
299 :
300 13416 : bool ErrorEncountered() const
301 : {
302 13416 : return mErrorEncountered;
303 : }
304 :
305 6891 : void write(const value_type* aStart, uint32_t aN)
306 : {
307 6891 : if (mErrorEncountered) {
308 0 : return;
309 : }
310 :
311 : // algorithm assumes utf8 units won't
312 : // be spread across fragments
313 6891 : const value_type* p = aStart;
314 6891 : const value_type* end = aStart + aN;
315 6891 : buffer_type* out = mBuffer;
316 517087 : for (; p != end /* && *p */;) {
317 : bool err;
318 255098 : uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
319 :
320 255098 : if (err) {
321 0 : mErrorEncountered = true;
322 0 : mBuffer = out;
323 0 : return;
324 : }
325 :
326 255098 : if (ucs4 >= PLANE1_BASE) {
327 0 : *out++ = (buffer_type)H_SURROGATE(ucs4);
328 0 : *out++ = (buffer_type)L_SURROGATE(ucs4);
329 : } else {
330 255098 : *out++ = ucs4;
331 : }
332 : }
333 6891 : mBuffer = out;
334 : }
335 :
336 154 : void write_terminator()
337 : {
338 154 : *mBuffer = buffer_type(0);
339 154 : }
340 :
341 : private:
342 : buffer_type* const mStart;
343 : buffer_type* mBuffer;
344 : bool mErrorEncountered;
345 : };
346 :
347 : /**
348 : * A character sink (see |copy_string| in nsAlgorithm.h) for computing
349 : * the length of the UTF-16 string equivalent to a UTF-8 string.
350 : */
351 : class CalculateUTF8Length
352 : {
353 : public:
354 : typedef char value_type;
355 :
356 7266 : CalculateUTF8Length()
357 7266 : : mLength(0), mErrorEncountered(false)
358 : {
359 7266 : }
360 :
361 7266 : size_t Length() const
362 : {
363 7266 : return mLength;
364 : }
365 :
366 7266 : void write(const value_type* aStart, uint32_t aN)
367 : {
368 : // ignore any further requests
369 7266 : if (mErrorEncountered) {
370 0 : return;
371 : }
372 :
373 : // algorithm assumes utf8 units won't
374 : // be spread across fragments
375 7266 : const value_type* p = aStart;
376 7266 : const value_type* end = aStart + aN;
377 287478 : for (; p < end /* && *p */; ++mLength) {
378 140106 : if (UTF8traits::isASCII(*p)) {
379 139638 : p += 1;
380 468 : } else if (UTF8traits::is2byte(*p)) {
381 80 : p += 2;
382 388 : } else if (UTF8traits::is3byte(*p)) {
383 388 : p += 3;
384 0 : } else if (UTF8traits::is4byte(*p)) {
385 : // Because a UTF-8 sequence of 4 bytes represents a codepoint
386 : // greater than 0xFFFF, it will become a surrogate pair in the
387 : // UTF-16 string, so add 1 more to mLength.
388 : // This doesn't happen with is5byte and is6byte because they
389 : // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
390 : // converted to a single replacement character.
391 :
392 : // However, there is one case when a 4 byte UTF-8 sequence will
393 : // only generate 2 UTF-16 bytes. If we have a properly encoded
394 : // sequence, but with an invalid value (too small or too big),
395 : // that will result in a replacement character being written
396 : // This replacement character is encoded as just 1 single
397 : // UTF-16 character, which is 2 bytes.
398 :
399 : // The below code therefore only adds 1 to mLength if the UTF8
400 : // data will produce a decoded character which is greater than
401 : // or equal to 0x010000 and less than 0x0110000.
402 :
403 : // A 4byte UTF8 character is encoded as
404 : // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
405 : // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
406 : // map to bit 17-21 in the final result. If these bits are
407 : // between 0x01 and 0x11, that means that the final result is
408 : // between 0x010000 and 0x110000. The below code reads these
409 : // bits out and assigns them to c, but shifted up 4 bits to
410 : // avoid having to shift twice.
411 :
412 : // It doesn't matter what to do in the case where p + 4 > end
413 : // since no UTF16 characters will be written in that case by
414 : // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
415 : // any of the surrogate bits are wrong since no UTF16
416 : // characters will be written in that case either.
417 :
418 0 : if (p + 4 <= end) {
419 0 : uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
420 0 : ((uint32_t)(p[1] & 0x30));
421 0 : if (c >= 0x010 && c < 0x110) {
422 0 : ++mLength;
423 : }
424 : }
425 :
426 0 : p += 4;
427 0 : } else if (UTF8traits::is5byte(*p)) {
428 0 : p += 5;
429 0 : } else if (UTF8traits::is6byte(*p)) {
430 0 : p += 6;
431 : } else { // error
432 0 : ++mLength; // to account for the decrement below
433 0 : break;
434 : }
435 : }
436 7266 : if (p != end) {
437 0 : NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
438 0 : --mLength; // The last multi-byte char wasn't complete, discard it.
439 0 : mErrorEncountered = true;
440 : }
441 : }
442 :
443 : private:
444 : size_t mLength;
445 : bool mErrorEncountered;
446 : };
447 :
448 : /**
449 : * A character sink (see |copy_string| in nsAlgorithm.h) for
450 : * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
451 : * (0xEFBFBD in UTF-8).
452 : */
453 : class ConvertUTF16toUTF8
454 : {
455 : public:
456 : typedef char16_t value_type;
457 : typedef char buffer_type;
458 :
459 : // The error handling here is more lenient than that in
460 : // |ConvertUTF8toUTF16|, but it's that way for backwards
461 : // compatibility.
462 :
463 919 : explicit ConvertUTF16toUTF8(buffer_type* aBuffer)
464 919 : : mStart(aBuffer), mBuffer(aBuffer)
465 : {
466 919 : }
467 :
468 919 : size_t Size() const
469 : {
470 919 : return mBuffer - mStart;
471 : }
472 :
473 919 : void write(const value_type* aStart, uint32_t aN)
474 : {
475 919 : buffer_type* out = mBuffer; // gcc isn't smart enough to do this!
476 :
477 6185 : for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
478 5266 : value_type c = *p;
479 5266 : if (!(c & 0xFF80)) { // U+0000 - U+007F
480 4915 : *out++ = (char)c;
481 351 : } else if (!(c & 0xF800)) { // U+0100 - U+07FF
482 60 : *out++ = 0xC0 | (char)(c >> 6);
483 60 : *out++ = 0x80 | (char)(0x003F & c);
484 291 : } else if (!IS_SURROGATE(c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
485 291 : *out++ = 0xE0 | (char)(c >> 12);
486 291 : *out++ = 0x80 | (char)(0x003F & (c >> 6));
487 291 : *out++ = 0x80 | (char)(0x003F & c);
488 0 : } else if (NS_IS_HIGH_SURROGATE(c)) { // U+D800 - U+DBFF
489 : // D800- DBFF - High Surrogate
490 0 : value_type h = c;
491 :
492 0 : ++p;
493 0 : if (p == end) {
494 : // Treat broken characters as the Unicode
495 : // replacement character 0xFFFD (0xEFBFBD in
496 : // UTF-8)
497 0 : *out++ = '\xEF';
498 0 : *out++ = '\xBF';
499 0 : *out++ = '\xBD';
500 :
501 0 : UTF8UTILS_WARNING("String ending in half a surrogate pair!");
502 :
503 0 : break;
504 : }
505 0 : c = *p;
506 :
507 0 : if (NS_IS_LOW_SURROGATE(c)) {
508 : // DC00- DFFF - Low Surrogate
509 : // N = (H - D800) *400 + 10000 + ( L - DC00 )
510 0 : uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
511 :
512 : // 0001 0000-001F FFFF
513 0 : *out++ = 0xF0 | (char)(ucs4 >> 18);
514 0 : *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
515 0 : *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
516 0 : *out++ = 0x80 | (char)(0x003F & ucs4);
517 : } else {
518 : // Treat broken characters as the Unicode
519 : // replacement character 0xFFFD (0xEFBFBD in
520 : // UTF-8)
521 0 : *out++ = '\xEF';
522 0 : *out++ = '\xBF';
523 0 : *out++ = '\xBD';
524 :
525 : // The pointer to the next character points to the second
526 : // 16-bit value, not beyond it, as per Unicode 5.0.0
527 : // Chapter 3 C10, only the first code unit of an illegal
528 : // sequence must be treated as an illegally terminated
529 : // code unit sequence (also Chapter 3 D91, "isolated [not
530 : // paired and ill-formed] UTF-16 code units in the range
531 : // D800..DFFF are ill-formed").
532 0 : p--;
533 :
534 0 : UTF8UTILS_WARNING("got a High Surrogate but no low surrogate");
535 : }
536 : } else { // U+DC00 - U+DFFF
537 : // Treat broken characters as the Unicode replacement
538 : // character 0xFFFD (0xEFBFBD in UTF-8)
539 0 : *out++ = '\xEF';
540 0 : *out++ = '\xBF';
541 0 : *out++ = '\xBD';
542 :
543 : // DC00- DFFF - Low Surrogate
544 0 : UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
545 : }
546 : }
547 :
548 919 : mBuffer = out;
549 919 : }
550 :
551 1 : void write_terminator()
552 : {
553 1 : *mBuffer = buffer_type(0);
554 1 : }
555 :
556 : private:
557 : buffer_type* const mStart;
558 : buffer_type* mBuffer;
559 : };
560 :
561 : /**
562 : * A character sink (see |copy_string| in nsAlgorithm.h) for computing
563 : * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
564 : * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
565 : */
566 : class CalculateUTF8Size
567 : {
568 : public:
569 : typedef char16_t value_type;
570 :
571 644 : CalculateUTF8Size()
572 644 : : mSize(0)
573 : {
574 644 : }
575 :
576 645 : size_t Size() const
577 : {
578 645 : return mSize;
579 : }
580 :
581 644 : void write(const value_type* aStart, uint32_t aN)
582 : {
583 : // Assume UCS2 surrogate pairs won't be spread across fragments.
584 5556 : for (const value_type* p = aStart, *end = aStart + aN; p < end; ++p) {
585 4912 : value_type c = *p;
586 4912 : if (!(c & 0xFF80)) { // U+0000 - U+007F
587 4912 : mSize += 1;
588 0 : } else if (!(c & 0xF800)) { // U+0100 - U+07FF
589 0 : mSize += 2;
590 0 : } else if (0xD800 != (0xF800 & c)) { // U+0800 - U+D7FF,U+E000 - U+FFFF
591 0 : mSize += 3;
592 0 : } else if (0xD800 == (0xFC00 & c)) { // U+D800 - U+DBFF
593 0 : ++p;
594 0 : if (p == end) {
595 : // Treat broken characters as the Unicode
596 : // replacement character 0xFFFD (0xEFBFBD in
597 : // UTF-8)
598 0 : mSize += 3;
599 :
600 0 : UTF8UTILS_WARNING("String ending in half a surrogate pair!");
601 :
602 0 : break;
603 : }
604 0 : c = *p;
605 :
606 0 : if (0xDC00 == (0xFC00 & c)) {
607 0 : mSize += 4;
608 : } else {
609 : // Treat broken characters as the Unicode
610 : // replacement character 0xFFFD (0xEFBFBD in
611 : // UTF-8)
612 0 : mSize += 3;
613 :
614 : // The next code unit is the second 16-bit value, not
615 : // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
616 : // only the first code unit of an illegal sequence must
617 : // be treated as an illegally terminated code unit
618 : // sequence (also Chapter 3 D91, "isolated [not paired and
619 : // ill-formed] UTF-16 code units in the range D800..DFFF
620 : // are ill-formed").
621 0 : p--;
622 :
623 0 : UTF8UTILS_WARNING("got a high Surrogate but no low surrogate");
624 : }
625 : } else { // U+DC00 - U+DFFF
626 : // Treat broken characters as the Unicode replacement
627 : // character 0xFFFD (0xEFBFBD in UTF-8)
628 0 : mSize += 3;
629 :
630 0 : UTF8UTILS_WARNING("got a low Surrogate but no high surrogate");
631 : }
632 : }
633 644 : }
634 :
635 : private:
636 : size_t mSize;
637 : };
638 :
639 : #ifdef MOZILLA_INTERNAL_API
640 : /**
641 : * A character sink that performs a |reinterpret_cast|-style conversion
642 : * from char to char16_t.
643 : */
644 : class LossyConvertEncoding8to16
645 : {
646 : public:
647 : typedef char value_type;
648 : typedef char input_type;
649 : typedef char16_t output_type;
650 :
651 : public:
652 1274 : explicit LossyConvertEncoding8to16(char16_t* aDestination) :
653 1274 : mDestination(aDestination)
654 : {
655 1274 : }
656 :
657 : void
658 1274 : write(const char* aSource, uint32_t aSourceLength)
659 : {
660 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
661 1274 : if (mozilla::supports_sse2()) {
662 1274 : write_sse2(aSource, aSourceLength);
663 1274 : return;
664 : }
665 : #endif
666 0 : const char* done_writing = aSource + aSourceLength;
667 0 : while (aSource < done_writing) {
668 0 : *mDestination++ = (char16_t)(unsigned char)(*aSource++);
669 : }
670 : }
671 :
672 : void
673 : write_sse2(const char* aSource, uint32_t aSourceLength);
674 :
675 : void
676 0 : write_terminator()
677 : {
678 0 : *mDestination = (char16_t)(0);
679 0 : }
680 :
681 : private:
682 : char16_t* mDestination;
683 : };
684 :
685 : /**
686 : * A character sink that performs a |reinterpret_cast|-style conversion
687 : * from char16_t to char.
688 : */
689 : class LossyConvertEncoding16to8
690 : {
691 : public:
692 : typedef char16_t value_type;
693 : typedef char16_t input_type;
694 : typedef char output_type;
695 :
696 4329 : explicit LossyConvertEncoding16to8(char* aDestination)
697 4329 : : mDestination(aDestination)
698 : {
699 4329 : }
700 :
701 : void
702 4329 : write(const char16_t* aSource, uint32_t aSourceLength)
703 : {
704 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
705 4329 : if (mozilla::supports_sse2()) {
706 4329 : write_sse2(aSource, aSourceLength);
707 4329 : return;
708 : }
709 : #endif
710 0 : const char16_t* done_writing = aSource + aSourceLength;
711 0 : while (aSource < done_writing) {
712 0 : *mDestination++ = (char)(*aSource++);
713 : }
714 : }
715 :
716 : #ifdef MOZILLA_MAY_SUPPORT_SSE2
717 : void
718 : write_sse2(const char16_t* aSource, uint32_t aSourceLength);
719 : #endif
720 :
721 : void
722 10 : write_terminator()
723 : {
724 10 : *mDestination = '\0';
725 10 : }
726 :
727 : private:
728 : char* mDestination;
729 : };
730 : #endif // MOZILLA_INTERNAL_API
731 :
732 :
733 : template<typename Char, typename UnsignedT>
734 : inline UnsignedT
735 0 : RewindToPriorUTF8Codepoint(const Char* utf8Chars, UnsignedT index)
736 : {
737 : static_assert(mozilla::IsSame<Char, char>::value ||
738 : mozilla::IsSame<Char, unsigned char>::value ||
739 : mozilla::IsSame<Char, signed char>::value,
740 : "UTF-8 data must be in 8-bit units");
741 : static_assert(mozilla::IsUnsigned<UnsignedT>::value, "index type must be unsigned");
742 0 : while (index > 0 && (utf8Chars[index] & 0xC0) == 0x80)
743 0 : --index;
744 :
745 0 : return index;
746 : }
747 :
748 : #undef UTF8UTILS_WARNING
749 :
750 : #endif /* !defined(nsUTF8Utils_h_) */
|