Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #include "xpcom-private.h"
8 :
9 : //-----------------------------------------------------------------------------
10 : // XP_MACOSX or ANDROID
11 : //-----------------------------------------------------------------------------
12 : #if defined(XP_MACOSX) || defined(ANDROID)
13 :
14 : #include "nsAString.h"
15 : #include "nsReadableUtils.h"
16 : #include "nsString.h"
17 :
18 : nsresult
19 : NS_CopyNativeToUnicode(const nsACString& aInput, nsAString& aOutput)
20 : {
21 : CopyUTF8toUTF16(aInput, aOutput);
22 : return NS_OK;
23 : }
24 :
25 : nsresult
26 : NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput)
27 : {
28 : CopyUTF16toUTF8(aInput, aOutput);
29 : return NS_OK;
30 : }
31 :
32 : void
33 : NS_StartupNativeCharsetUtils()
34 : {
35 : }
36 :
37 : void
38 : NS_ShutdownNativeCharsetUtils()
39 : {
40 : }
41 :
42 :
43 : //-----------------------------------------------------------------------------
44 : // XP_UNIX
45 : //-----------------------------------------------------------------------------
46 : #elif defined(XP_UNIX)
47 :
48 : #include <stdlib.h> // mbtowc, wctomb
49 : #include <locale.h> // setlocale
50 : #include "mozilla/Mutex.h"
51 : #include "nscore.h"
52 : #include "nsAString.h"
53 : #include "nsReadableUtils.h"
54 :
55 : using namespace mozilla;
56 :
57 : //
58 : // choose a conversion library. we used to use mbrtowc/wcrtomb under Linux,
59 : // but that doesn't work for non-BMP characters whether we use '-fshort-wchar'
60 : // or not (see bug 206811 and
61 : // news://news.mozilla.org:119/bajml3$fvr1@ripley.netscape.com). we now use
62 : // iconv for all platforms where nltypes.h and nllanginfo.h are present
63 : // along with iconv.
64 : //
65 : #if defined(HAVE_ICONV) && defined(HAVE_NL_TYPES_H) && defined(HAVE_LANGINFO_CODESET)
66 : #define USE_ICONV 1
67 : #else
68 : #define USE_STDCONV 1
69 : #endif
70 :
71 : static void
72 0 : isolatin1_to_utf16(const char** aInput, uint32_t* aInputLeft,
73 : char16_t** aOutput, uint32_t* aOutputLeft)
74 : {
75 0 : while (*aInputLeft && *aOutputLeft) {
76 0 : **aOutput = (unsigned char)** aInput;
77 0 : (*aInput)++;
78 0 : (*aInputLeft)--;
79 0 : (*aOutput)++;
80 0 : (*aOutputLeft)--;
81 : }
82 0 : }
83 :
84 : static void
85 0 : utf16_to_isolatin1(const char16_t** aInput, uint32_t* aInputLeft,
86 : char** aOutput, uint32_t* aOutputLeft)
87 : {
88 0 : while (*aInputLeft && *aOutputLeft) {
89 0 : **aOutput = (unsigned char)**aInput;
90 0 : (*aInput)++;
91 0 : (*aInputLeft)--;
92 0 : (*aOutput)++;
93 0 : (*aOutputLeft)--;
94 : }
95 0 : }
96 :
97 : //-----------------------------------------------------------------------------
98 : // conversion using iconv
99 : //-----------------------------------------------------------------------------
100 : #if defined(USE_ICONV)
101 : #include <nl_types.h> // CODESET
102 : #include <langinfo.h> // nl_langinfo
103 : #include <iconv.h> // iconv_open, iconv, iconv_close
104 : #include <errno.h>
105 : #include "plstr.h"
106 :
107 : #if defined(HAVE_ICONV_WITH_CONST_INPUT)
108 : #define ICONV_INPUT(x) (x)
109 : #else
110 : #define ICONV_INPUT(x) ((char **)x)
111 : #endif
112 :
113 : // solaris definitely needs this, but we'll enable it by default
114 : // just in case... but we know for sure that iconv(3) in glibc
115 : // doesn't need this.
116 : #if !defined(__GLIBC__)
117 : #define ENABLE_UTF8_FALLBACK_SUPPORT
118 : #endif
119 :
120 : #define INVALID_ICONV_T ((iconv_t)-1)
121 :
122 : static inline size_t
123 17076 : xp_iconv(iconv_t converter,
124 : const char** aInput, size_t* aInputLeft,
125 : char** aOutput, size_t* aOutputLeft)
126 : {
127 17076 : size_t res, outputAvail = *aOutputLeft;
128 17076 : res = iconv(converter, ICONV_INPUT(aInput), aInputLeft, aOutput, aOutputLeft);
129 17076 : if (res == (size_t)-1) {
130 : // on some platforms (e.g., linux) iconv will fail with
131 : // E2BIG if it cannot convert _all_ of its input. it'll
132 : // still adjust all of the in/out params correctly, so we
133 : // can ignore this error. the assumption is that we will
134 : // be called again to complete the conversion.
135 0 : if ((errno == E2BIG) && (*aOutputLeft < outputAvail)) {
136 0 : res = 0;
137 : }
138 : }
139 17076 : return res;
140 : }
141 :
142 : static inline void
143 11382 : xp_iconv_reset(iconv_t converter)
144 : {
145 : // NOTE: the man pages on Solaris claim that you can pass nullptr
146 : // for all parameter to reset the converter, but beware the
147 : // evil Solaris crash if you go down this route >:-)
148 :
149 11382 : const char* zero_char_in_ptr = nullptr;
150 11382 : char* zero_char_out_ptr = nullptr;
151 11382 : size_t zero_size_in = 0;
152 11382 : size_t zero_size_out = 0;
153 :
154 : xp_iconv(converter,
155 : &zero_char_in_ptr,
156 : &zero_size_in,
157 : &zero_char_out_ptr,
158 11382 : &zero_size_out);
159 11382 : }
160 :
161 : static inline iconv_t
162 6 : xp_iconv_open(const char** to_list, const char** from_list)
163 : {
164 : iconv_t res;
165 : const char** from_name;
166 : const char** to_name;
167 :
168 : // try all possible combinations to locate a converter.
169 6 : to_name = to_list;
170 6 : while (*to_name) {
171 6 : if (**to_name) {
172 6 : from_name = from_list;
173 6 : while (*from_name) {
174 6 : if (**from_name) {
175 6 : res = iconv_open(*to_name, *from_name);
176 6 : if (res != INVALID_ICONV_T) {
177 6 : return res;
178 : }
179 : }
180 0 : from_name++;
181 : }
182 : }
183 0 : to_name++;
184 : }
185 :
186 0 : return INVALID_ICONV_T;
187 : }
188 :
189 : /*
190 : * char16_t[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
191 : * have to use UTF-16 with iconv(3) on platforms where it's supported.
192 : * However, the way UTF-16 and UCS-2 are interpreted varies across platforms
193 : * and implementations of iconv(3). On Tru64, it also depends on the environment
194 : * variable. To avoid the trouble arising from byte-swapping
195 : * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
196 : * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
197 : * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
198 : * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
199 : * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
200 : * variable ICONV_BYTEORDER is set to 'big-endian', about which not much
201 : * can be done other than adding a note in the release notes. (bug 206811)
202 : */
203 : static const char* UTF_16_NAMES[] = {
204 : #if defined(IS_LITTLE_ENDIAN)
205 : "UTF-16LE",
206 : #if defined(__GLIBC__)
207 : "UNICODELITTLE",
208 : #endif
209 : "UCS-2LE",
210 : #else
211 : "UTF-16BE",
212 : #if defined(__GLIBC__)
213 : "UNICODEBIG",
214 : #endif
215 : "UCS-2BE",
216 : #endif
217 : "UTF-16",
218 : "UCS-2",
219 : "UCS2",
220 : "UCS_2",
221 : "ucs-2",
222 : "ucs2",
223 : "ucs_2",
224 : nullptr
225 : };
226 :
227 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
228 : static const char* UTF_8_NAMES[] = {
229 : "UTF-8",
230 : "UTF8",
231 : "UTF_8",
232 : "utf-8",
233 : "utf8",
234 : "utf_8",
235 : nullptr
236 : };
237 : #endif
238 :
239 : static const char* ISO_8859_1_NAMES[] = {
240 : "ISO-8859-1",
241 : #if !defined(__GLIBC__)
242 : "ISO8859-1",
243 : "ISO88591",
244 : "ISO_8859_1",
245 : "ISO8859_1",
246 : "iso-8859-1",
247 : "iso8859-1",
248 : "iso88591",
249 : "iso_8859_1",
250 : "iso8859_1",
251 : #endif
252 : nullptr
253 : };
254 :
255 : class nsNativeCharsetConverter
256 : {
257 : public:
258 : nsNativeCharsetConverter();
259 : ~nsNativeCharsetConverter();
260 :
261 : nsresult NativeToUnicode(const char** aInput, uint32_t* aInputLeft,
262 : char16_t** aOutput, uint32_t* aOutputLeft);
263 : nsresult UnicodeToNative(const char16_t** aInput, uint32_t* aInputLeft,
264 : char** aOutput, uint32_t* aOutputLeft);
265 :
266 : static void GlobalInit();
267 : static void GlobalShutdown();
268 : static bool IsNativeUTF8();
269 :
270 : private:
271 : static iconv_t gNativeToUnicode;
272 : static iconv_t gUnicodeToNative;
273 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
274 : static iconv_t gNativeToUTF8;
275 : static iconv_t gUTF8ToNative;
276 : static iconv_t gUnicodeToUTF8;
277 : static iconv_t gUTF8ToUnicode;
278 : #endif
279 : static Mutex* gLock;
280 : static bool gInitialized;
281 : static bool gIsNativeUTF8;
282 :
283 : static void LazyInit();
284 :
285 5691 : static void Lock()
286 : {
287 5691 : if (gLock) {
288 5678 : gLock->Lock();
289 : }
290 5691 : }
291 5691 : static void Unlock()
292 : {
293 5691 : if (gLock) {
294 5678 : gLock->Unlock();
295 : }
296 5691 : }
297 : };
298 :
299 : iconv_t nsNativeCharsetConverter::gNativeToUnicode = INVALID_ICONV_T;
300 : iconv_t nsNativeCharsetConverter::gUnicodeToNative = INVALID_ICONV_T;
301 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
302 : iconv_t nsNativeCharsetConverter::gNativeToUTF8 = INVALID_ICONV_T;
303 : iconv_t nsNativeCharsetConverter::gUTF8ToNative = INVALID_ICONV_T;
304 : iconv_t nsNativeCharsetConverter::gUnicodeToUTF8 = INVALID_ICONV_T;
305 : iconv_t nsNativeCharsetConverter::gUTF8ToUnicode = INVALID_ICONV_T;
306 : #endif
307 : Mutex* nsNativeCharsetConverter::gLock = nullptr;
308 : bool nsNativeCharsetConverter::gInitialized = false;
309 : bool nsNativeCharsetConverter::gIsNativeUTF8 = false;
310 :
311 : void
312 3 : nsNativeCharsetConverter::LazyInit()
313 : {
314 : // LazyInit may be called before NS_StartupNativeCharsetUtils, but
315 : // the setlocale it does has to be called before nl_langinfo. Like in
316 : // NS_StartupNativeCharsetUtils, assume we are called early enough that
317 : // we are the first to care about the locale's charset.
318 3 : if (!gLock) {
319 1 : setlocale(LC_CTYPE, "");
320 : }
321 3 : const char* blank_list[] = { "", nullptr };
322 3 : const char** native_charset_list = blank_list;
323 3 : const char* native_charset = nl_langinfo(CODESET);
324 3 : if (!native_charset) {
325 0 : NS_ERROR("native charset is unknown");
326 : // fallback to ISO-8859-1
327 0 : native_charset_list = ISO_8859_1_NAMES;
328 : } else {
329 3 : native_charset_list[0] = native_charset;
330 : }
331 :
332 : // Most, if not all, Unixen supporting UTF-8 and nl_langinfo(CODESET)
333 : // return 'UTF-8' (or 'utf-8')
334 3 : if (!PL_strcasecmp(native_charset, "UTF-8")) {
335 3 : gIsNativeUTF8 = true;
336 : }
337 :
338 3 : gNativeToUnicode = xp_iconv_open(UTF_16_NAMES, native_charset_list);
339 3 : gUnicodeToNative = xp_iconv_open(native_charset_list, UTF_16_NAMES);
340 :
341 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
342 : if (gNativeToUnicode == INVALID_ICONV_T) {
343 : gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
344 : gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
345 : NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
346 : NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
347 : }
348 : if (gUnicodeToNative == INVALID_ICONV_T) {
349 : gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
350 : gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
351 : NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
352 : NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
353 : }
354 : #else
355 3 : NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
356 3 : NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
357 : #endif
358 :
359 : /*
360 : * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
361 : * prepend a byte order mark unicode character (BOM, u+FEFF) during
362 : * the first use of the iconv converter. The same is the case of
363 : * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
364 : * However, we use 'UTF-16LE/BE' in both cases, instead so that we
365 : * should be safe. But just in case...
366 : *
367 : * This dummy conversion gets rid of the BOMs and fixes bug 153562.
368 : */
369 3 : char dummy_input[1] = { ' ' };
370 : char dummy_output[4];
371 :
372 3 : if (gNativeToUnicode != INVALID_ICONV_T) {
373 3 : const char* input = dummy_input;
374 3 : size_t input_left = sizeof(dummy_input);
375 3 : char* output = dummy_output;
376 3 : size_t output_left = sizeof(dummy_output);
377 :
378 3 : xp_iconv(gNativeToUnicode, &input, &input_left, &output, &output_left);
379 : }
380 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
381 : if (gUTF8ToUnicode != INVALID_ICONV_T) {
382 : const char* input = dummy_input;
383 : size_t input_left = sizeof(dummy_input);
384 : char* output = dummy_output;
385 : size_t output_left = sizeof(dummy_output);
386 :
387 : xp_iconv(gUTF8ToUnicode, &input, &input_left, &output, &output_left);
388 : }
389 : #endif
390 :
391 3 : gInitialized = true;
392 3 : }
393 :
394 : void
395 3 : nsNativeCharsetConverter::GlobalInit()
396 : {
397 3 : gLock = new Mutex("nsNativeCharsetConverter.gLock");
398 3 : }
399 :
400 : void
401 0 : nsNativeCharsetConverter::GlobalShutdown()
402 : {
403 0 : delete gLock;
404 0 : gLock = nullptr;
405 :
406 0 : if (gNativeToUnicode != INVALID_ICONV_T) {
407 0 : iconv_close(gNativeToUnicode);
408 0 : gNativeToUnicode = INVALID_ICONV_T;
409 : }
410 :
411 0 : if (gUnicodeToNative != INVALID_ICONV_T) {
412 0 : iconv_close(gUnicodeToNative);
413 0 : gUnicodeToNative = INVALID_ICONV_T;
414 : }
415 :
416 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
417 : if (gNativeToUTF8 != INVALID_ICONV_T) {
418 : iconv_close(gNativeToUTF8);
419 : gNativeToUTF8 = INVALID_ICONV_T;
420 : }
421 : if (gUTF8ToNative != INVALID_ICONV_T) {
422 : iconv_close(gUTF8ToNative);
423 : gUTF8ToNative = INVALID_ICONV_T;
424 : }
425 : if (gUnicodeToUTF8 != INVALID_ICONV_T) {
426 : iconv_close(gUnicodeToUTF8);
427 : gUnicodeToUTF8 = INVALID_ICONV_T;
428 : }
429 : if (gUTF8ToUnicode != INVALID_ICONV_T) {
430 : iconv_close(gUTF8ToUnicode);
431 : gUTF8ToUnicode = INVALID_ICONV_T;
432 : }
433 : #endif
434 :
435 0 : gInitialized = false;
436 0 : }
437 :
438 5691 : nsNativeCharsetConverter::nsNativeCharsetConverter()
439 : {
440 5691 : Lock();
441 5691 : if (!gInitialized) {
442 3 : LazyInit();
443 : }
444 5691 : }
445 :
446 5691 : nsNativeCharsetConverter::~nsNativeCharsetConverter()
447 : {
448 : // reset converters for next time
449 5691 : if (gNativeToUnicode != INVALID_ICONV_T) {
450 5691 : xp_iconv_reset(gNativeToUnicode);
451 : }
452 5691 : if (gUnicodeToNative != INVALID_ICONV_T) {
453 5691 : xp_iconv_reset(gUnicodeToNative);
454 : }
455 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
456 : if (gNativeToUTF8 != INVALID_ICONV_T) {
457 : xp_iconv_reset(gNativeToUTF8);
458 : }
459 : if (gUTF8ToNative != INVALID_ICONV_T) {
460 : xp_iconv_reset(gUTF8ToNative);
461 : }
462 : if (gUnicodeToUTF8 != INVALID_ICONV_T) {
463 : xp_iconv_reset(gUnicodeToUTF8);
464 : }
465 : if (gUTF8ToUnicode != INVALID_ICONV_T) {
466 : xp_iconv_reset(gUTF8ToUnicode);
467 : }
468 : #endif
469 5691 : Unlock();
470 5691 : }
471 :
472 : nsresult
473 3003 : nsNativeCharsetConverter::NativeToUnicode(const char** aInput,
474 : uint32_t* aInputLeft,
475 : char16_t** aOutput,
476 : uint32_t* aOutputLeft)
477 : {
478 3003 : size_t res = 0;
479 3003 : size_t inLeft = (size_t)*aInputLeft;
480 3003 : size_t outLeft = (size_t)*aOutputLeft * 2;
481 :
482 3003 : if (gNativeToUnicode != INVALID_ICONV_T) {
483 :
484 3003 : res = xp_iconv(gNativeToUnicode, aInput, &inLeft, (char**)aOutput, &outLeft);
485 :
486 3003 : *aInputLeft = inLeft;
487 3003 : *aOutputLeft = outLeft / 2;
488 3003 : if (res != (size_t)-1) {
489 3003 : return NS_OK;
490 : }
491 :
492 0 : NS_WARNING("conversion from native to utf-16 failed");
493 :
494 : // reset converter
495 0 : xp_iconv_reset(gNativeToUnicode);
496 : }
497 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
498 : else if ((gNativeToUTF8 != INVALID_ICONV_T) &&
499 : (gUTF8ToUnicode != INVALID_ICONV_T)) {
500 : // convert first to UTF8, then from UTF8 to UCS2
501 : const char* in = *aInput;
502 :
503 : char ubuf[1024];
504 :
505 : // we assume we're always called with enough space in |aOutput|,
506 : // so convert many chars at a time...
507 : while (inLeft) {
508 : char* p = ubuf;
509 : size_t n = sizeof(ubuf);
510 : res = xp_iconv(gNativeToUTF8, &in, &inLeft, &p, &n);
511 : if (res == (size_t)-1) {
512 : NS_ERROR("conversion from native to utf-8 failed");
513 : break;
514 : }
515 : NS_ASSERTION(outLeft > 0, "bad assumption");
516 : p = ubuf;
517 : n = sizeof(ubuf) - n;
518 : res = xp_iconv(gUTF8ToUnicode, (const char**)&p, &n,
519 : (char**)aOutput, &outLeft);
520 : if (res == (size_t)-1) {
521 : NS_ERROR("conversion from utf-8 to utf-16 failed");
522 : break;
523 : }
524 : }
525 :
526 : (*aInput) += (*aInputLeft - inLeft);
527 : *aInputLeft = inLeft;
528 : *aOutputLeft = outLeft / 2;
529 :
530 : if (res != (size_t)-1) {
531 : return NS_OK;
532 : }
533 :
534 : // reset converters
535 : xp_iconv_reset(gNativeToUTF8);
536 : xp_iconv_reset(gUTF8ToUnicode);
537 : }
538 : #endif
539 :
540 : // fallback: zero-pad and hope for the best
541 : // XXX This is lame and we have to do better.
542 0 : isolatin1_to_utf16(aInput, aInputLeft, aOutput, aOutputLeft);
543 :
544 0 : return NS_OK;
545 : }
546 :
547 : nsresult
548 2688 : nsNativeCharsetConverter::UnicodeToNative(const char16_t** aInput,
549 : uint32_t* aInputLeft,
550 : char** aOutput,
551 : uint32_t* aOutputLeft)
552 : {
553 2688 : size_t res = 0;
554 2688 : size_t inLeft = (size_t)*aInputLeft * 2;
555 2688 : size_t outLeft = (size_t)*aOutputLeft;
556 :
557 2688 : if (gUnicodeToNative != INVALID_ICONV_T) {
558 2688 : res = xp_iconv(gUnicodeToNative, (const char**)aInput, &inLeft,
559 2688 : aOutput, &outLeft);
560 :
561 2688 : *aInputLeft = inLeft / 2;
562 2688 : *aOutputLeft = outLeft;
563 2688 : if (res != (size_t)-1) {
564 2688 : return NS_OK;
565 : }
566 :
567 0 : NS_ERROR("iconv failed");
568 :
569 : // reset converter
570 0 : xp_iconv_reset(gUnicodeToNative);
571 : }
572 : #if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
573 : else if ((gUnicodeToUTF8 != INVALID_ICONV_T) &&
574 : (gUTF8ToNative != INVALID_ICONV_T)) {
575 : const char* in = (const char*)*aInput;
576 :
577 : char ubuf[6]; // max utf-8 char length (really only needs to be 4 bytes)
578 :
579 : // convert one uchar at a time...
580 : while (inLeft && outLeft) {
581 : char* p = ubuf;
582 : size_t n = sizeof(ubuf), one_uchar = sizeof(char16_t);
583 : res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
584 : if (res == (size_t)-1) {
585 : NS_ERROR("conversion from utf-16 to utf-8 failed");
586 : break;
587 : }
588 : p = ubuf;
589 : n = sizeof(ubuf) - n;
590 : res = xp_iconv(gUTF8ToNative, (const char**)&p, &n, aOutput, &outLeft);
591 : if (res == (size_t)-1) {
592 : if (errno == E2BIG) {
593 : // not enough room for last uchar... back up and return.
594 : in -= sizeof(char16_t);
595 : res = 0;
596 : } else {
597 : NS_ERROR("conversion from utf-8 to native failed");
598 : }
599 : break;
600 : }
601 : inLeft -= sizeof(char16_t);
602 : }
603 :
604 : (*aInput) += (*aInputLeft - inLeft / 2);
605 : *aInputLeft = inLeft / 2;
606 : *aOutputLeft = outLeft;
607 : if (res != (size_t)-1) {
608 : return NS_OK;
609 : }
610 :
611 : // reset converters
612 : xp_iconv_reset(gUnicodeToUTF8);
613 : xp_iconv_reset(gUTF8ToNative);
614 : }
615 : #endif
616 :
617 : // fallback: truncate and hope for the best
618 : // XXX This is lame and we have to do better.
619 0 : utf16_to_isolatin1(aInput, aInputLeft, aOutput, aOutputLeft);
620 :
621 0 : return NS_OK;
622 : }
623 :
624 : bool
625 2655 : nsNativeCharsetConverter::IsNativeUTF8()
626 : {
627 2655 : if (!gInitialized) {
628 0 : Lock();
629 0 : if (!gInitialized) {
630 0 : LazyInit();
631 : }
632 0 : Unlock();
633 : }
634 2655 : return gIsNativeUTF8;
635 : }
636 :
637 : #endif // USE_ICONV
638 :
639 : //-----------------------------------------------------------------------------
640 : // conversion using mb[r]towc/wc[r]tomb
641 : //-----------------------------------------------------------------------------
642 : #if defined(USE_STDCONV)
643 : #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
644 : #include <wchar.h> // mbrtowc, wcrtomb
645 : #endif
646 :
647 : class nsNativeCharsetConverter
648 : {
649 : public:
650 : nsNativeCharsetConverter();
651 :
652 : nsresult NativeToUnicode(const char** aInput, uint32_t* aInputLeft,
653 : char16_t** aOutput, uint32_t* aOutputLeft);
654 : nsresult UnicodeToNative(const char16_t** aInput, uint32_t* aInputLeft,
655 : char** aOutput, uint32_t* aOutputLeft);
656 :
657 : static void GlobalInit();
658 : static void GlobalShutdown() { }
659 : static bool IsNativeUTF8();
660 :
661 : private:
662 : static bool gWCharIsUnicode;
663 :
664 : #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
665 : mbstate_t ps;
666 : #endif
667 : };
668 :
669 : bool nsNativeCharsetConverter::gWCharIsUnicode = false;
670 :
671 : nsNativeCharsetConverter::nsNativeCharsetConverter()
672 : {
673 : #if defined(HAVE_WCRTOMB) || defined(HAVE_MBRTOWC)
674 : memset(&ps, 0, sizeof(ps));
675 : #endif
676 : }
677 :
678 : void
679 : nsNativeCharsetConverter::GlobalInit()
680 : {
681 : // verify that wchar_t for the current locale is actually unicode.
682 : // if it is not, then we should avoid calling mbtowc/wctomb and
683 : // just fallback on zero-pad/truncation conversion.
684 : //
685 : // this test cannot be done at build time because the encoding of
686 : // wchar_t may depend on the runtime locale. sad, but true!!
687 : //
688 : // so, if wchar_t is unicode then converting an ASCII character
689 : // to wchar_t should not change its numeric value. we'll just
690 : // check what happens with the ASCII 'a' character.
691 : //
692 : // this test is not perfect... obviously, it could yield false
693 : // positives, but then at least ASCII text would be converted
694 : // properly (or maybe just the 'a' character) -- oh well :(
695 :
696 : char a = 'a';
697 : unsigned int w = 0;
698 :
699 : int res = mbtowc((wchar_t*)&w, &a, 1);
700 :
701 : gWCharIsUnicode = (res != -1 && w == 'a');
702 :
703 : #ifdef DEBUG
704 : if (!gWCharIsUnicode) {
705 : NS_WARNING("wchar_t is not unicode (unicode conversion will be lossy)");
706 : }
707 : #endif
708 : }
709 :
710 : nsresult
711 : nsNativeCharsetConverter::NativeToUnicode(const char** aInput,
712 : uint32_t* aInputLeft,
713 : char16_t** aOutput,
714 : uint32_t* aOutputLeft)
715 : {
716 : if (gWCharIsUnicode) {
717 : int incr;
718 :
719 : // cannot use wchar_t here since it may have been redefined (e.g.,
720 : // via -fshort-wchar). hopefully, sizeof(tmp) is sufficient XP.
721 : unsigned int tmp = 0;
722 : while (*aInputLeft && *aOutputLeft) {
723 : #ifdef HAVE_MBRTOWC
724 : incr = (int)mbrtowc((wchar_t*)&tmp, *aInput, *aInputLeft, &ps);
725 : #else
726 : // XXX is this thread-safe?
727 : incr = (int)mbtowc((wchar_t*)&tmp, *aInput, *aInputLeft);
728 : #endif
729 : if (incr < 0) {
730 : NS_WARNING("mbtowc failed: possible charset mismatch");
731 : // zero-pad and hope for the best
732 : tmp = (unsigned char)**aInput;
733 : incr = 1;
734 : }
735 : ** aOutput = (char16_t)tmp;
736 : (*aInput) += incr;
737 : (*aInputLeft) -= incr;
738 : (*aOutput)++;
739 : (*aOutputLeft)--;
740 : }
741 : } else {
742 : // wchar_t isn't unicode, so the best we can do is treat the
743 : // input as if it is isolatin1 :(
744 : isolatin1_to_utf16(aInput, aInputLeft, aOutput, aOutputLeft);
745 : }
746 :
747 : return NS_OK;
748 : }
749 :
750 : nsresult
751 : nsNativeCharsetConverter::UnicodeToNative(const char16_t** aInput,
752 : uint32_t* aInputLeft,
753 : char** aOutput,
754 : uint32_t* aOutputLeft)
755 : {
756 : if (gWCharIsUnicode) {
757 : int incr;
758 :
759 : while (*aInputLeft && *aOutputLeft >= MB_CUR_MAX) {
760 : #ifdef HAVE_WCRTOMB
761 : incr = (int)wcrtomb(*aOutput, (wchar_t)**aInput, &ps);
762 : #else
763 : // XXX is this thread-safe?
764 : incr = (int)wctomb(*aOutput, (wchar_t)**aInput);
765 : #endif
766 : if (incr < 0) {
767 : NS_WARNING("mbtowc failed: possible charset mismatch");
768 : ** aOutput = (unsigned char)**aInput; // truncate
769 : incr = 1;
770 : }
771 : // most likely we're dead anyways if this assertion should fire
772 : NS_ASSERTION(uint32_t(incr) <= *aOutputLeft, "wrote beyond end of string");
773 : (*aOutput) += incr;
774 : (*aOutputLeft) -= incr;
775 : (*aInput)++;
776 : (*aInputLeft)--;
777 : }
778 : } else {
779 : // wchar_t isn't unicode, so the best we can do is treat the
780 : // input as if it is isolatin1 :(
781 : utf16_to_isolatin1(aInput, aInputLeft, aOutput, aOutputLeft);
782 : }
783 :
784 : return NS_OK;
785 : }
786 :
787 : // XXX : for now, return false
788 : bool
789 : nsNativeCharsetConverter::IsNativeUTF8()
790 : {
791 : return false;
792 : }
793 :
794 : #endif // USE_STDCONV
795 :
796 : //-----------------------------------------------------------------------------
797 : // API implementation
798 : //-----------------------------------------------------------------------------
799 :
800 : nsresult
801 3003 : NS_CopyNativeToUnicode(const nsACString& aInput, nsAString& aOutput)
802 : {
803 3003 : aOutput.Truncate();
804 :
805 3003 : uint32_t inputLen = aInput.Length();
806 :
807 3003 : nsACString::const_iterator iter;
808 3003 : aInput.BeginReading(iter);
809 :
810 : //
811 : // OPTIMIZATION: preallocate space for largest possible result; convert
812 : // directly into the result buffer to avoid intermediate buffer copy.
813 : //
814 : // this will generally result in a larger allocation, but that seems
815 : // better than an extra buffer copy.
816 : //
817 3003 : if (!aOutput.SetLength(inputLen, fallible)) {
818 0 : return NS_ERROR_OUT_OF_MEMORY;
819 : }
820 3003 : nsAString::iterator out_iter;
821 3003 : aOutput.BeginWriting(out_iter);
822 :
823 3003 : char16_t* result = out_iter.get();
824 3003 : uint32_t resultLeft = inputLen;
825 :
826 3003 : const char* buf = iter.get();
827 3003 : uint32_t bufLeft = inputLen;
828 :
829 6006 : nsNativeCharsetConverter conv;
830 3003 : nsresult rv = conv.NativeToUnicode(&buf, &bufLeft, &result, &resultLeft);
831 3003 : if (NS_SUCCEEDED(rv)) {
832 3003 : NS_ASSERTION(bufLeft == 0, "did not consume entire input buffer");
833 3003 : aOutput.SetLength(inputLen - resultLeft);
834 : }
835 3003 : return rv;
836 : }
837 :
838 : nsresult
839 2688 : NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput)
840 : {
841 2688 : aOutput.Truncate();
842 :
843 2688 : nsAString::const_iterator iter, end;
844 2688 : aInput.BeginReading(iter);
845 2688 : aInput.EndReading(end);
846 :
847 : // cannot easily avoid intermediate buffer copy.
848 : char temp[4096];
849 :
850 5376 : nsNativeCharsetConverter conv;
851 :
852 2688 : const char16_t* buf = iter.get();
853 2688 : uint32_t bufLeft = Distance(iter, end);
854 8064 : while (bufLeft) {
855 2688 : char* p = temp;
856 2688 : uint32_t tempLeft = sizeof(temp);
857 :
858 2688 : nsresult rv = conv.UnicodeToNative(&buf, &bufLeft, &p, &tempLeft);
859 2688 : if (NS_FAILED(rv)) {
860 0 : return rv;
861 : }
862 :
863 2688 : if (tempLeft < sizeof(temp)) {
864 2688 : aOutput.Append(temp, sizeof(temp) - tempLeft);
865 : }
866 : }
867 2688 : return NS_OK;
868 : }
869 :
870 : bool
871 2655 : NS_IsNativeUTF8()
872 : {
873 2655 : return nsNativeCharsetConverter::IsNativeUTF8();
874 : }
875 :
876 : void
877 3 : NS_StartupNativeCharsetUtils()
878 : {
879 : //
880 : // need to initialize the locale or else charset conversion will fail.
881 : // better not delay this in case some other component alters the locale
882 : // settings.
883 : //
884 : // XXX we assume that we are called early enough that we should
885 : // always be the first to care about the locale's charset.
886 : //
887 3 : setlocale(LC_CTYPE, "");
888 :
889 3 : nsNativeCharsetConverter::GlobalInit();
890 3 : }
891 :
892 : void
893 0 : NS_ShutdownNativeCharsetUtils()
894 : {
895 0 : nsNativeCharsetConverter::GlobalShutdown();
896 0 : }
897 :
898 : //-----------------------------------------------------------------------------
899 : // XP_WIN
900 : //-----------------------------------------------------------------------------
901 : #elif defined(XP_WIN)
902 :
903 : #include <windows.h>
904 : #include "nsString.h"
905 : #include "nsAString.h"
906 : #include "nsReadableUtils.h"
907 :
908 : using namespace mozilla;
909 :
910 : nsresult
911 : NS_CopyNativeToUnicode(const nsACString& aInput, nsAString& aOutput)
912 : {
913 : uint32_t inputLen = aInput.Length();
914 :
915 : nsACString::const_iterator iter;
916 : aInput.BeginReading(iter);
917 :
918 : const char* buf = iter.get();
919 :
920 : // determine length of result
921 : uint32_t resultLen = 0;
922 : int n = ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, nullptr, 0);
923 : if (n > 0) {
924 : resultLen += n;
925 : }
926 :
927 : // allocate sufficient space
928 : if (!aOutput.SetLength(resultLen, fallible)) {
929 : return NS_ERROR_OUT_OF_MEMORY;
930 : }
931 : if (resultLen > 0) {
932 : char16ptr_t result = aOutput.BeginWriting();
933 : ::MultiByteToWideChar(CP_ACP, 0, buf, inputLen, result, resultLen);
934 : }
935 : return NS_OK;
936 : }
937 :
938 : nsresult
939 : NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput)
940 : {
941 : uint32_t inputLen = aInput.Length();
942 :
943 : nsAString::const_iterator iter;
944 : aInput.BeginReading(iter);
945 :
946 : char16ptr_t buf = iter.get();
947 :
948 : // determine length of result
949 : uint32_t resultLen = 0;
950 :
951 : int n = ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, nullptr, 0,
952 : nullptr, nullptr);
953 : if (n > 0) {
954 : resultLen += n;
955 : }
956 :
957 : // allocate sufficient space
958 : if (!aOutput.SetLength(resultLen, fallible)) {
959 : return NS_ERROR_OUT_OF_MEMORY;
960 : }
961 : if (resultLen > 0) {
962 : nsACString::iterator out_iter;
963 : aOutput.BeginWriting(out_iter);
964 :
965 : // default "defaultChar" is '?', which is an illegal character on windows
966 : // file system. That will cause file uncreatable. Change it to '_'
967 : const char defaultChar = '_';
968 :
969 : char* result = out_iter.get();
970 :
971 : ::WideCharToMultiByte(CP_ACP, 0, buf, inputLen, result, resultLen,
972 : &defaultChar, nullptr);
973 : }
974 : return NS_OK;
975 : }
976 :
977 : #else
978 :
979 : #include "nsReadableUtils.h"
980 :
981 : nsresult
982 : NS_CopyNativeToUnicode(const nsACString& aInput, nsAString& aOutput)
983 : {
984 : CopyASCIItoUTF16(aInput, aOutput);
985 : return NS_OK;
986 : }
987 :
988 : nsresult
989 : NS_CopyUnicodeToNative(const nsAString& aInput, nsACString& aOutput)
990 : {
991 : LossyCopyUTF16toASCII(aInput, aOutput);
992 : return NS_OK;
993 : }
994 :
995 : void
996 : NS_StartupNativeCharsetUtils()
997 : {
998 : }
999 :
1000 : void
1001 : NS_ShutdownNativeCharsetUtils()
1002 : {
1003 : }
1004 :
1005 : #endif
|