Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99:
3 : * This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #ifndef js_CharacterEncoding_h
8 : #define js_CharacterEncoding_h
9 :
10 : #include "mozilla/Range.h"
11 :
12 : #include "js/TypeDecls.h"
13 : #include "js/Utility.h"
14 :
15 : class JSFlatString;
16 :
17 : namespace JS {
18 :
19 : /*
20 : * By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
21 : * are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
22 : * byte is treated as a 2-byte character, and there is no way to pass in a
23 : * string containing characters beyond U+00FF.
24 : */
25 : class Latin1Chars : public mozilla::Range<Latin1Char>
26 : {
27 : typedef mozilla::Range<Latin1Char> Base;
28 :
29 : public:
30 : using CharT = Latin1Char;
31 :
32 : Latin1Chars() : Base() {}
33 0 : Latin1Chars(char* aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength) {}
34 : Latin1Chars(const Latin1Char* aBytes, size_t aLength)
35 : : Base(const_cast<Latin1Char*>(aBytes), aLength)
36 : {}
37 : Latin1Chars(const char* aBytes, size_t aLength)
38 : : Base(reinterpret_cast<Latin1Char*>(const_cast<char*>(aBytes)), aLength)
39 : {}
40 : };
41 :
42 : /*
43 : * A Latin1Chars, but with \0 termination for C compatibility.
44 : */
45 : class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
46 : {
47 : typedef mozilla::RangedPtr<Latin1Char> Base;
48 :
49 : public:
50 : using CharT = Latin1Char;
51 :
52 0 : Latin1CharsZ() : Base(nullptr, 0) {}
53 :
54 : Latin1CharsZ(char* aBytes, size_t aLength)
55 : : Base(reinterpret_cast<Latin1Char*>(aBytes), aLength)
56 : {
57 : MOZ_ASSERT(aBytes[aLength] == '\0');
58 : }
59 :
60 0 : Latin1CharsZ(Latin1Char* aBytes, size_t aLength)
61 0 : : Base(aBytes, aLength)
62 : {
63 0 : MOZ_ASSERT(aBytes[aLength] == '\0');
64 0 : }
65 :
66 : using Base::operator=;
67 :
68 0 : char* c_str() { return reinterpret_cast<char*>(get()); }
69 : };
70 :
71 : class UTF8Chars : public mozilla::Range<unsigned char>
72 : {
73 : typedef mozilla::Range<unsigned char> Base;
74 :
75 : public:
76 : using CharT = unsigned char;
77 :
78 : UTF8Chars() : Base() {}
79 0 : UTF8Chars(char* aBytes, size_t aLength)
80 0 : : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
81 0 : {}
82 323 : UTF8Chars(const char* aBytes, size_t aLength)
83 323 : : Base(reinterpret_cast<unsigned char*>(const_cast<char*>(aBytes)), aLength)
84 323 : {}
85 : };
86 :
87 : /*
88 : * SpiderMonkey also deals directly with UTF-8 encoded text in some places.
89 : */
90 : class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
91 : {
92 : typedef mozilla::RangedPtr<unsigned char> Base;
93 :
94 : public:
95 : using CharT = unsigned char;
96 :
97 0 : UTF8CharsZ() : Base(nullptr, 0) {}
98 :
99 654 : UTF8CharsZ(char* aBytes, size_t aLength)
100 654 : : Base(reinterpret_cast<unsigned char*>(aBytes), aLength)
101 : {
102 654 : MOZ_ASSERT(aBytes[aLength] == '\0');
103 654 : }
104 :
105 : UTF8CharsZ(unsigned char* aBytes, size_t aLength)
106 : : Base(aBytes, aLength)
107 : {
108 : MOZ_ASSERT(aBytes[aLength] == '\0');
109 : }
110 :
111 : using Base::operator=;
112 :
113 654 : char* c_str() { return reinterpret_cast<char*>(get()); }
114 : };
115 :
116 : /*
117 : * A wrapper for a "const char*" that is encoded using UTF-8.
118 : * This class does not manage ownership of the data; that is left
119 : * to others. This differs from UTF8CharsZ in that the chars are
120 : * const and it allows assignment.
121 : */
122 : class ConstUTF8CharsZ
123 : {
124 : const char* data_;
125 :
126 : public:
127 : using CharT = unsigned char;
128 :
129 6 : ConstUTF8CharsZ() : data_(nullptr)
130 6 : {}
131 :
132 4 : ConstUTF8CharsZ(const char* aBytes, size_t aLength)
133 4 : : data_(aBytes)
134 : {
135 4 : MOZ_ASSERT(aBytes[aLength] == '\0');
136 : #ifdef DEBUG
137 4 : validate(aLength);
138 : #endif
139 4 : }
140 :
141 2 : const void* get() const { return data_; }
142 :
143 8 : const char* c_str() const { return data_; }
144 :
145 12 : explicit operator bool() const { return data_ != nullptr; }
146 :
147 : private:
148 : #ifdef DEBUG
149 : void validate(size_t aLength);
150 : #endif
151 : };
152 :
153 : /*
154 : * SpiderMonkey uses a 2-byte character representation: it is a
155 : * 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
156 : * but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
157 : * sufficiently dedicated JavaScript program to be fully unicode-aware by
158 : * manually interpreting UTF-16 extension characters embedded in the JS
159 : * string.
160 : */
161 : class TwoByteChars : public mozilla::Range<char16_t>
162 : {
163 : typedef mozilla::Range<char16_t> Base;
164 :
165 : public:
166 : using CharT = char16_t;
167 :
168 : TwoByteChars() : Base() {}
169 280 : TwoByteChars(char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
170 0 : TwoByteChars(const char16_t* aChars, size_t aLength) : Base(const_cast<char16_t*>(aChars), aLength) {}
171 : };
172 :
173 : /*
174 : * A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
175 : */
176 : class TwoByteCharsZ : public mozilla::RangedPtr<char16_t>
177 : {
178 : typedef mozilla::RangedPtr<char16_t> Base;
179 :
180 : public:
181 : using CharT = char16_t;
182 :
183 0 : TwoByteCharsZ() : Base(nullptr, 0) {}
184 :
185 317 : TwoByteCharsZ(char16_t* chars, size_t length)
186 317 : : Base(chars, length)
187 : {
188 317 : MOZ_ASSERT(chars[length] == '\0');
189 317 : }
190 :
191 : using Base::operator=;
192 : };
193 :
194 : typedef mozilla::RangedPtr<const char16_t> ConstCharPtr;
195 :
196 : /*
197 : * Like TwoByteChars, but the chars are const.
198 : */
199 : class ConstTwoByteChars : public mozilla::Range<const char16_t>
200 : {
201 : typedef mozilla::Range<const char16_t> Base;
202 :
203 : public:
204 : using CharT = char16_t;
205 :
206 : ConstTwoByteChars() : Base() {}
207 2 : ConstTwoByteChars(const char16_t* aChars, size_t aLength) : Base(aChars, aLength) {}
208 : };
209 :
210 : /*
211 : * Convert a 2-byte character sequence to "ISO-Latin-1". This works by
212 : * truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
213 : * contains any UTF-16 extension characters, then this may give invalid Latin1
214 : * output. The returned string is zero terminated. The returned string or the
215 : * returned string's |start()| must be freed with JS_free or js_free,
216 : * respectively. If allocation fails, an OOM error will be set and the method
217 : * will return a nullptr chars (which can be tested for with the ! operator).
218 : * This method cannot trigger GC.
219 : */
220 : extern Latin1CharsZ
221 : LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx,
222 : const mozilla::Range<const char16_t> tbchars);
223 :
224 : inline Latin1CharsZ
225 : LossyTwoByteCharsToNewLatin1CharsZ(JSContext* cx, const char16_t* begin, size_t length)
226 : {
227 : const mozilla::Range<const char16_t> tbchars(begin, length);
228 : return JS::LossyTwoByteCharsToNewLatin1CharsZ(cx, tbchars);
229 : }
230 :
231 : template <typename CharT>
232 : extern UTF8CharsZ
233 : CharsToNewUTF8CharsZ(JSContext* maybeCx, const mozilla::Range<CharT> chars);
234 :
235 : JS_PUBLIC_API(uint32_t)
236 : Utf8ToOneUcs4Char(const uint8_t* utf8Buffer, int utf8Length);
237 :
238 : /*
239 : * Inflate bytes in UTF-8 encoding to char16_t.
240 : * - On error, returns an empty TwoByteCharsZ.
241 : * - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
242 : * its length; the length value excludes the trailing null.
243 : */
244 : extern JS_PUBLIC_API(TwoByteCharsZ)
245 : UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
246 :
247 : /*
248 : * Like UTF8CharsToNewTwoByteCharsZ, but for ConstUTF8CharsZ.
249 : */
250 : extern JS_PUBLIC_API(TwoByteCharsZ)
251 : UTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
252 :
253 : /*
254 : * The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
255 : * will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
256 : * input.
257 : */
258 : extern JS_PUBLIC_API(TwoByteCharsZ)
259 : LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
260 :
261 : extern JS_PUBLIC_API(TwoByteCharsZ)
262 : LossyUTF8CharsToNewTwoByteCharsZ(JSContext* cx, const ConstUTF8CharsZ& utf8, size_t* outlen);
263 :
264 : /*
265 : * Returns the length of the char buffer required to encode |s| as UTF8.
266 : * Does not include the null-terminator.
267 : */
268 : JS_PUBLIC_API(size_t)
269 : GetDeflatedUTF8StringLength(JSFlatString* s);
270 :
271 : /*
272 : * Encode |src| as UTF8. The caller must either ensure |dst| has enough space
273 : * to encode the entire string or pass the length of the buffer as |dstlenp|,
274 : * in which case the function will encode characters from the string until
275 : * the buffer is exhausted. Does not write the null terminator.
276 : *
277 : * If |dstlenp| is provided, it will be updated to hold the number of bytes
278 : * written to the buffer. If |numcharsp| is provided, it will be updated to hold
279 : * the number of Unicode characters written to the buffer (which can be less
280 : * than the length of the string, if the buffer is exhausted before the string
281 : * is fully encoded).
282 : */
283 : JS_PUBLIC_API(void)
284 : DeflateStringToUTF8Buffer(JSFlatString* src, mozilla::RangedPtr<char> dst,
285 : size_t* dstlenp = nullptr, size_t* numcharsp = nullptr);
286 :
287 : /*
288 : * The smallest character encoding capable of fully representing a particular
289 : * string.
290 : */
291 : enum class SmallestEncoding {
292 : ASCII,
293 : Latin1,
294 : UTF16
295 : };
296 :
297 : /*
298 : * Returns the smallest encoding possible for the given string: if all
299 : * codepoints are <128 then ASCII, otherwise if all codepoints are <256
300 : * Latin-1, else UTF16.
301 : */
302 : JS_PUBLIC_API(SmallestEncoding)
303 : FindSmallestEncoding(UTF8Chars utf8);
304 :
305 : /*
306 : * Return a null-terminated Latin-1 string copied from the input string,
307 : * storing its length (excluding null terminator) in |*outlen|. Fail and
308 : * report an error if the string contains non-Latin-1 codepoints. Returns
309 : * Latin1CharsZ() on failure.
310 : */
311 : extern JS_PUBLIC_API(Latin1CharsZ)
312 : UTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
313 :
314 : /*
315 : * Return a null-terminated Latin-1 string copied from the input string,
316 : * storing its length (excluding null terminator) in |*outlen|. Non-Latin-1
317 : * codepoints are replaced by '?'. Returns Latin1CharsZ() on failure.
318 : */
319 : extern JS_PUBLIC_API(Latin1CharsZ)
320 : LossyUTF8CharsToNewLatin1CharsZ(JSContext* cx, const UTF8Chars utf8, size_t* outlen);
321 :
322 : /*
323 : * Returns true if all characters in the given null-terminated string are
324 : * ASCII, i.e. < 0x80, false otherwise.
325 : */
326 : extern JS_PUBLIC_API(bool)
327 : StringIsASCII(const char* s);
328 :
329 : } // namespace JS
330 :
331 : inline void JS_free(JS::Latin1CharsZ& ptr) { js_free((void*)ptr.get()); }
332 : inline void JS_free(JS::UTF8CharsZ& ptr) { js_free((void*)ptr.get()); }
333 :
334 : #endif /* js_CharacterEncoding_h */
|