Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 : * vim: set ts=8 sts=4 et sw=4 tw=99:
3 : * This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #ifndef vm_Unicode_h
8 : #define vm_Unicode_h
9 :
10 : #include "jspubtd.h"
11 : #include "vm/UnicodeNonBMP.h"
12 :
13 : extern const bool js_isidstart[];
14 : extern const bool js_isident[];
15 : extern const bool js_isspace[];
16 :
17 : namespace js {
18 : namespace unicode {
19 :
20 : /*
21 : * This namespace contains all the knowledge required to handle Unicode
22 : * characters in JavaScript.
23 : *
24 : * SPACE
25 : * Every character that is either in the ECMAScript class WhiteSpace
26 : * (ES2016, § 11.2) or in LineTerminator (ES2016, § 11.3).
27 : *
28 : * WhiteSpace
29 : * \u0009, \u000B, \u000C, \u0020, \u00A0 and \uFEFF
30 : * and every other Unicode character with the General Category "Zs".
31 : * See <http://www.unicode.org/reports/tr44/#UnicodeData.txt> for more
32 : * information about General Categories and the UnicodeData.txt file.
33 : *
34 : * LineTerminator
35 : * \u000A, \u000D, \u2028, \u2029
36 : *
37 : * UNICODE_ID_START
38 : * These are all characters with the Unicode property «ID_Start».
39 : *
40 : * UNICODE_ID_CONTINUE_ONLY
41 : * These are all characters with the Unicode property «ID_Continue» minus all
42 : * characters with the Unicode property «ID_Start».
43 : * And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6)
44 : *
45 : * UNICODE_ID_CONTINUE
46 : * These are all characters with the Unicode property «ID_Continue».
47 : * And additionally <ZWNJ> and <ZWJ>. (ES2016, § 11.6)
48 : *
49 : * Attention: UNICODE_ID_START is _not_ IdentifierStart, but you could build
50 : * a matcher for the real IdentifierPart like this:
51 : *
52 : * if char in ['$', '_']:
53 : * return True
54 : * if GetFlag(char) & UNICODE_ID_CONTINUE:
55 : * return True
56 : *
57 : */
58 :
59 : namespace CharFlag {
60 : const uint8_t SPACE = 1 << 0;
61 : const uint8_t UNICODE_ID_START = 1 << 1;
62 : const uint8_t UNICODE_ID_CONTINUE_ONLY = 1 << 2;
63 : const uint8_t UNICODE_ID_CONTINUE = UNICODE_ID_START + UNICODE_ID_CONTINUE_ONLY;
64 : }
65 :
66 : constexpr char16_t NO_BREAK_SPACE = 0x00A0;
67 : constexpr char16_t MICRO_SIGN = 0x00B5;
68 : constexpr char16_t LATIN_SMALL_LETTER_SHARP_S = 0x00DF;
69 : constexpr char16_t LATIN_SMALL_LETTER_Y_WITH_DIAERESIS = 0x00FF;
70 : constexpr char16_t LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE = 0x0130;
71 : constexpr char16_t COMBINING_DOT_ABOVE = 0x0307;
72 : constexpr char16_t GREEK_CAPITAL_LETTER_SIGMA = 0x03A3;
73 : constexpr char16_t GREEK_SMALL_LETTER_FINAL_SIGMA = 0x03C2;
74 : constexpr char16_t GREEK_SMALL_LETTER_SIGMA = 0x03C3;
75 : constexpr char16_t LINE_SEPARATOR = 0x2028;
76 : constexpr char16_t PARA_SEPARATOR = 0x2029;
77 : constexpr char16_t BYTE_ORDER_MARK2 = 0xFFFE;
78 :
79 : const char16_t LeadSurrogateMin = 0xD800;
80 : const char16_t LeadSurrogateMax = 0xDBFF;
81 : const char16_t TrailSurrogateMin = 0xDC00;
82 : const char16_t TrailSurrogateMax = 0xDFFF;
83 :
84 : const uint32_t UTF16Max = 0xFFFF;
85 : const uint32_t NonBMPMin = 0x10000;
86 : const uint32_t NonBMPMax = 0x10FFFF;
87 :
88 : class CharacterInfo {
89 : /*
90 : * upperCase and lowerCase normally store the delta between two
91 : * letters. For example the lower case alpha (a) has the char code
92 : * 97, and the upper case alpha (A) has 65. So for "a" we would
93 : * store -32 in upperCase (97 + (-32) = 65) and 0 in lowerCase,
94 : * because this char is already in lower case.
95 : * Well, not -32 exactly, but (2**16 - 32) to induce
96 : * unsigned overflow with identical mathematical behavior.
97 : * For upper case alpha, we would store 0 in upperCase and 32 in
98 : * lowerCase (65 + 32 = 97).
99 : *
100 : * We use deltas to reuse information for multiple characters. For
101 : * example the whole lower case latin alphabet fits into one entry,
102 : * because it's always a UnicodeLetter and upperCase contains
103 : * -32.
104 : */
105 : public:
106 : uint16_t upperCase;
107 : uint16_t lowerCase;
108 : uint8_t flags;
109 :
110 0 : inline bool isSpace() const {
111 0 : return flags & CharFlag::SPACE;
112 : }
113 :
114 0 : inline bool isUnicodeIDStart() const {
115 0 : return flags & CharFlag::UNICODE_ID_START;
116 : }
117 :
118 0 : inline bool isUnicodeIDContinue() const {
119 : // Also matches <ZWNJ> and <ZWJ>!
120 0 : return flags & CharFlag::UNICODE_ID_CONTINUE;
121 : }
122 : };
123 :
124 : extern const uint8_t index1[];
125 : extern const uint8_t index2[];
126 : extern const CharacterInfo js_charinfo[];
127 :
128 : inline const CharacterInfo&
129 103 : CharInfo(char16_t code)
130 : {
131 103 : const size_t shift = 6;
132 103 : size_t index = index1[code >> shift];
133 103 : index = index2[(index << shift) + (code & ((1 << shift) - 1))];
134 :
135 103 : return js_charinfo[index];
136 : }
137 :
138 : inline bool
139 10156 : IsIdentifierStart(char16_t ch)
140 : {
141 : /*
142 : * ES2016 11.6 IdentifierStart
143 : * $ (dollar sign)
144 : * _ (underscore)
145 : * or any character with the Unicode property «ID_Start».
146 : *
147 : * We use a lookup table for small and thus common characters for speed.
148 : */
149 :
150 10156 : if (ch < 128)
151 10156 : return js_isidstart[ch];
152 :
153 0 : return CharInfo(ch).isUnicodeIDStart();
154 : }
155 :
156 : bool
157 : IsIdentifierStartNonBMP(uint32_t codePoint);
158 :
159 : inline bool
160 0 : IsIdentifierStart(uint32_t codePoint)
161 : {
162 0 : if (MOZ_UNLIKELY(codePoint > UTF16Max))
163 0 : return IsIdentifierStartNonBMP(codePoint);
164 0 : return IsIdentifierStart(char16_t(codePoint));
165 : }
166 :
167 : inline bool
168 1628281 : IsIdentifierPart(char16_t ch)
169 : {
170 : /*
171 : * ES2016 11.6 IdentifierPart
172 : * $ (dollar sign)
173 : * _ (underscore)
174 : * <ZWNJ>
175 : * <ZWJ>
176 : * or any character with the Unicode property «ID_Continue».
177 : *
178 : * We use a lookup table for small and thus common characters for speed.
179 : */
180 :
181 1628281 : if (ch < 128)
182 1628281 : return js_isident[ch];
183 :
184 0 : return CharInfo(ch).isUnicodeIDContinue();
185 : }
186 :
187 : bool
188 : IsIdentifierPartNonBMP(uint32_t codePoint);
189 :
190 : inline bool
191 0 : IsIdentifierPart(uint32_t codePoint)
192 : {
193 0 : if (MOZ_UNLIKELY(codePoint > UTF16Max))
194 0 : return IsIdentifierPartNonBMP(codePoint);
195 0 : return IsIdentifierPart(char16_t(codePoint));
196 : }
197 :
198 : inline bool
199 0 : IsUnicodeIDStart(char16_t ch)
200 : {
201 0 : return CharInfo(ch).isUnicodeIDStart();
202 : }
203 :
204 : bool
205 : IsUnicodeIDStartNonBMP(uint32_t codePoint);
206 :
207 : inline bool
208 0 : IsUnicodeIDStart(uint32_t codePoint)
209 : {
210 0 : if (MOZ_UNLIKELY(codePoint > UTF16Max))
211 0 : return IsIdentifierStartNonBMP(codePoint);
212 0 : return IsUnicodeIDStart(char16_t(codePoint));
213 : }
214 :
215 : inline bool
216 395 : IsSpace(char16_t ch)
217 : {
218 : /*
219 : * IsSpace checks if some character is included in the merged set
220 : * of WhiteSpace and LineTerminator, specified by ES2016 11.2 and 11.3.
221 : * We combined them, because in practice nearly every
222 : * calling function wants this, except some code in the tokenizer.
223 : *
224 : * We use a lookup table for ASCII-7 characters, because they are
225 : * very common and must be handled quickly in the tokenizer.
226 : * NO-BREAK SPACE is supposed to be the most common character not in
227 : * this range, so we inline this case, too.
228 : */
229 :
230 395 : if (ch < 128)
231 395 : return js_isspace[ch];
232 :
233 0 : if (ch == NO_BREAK_SPACE)
234 0 : return true;
235 :
236 0 : return CharInfo(ch).isSpace();
237 : }
238 :
239 : inline bool
240 107 : IsSpaceOrBOM2(char16_t ch)
241 : {
242 107 : if (ch < 128)
243 107 : return js_isspace[ch];
244 :
245 : /* We accept BOM2 (0xFFFE) for compatibility reasons in the parser. */
246 0 : if (ch == NO_BREAK_SPACE || ch == BYTE_ORDER_MARK2)
247 0 : return true;
248 :
249 0 : return CharInfo(ch).isSpace();
250 : }
251 :
252 : /*
253 : * Returns the simple upper case mapping (see CanUpperCaseSpecialCasing for
254 : * details) of the given UTF-16 code unit.
255 : */
256 : inline char16_t
257 679 : ToUpperCase(char16_t ch)
258 : {
259 679 : if (ch < 128) {
260 679 : if (ch >= 'a' && ch <= 'z')
261 435 : return ch - ('a' - 'A');
262 244 : return ch;
263 : }
264 :
265 0 : const CharacterInfo& info = CharInfo(ch);
266 :
267 0 : return uint16_t(ch) + info.upperCase;
268 : }
269 :
270 : /*
271 : * Returns the simple lower case mapping (see CanUpperCaseSpecialCasing for
272 : * details) of the given UTF-16 code unit.
273 : */
274 : inline char16_t
275 218 : ToLowerCase(char16_t ch)
276 : {
277 218 : if (ch < 128) {
278 218 : if (ch >= 'A' && ch <= 'Z')
279 43 : return ch + ('a' - 'A');
280 175 : return ch;
281 : }
282 :
283 0 : const CharacterInfo& info = CharInfo(ch);
284 :
285 0 : return uint16_t(ch) + info.lowerCase;
286 : }
287 :
288 : // Returns true iff ToUpperCase(ch) != ch.
289 : inline bool
290 9 : CanUpperCase(char16_t ch)
291 : {
292 9 : if (ch < 128)
293 9 : return ch >= 'a' && ch <= 'z';
294 0 : return CharInfo(ch).upperCase != 0;
295 : }
296 :
297 : // Returns true iff ToLowerCase(ch) != ch.
298 : inline bool
299 163 : CanLowerCase(char16_t ch)
300 : {
301 163 : if (ch < 128)
302 60 : return ch >= 'A' && ch <= 'Z';
303 103 : return CharInfo(ch).lowerCase != 0;
304 : }
305 :
306 : #define CHECK_RANGE(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
307 : if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \
308 : return true;
309 :
310 : inline bool
311 0 : CanUpperCaseNonBMP(char16_t lead, char16_t trail)
312 : {
313 0 : FOR_EACH_NON_BMP_UPPERCASE(CHECK_RANGE)
314 0 : return false;
315 : }
316 :
317 : inline bool
318 0 : CanLowerCaseNonBMP(char16_t lead, char16_t trail)
319 : {
320 0 : FOR_EACH_NON_BMP_LOWERCASE(CHECK_RANGE)
321 0 : return false;
322 : }
323 :
324 : #undef CHECK_RANGE
325 :
326 : inline char16_t
327 0 : ToUpperCaseNonBMPTrail(char16_t lead, char16_t trail)
328 : {
329 : #define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
330 : if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \
331 : return trail + DIFF;
332 0 : FOR_EACH_NON_BMP_UPPERCASE(CALC_TRAIL)
333 : #undef CALL_TRAIL
334 :
335 0 : return trail;
336 : }
337 :
338 : inline char16_t
339 0 : ToLowerCaseNonBMPTrail(char16_t lead, char16_t trail)
340 : {
341 : #define CALC_TRAIL(FROM, TO, LEAD, TRAIL_FROM, TRAIL_TO, DIFF) \
342 : if (lead == LEAD && trail >= TRAIL_FROM && trail <= TRAIL_TO) \
343 : return trail + DIFF;
344 0 : FOR_EACH_NON_BMP_LOWERCASE(CALC_TRAIL)
345 : #undef CALL_TRAIL
346 :
347 0 : return trail;
348 : }
349 :
350 : /*
351 : * Returns true if the given UTF-16 code unit has a language-independent,
352 : * unconditional or conditional special upper case mapping.
353 : *
354 : * Unicode defines two case mapping modes:
355 : * 1. "simple case mappings" for one-to-one mappings which are independent of
356 : * context and language (defined in UnicodeData.txt).
357 : * 2. "special case mappings" for mappings which can increase or decrease the
358 : * string length; or are dependent on context or locale (defined in
359 : * SpecialCasing.txt).
360 : *
361 : * The CanUpperCase() method defined above only supports simple case mappings.
362 : * In order to support the full case mappings of all Unicode characters,
363 : * callers need to check this method in addition to CanUpperCase().
364 : *
365 : * NOTE: All special upper case mappings are unconditional in Unicode 9.
366 : */
367 : bool
368 : CanUpperCaseSpecialCasing(char16_t ch);
369 :
370 : /*
371 : * Returns the length of the upper case mapping of |ch|.
372 : *
373 : * This function asserts if |ch| doesn't have a special upper case mapping.
374 : */
375 : size_t
376 : LengthUpperCaseSpecialCasing(char16_t ch);
377 :
378 : /*
379 : * Appends the upper case mapping of |ch| to the given output buffer,
380 : * starting at the provided index.
381 : *
382 : * This function asserts if |ch| doesn't have a special upper case mapping.
383 : */
384 : void
385 : AppendUpperCaseSpecialCasing(char16_t ch, char16_t* elements, size_t* index);
386 :
387 : /*
388 : * For a codepoint C, CodepointsWithSameUpperCaseInfo stores three offsets
389 : * from C to up to three codepoints with same uppercase (no codepoint in
390 : * UnicodeData.txt has more than three such codepoints).
391 : *
392 : * To illustrate, consider the codepoint U+0399 GREEK CAPITAL LETTER IOTA, the
393 : * uppercased form of these three codepoints:
394 : *
395 : * U+03B9 GREEK SMALL LETTER IOTA
396 : * U+1FBE GREEK PROSGEGRAMMENI
397 : * U+0345 COMBINING GREEK YPOGEGRAMMENI
398 : *
399 : * For the CodepointsWithSameUpperCaseInfo corresponding to this codepoint,
400 : * delta{1,2,3} are 16-bit modular deltas from 0x0399 to each respective
401 : * codepoint:
402 : * uint16_t(0x03B9 - 0x0399),
403 : * uint16_t(0x1FBE - 0x0399),
404 : * uint16_t(0x0345 - 0x0399)
405 : * in an unimportant order.
406 : *
407 : * If there are fewer than three other codepoints, some fields are zero.
408 : * Consider the codepoint U+03B9 above, the other two codepoints U+1FBE and
409 : * U+0345 have same uppercase (U+0399 is not). For the
410 : * CodepointsWithSameUpperCaseInfo corresponding to this codepoint,
411 : * delta{1,2,3} are:
412 : * uint16_t(0x1FBE - 0x03B9),
413 : * uint16_t(0x0345 - 0x03B9),
414 : * uint16_t(0)
415 : * in an unimportant order.
416 : *
417 : * Because multiple codepoints map to a single CodepointsWithSameUpperCaseInfo,
418 : * a CodepointsWithSameUpperCaseInfo and its delta{1,2,3} have no meaning
419 : * standing alone: they have meaning only with respect to a codepoint mapping
420 : * to that CodepointsWithSameUpperCaseInfo.
421 : */
422 : class CodepointsWithSameUpperCaseInfo
423 : {
424 : public:
425 : uint16_t delta1;
426 : uint16_t delta2;
427 : uint16_t delta3;
428 : };
429 :
430 : extern const uint8_t codepoints_with_same_upper_index1[];
431 : extern const uint8_t codepoints_with_same_upper_index2[];
432 : extern const CodepointsWithSameUpperCaseInfo js_codepoints_with_same_upper_info[];
433 :
434 : class CodepointsWithSameUpperCase
435 : {
436 : const CodepointsWithSameUpperCaseInfo& info_;
437 : const char16_t code_;
438 :
439 562 : static const CodepointsWithSameUpperCaseInfo& computeInfo(char16_t code) {
440 562 : const size_t shift = 6;
441 562 : size_t index = codepoints_with_same_upper_index1[code >> shift];
442 562 : index = codepoints_with_same_upper_index2[(index << shift) + (code & ((1 << shift) - 1))];
443 562 : return js_codepoints_with_same_upper_info[index];
444 : }
445 :
446 : public:
447 562 : explicit CodepointsWithSameUpperCase(char16_t code)
448 562 : : info_(computeInfo(code)),
449 562 : code_(code)
450 562 : {}
451 :
452 562 : char16_t other1() const { return uint16_t(code_) + info_.delta1; }
453 562 : char16_t other2() const { return uint16_t(code_) + info_.delta2; }
454 562 : char16_t other3() const { return uint16_t(code_) + info_.delta3; }
455 : };
456 :
457 : class FoldingInfo {
458 : public:
459 : uint16_t folding;
460 : uint16_t reverse1;
461 : uint16_t reverse2;
462 : uint16_t reverse3;
463 : };
464 :
465 : extern const uint8_t folding_index1[];
466 : extern const uint8_t folding_index2[];
467 : extern const FoldingInfo js_foldinfo[];
468 :
469 : inline const FoldingInfo&
470 0 : CaseFoldInfo(char16_t code)
471 : {
472 0 : const size_t shift = 6;
473 0 : size_t index = folding_index1[code >> shift];
474 0 : index = folding_index2[(index << shift) + (code & ((1 << shift) - 1))];
475 0 : return js_foldinfo[index];
476 : }
477 :
478 : inline char16_t
479 0 : FoldCase(char16_t ch)
480 : {
481 0 : const FoldingInfo& info = CaseFoldInfo(ch);
482 0 : return uint16_t(ch) + info.folding;
483 : }
484 :
485 : inline char16_t
486 0 : ReverseFoldCase1(char16_t ch)
487 : {
488 0 : const FoldingInfo& info = CaseFoldInfo(ch);
489 0 : return uint16_t(ch) + info.reverse1;
490 : }
491 :
492 : inline char16_t
493 0 : ReverseFoldCase2(char16_t ch)
494 : {
495 0 : const FoldingInfo& info = CaseFoldInfo(ch);
496 0 : return uint16_t(ch) + info.reverse2;
497 : }
498 :
499 : inline char16_t
500 0 : ReverseFoldCase3(char16_t ch)
501 : {
502 0 : const FoldingInfo& info = CaseFoldInfo(ch);
503 0 : return uint16_t(ch) + info.reverse3;
504 : }
505 :
506 : inline bool
507 0 : IsSupplementary(uint32_t codePoint)
508 : {
509 0 : return codePoint >= NonBMPMin && codePoint <= NonBMPMax;
510 : }
511 :
512 : inline bool
513 1637958 : IsLeadSurrogate(uint32_t codePoint)
514 : {
515 1637958 : return codePoint >= LeadSurrogateMin && codePoint <= LeadSurrogateMax;
516 : }
517 :
518 : inline bool
519 270 : IsTrailSurrogate(uint32_t codePoint)
520 : {
521 270 : return codePoint >= TrailSurrogateMin && codePoint <= TrailSurrogateMax;
522 : }
523 :
524 : inline char16_t
525 0 : LeadSurrogate(uint32_t codePoint)
526 : {
527 0 : MOZ_ASSERT(IsSupplementary(codePoint));
528 :
529 0 : return char16_t((codePoint >> 10) + (LeadSurrogateMin - (NonBMPMin >> 10)));
530 : }
531 :
532 : inline char16_t
533 0 : TrailSurrogate(uint32_t codePoint)
534 : {
535 0 : MOZ_ASSERT(IsSupplementary(codePoint));
536 :
537 0 : return char16_t((codePoint & 0x3FF) | TrailSurrogateMin);
538 : }
539 :
540 : inline void
541 0 : UTF16Encode(uint32_t codePoint, char16_t* lead, char16_t* trail)
542 : {
543 0 : MOZ_ASSERT(IsSupplementary(codePoint));
544 :
545 0 : *lead = LeadSurrogate(codePoint);
546 0 : *trail = TrailSurrogate(codePoint);
547 0 : }
548 :
549 : inline void
550 0 : UTF16Encode(uint32_t codePoint, char16_t* elements, unsigned* index)
551 : {
552 0 : if (!IsSupplementary(codePoint)) {
553 0 : elements[(*index)++] = char16_t(codePoint);
554 : } else {
555 0 : elements[(*index)++] = LeadSurrogate(codePoint);
556 0 : elements[(*index)++] = TrailSurrogate(codePoint);
557 : }
558 0 : }
559 :
560 : inline uint32_t
561 0 : UTF16Decode(char16_t lead, char16_t trail)
562 : {
563 0 : MOZ_ASSERT(IsLeadSurrogate(lead));
564 0 : MOZ_ASSERT(IsTrailSurrogate(trail));
565 :
566 0 : return (lead << 10) + trail + (NonBMPMin - (LeadSurrogateMin << 10) - TrailSurrogateMin);
567 : }
568 :
569 : } /* namespace unicode */
570 : } /* namespace js */
571 :
572 : #endif /* vm_Unicode_h */
|