Line data Source code
1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 : /* vim:set ts=4 sw=4 sts=4 et cindent: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #include "nsUnicodeProperties.h"
8 : #include "nsUnicodePropertyData.cpp"
9 :
10 : #include "mozilla/ArrayUtils.h"
11 : #include "nsCharTraits.h"
12 :
13 : #define UNICODE_BMP_LIMIT 0x10000
14 : #define UNICODE_LIMIT 0x110000
15 :
16 : #ifndef ENABLE_INTL_API
17 : static const nsCharProps1&
18 : GetCharProps1(uint32_t aCh)
19 : {
20 : if (aCh < UNICODE_BMP_LIMIT) {
21 : return sCharProp1Values[sCharProp1Pages[0][aCh >> kCharProp1CharBits]]
22 : [aCh & ((1 << kCharProp1CharBits) - 1)];
23 : }
24 : if (aCh < (kCharProp1MaxPlane + 1) * 0x10000) {
25 : return sCharProp1Values[sCharProp1Pages[sCharProp1Planes[(aCh >> 16) - 1]]
26 : [(aCh & 0xffff) >> kCharProp1CharBits]]
27 : [aCh & ((1 << kCharProp1CharBits) - 1)];
28 : }
29 :
30 : // Default values for unassigned
31 : static const nsCharProps1 undefined = {
32 : 0, // Index to mirrored char offsets
33 : 0, // Hangul Syllable type
34 : 0 // Combining class
35 : };
36 : return undefined;
37 : }
38 : #endif
39 :
40 : const nsCharProps2&
41 0 : GetCharProps2(uint32_t aCh)
42 : {
43 0 : if (aCh < UNICODE_BMP_LIMIT) {
44 0 : return sCharProp2Values[sCharProp2Pages[0][aCh >> kCharProp2CharBits]]
45 0 : [aCh & ((1 << kCharProp2CharBits) - 1)];
46 : }
47 0 : if (aCh < (kCharProp2MaxPlane + 1) * 0x10000) {
48 0 : return sCharProp2Values[sCharProp2Pages[sCharProp2Planes[(aCh >> 16) - 1]]
49 0 : [(aCh & 0xffff) >> kCharProp2CharBits]]
50 0 : [aCh & ((1 << kCharProp2CharBits) - 1)];
51 : }
52 :
53 0 : NS_NOTREACHED("Getting CharProps for codepoint outside Unicode range");
54 : // Default values for unassigned
55 : using namespace mozilla::unicode;
56 : static const nsCharProps2 undefined = {
57 : #if ENABLE_INTL_API
58 : VERTICAL_ORIENTATION_R,
59 : 0 // IdentifierType
60 : #else
61 : uint8_t(Script::UNKNOWN),
62 : PAIRED_BRACKET_TYPE_NONE,
63 : 0, // EastAsianWidthFWH
64 : HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED,
65 : 0, // IdentifierType
66 : 0, // DefaultIgnorable
67 : eCharType_LeftToRight,
68 : VERTICAL_ORIENTATION_R,
69 : 0, // LineBreak
70 : -1 // Numeric Value
71 : #endif
72 : };
73 0 : return undefined;
74 : }
75 :
76 : namespace mozilla {
77 :
78 : namespace unicode {
79 :
80 : /*
81 : To store properties for a million Unicode codepoints compactly, we use
82 : a three-level array structure, with the Unicode values considered as
83 : three elements: Plane, Page, and Char.
84 :
85 : Space optimization happens because multiple Planes can refer to the same
86 : Page array, and multiple Pages can refer to the same Char array holding
87 : the actual values. In practice, most of the higher planes are empty and
88 : thus share the same data; and within the BMP, there are also many pages
89 : that repeat the same data for any given property.
90 :
91 : Plane is usually zero, so we skip a lookup in this case, and require
92 : that the Plane 0 pages are always the first set of entries in the Page
93 : array.
94 :
95 : The division of the remaining 16 bits into Page and Char fields is
96 : adjusted for each property (by experiment using the generation tool)
97 : to provide the most compact storage, depending on the distribution
98 : of values.
99 : */
100 :
101 : const nsUGenCategory sDetailedToGeneralCategory[] = {
102 : /*
103 : * The order here corresponds to the HB_UNICODE_GENERAL_CATEGORY_* constants
104 : * of the hb_unicode_general_category_t enum in gfx/harfbuzz/src/hb-unicode.h.
105 : */
106 : /* CONTROL */ nsUGenCategory::kOther,
107 : /* FORMAT */ nsUGenCategory::kOther,
108 : /* UNASSIGNED */ nsUGenCategory::kOther,
109 : /* PRIVATE_USE */ nsUGenCategory::kOther,
110 : /* SURROGATE */ nsUGenCategory::kOther,
111 : /* LOWERCASE_LETTER */ nsUGenCategory::kLetter,
112 : /* MODIFIER_LETTER */ nsUGenCategory::kLetter,
113 : /* OTHER_LETTER */ nsUGenCategory::kLetter,
114 : /* TITLECASE_LETTER */ nsUGenCategory::kLetter,
115 : /* UPPERCASE_LETTER */ nsUGenCategory::kLetter,
116 : /* COMBINING_MARK */ nsUGenCategory::kMark,
117 : /* ENCLOSING_MARK */ nsUGenCategory::kMark,
118 : /* NON_SPACING_MARK */ nsUGenCategory::kMark,
119 : /* DECIMAL_NUMBER */ nsUGenCategory::kNumber,
120 : /* LETTER_NUMBER */ nsUGenCategory::kNumber,
121 : /* OTHER_NUMBER */ nsUGenCategory::kNumber,
122 : /* CONNECT_PUNCTUATION */ nsUGenCategory::kPunctuation,
123 : /* DASH_PUNCTUATION */ nsUGenCategory::kPunctuation,
124 : /* CLOSE_PUNCTUATION */ nsUGenCategory::kPunctuation,
125 : /* FINAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
126 : /* INITIAL_PUNCTUATION */ nsUGenCategory::kPunctuation,
127 : /* OTHER_PUNCTUATION */ nsUGenCategory::kPunctuation,
128 : /* OPEN_PUNCTUATION */ nsUGenCategory::kPunctuation,
129 : /* CURRENCY_SYMBOL */ nsUGenCategory::kSymbol,
130 : /* MODIFIER_SYMBOL */ nsUGenCategory::kSymbol,
131 : /* MATH_SYMBOL */ nsUGenCategory::kSymbol,
132 : /* OTHER_SYMBOL */ nsUGenCategory::kSymbol,
133 : /* LINE_SEPARATOR */ nsUGenCategory::kSeparator,
134 : /* PARAGRAPH_SEPARATOR */ nsUGenCategory::kSeparator,
135 : /* SPACE_SEPARATOR */ nsUGenCategory::kSeparator
136 : };
137 :
138 : #ifdef ENABLE_INTL_API
139 : const hb_unicode_general_category_t sICUtoHBcategory[U_CHAR_CATEGORY_COUNT] = {
140 : HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED, // U_GENERAL_OTHER_TYPES = 0,
141 : HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER, // U_UPPERCASE_LETTER = 1,
142 : HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER, // U_LOWERCASE_LETTER = 2,
143 : HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER, // U_TITLECASE_LETTER = 3,
144 : HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER, // U_MODIFIER_LETTER = 4,
145 : HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER, // U_OTHER_LETTER = 5,
146 : HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK, // U_NON_SPACING_MARK = 6,
147 : HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK, // U_ENCLOSING_MARK = 7,
148 : HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK, // U_COMBINING_SPACING_MARK = 8,
149 : HB_UNICODE_GENERAL_CATEGORY_DECIMAL_NUMBER, // U_DECIMAL_DIGIT_NUMBER = 9,
150 : HB_UNICODE_GENERAL_CATEGORY_LETTER_NUMBER, // U_LETTER_NUMBER = 10,
151 : HB_UNICODE_GENERAL_CATEGORY_OTHER_NUMBER, // U_OTHER_NUMBER = 11,
152 : HB_UNICODE_GENERAL_CATEGORY_SPACE_SEPARATOR, // U_SPACE_SEPARATOR = 12,
153 : HB_UNICODE_GENERAL_CATEGORY_LINE_SEPARATOR, // U_LINE_SEPARATOR = 13,
154 : HB_UNICODE_GENERAL_CATEGORY_PARAGRAPH_SEPARATOR, // U_PARAGRAPH_SEPARATOR = 14,
155 : HB_UNICODE_GENERAL_CATEGORY_CONTROL, // U_CONTROL_CHAR = 15,
156 : HB_UNICODE_GENERAL_CATEGORY_FORMAT, // U_FORMAT_CHAR = 16,
157 : HB_UNICODE_GENERAL_CATEGORY_PRIVATE_USE, // U_PRIVATE_USE_CHAR = 17,
158 : HB_UNICODE_GENERAL_CATEGORY_SURROGATE, // U_SURROGATE = 18,
159 : HB_UNICODE_GENERAL_CATEGORY_DASH_PUNCTUATION, // U_DASH_PUNCTUATION = 19,
160 : HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION, // U_START_PUNCTUATION = 20,
161 : HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION, // U_END_PUNCTUATION = 21,
162 : HB_UNICODE_GENERAL_CATEGORY_CONNECT_PUNCTUATION, // U_CONNECTOR_PUNCTUATION = 22,
163 : HB_UNICODE_GENERAL_CATEGORY_OTHER_PUNCTUATION, // U_OTHER_PUNCTUATION = 23,
164 : HB_UNICODE_GENERAL_CATEGORY_MATH_SYMBOL, // U_MATH_SYMBOL = 24,
165 : HB_UNICODE_GENERAL_CATEGORY_CURRENCY_SYMBOL, // U_CURRENCY_SYMBOL = 25,
166 : HB_UNICODE_GENERAL_CATEGORY_MODIFIER_SYMBOL, // U_MODIFIER_SYMBOL = 26,
167 : HB_UNICODE_GENERAL_CATEGORY_OTHER_SYMBOL, // U_OTHER_SYMBOL = 27,
168 : HB_UNICODE_GENERAL_CATEGORY_INITIAL_PUNCTUATION, // U_INITIAL_PUNCTUATION = 28,
169 : HB_UNICODE_GENERAL_CATEGORY_FINAL_PUNCTUATION, // U_FINAL_PUNCTUATION = 29,
170 : };
171 : #endif
172 :
173 : #if !ENABLE_INTL_API
174 : uint8_t GetGeneralCategory(uint32_t aCh) {
175 : return GetCharProps2(aCh).mCategory;
176 : }
177 :
178 : nsCharType GetBidiCat(uint32_t aCh) {
179 : return nsCharType(GetCharProps2(aCh).mBidiCategory);
180 : }
181 :
182 : int8_t GetNumericValue(uint32_t aCh) {
183 : return GetCharProps2(aCh).mNumericValue;
184 : }
185 :
186 : uint32_t
187 : GetMirroredChar(uint32_t aCh)
188 : {
189 : return aCh + sMirrorOffsets[GetCharProps1(aCh).mMirrorOffsetIndex];
190 : }
191 :
192 : bool
193 : HasMirroredChar(uint32_t aCh)
194 : {
195 : return GetCharProps1(aCh).mMirrorOffsetIndex != 0;
196 : }
197 :
198 : uint8_t
199 : GetCombiningClass(uint32_t aCh)
200 : {
201 : return GetCharProps1(aCh).mCombiningClass;
202 : }
203 :
204 : uint8_t
205 : GetLineBreakClass(uint32_t aCh)
206 : {
207 : return GetCharProps2(aCh).mLineBreak;
208 : }
209 :
210 : Script
211 : GetScriptCode(uint32_t aCh)
212 : {
213 : return Script(GetCharProps2(aCh).mScriptCode);
214 : }
215 :
216 : uint32_t
217 : GetScriptTagForCode(Script aScriptCode)
218 : {
219 : // this will safely return 0 for negative script codes, too :)
220 : if (static_cast<uint32_t>(aScriptCode) > ArrayLength(sScriptCodeToTag)) {
221 : return 0;
222 : }
223 : return sScriptCodeToTag[static_cast<uint32_t>(aScriptCode)];
224 : }
225 :
226 : PairedBracketType GetPairedBracketType(uint32_t aCh)
227 : {
228 : return PairedBracketType(GetCharProps2(aCh).mPairedBracketType);
229 : }
230 :
231 : uint32_t GetPairedBracket(uint32_t aCh)
232 : {
233 : return GetPairedBracketType(aCh) != PAIRED_BRACKET_TYPE_NONE
234 : ? GetMirroredChar(aCh) : aCh;
235 : }
236 :
237 : static inline uint32_t
238 : GetCaseMapValue(uint32_t aCh)
239 : {
240 : if (aCh < UNICODE_BMP_LIMIT) {
241 : return sCaseMapValues[sCaseMapPages[0][aCh >> kCaseMapCharBits]]
242 : [aCh & ((1 << kCaseMapCharBits) - 1)];
243 : }
244 : if (aCh < (kCaseMapMaxPlane + 1) * 0x10000) {
245 : return sCaseMapValues[sCaseMapPages[sCaseMapPlanes[(aCh >> 16) - 1]]
246 : [(aCh & 0xffff) >> kCaseMapCharBits]]
247 : [aCh & ((1 << kCaseMapCharBits) - 1)];
248 : }
249 : return 0;
250 : }
251 :
252 : uint32_t
253 : GetUppercase(uint32_t aCh)
254 : {
255 : uint32_t mapValue = GetCaseMapValue(aCh);
256 : if (mapValue & (kLowerToUpper | kTitleToUpper)) {
257 : return aCh ^ (mapValue & kCaseMapCharMask);
258 : }
259 : if (mapValue & kLowerToTitle) {
260 : return GetUppercase(aCh ^ (mapValue & kCaseMapCharMask));
261 : }
262 : return aCh;
263 : }
264 :
265 : uint32_t
266 : GetLowercase(uint32_t aCh)
267 : {
268 : uint32_t mapValue = GetCaseMapValue(aCh);
269 : if (mapValue & kUpperToLower) {
270 : return aCh ^ (mapValue & kCaseMapCharMask);
271 : }
272 : if (mapValue & kTitleToUpper) {
273 : return GetLowercase(aCh ^ (mapValue & kCaseMapCharMask));
274 : }
275 : return aCh;
276 : }
277 :
278 : uint32_t
279 : GetTitlecaseForLower(uint32_t aCh)
280 : {
281 : uint32_t mapValue = GetCaseMapValue(aCh);
282 : if (mapValue & (kLowerToTitle | kLowerToUpper)) {
283 : return aCh ^ (mapValue & kCaseMapCharMask);
284 : }
285 : return aCh;
286 : }
287 :
288 : uint32_t
289 : GetTitlecaseForAll(uint32_t aCh)
290 : {
291 : uint32_t mapValue = GetCaseMapValue(aCh);
292 : if (mapValue & (kLowerToTitle | kLowerToUpper)) {
293 : return aCh ^ (mapValue & kCaseMapCharMask);
294 : }
295 : if (mapValue & kUpperToLower) {
296 : return GetTitlecaseForLower(aCh ^ (mapValue & kCaseMapCharMask));
297 : }
298 : return aCh;
299 : }
300 :
301 : bool IsEastAsianWidthFWH(uint32_t aCh)
302 : {
303 : return GetCharProps2(aCh).mEastAsianWidthFWH;
304 : }
305 :
306 : #endif
307 :
308 : #define DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(prefix_) \
309 : uint32_t Get##prefix_(uint32_t aCh) \
310 : { \
311 : if (aCh >= UNICODE_BMP_LIMIT) { \
312 : return aCh; \
313 : } \
314 : auto page = s##prefix_##Pages[aCh >> k##prefix_##CharBits]; \
315 : auto index = aCh & ((1 << k##prefix_##CharBits) - 1); \
316 : uint32_t v = s##prefix_##Values[page][index]; \
317 : return v ? v : aCh; \
318 : }
319 :
320 : // full-width mappings only exist for BMP characters; all others are
321 : // returned unchanged
322 0 : DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidth)
323 0 : DEFINE_BMP_1PLANE_MAPPING_GET_FUNC(FullWidthInverse)
324 :
325 : bool
326 727 : IsClusterExtender(uint32_t aCh, uint8_t aCategory)
327 : {
328 786 : return ((aCategory >= HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK &&
329 786 : aCategory <= HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK) ||
330 738 : (aCh >= 0x200c && aCh <= 0x200d) || // ZWJ, ZWNJ
331 727 : (aCh >= 0xff9e && aCh <= 0xff9f)); // katakana sound marks
332 : }
333 :
334 : enum HSType {
335 : #if ENABLE_INTL_API
336 : HST_NONE = U_HST_NOT_APPLICABLE,
337 : HST_L = U_HST_LEADING_JAMO,
338 : HST_V = U_HST_VOWEL_JAMO,
339 : HST_T = U_HST_TRAILING_JAMO,
340 : HST_LV = U_HST_LV_SYLLABLE,
341 : HST_LVT = U_HST_LVT_SYLLABLE
342 : #else
343 : HST_NONE = 0x00,
344 : HST_L = 0x01,
345 : HST_V = 0x02,
346 : HST_T = 0x04,
347 : HST_LV = 0x03,
348 : HST_LVT = 0x07
349 : #endif
350 : };
351 :
352 : static HSType
353 0 : GetHangulSyllableType(uint32_t aCh)
354 : {
355 : #if ENABLE_INTL_API
356 0 : return HSType(u_getIntPropertyValue(aCh, UCHAR_HANGUL_SYLLABLE_TYPE));
357 : #else
358 : return HSType(GetCharProps1(aCh).mHangulType);
359 : #endif
360 : }
361 :
362 : void
363 11 : ClusterIterator::Next()
364 : {
365 11 : if (AtEnd()) {
366 0 : NS_WARNING("ClusterIterator has already reached the end");
367 0 : return;
368 : }
369 :
370 11 : uint32_t ch = *mPos++;
371 :
372 11 : if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit &&
373 0 : NS_IS_LOW_SURROGATE(*mPos)) {
374 0 : ch = SURROGATE_TO_UCS4(ch, *mPos++);
375 11 : } else if ((ch & ~0xff) == 0x1100 ||
376 0 : (ch >= 0xa960 && ch <= 0xa97f) ||
377 0 : (ch >= 0xac00 && ch <= 0xd7ff)) {
378 : // Handle conjoining Jamo that make Hangul syllables
379 0 : HSType hangulState = GetHangulSyllableType(ch);
380 0 : while (mPos < mLimit) {
381 0 : ch = *mPos;
382 0 : HSType hangulType = GetHangulSyllableType(ch);
383 0 : switch (hangulType) {
384 : case HST_L:
385 : case HST_LV:
386 : case HST_LVT:
387 0 : if (hangulState == HST_L) {
388 0 : hangulState = hangulType;
389 0 : mPos++;
390 0 : continue;
391 : }
392 0 : break;
393 : case HST_V:
394 0 : if ((hangulState != HST_NONE) && (hangulState != HST_T) &&
395 : (hangulState != HST_LVT)) {
396 0 : hangulState = hangulType;
397 0 : mPos++;
398 0 : continue;
399 : }
400 0 : break;
401 : case HST_T:
402 0 : if (hangulState != HST_NONE && hangulState != HST_L) {
403 0 : hangulState = hangulType;
404 0 : mPos++;
405 0 : continue;
406 : }
407 0 : break;
408 : default:
409 0 : break;
410 : }
411 0 : break;
412 : }
413 : }
414 :
415 11 : while (mPos < mLimit) {
416 9 : ch = *mPos;
417 :
418 : // Check for surrogate pairs; note that isolated surrogates will just
419 : // be treated as generic (non-cluster-extending) characters here,
420 : // which is fine for cluster-iterating purposes
421 9 : if (NS_IS_HIGH_SURROGATE(ch) && mPos < mLimit - 1 &&
422 0 : NS_IS_LOW_SURROGATE(*(mPos + 1))) {
423 0 : ch = SURROGATE_TO_UCS4(ch, *(mPos + 1));
424 : }
425 :
426 9 : if (!IsClusterExtender(ch)) {
427 9 : break;
428 : }
429 :
430 0 : mPos++;
431 0 : if (!IS_IN_BMP(ch)) {
432 0 : mPos++;
433 : }
434 : }
435 :
436 11 : NS_ASSERTION(mText < mPos && mPos <= mLimit,
437 : "ClusterIterator::Next has overshot the string!");
438 : }
439 :
440 : void
441 0 : ClusterReverseIterator::Next()
442 : {
443 0 : if (AtEnd()) {
444 0 : NS_WARNING("ClusterReverseIterator has already reached the end");
445 0 : return;
446 : }
447 :
448 : uint32_t ch;
449 0 : do {
450 0 : ch = *--mPos;
451 :
452 0 : if (NS_IS_LOW_SURROGATE(ch) && mPos > mLimit &&
453 0 : NS_IS_HIGH_SURROGATE(*(mPos - 1))) {
454 0 : ch = SURROGATE_TO_UCS4(*--mPos, ch);
455 : }
456 :
457 0 : if (!IsClusterExtender(ch)) {
458 0 : break;
459 : }
460 0 : } while (mPos > mLimit);
461 :
462 : // XXX May need to handle conjoining Jamo
463 :
464 0 : NS_ASSERTION(mPos >= mLimit,
465 : "ClusterReverseIterator::Next has overshot the string!");
466 : }
467 :
468 : uint32_t
469 0 : CountGraphemeClusters(const char16_t* aText, uint32_t aLength)
470 : {
471 0 : ClusterIterator iter(aText, aLength);
472 0 : uint32_t result = 0;
473 0 : while (!iter.AtEnd()) {
474 0 : ++result;
475 0 : iter.Next();
476 : }
477 0 : return result;
478 : }
479 :
480 : } // end namespace unicode
481 :
482 : } // end namespace mozilla
|