Line data Source code
1 : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "nsUnicharUtils.h"
7 : #include "nsUTF8Utils.h"
8 : #include "nsUnicodeProperties.h"
9 : #include "mozilla/Likely.h"
10 : #include "mozilla/HashFunctions.h"
11 :
12 : // We map x -> x, except for upper-case letters,
13 : // which we map to their lower-case equivalents.
14 : static const uint8_t gASCIIToLower [128] = {
15 : 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
16 : 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
17 : 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
18 : 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
19 : 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
20 : 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
21 : 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
22 : 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
23 : };
24 :
25 : #define IS_ASCII(u) ((u) < 0x80)
26 : #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
27 : #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
28 : #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
29 : #define IS_ASCII_SPACE(u) (' ' == (u))
30 :
31 : // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
32 : // when they're called from within the case-insensitive comparators, so we
33 : // define inlined versions.
34 : static MOZ_ALWAYS_INLINE uint32_t
35 96387 : ToLowerCase_inline(uint32_t aChar)
36 : {
37 96387 : if (IS_ASCII(aChar)) {
38 96387 : return gASCIIToLower[aChar];
39 : }
40 :
41 0 : return mozilla::unicode::GetLowercase(aChar);
42 : }
43 :
44 : static MOZ_ALWAYS_INLINE uint32_t
45 4 : ToLowerCaseASCII_inline(const uint32_t aChar)
46 : {
47 4 : if (IS_ASCII(aChar)) {
48 4 : return gASCIIToLower[aChar];
49 : }
50 :
51 0 : return aChar;
52 : }
53 :
54 : void
55 4937 : ToLowerCase(nsAString& aString)
56 : {
57 4937 : char16_t *buf = aString.BeginWriting();
58 4937 : ToLowerCase(buf, buf, aString.Length());
59 4937 : }
60 :
61 : void
62 0 : ToLowerCase(const nsAString& aSource,
63 : nsAString& aDest)
64 : {
65 0 : const char16_t *in = aSource.BeginReading();
66 0 : uint32_t len = aSource.Length();
67 :
68 0 : aDest.SetLength(len);
69 0 : char16_t *out = aDest.BeginWriting();
70 :
71 0 : ToLowerCase(in, out, len);
72 0 : }
73 :
74 : uint32_t
75 0 : ToLowerCaseASCII(const uint32_t aChar)
76 : {
77 0 : return ToLowerCaseASCII_inline(aChar);
78 : }
79 :
80 : void
81 276 : ToUpperCase(nsAString& aString)
82 : {
83 276 : char16_t *buf = aString.BeginWriting();
84 276 : ToUpperCase(buf, buf, aString.Length());
85 276 : }
86 :
87 : void
88 0 : ToUpperCase(const nsAString& aSource,
89 : nsAString& aDest)
90 : {
91 0 : const char16_t *in = aSource.BeginReading();
92 0 : uint32_t len = aSource.Length();
93 :
94 0 : aDest.SetLength(len);
95 0 : char16_t *out = aDest.BeginWriting();
96 :
97 0 : ToUpperCase(in, out, len);
98 0 : }
99 :
100 : #ifdef MOZILLA_INTERNAL_API
101 :
102 : int32_t
103 167 : nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
104 : const char16_t* rhs,
105 : uint32_t lLength,
106 : uint32_t rLength) const
107 : {
108 167 : return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
109 167 : (lLength > rLength) ? 1 : -1;
110 : }
111 :
112 : int32_t
113 0 : nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
114 : const char* rhs,
115 : uint32_t lLength,
116 : uint32_t rLength) const
117 : {
118 0 : return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
119 : }
120 :
121 : int32_t
122 2 : nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
123 : const char16_t* rhs,
124 : uint32_t lLength,
125 : uint32_t rLength) const
126 : {
127 2 : if (lLength != rLength) {
128 0 : if (lLength > rLength)
129 0 : return 1;
130 0 : return -1;
131 : }
132 :
133 2 : while (rLength) {
134 : // we don't care about surrogates here, because we're only
135 : // lowercasing the ASCII range
136 2 : char16_t l = *lhs++;
137 2 : char16_t r = *rhs++;
138 2 : if (l != r) {
139 2 : l = ToLowerCaseASCII_inline(l);
140 2 : r = ToLowerCaseASCII_inline(r);
141 :
142 2 : if (l > r)
143 2 : return 1;
144 0 : else if (r > l)
145 0 : return -1;
146 : }
147 0 : rLength--;
148 : }
149 :
150 0 : return 0;
151 : }
152 :
153 : #endif // MOZILLA_INTERNAL_API
154 :
155 : uint32_t
156 96359 : ToLowerCase(uint32_t aChar)
157 : {
158 96359 : return ToLowerCase_inline(aChar);
159 : }
160 :
161 : void
162 4937 : ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
163 : {
164 86744 : for (uint32_t i = 0; i < aLen; i++) {
165 81807 : uint32_t ch = aIn[i];
166 81807 : if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
167 0 : NS_IS_LOW_SURROGATE(aIn[i + 1])) {
168 0 : ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
169 0 : NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
170 0 : aOut[i++] = H_SURROGATE(ch);
171 0 : aOut[i] = L_SURROGATE(ch);
172 0 : continue;
173 : }
174 81807 : aOut[i] = ToLowerCase(ch);
175 : }
176 4937 : }
177 :
178 : uint32_t
179 15820 : ToUpperCase(uint32_t aChar)
180 : {
181 15820 : if (IS_ASCII(aChar)) {
182 15820 : if (IS_ASCII_LOWER(aChar)) {
183 1303 : return aChar - 0x20;
184 : }
185 14517 : return aChar;
186 : }
187 :
188 0 : return mozilla::unicode::GetUppercase(aChar);
189 : }
190 :
191 : void
192 276 : ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
193 : {
194 1544 : for (uint32_t i = 0; i < aLen; i++) {
195 1268 : uint32_t ch = aIn[i];
196 1268 : if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
197 0 : NS_IS_LOW_SURROGATE(aIn[i + 1])) {
198 0 : ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
199 0 : NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
200 0 : aOut[i++] = H_SURROGATE(ch);
201 0 : aOut[i] = L_SURROGATE(ch);
202 0 : continue;
203 : }
204 1268 : aOut[i] = ToUpperCase(ch);
205 : }
206 276 : }
207 :
208 : uint32_t
209 0 : ToTitleCase(uint32_t aChar)
210 : {
211 0 : if (IS_ASCII(aChar)) {
212 0 : return ToUpperCase(aChar);
213 : }
214 :
215 0 : return mozilla::unicode::GetTitlecaseForLower(aChar);
216 : }
217 :
218 : int32_t
219 167 : CaseInsensitiveCompare(const char16_t *a,
220 : const char16_t *b,
221 : uint32_t len)
222 : {
223 167 : NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
224 :
225 167 : if (len) {
226 1597 : do {
227 1611 : uint32_t c1 = *a++;
228 1611 : uint32_t c2 = *b++;
229 :
230 : // Unfortunately, we need to check for surrogates BEFORE we check
231 : // for equality, because we could have identical high surrogates
232 : // but non-identical characters, so we can't just skip them
233 :
234 : // If c1 isn't a surrogate, we don't bother to check c2;
235 : // in the case where it _is_ a surrogate, we're definitely going to get
236 : // a mismatch, and don't need to interpret and lowercase it
237 :
238 1611 : if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
239 0 : c1 = SURROGATE_TO_UCS4(c1, *a++);
240 0 : if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
241 0 : c2 = SURROGATE_TO_UCS4(c2, *b++);
242 : }
243 : // If c2 wasn't a surrogate, decrementing len means we'd stop
244 : // short of the end of string b, but that doesn't actually matter
245 : // because we're going to find a mismatch and return early
246 0 : --len;
247 : }
248 :
249 1611 : if (c1 != c2) {
250 14 : c1 = ToLowerCase_inline(c1);
251 14 : c2 = ToLowerCase_inline(c2);
252 14 : if (c1 != c2) {
253 14 : if (c1 < c2) {
254 12 : return -1;
255 : }
256 2 : return 1;
257 : }
258 : }
259 : } while (--len != 0);
260 : }
261 153 : return 0;
262 : }
263 :
264 : // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext
265 : // to the byte following the end of the sequence.
266 : //
267 : // If the sequence is invalid, or if computing the codepoint would take us off
268 : // the end of the string (as marked by aEnd), returns -1 and does not set
269 : // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes
270 : // you've done that already.
271 : static MOZ_ALWAYS_INLINE uint32_t
272 0 : GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
273 : {
274 : // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
275 : // sign extend.
276 0 : const unsigned char *str = (unsigned char*)aStr;
277 :
278 0 : if (UTF8traits::isASCII(str[0])) {
279 : // It's ASCII; just convert to lower-case and return it.
280 0 : *aNext = aStr + 1;
281 0 : return gASCIIToLower[*str];
282 : }
283 0 : if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
284 : // It's a two-byte sequence, so it looks like
285 : // 110XXXXX 10XXXXXX.
286 : // This is definitely in the BMP, so we can store straightaway into a
287 : // uint16_t.
288 :
289 : uint16_t c;
290 0 : c = (str[0] & 0x1F) << 6;
291 0 : c += (str[1] & 0x3F);
292 :
293 : // we don't go through ToLowerCase here, because we know this isn't
294 : // an ASCII character so the ASCII fast-path there is useless
295 0 : c = mozilla::unicode::GetLowercase(c);
296 :
297 0 : *aNext = aStr + 2;
298 0 : return c;
299 : }
300 0 : if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
301 : // It's a three-byte sequence, so it looks like
302 : // 1110XXXX 10XXXXXX 10XXXXXX.
303 : // This will just barely fit into 16-bits, so store into a uint16_t.
304 :
305 : uint16_t c;
306 0 : c = (str[0] & 0x0F) << 12;
307 0 : c += (str[1] & 0x3F) << 6;
308 0 : c += (str[2] & 0x3F);
309 :
310 0 : c = mozilla::unicode::GetLowercase(c);
311 :
312 0 : *aNext = aStr + 3;
313 0 : return c;
314 : }
315 0 : if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
316 : // It's a four-byte sequence, so it looks like
317 : // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
318 :
319 : uint32_t c;
320 0 : c = (str[0] & 0x07) << 18;
321 0 : c += (str[1] & 0x3F) << 12;
322 0 : c += (str[2] & 0x3F) << 6;
323 0 : c += (str[3] & 0x3F);
324 :
325 0 : c = mozilla::unicode::GetLowercase(c);
326 :
327 0 : *aNext = aStr + 4;
328 0 : return c;
329 : }
330 :
331 : // Hm, we don't understand this sequence.
332 0 : return -1;
333 : }
334 :
335 0 : int32_t CaseInsensitiveCompare(const char *aLeft,
336 : const char *aRight,
337 : uint32_t aLeftBytes,
338 : uint32_t aRightBytes)
339 : {
340 0 : const char *leftEnd = aLeft + aLeftBytes;
341 0 : const char *rightEnd = aRight + aRightBytes;
342 :
343 0 : while (aLeft < leftEnd && aRight < rightEnd) {
344 0 : uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
345 0 : if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
346 0 : return -1;
347 :
348 0 : uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
349 0 : if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
350 0 : return -1;
351 :
352 : // Now leftChar and rightChar are lower-case, so we can compare them.
353 0 : if (leftChar != rightChar) {
354 0 : if (leftChar > rightChar)
355 0 : return 1;
356 0 : return -1;
357 : }
358 : }
359 :
360 : // Make sure that if one string is longer than the other we return the
361 : // correct result.
362 0 : if (aLeft < leftEnd)
363 0 : return 1;
364 0 : if (aRight < rightEnd)
365 0 : return -1;
366 :
367 0 : return 0;
368 : }
369 :
370 : bool
371 0 : CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
372 : const char* aLeftEnd, const char* aRightEnd,
373 : const char** aLeftNext, const char** aRightNext,
374 : bool* aErr)
375 : {
376 0 : NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
377 0 : NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
378 0 : NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
379 0 : NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
380 0 : NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
381 :
382 0 : uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
383 0 : if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
384 0 : *aErr = true;
385 0 : return false;
386 : }
387 :
388 0 : uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
389 0 : if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
390 0 : *aErr = true;
391 0 : return false;
392 : }
393 :
394 : // Can't have an error past this point.
395 0 : *aErr = false;
396 :
397 0 : return leftChar == rightChar;
398 : }
399 :
400 : namespace mozilla {
401 :
402 : uint32_t
403 844 : HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
404 : {
405 844 : uint32_t hash = 0;
406 844 : const char* s = aUTF8;
407 844 : const char* end = aUTF8 + aLength;
408 :
409 844 : *aErr = false;
410 :
411 31524 : while (s < end)
412 : {
413 15340 : uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
414 15340 : if (*aErr) {
415 0 : return 0;
416 : }
417 :
418 15340 : if (ucs4 < PLANE1_BASE) {
419 15340 : hash = AddToHash(hash, ucs4);
420 : }
421 : else {
422 0 : hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
423 : }
424 : }
425 :
426 844 : return hash;
427 : }
428 :
429 : bool
430 0 : IsSegmentBreakSkipChar(uint32_t u)
431 : {
432 0 : return unicode::IsEastAsianWidthFWH(u) &&
433 0 : unicode::GetScriptCode(u) != unicode::Script::HANGUL;
434 : }
435 :
436 : } // namespace mozilla
|