LCOV - output.info - intl/unicharutil/util/nsUnicharUtils.cpp

LCOV - code coverage report

Current view:	top level - intl/unicharutil/util - nsUnicharUtils.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	72	187	38.5 %
Date:	2017-07-14 16:53:18	Functions:	12	21	57.1 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       5             : 
       6             : #include "nsUnicharUtils.h"
       7             : #include "nsUTF8Utils.h"
       8             : #include "nsUnicodeProperties.h"
       9             : #include "mozilla/Likely.h"
      10             : #include "mozilla/HashFunctions.h"
      11             : 
      12             : // We map x -> x, except for upper-case letters,
      13             : // which we map to their lower-case equivalents.
      14             : static const uint8_t gASCIIToLower [128] = {
      15             :     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
      16             :     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
      17             :     0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
      18             :     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
      19             :     0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
      20             :     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
      21             :     0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
      22             :     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
      23             : };
      24             : 
      25             : #define IS_ASCII(u)       ((u) < 0x80)
      26             : #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
      27             : #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
      28             : #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
      29             : #define IS_ASCII_SPACE(u) (' ' == (u))
      30             : 
      31             : // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
      32             : // when they're called from within the case-insensitive comparators, so we
      33             : // define inlined versions.
      34             : static MOZ_ALWAYS_INLINE uint32_t
      35       96387 : ToLowerCase_inline(uint32_t aChar)
      36             : {
      37       96387 :   if (IS_ASCII(aChar)) {
      38       96387 :     return gASCIIToLower[aChar];
      39             :   }
      40             : 
      41           0 :   return mozilla::unicode::GetLowercase(aChar);
      42             : }
      43             : 
      44             : static MOZ_ALWAYS_INLINE uint32_t
      45           4 : ToLowerCaseASCII_inline(const uint32_t aChar)
      46             : {
      47           4 :   if (IS_ASCII(aChar)) {
      48           4 :     return gASCIIToLower[aChar];
      49             :   }
      50             : 
      51           0 :   return aChar;
      52             : }
      53             : 
      54             : void
      55        4937 : ToLowerCase(nsAString& aString)
      56             : {
      57        4937 :   char16_t *buf = aString.BeginWriting();
      58        4937 :   ToLowerCase(buf, buf, aString.Length());
      59        4937 : }
      60             : 
      61             : void
      62           0 : ToLowerCase(const nsAString& aSource,
      63             :             nsAString& aDest)
      64             : {
      65           0 :   const char16_t *in = aSource.BeginReading();
      66           0 :   uint32_t len = aSource.Length();
      67             : 
      68           0 :   aDest.SetLength(len);
      69           0 :   char16_t *out = aDest.BeginWriting();
      70             : 
      71           0 :   ToLowerCase(in, out, len);
      72           0 : }
      73             : 
      74             : uint32_t
      75           0 : ToLowerCaseASCII(const uint32_t aChar)
      76             : {
      77           0 :   return ToLowerCaseASCII_inline(aChar);
      78             : }
      79             : 
      80             : void
      81         276 : ToUpperCase(nsAString& aString)
      82             : {
      83         276 :   char16_t *buf = aString.BeginWriting();
      84         276 :   ToUpperCase(buf, buf, aString.Length());
      85         276 : }
      86             : 
      87             : void
      88           0 : ToUpperCase(const nsAString& aSource,
      89             :             nsAString& aDest)
      90             : {
      91           0 :   const char16_t *in = aSource.BeginReading();
      92           0 :   uint32_t len = aSource.Length();
      93             : 
      94           0 :   aDest.SetLength(len);
      95           0 :   char16_t *out = aDest.BeginWriting();
      96             : 
      97           0 :   ToUpperCase(in, out, len);
      98           0 : }
      99             : 
     100             : #ifdef MOZILLA_INTERNAL_API
     101             : 
     102             : int32_t
     103         167 : nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
     104             :                                               const char16_t* rhs,
     105             :                                               uint32_t lLength,
     106             :                                               uint32_t rLength) const
     107             : {
     108         167 :   return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
     109         167 :          (lLength > rLength) ? 1 : -1;
     110             : }
     111             : 
     112             : int32_t
     113           0 : nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
     114             :                                                   const char* rhs,
     115             :                                                   uint32_t lLength,
     116             :                                                   uint32_t rLength) const
     117             : {
     118           0 :   return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
     119             : }
     120             : 
     121             : int32_t
     122           2 : nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
     123             :                                                    const char16_t* rhs,
     124             :                                                    uint32_t lLength,
     125             :                                                    uint32_t rLength) const
     126             : {
     127           2 :   if (lLength != rLength) {
     128           0 :     if (lLength > rLength)
     129           0 :       return 1;
     130           0 :     return -1;
     131             :   }
     132             : 
     133           2 :   while (rLength) {
     134             :     // we don't care about surrogates here, because we're only
     135             :     // lowercasing the ASCII range
     136           2 :     char16_t l = *lhs++;
     137           2 :     char16_t r = *rhs++;
     138           2 :     if (l != r) {
     139           2 :       l = ToLowerCaseASCII_inline(l);
     140           2 :       r = ToLowerCaseASCII_inline(r);
     141             : 
     142           2 :       if (l > r)
     143           2 :         return 1;
     144           0 :       else if (r > l)
     145           0 :         return -1;
     146             :     }
     147           0 :     rLength--;
     148             :   }
     149             : 
     150           0 :   return 0;
     151             : }
     152             : 
     153             : #endif // MOZILLA_INTERNAL_API
     154             : 
     155             : uint32_t
     156       96359 : ToLowerCase(uint32_t aChar)
     157             : {
     158       96359 :   return ToLowerCase_inline(aChar);
     159             : }
     160             : 
     161             : void
     162        4937 : ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
     163             : {
     164       86744 :   for (uint32_t i = 0; i < aLen; i++) {
     165       81807 :     uint32_t ch = aIn[i];
     166       81807 :     if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
     167           0 :         NS_IS_LOW_SURROGATE(aIn[i + 1])) {
     168           0 :       ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
     169           0 :       NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
     170           0 :       aOut[i++] = H_SURROGATE(ch);
     171           0 :       aOut[i] = L_SURROGATE(ch);
     172           0 :       continue;
     173             :     }
     174       81807 :     aOut[i] = ToLowerCase(ch);
     175             :   }
     176        4937 : }
     177             : 
     178             : uint32_t
     179       15820 : ToUpperCase(uint32_t aChar)
     180             : {
     181       15820 :   if (IS_ASCII(aChar)) {
     182       15820 :     if (IS_ASCII_LOWER(aChar)) {
     183        1303 :       return aChar - 0x20;
     184             :     }
     185       14517 :     return aChar;
     186             :   }
     187             : 
     188           0 :   return mozilla::unicode::GetUppercase(aChar);
     189             : }
     190             : 
     191             : void
     192         276 : ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
     193             : {
     194        1544 :   for (uint32_t i = 0; i < aLen; i++) {
     195        1268 :     uint32_t ch = aIn[i];
     196        1268 :     if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
     197           0 :         NS_IS_LOW_SURROGATE(aIn[i + 1])) {
     198           0 :       ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
     199           0 :       NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
     200           0 :       aOut[i++] = H_SURROGATE(ch);
     201           0 :       aOut[i] = L_SURROGATE(ch);
     202           0 :       continue;
     203             :     }
     204        1268 :     aOut[i] = ToUpperCase(ch);
     205             :   }
     206         276 : }
     207             : 
     208             : uint32_t
     209           0 : ToTitleCase(uint32_t aChar)
     210             : {
     211           0 :   if (IS_ASCII(aChar)) {
     212           0 :     return ToUpperCase(aChar);
     213             :   }
     214             : 
     215           0 :   return mozilla::unicode::GetTitlecaseForLower(aChar);
     216             : }
     217             : 
     218             : int32_t
     219         167 : CaseInsensitiveCompare(const char16_t *a,
     220             :                        const char16_t *b,
     221             :                        uint32_t len)
     222             : {
     223         167 :   NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
     224             : 
     225         167 :   if (len) {
     226        1597 :     do {
     227        1611 :       uint32_t c1 = *a++;
     228        1611 :       uint32_t c2 = *b++;
     229             : 
     230             :       // Unfortunately, we need to check for surrogates BEFORE we check
     231             :       // for equality, because we could have identical high surrogates
     232             :       // but non-identical characters, so we can't just skip them
     233             : 
     234             :       // If c1 isn't a surrogate, we don't bother to check c2;
     235             :       // in the case where it _is_ a surrogate, we're definitely going to get
     236             :       // a mismatch, and don't need to interpret and lowercase it
     237             : 
     238        1611 :       if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
     239           0 :         c1 = SURROGATE_TO_UCS4(c1, *a++);
     240           0 :         if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
     241           0 :           c2 = SURROGATE_TO_UCS4(c2, *b++);
     242             :         }
     243             :         // If c2 wasn't a surrogate, decrementing len means we'd stop
     244             :         // short of the end of string b, but that doesn't actually matter
     245             :         // because we're going to find a mismatch and return early
     246           0 :         --len;
     247             :       }
     248             : 
     249        1611 :       if (c1 != c2) {
     250          14 :         c1 = ToLowerCase_inline(c1);
     251          14 :         c2 = ToLowerCase_inline(c2);
     252          14 :         if (c1 != c2) {
     253          14 :           if (c1 < c2) {
     254          12 :             return -1;
     255             :           }
     256           2 :           return 1;
     257             :         }
     258             :       }
     259             :     } while (--len != 0);
     260             :   }
     261         153 :   return 0;
     262             : }
     263             : 
     264             : // Calculates the codepoint of the UTF8 sequence starting at aStr.  Sets aNext
     265             : // to the byte following the end of the sequence.
     266             : //
     267             : // If the sequence is invalid, or if computing the codepoint would take us off
     268             : // the end of the string (as marked by aEnd), returns -1 and does not set
     269             : // aNext.  Note that this function doesn't check that aStr < aEnd -- it assumes
     270             : // you've done that already.
     271             : static MOZ_ALWAYS_INLINE uint32_t
     272           0 : GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
     273             : {
     274             :   // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
     275             :   // sign extend.
     276           0 :   const unsigned char *str = (unsigned char*)aStr;
     277             : 
     278           0 :   if (UTF8traits::isASCII(str[0])) {
     279             :     // It's ASCII; just convert to lower-case and return it.
     280           0 :     *aNext = aStr + 1;
     281           0 :     return gASCIIToLower[*str];
     282             :   }
     283           0 :   if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
     284             :     // It's a two-byte sequence, so it looks like
     285             :     //  110XXXXX 10XXXXXX.
     286             :     // This is definitely in the BMP, so we can store straightaway into a
     287             :     // uint16_t.
     288             : 
     289             :     uint16_t c;
     290           0 :     c  = (str[0] & 0x1F) << 6;
     291           0 :     c += (str[1] & 0x3F);
     292             : 
     293             :     // we don't go through ToLowerCase here, because we know this isn't
     294             :     // an ASCII character so the ASCII fast-path there is useless
     295           0 :     c = mozilla::unicode::GetLowercase(c);
     296             : 
     297           0 :     *aNext = aStr + 2;
     298           0 :     return c;
     299             :   }
     300           0 :   if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
     301             :     // It's a three-byte sequence, so it looks like
     302             :     //  1110XXXX 10XXXXXX 10XXXXXX.
     303             :     // This will just barely fit into 16-bits, so store into a uint16_t.
     304             : 
     305             :     uint16_t c;
     306           0 :     c  = (str[0] & 0x0F) << 12;
     307           0 :     c += (str[1] & 0x3F) << 6;
     308           0 :     c += (str[2] & 0x3F);
     309             : 
     310           0 :     c = mozilla::unicode::GetLowercase(c);
     311             : 
     312           0 :     *aNext = aStr + 3;
     313           0 :     return c;
     314             :   }
     315           0 :   if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
     316             :     // It's a four-byte sequence, so it looks like
     317             :     //   11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
     318             : 
     319             :     uint32_t c;
     320           0 :     c  = (str[0] & 0x07) << 18;
     321           0 :     c += (str[1] & 0x3F) << 12;
     322           0 :     c += (str[2] & 0x3F) << 6;
     323           0 :     c += (str[3] & 0x3F);
     324             : 
     325           0 :     c = mozilla::unicode::GetLowercase(c);
     326             : 
     327           0 :     *aNext = aStr + 4;
     328           0 :     return c;
     329             :   }
     330             : 
     331             :   // Hm, we don't understand this sequence.
     332           0 :   return -1;
     333             : }
     334             : 
     335           0 : int32_t CaseInsensitiveCompare(const char *aLeft,
     336             :                                const char *aRight,
     337             :                                uint32_t aLeftBytes,
     338             :                                uint32_t aRightBytes)
     339             : {
     340           0 :   const char *leftEnd = aLeft + aLeftBytes;
     341           0 :   const char *rightEnd = aRight + aRightBytes;
     342             : 
     343           0 :   while (aLeft < leftEnd && aRight < rightEnd) {
     344           0 :     uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
     345           0 :     if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
     346           0 :       return -1;
     347             : 
     348           0 :     uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
     349           0 :     if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
     350           0 :       return -1;
     351             : 
     352             :     // Now leftChar and rightChar are lower-case, so we can compare them.
     353           0 :     if (leftChar != rightChar) {
     354           0 :       if (leftChar > rightChar)
     355           0 :         return 1;
     356           0 :       return -1;
     357             :     }
     358             :   }
     359             : 
     360             :   // Make sure that if one string is longer than the other we return the
     361             :   // correct result.
     362           0 :   if (aLeft < leftEnd)
     363           0 :     return 1;
     364           0 :   if (aRight < rightEnd)
     365           0 :     return -1;
     366             : 
     367           0 :   return 0;
     368             : }
     369             : 
     370             : bool
     371           0 : CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
     372             :                               const char* aLeftEnd, const char* aRightEnd,
     373             :                               const char** aLeftNext, const char** aRightNext,
     374             :                               bool* aErr)
     375             : {
     376           0 :   NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
     377           0 :   NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
     378           0 :   NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
     379           0 :   NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
     380           0 :   NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
     381             : 
     382           0 :   uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
     383           0 :   if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
     384           0 :     *aErr = true;
     385           0 :     return false;
     386             :   }
     387             : 
     388           0 :   uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
     389           0 :   if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
     390           0 :     *aErr = true;
     391           0 :     return false;
     392             :   }
     393             : 
     394             :   // Can't have an error past this point.
     395           0 :   *aErr = false;
     396             : 
     397           0 :   return leftChar == rightChar;
     398             : }
     399             : 
     400             : namespace mozilla {
     401             : 
     402             : uint32_t
     403         844 : HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
     404             : {
     405         844 :   uint32_t hash = 0;
     406         844 :   const char* s = aUTF8;
     407         844 :   const char* end = aUTF8 + aLength;
     408             : 
     409         844 :   *aErr = false;
     410             : 
     411       31524 :   while (s < end)
     412             :   {
     413       15340 :     uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
     414       15340 :     if (*aErr) {
     415           0 :       return 0;
     416             :     }
     417             : 
     418       15340 :     if (ucs4 < PLANE1_BASE) {
     419       15340 :       hash = AddToHash(hash, ucs4);
     420             :     }
     421             :     else {
     422           0 :       hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
     423             :     }
     424             :   }
     425             : 
     426         844 :   return hash;
     427             : }
     428             : 
     429             : bool
     430           0 : IsSegmentBreakSkipChar(uint32_t u)
     431             : {
     432           0 :   return unicode::IsEastAsianWidthFWH(u) &&
     433           0 :          unicode::GetScriptCode(u) != unicode::Script::HANGUL;
     434             : }
     435             : 
     436             : } // namespace mozilla

Generated by: LCOV version 1.13