LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - collationruleparser.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 540 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 29 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : * Copyright (C) 2013-2015, International Business Machines
       6             : * Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : * collationruleparser.cpp
       9             : *
      10             : * (replaced the former ucol_tok.cpp)
      11             : *
      12             : * created on: 2013apr10
      13             : * created by: Markus W. Scherer
      14             : */
      15             : 
      16             : #include "unicode/utypes.h"
      17             : 
      18             : #if !UCONFIG_NO_COLLATION
      19             : 
      20             : #include "unicode/normalizer2.h"
      21             : #include "unicode/parseerr.h"
      22             : #include "unicode/uchar.h"
      23             : #include "unicode/ucol.h"
      24             : #include "unicode/uloc.h"
      25             : #include "unicode/unistr.h"
      26             : #include "unicode/utf16.h"
      27             : #include "charstr.h"
      28             : #include "cmemory.h"
      29             : #include "collation.h"
      30             : #include "collationdata.h"
      31             : #include "collationruleparser.h"
      32             : #include "collationsettings.h"
      33             : #include "collationtailoring.h"
      34             : #include "cstring.h"
      35             : #include "patternprops.h"
      36             : #include "uassert.h"
      37             : #include "uvectr32.h"
      38             : 
      39             : U_NAMESPACE_BEGIN
      40             : 
      41             : namespace {
      42             : 
      43             : static const UChar BEFORE[] = { 0x5b, 0x62, 0x65, 0x66, 0x6f, 0x72, 0x65, 0 };  // "[before"
      44             : const int32_t BEFORE_LENGTH = 7;
      45             : 
      46             : }  // namespace
      47             : 
      48           0 : CollationRuleParser::Sink::~Sink() {}
      49             : 
      50             : void
      51           0 : CollationRuleParser::Sink::suppressContractions(const UnicodeSet &, const char *&, UErrorCode &) {}
      52             : 
      53             : void
      54           0 : CollationRuleParser::Sink::optimize(const UnicodeSet &, const char *&, UErrorCode &) {}
      55             : 
      56           0 : CollationRuleParser::Importer::~Importer() {}
      57             : 
      58           0 : CollationRuleParser::CollationRuleParser(const CollationData *base, UErrorCode &errorCode)
      59           0 :         : nfd(*Normalizer2::getNFDInstance(errorCode)),
      60           0 :           nfc(*Normalizer2::getNFCInstance(errorCode)),
      61             :           rules(NULL), baseData(base), settings(NULL),
      62             :           parseError(NULL), errorReason(NULL),
      63             :           sink(NULL), importer(NULL),
      64           0 :           ruleIndex(0) {
      65           0 : }
      66             : 
      67           0 : CollationRuleParser::~CollationRuleParser() {
      68           0 : }
      69             : 
      70             : void
      71           0 : CollationRuleParser::parse(const UnicodeString &ruleString,
      72             :                            CollationSettings &outSettings,
      73             :                            UParseError *outParseError,
      74             :                            UErrorCode &errorCode) {
      75           0 :     if(U_FAILURE(errorCode)) { return; }
      76           0 :     settings = &outSettings;
      77           0 :     parseError = outParseError;
      78           0 :     if(parseError != NULL) {
      79           0 :         parseError->line = 0;
      80           0 :         parseError->offset = -1;
      81           0 :         parseError->preContext[0] = 0;
      82           0 :         parseError->postContext[0] = 0;
      83             :     }
      84           0 :     errorReason = NULL;
      85           0 :     parse(ruleString, errorCode);
      86             : }
      87             : 
      88             : void
      89           0 : CollationRuleParser::parse(const UnicodeString &ruleString, UErrorCode &errorCode) {
      90           0 :     if(U_FAILURE(errorCode)) { return; }
      91           0 :     rules = &ruleString;
      92           0 :     ruleIndex = 0;
      93             : 
      94           0 :     while(ruleIndex < rules->length()) {
      95           0 :         UChar c = rules->charAt(ruleIndex);
      96           0 :         if(PatternProps::isWhiteSpace(c)) {
      97           0 :             ++ruleIndex;
      98           0 :             continue;
      99             :         }
     100           0 :         switch(c) {
     101             :         case 0x26:  // '&'
     102           0 :             parseRuleChain(errorCode);
     103           0 :             break;
     104             :         case 0x5b:  // '['
     105           0 :             parseSetting(errorCode);
     106           0 :             break;
     107             :         case 0x23:  // '#' starts a comment, until the end of the line
     108           0 :             ruleIndex = skipComment(ruleIndex + 1);
     109           0 :             break;
     110             :         case 0x40:  // '@' is equivalent to [backwards 2]
     111           0 :             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
     112           0 :                               UCOL_ON, 0, errorCode);
     113           0 :             ++ruleIndex;
     114           0 :             break;
     115             :         case 0x21:  // '!' used to turn on Thai/Lao character reversal
     116             :             // Accept but ignore. The root collator has contractions
     117             :             // that are equivalent to the character reversal, where appropriate.
     118           0 :             ++ruleIndex;
     119           0 :             break;
     120             :         default:
     121           0 :             setParseError("expected a reset or setting or comment", errorCode);
     122           0 :             break;
     123             :         }
     124           0 :         if(U_FAILURE(errorCode)) { return; }
     125             :     }
     126             : }
     127             : 
     128             : void
     129           0 : CollationRuleParser::parseRuleChain(UErrorCode &errorCode) {
     130           0 :     int32_t resetStrength = parseResetAndPosition(errorCode);
     131           0 :     UBool isFirstRelation = TRUE;
     132             :     for(;;) {
     133           0 :         int32_t result = parseRelationOperator(errorCode);
     134           0 :         if(U_FAILURE(errorCode)) { return; }
     135           0 :         if(result < 0) {
     136           0 :             if(ruleIndex < rules->length() && rules->charAt(ruleIndex) == 0x23) {
     137             :                 // '#' starts a comment, until the end of the line
     138           0 :                 ruleIndex = skipComment(ruleIndex + 1);
     139           0 :                 continue;
     140             :             }
     141           0 :             if(isFirstRelation) {
     142           0 :                 setParseError("reset not followed by a relation", errorCode);
     143             :             }
     144           0 :             return;
     145             :         }
     146           0 :         int32_t strength = result & STRENGTH_MASK;
     147           0 :         if(resetStrength < UCOL_IDENTICAL) {
     148             :             // reset-before rule chain
     149           0 :             if(isFirstRelation) {
     150           0 :                 if(strength != resetStrength) {
     151           0 :                     setParseError("reset-before strength differs from its first relation", errorCode);
     152           0 :                     return;
     153             :                 }
     154             :             } else {
     155           0 :                 if(strength < resetStrength) {
     156           0 :                     setParseError("reset-before strength followed by a stronger relation", errorCode);
     157           0 :                     return;
     158             :                 }
     159             :             }
     160             :         }
     161           0 :         int32_t i = ruleIndex + (result >> OFFSET_SHIFT);  // skip over the relation operator
     162           0 :         if((result & STARRED_FLAG) == 0) {
     163           0 :             parseRelationStrings(strength, i, errorCode);
     164             :         } else {
     165           0 :             parseStarredCharacters(strength, i, errorCode);
     166             :         }
     167           0 :         if(U_FAILURE(errorCode)) { return; }
     168           0 :         isFirstRelation = FALSE;
     169           0 :     }
     170             : }
     171             : 
     172             : int32_t
     173           0 : CollationRuleParser::parseResetAndPosition(UErrorCode &errorCode) {
     174           0 :     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
     175           0 :     int32_t i = skipWhiteSpace(ruleIndex + 1);
     176             :     int32_t j;
     177             :     UChar c;
     178             :     int32_t resetStrength;
     179           0 :     if(rules->compare(i, BEFORE_LENGTH, BEFORE, 0, BEFORE_LENGTH) == 0 &&
     180           0 :             (j = i + BEFORE_LENGTH) < rules->length() &&
     181           0 :             PatternProps::isWhiteSpace(rules->charAt(j)) &&
     182           0 :             ((j = skipWhiteSpace(j + 1)) + 1) < rules->length() &&
     183           0 :             0x31 <= (c = rules->charAt(j)) && c <= 0x33 &&
     184           0 :             rules->charAt(j + 1) == 0x5d) {
     185             :         // &[before n] with n=1 or 2 or 3
     186           0 :         resetStrength = UCOL_PRIMARY + (c - 0x31);
     187           0 :         i = skipWhiteSpace(j + 2);
     188             :     } else {
     189           0 :         resetStrength = UCOL_IDENTICAL;
     190             :     }
     191           0 :     if(i >= rules->length()) {
     192           0 :         setParseError("reset without position", errorCode);
     193           0 :         return UCOL_DEFAULT;
     194             :     }
     195           0 :     UnicodeString str;
     196           0 :     if(rules->charAt(i) == 0x5b) {  // '['
     197           0 :         i = parseSpecialPosition(i, str, errorCode);
     198             :     } else {
     199           0 :         i = parseTailoringString(i, str, errorCode);
     200             :     }
     201           0 :     sink->addReset(resetStrength, str, errorReason, errorCode);
     202           0 :     if(U_FAILURE(errorCode)) { setErrorContext(); }
     203           0 :     ruleIndex = i;
     204           0 :     return resetStrength;
     205             : }
     206             : 
     207             : int32_t
     208           0 : CollationRuleParser::parseRelationOperator(UErrorCode &errorCode) {
     209           0 :     if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; }
     210           0 :     ruleIndex = skipWhiteSpace(ruleIndex);
     211           0 :     if(ruleIndex >= rules->length()) { return UCOL_DEFAULT; }
     212             :     int32_t strength;
     213           0 :     int32_t i = ruleIndex;
     214           0 :     UChar c = rules->charAt(i++);
     215           0 :     switch(c) {
     216             :     case 0x3c:  // '<'
     217           0 :         if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<
     218           0 :             ++i;
     219           0 :             if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<
     220           0 :                 ++i;
     221           0 :                 if(i < rules->length() && rules->charAt(i) == 0x3c) {  // <<<<
     222           0 :                     ++i;
     223           0 :                     strength = UCOL_QUATERNARY;
     224             :                 } else {
     225           0 :                     strength = UCOL_TERTIARY;
     226             :                 }
     227             :             } else {
     228           0 :                 strength = UCOL_SECONDARY;
     229             :             }
     230             :         } else {
     231           0 :             strength = UCOL_PRIMARY;
     232             :         }
     233           0 :         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
     234           0 :             ++i;
     235           0 :             strength |= STARRED_FLAG;
     236             :         }
     237           0 :         break;
     238             :     case 0x3b:  // ';' same as <<
     239           0 :         strength = UCOL_SECONDARY;
     240           0 :         break;
     241             :     case 0x2c:  // ',' same as <<<
     242           0 :         strength = UCOL_TERTIARY;
     243           0 :         break;
     244             :     case 0x3d:  // '='
     245           0 :         strength = UCOL_IDENTICAL;
     246           0 :         if(i < rules->length() && rules->charAt(i) == 0x2a) {  // '*'
     247           0 :             ++i;
     248           0 :             strength |= STARRED_FLAG;
     249             :         }
     250           0 :         break;
     251             :     default:
     252           0 :         return UCOL_DEFAULT;
     253             :     }
     254           0 :     return ((i - ruleIndex) << OFFSET_SHIFT) | strength;
     255             : }
     256             : 
     257             : void
     258           0 : CollationRuleParser::parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode) {
     259             :     // Parse
     260             :     //     prefix | str / extension
     261             :     // where prefix and extension are optional.
     262           0 :     UnicodeString prefix, str, extension;
     263           0 :     i = parseTailoringString(i, str, errorCode);
     264           0 :     if(U_FAILURE(errorCode)) { return; }
     265           0 :     UChar next = (i < rules->length()) ? rules->charAt(i) : 0;
     266           0 :     if(next == 0x7c) {  // '|' separates the context prefix from the string.
     267           0 :         prefix = str;
     268           0 :         i = parseTailoringString(i + 1, str, errorCode);
     269           0 :         if(U_FAILURE(errorCode)) { return; }
     270           0 :         next = (i < rules->length()) ? rules->charAt(i) : 0;
     271             :     }
     272           0 :     if(next == 0x2f) {  // '/' separates the string from the extension.
     273           0 :         i = parseTailoringString(i + 1, extension, errorCode);
     274             :     }
     275           0 :     if(!prefix.isEmpty()) {
     276           0 :         UChar32 prefix0 = prefix.char32At(0);
     277           0 :         UChar32 c = str.char32At(0);
     278           0 :         if(!nfc.hasBoundaryBefore(prefix0) || !nfc.hasBoundaryBefore(c)) {
     279             :             setParseError("in 'prefix|str', prefix and str must each start with an NFC boundary",
     280           0 :                           errorCode);
     281           0 :             return;
     282             :         }
     283             :     }
     284           0 :     sink->addRelation(strength, prefix, str, extension, errorReason, errorCode);
     285           0 :     if(U_FAILURE(errorCode)) { setErrorContext(); }
     286           0 :     ruleIndex = i;
     287             : }
     288             : 
     289             : void
     290           0 : CollationRuleParser::parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode) {
     291           0 :     UnicodeString empty, raw;
     292           0 :     i = parseString(skipWhiteSpace(i), raw, errorCode);
     293           0 :     if(U_FAILURE(errorCode)) { return; }
     294           0 :     if(raw.isEmpty()) {
     295           0 :         setParseError("missing starred-relation string", errorCode);
     296           0 :         return;
     297             :     }
     298           0 :     UChar32 prev = -1;
     299           0 :     int32_t j = 0;
     300             :     for(;;) {
     301           0 :         while(j < raw.length()) {
     302           0 :             UChar32 c = raw.char32At(j);
     303           0 :             if(!nfd.isInert(c)) {
     304           0 :                 setParseError("starred-relation string is not all NFD-inert", errorCode);
     305           0 :                 return;
     306             :             }
     307           0 :             sink->addRelation(strength, empty, UnicodeString(c), empty, errorReason, errorCode);
     308           0 :             if(U_FAILURE(errorCode)) {
     309           0 :                 setErrorContext();
     310           0 :                 return;
     311             :             }
     312           0 :             j += U16_LENGTH(c);
     313           0 :             prev = c;
     314             :         }
     315           0 :         if(i >= rules->length() || rules->charAt(i) != 0x2d) {  // '-'
     316           0 :             break;
     317             :         }
     318           0 :         if(prev < 0) {
     319           0 :             setParseError("range without start in starred-relation string", errorCode);
     320           0 :             return;
     321             :         }
     322           0 :         i = parseString(i + 1, raw, errorCode);
     323           0 :         if(U_FAILURE(errorCode)) { return; }
     324           0 :         if(raw.isEmpty()) {
     325           0 :             setParseError("range without end in starred-relation string", errorCode);
     326           0 :             return;
     327             :         }
     328           0 :         UChar32 c = raw.char32At(0);
     329           0 :         if(c < prev) {
     330           0 :             setParseError("range start greater than end in starred-relation string", errorCode);
     331           0 :             return;
     332             :         }
     333             :         // range prev-c
     334           0 :         UnicodeString s;
     335           0 :         while(++prev <= c) {
     336           0 :             if(!nfd.isInert(prev)) {
     337           0 :                 setParseError("starred-relation string range is not all NFD-inert", errorCode);
     338           0 :                 return;
     339             :             }
     340           0 :             if(U_IS_SURROGATE(prev)) {
     341           0 :                 setParseError("starred-relation string range contains a surrogate", errorCode);
     342           0 :                 return;
     343             :             }
     344           0 :             if(0xfffd <= prev && prev <= 0xffff) {
     345           0 :                 setParseError("starred-relation string range contains U+FFFD, U+FFFE or U+FFFF", errorCode);
     346           0 :                 return;
     347             :             }
     348           0 :             s.setTo(prev);
     349           0 :             sink->addRelation(strength, empty, s, empty, errorReason, errorCode);
     350           0 :             if(U_FAILURE(errorCode)) {
     351           0 :                 setErrorContext();
     352           0 :                 return;
     353             :             }
     354             :         }
     355           0 :         prev = -1;
     356           0 :         j = U16_LENGTH(c);
     357           0 :     }
     358           0 :     ruleIndex = skipWhiteSpace(i);
     359             : }
     360             : 
     361             : int32_t
     362           0 : CollationRuleParser::parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
     363           0 :     i = parseString(skipWhiteSpace(i), raw, errorCode);
     364           0 :     if(U_SUCCESS(errorCode) && raw.isEmpty()) {
     365           0 :         setParseError("missing relation string", errorCode);
     366             :     }
     367           0 :     return skipWhiteSpace(i);
     368             : }
     369             : 
     370             : int32_t
     371           0 : CollationRuleParser::parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode) {
     372           0 :     if(U_FAILURE(errorCode)) { return i; }
     373           0 :     raw.remove();
     374           0 :     while(i < rules->length()) {
     375           0 :         UChar32 c = rules->charAt(i++);
     376           0 :         if(isSyntaxChar(c)) {
     377           0 :             if(c == 0x27) {  // apostrophe
     378           0 :                 if(i < rules->length() && rules->charAt(i) == 0x27) {
     379             :                     // Double apostrophe, encodes a single one.
     380           0 :                     raw.append((UChar)0x27);
     381           0 :                     ++i;
     382           0 :                     continue;
     383             :                 }
     384             :                 // Quote literal text until the next single apostrophe.
     385             :                 for(;;) {
     386           0 :                     if(i == rules->length()) {
     387           0 :                         setParseError("quoted literal text missing terminating apostrophe", errorCode);
     388           0 :                         return i;
     389             :                     }
     390           0 :                     c = rules->charAt(i++);
     391           0 :                     if(c == 0x27) {
     392           0 :                         if(i < rules->length() && rules->charAt(i) == 0x27) {
     393             :                             // Double apostrophe inside quoted literal text,
     394             :                             // still encodes a single apostrophe.
     395           0 :                             ++i;
     396             :                         } else {
     397           0 :                             break;
     398             :                         }
     399             :                     }
     400           0 :                     raw.append((UChar)c);
     401             :                 }
     402           0 :             } else if(c == 0x5c) {  // backslash
     403           0 :                 if(i == rules->length()) {
     404           0 :                     setParseError("backslash escape at the end of the rule string", errorCode);
     405           0 :                     return i;
     406             :                 }
     407           0 :                 c = rules->char32At(i);
     408           0 :                 raw.append(c);
     409           0 :                 i += U16_LENGTH(c);
     410             :             } else {
     411             :                 // Any other syntax character terminates a string.
     412           0 :                 --i;
     413           0 :                 break;
     414             :             }
     415           0 :         } else if(PatternProps::isWhiteSpace(c)) {
     416             :             // Unquoted white space terminates a string.
     417           0 :             --i;
     418           0 :             break;
     419             :         } else {
     420           0 :             raw.append((UChar)c);
     421             :         }
     422             :     }
     423           0 :     for(int32_t j = 0; j < raw.length();) {
     424           0 :         UChar32 c = raw.char32At(j);
     425           0 :         if(U_IS_SURROGATE(c)) {
     426           0 :             setParseError("string contains an unpaired surrogate", errorCode);
     427           0 :             return i;
     428             :         }
     429           0 :         if(0xfffd <= c && c <= 0xffff) {
     430           0 :             setParseError("string contains U+FFFD, U+FFFE or U+FFFF", errorCode);
     431           0 :             return i;
     432             :         }
     433           0 :         j += U16_LENGTH(c);
     434             :     }
     435           0 :     return i;
     436             : }
     437             : 
     438             : namespace {
     439             : 
     440             : static const char *const positions[] = {
     441             :     "first tertiary ignorable",
     442             :     "last tertiary ignorable",
     443             :     "first secondary ignorable",
     444             :     "last secondary ignorable",
     445             :     "first primary ignorable",
     446             :     "last primary ignorable",
     447             :     "first variable",
     448             :     "last variable",
     449             :     "first regular",
     450             :     "last regular",
     451             :     "first implicit",
     452             :     "last implicit",
     453             :     "first trailing",
     454             :     "last trailing"
     455             : };
     456             : 
     457             : }  // namespace
     458             : 
     459             : int32_t
     460           0 : CollationRuleParser::parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode) {
     461           0 :     if(U_FAILURE(errorCode)) { return 0; }
     462           0 :     UnicodeString raw;
     463           0 :     int32_t j = readWords(i + 1, raw);
     464           0 :     if(j > i && rules->charAt(j) == 0x5d && !raw.isEmpty()) {  // words end with ]
     465           0 :         ++j;
     466           0 :         for(int32_t pos = 0; pos < UPRV_LENGTHOF(positions); ++pos) {
     467           0 :             if(raw == UnicodeString(positions[pos], -1, US_INV)) {
     468           0 :                 str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + pos));
     469           0 :                 return j;
     470             :             }
     471             :         }
     472           0 :         if(raw == UNICODE_STRING_SIMPLE("top")) {
     473           0 :             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_REGULAR));
     474           0 :             return j;
     475             :         }
     476           0 :         if(raw == UNICODE_STRING_SIMPLE("variable top")) {
     477           0 :             str.setTo((UChar)POS_LEAD).append((UChar)(POS_BASE + LAST_VARIABLE));
     478           0 :             return j;
     479             :         }
     480             :     }
     481           0 :     setParseError("not a valid special reset position", errorCode);
     482           0 :     return i;
     483             : }
     484             : 
     485             : void
     486           0 : CollationRuleParser::parseSetting(UErrorCode &errorCode) {
     487           0 :     if(U_FAILURE(errorCode)) { return; }
     488           0 :     UnicodeString raw;
     489           0 :     int32_t i = ruleIndex + 1;
     490           0 :     int32_t j = readWords(i, raw);
     491           0 :     if(j <= i || raw.isEmpty()) {
     492           0 :         setParseError("expected a setting/option at '['", errorCode);
     493             :     }
     494           0 :     if(rules->charAt(j) == 0x5d) {  // words end with ]
     495           0 :         ++j;
     496           0 :         if(raw.startsWith(UNICODE_STRING_SIMPLE("reorder")) &&
     497           0 :                 (raw.length() == 7 || raw.charAt(7) == 0x20)) {
     498           0 :             parseReordering(raw, errorCode);
     499           0 :             ruleIndex = j;
     500           0 :             return;
     501             :         }
     502           0 :         if(raw == UNICODE_STRING_SIMPLE("backwards 2")) {
     503           0 :             settings->setFlag(CollationSettings::BACKWARD_SECONDARY,
     504           0 :                               UCOL_ON, 0, errorCode);
     505           0 :             ruleIndex = j;
     506           0 :             return;
     507             :         }
     508           0 :         UnicodeString v;
     509           0 :         int32_t valueIndex = raw.lastIndexOf((UChar)0x20);
     510           0 :         if(valueIndex >= 0) {
     511           0 :             v.setTo(raw, valueIndex + 1);
     512           0 :             raw.truncate(valueIndex);
     513             :         }
     514           0 :         if(raw == UNICODE_STRING_SIMPLE("strength") && v.length() == 1) {
     515           0 :             int32_t value = UCOL_DEFAULT;
     516           0 :             UChar c = v.charAt(0);
     517           0 :             if(0x31 <= c && c <= 0x34) {  // 1..4
     518           0 :                 value = UCOL_PRIMARY + (c - 0x31);
     519           0 :             } else if(c == 0x49) {  // 'I'
     520           0 :                 value = UCOL_IDENTICAL;
     521             :             }
     522           0 :             if(value != UCOL_DEFAULT) {
     523           0 :                 settings->setStrength(value, 0, errorCode);
     524           0 :                 ruleIndex = j;
     525           0 :                 return;
     526             :             }
     527           0 :         } else if(raw == UNICODE_STRING_SIMPLE("alternate")) {
     528           0 :             UColAttributeValue value = UCOL_DEFAULT;
     529           0 :             if(v == UNICODE_STRING_SIMPLE("non-ignorable")) {
     530           0 :                 value = UCOL_NON_IGNORABLE;
     531           0 :             } else if(v == UNICODE_STRING_SIMPLE("shifted")) {
     532           0 :                 value = UCOL_SHIFTED;
     533             :             }
     534           0 :             if(value != UCOL_DEFAULT) {
     535           0 :                 settings->setAlternateHandling(value, 0, errorCode);
     536           0 :                 ruleIndex = j;
     537           0 :                 return;
     538             :             }
     539           0 :         } else if(raw == UNICODE_STRING_SIMPLE("maxVariable")) {
     540           0 :             int32_t value = UCOL_DEFAULT;
     541           0 :             if(v == UNICODE_STRING_SIMPLE("space")) {
     542           0 :                 value = CollationSettings::MAX_VAR_SPACE;
     543           0 :             } else if(v == UNICODE_STRING_SIMPLE("punct")) {
     544           0 :                 value = CollationSettings::MAX_VAR_PUNCT;
     545           0 :             } else if(v == UNICODE_STRING_SIMPLE("symbol")) {
     546           0 :                 value = CollationSettings::MAX_VAR_SYMBOL;
     547           0 :             } else if(v == UNICODE_STRING_SIMPLE("currency")) {
     548           0 :                 value = CollationSettings::MAX_VAR_CURRENCY;
     549             :             }
     550           0 :             if(value != UCOL_DEFAULT) {
     551           0 :                 settings->setMaxVariable(value, 0, errorCode);
     552           0 :                 settings->variableTop = baseData->getLastPrimaryForGroup(
     553             :                     UCOL_REORDER_CODE_FIRST + value);
     554           0 :                 U_ASSERT(settings->variableTop != 0);
     555           0 :                 ruleIndex = j;
     556           0 :                 return;
     557             :             }
     558           0 :         } else if(raw == UNICODE_STRING_SIMPLE("caseFirst")) {
     559           0 :             UColAttributeValue value = UCOL_DEFAULT;
     560           0 :             if(v == UNICODE_STRING_SIMPLE("off")) {
     561           0 :                 value = UCOL_OFF;
     562           0 :             } else if(v == UNICODE_STRING_SIMPLE("lower")) {
     563           0 :                 value = UCOL_LOWER_FIRST;
     564           0 :             } else if(v == UNICODE_STRING_SIMPLE("upper")) {
     565           0 :                 value = UCOL_UPPER_FIRST;
     566             :             }
     567           0 :             if(value != UCOL_DEFAULT) {
     568           0 :                 settings->setCaseFirst(value, 0, errorCode);
     569           0 :                 ruleIndex = j;
     570           0 :                 return;
     571             :             }
     572           0 :         } else if(raw == UNICODE_STRING_SIMPLE("caseLevel")) {
     573           0 :             UColAttributeValue value = getOnOffValue(v);
     574           0 :             if(value != UCOL_DEFAULT) {
     575           0 :                 settings->setFlag(CollationSettings::CASE_LEVEL, value, 0, errorCode);
     576           0 :                 ruleIndex = j;
     577           0 :                 return;
     578             :             }
     579           0 :         } else if(raw == UNICODE_STRING_SIMPLE("normalization")) {
     580           0 :             UColAttributeValue value = getOnOffValue(v);
     581           0 :             if(value != UCOL_DEFAULT) {
     582           0 :                 settings->setFlag(CollationSettings::CHECK_FCD, value, 0, errorCode);
     583           0 :                 ruleIndex = j;
     584           0 :                 return;
     585             :             }
     586           0 :         } else if(raw == UNICODE_STRING_SIMPLE("numericOrdering")) {
     587           0 :             UColAttributeValue value = getOnOffValue(v);
     588           0 :             if(value != UCOL_DEFAULT) {
     589           0 :                 settings->setFlag(CollationSettings::NUMERIC, value, 0, errorCode);
     590           0 :                 ruleIndex = j;
     591           0 :                 return;
     592             :             }
     593           0 :         } else if(raw == UNICODE_STRING_SIMPLE("hiraganaQ")) {
     594           0 :             UColAttributeValue value = getOnOffValue(v);
     595           0 :             if(value != UCOL_DEFAULT) {
     596           0 :                 if(value == UCOL_ON) {
     597           0 :                     setParseError("[hiraganaQ on] is not supported", errorCode);
     598             :                 }
     599           0 :                 ruleIndex = j;
     600           0 :                 return;
     601             :             }
     602           0 :         } else if(raw == UNICODE_STRING_SIMPLE("import")) {
     603           0 :             CharString lang;
     604           0 :             lang.appendInvariantChars(v, errorCode);
     605           0 :             if(errorCode == U_MEMORY_ALLOCATION_ERROR) { return; }
     606             :             // BCP 47 language tag -> ICU locale ID
     607             :             char localeID[ULOC_FULLNAME_CAPACITY];
     608             :             int32_t parsedLength;
     609           0 :             int32_t length = uloc_forLanguageTag(lang.data(), localeID, ULOC_FULLNAME_CAPACITY,
     610           0 :                                                  &parsedLength, &errorCode);
     611           0 :             if(U_FAILURE(errorCode) ||
     612           0 :                     parsedLength != lang.length() || length >= ULOC_FULLNAME_CAPACITY) {
     613           0 :                 errorCode = U_ZERO_ERROR;
     614           0 :                 setParseError("expected language tag in [import langTag]", errorCode);
     615           0 :                 return;
     616             :             }
     617             :             // localeID minus all keywords
     618             :             char baseID[ULOC_FULLNAME_CAPACITY];
     619           0 :             length = uloc_getBaseName(localeID, baseID, ULOC_FULLNAME_CAPACITY, &errorCode);
     620           0 :             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
     621           0 :                 errorCode = U_ZERO_ERROR;
     622           0 :                 setParseError("expected language tag in [import langTag]", errorCode);
     623           0 :                 return;
     624             :             }
     625           0 :             if(length == 3 && uprv_memcmp(baseID, "und", 3) == 0) {
     626           0 :                 uprv_strcpy(baseID, "root");
     627             :             }
     628             :             // @collation=type, or length=0 if not specified
     629             :             char collationType[ULOC_KEYWORDS_CAPACITY];
     630             :             length = uloc_getKeywordValue(localeID, "collation",
     631             :                                           collationType, ULOC_KEYWORDS_CAPACITY,
     632           0 :                                           &errorCode);
     633           0 :             if(U_FAILURE(errorCode) || length >= ULOC_KEYWORDS_CAPACITY) {
     634           0 :                 errorCode = U_ZERO_ERROR;
     635           0 :                 setParseError("expected language tag in [import langTag]", errorCode);
     636           0 :                 return;
     637             :             }
     638           0 :             if(importer == NULL) {
     639           0 :                 setParseError("[import langTag] is not supported", errorCode);
     640             :             } else {
     641           0 :                 UnicodeString importedRules;
     642           0 :                 importer->getRules(baseID, length > 0 ? collationType : "standard",
     643           0 :                                    importedRules, errorReason, errorCode);
     644           0 :                 if(U_FAILURE(errorCode)) {
     645           0 :                     if(errorReason == NULL) {
     646           0 :                         errorReason = "[import langTag] failed";
     647             :                     }
     648           0 :                     setErrorContext();
     649           0 :                     return;
     650             :                 }
     651           0 :                 const UnicodeString *outerRules = rules;
     652           0 :                 int32_t outerRuleIndex = ruleIndex;
     653           0 :                 parse(importedRules, errorCode);
     654           0 :                 if(U_FAILURE(errorCode)) {
     655           0 :                     if(parseError != NULL) {
     656           0 :                         parseError->offset = outerRuleIndex;
     657             :                     }
     658             :                 }
     659           0 :                 rules = outerRules;
     660           0 :                 ruleIndex = j;
     661             :             }
     662           0 :             return;
     663             :         }
     664           0 :     } else if(rules->charAt(j) == 0x5b) {  // words end with [
     665           0 :         UnicodeSet set;
     666           0 :         j = parseUnicodeSet(j, set, errorCode);
     667           0 :         if(U_FAILURE(errorCode)) { return; }
     668           0 :         if(raw == UNICODE_STRING_SIMPLE("optimize")) {
     669           0 :             sink->optimize(set, errorReason, errorCode);
     670           0 :             if(U_FAILURE(errorCode)) { setErrorContext(); }
     671           0 :             ruleIndex = j;
     672           0 :             return;
     673           0 :         } else if(raw == UNICODE_STRING_SIMPLE("suppressContractions")) {
     674           0 :             sink->suppressContractions(set, errorReason, errorCode);
     675           0 :             if(U_FAILURE(errorCode)) { setErrorContext(); }
     676           0 :             ruleIndex = j;
     677           0 :             return;
     678             :         }
     679             :     }
     680           0 :     setParseError("not a valid setting/option", errorCode);
     681             : }
     682             : 
     683             : void
     684           0 : CollationRuleParser::parseReordering(const UnicodeString &raw, UErrorCode &errorCode) {
     685           0 :     if(U_FAILURE(errorCode)) { return; }
     686           0 :     int32_t i = 7;  // after "reorder"
     687           0 :     if(i == raw.length()) {
     688             :         // empty [reorder] with no codes
     689           0 :         settings->resetReordering();
     690           0 :         return;
     691             :     }
     692             :     // Parse the codes in [reorder aa bb cc].
     693           0 :     UVector32 reorderCodes(errorCode);
     694           0 :     if(U_FAILURE(errorCode)) { return; }
     695           0 :     CharString word;
     696           0 :     while(i < raw.length()) {
     697           0 :         ++i;  // skip the word-separating space
     698           0 :         int32_t limit = raw.indexOf((UChar)0x20, i);
     699           0 :         if(limit < 0) { limit = raw.length(); }
     700           0 :         word.clear().appendInvariantChars(raw.tempSubStringBetween(i, limit), errorCode);
     701           0 :         if(U_FAILURE(errorCode)) { return; }
     702           0 :         int32_t code = getReorderCode(word.data());
     703           0 :         if(code < 0) {
     704           0 :             setParseError("unknown script or reorder code", errorCode);
     705           0 :             return;
     706             :         }
     707           0 :         reorderCodes.addElement(code, errorCode);
     708           0 :         if(U_FAILURE(errorCode)) { return; }
     709           0 :         i = limit;
     710             :     }
     711           0 :     settings->setReordering(*baseData, reorderCodes.getBuffer(), reorderCodes.size(), errorCode);
     712             : }
     713             : 
     714             : static const char *const gSpecialReorderCodes[] = {
     715             :     "space", "punct", "symbol", "currency", "digit"
     716             : };
     717             : 
     718             : int32_t
     719           0 : CollationRuleParser::getReorderCode(const char *word) {
     720           0 :     for(int32_t i = 0; i < UPRV_LENGTHOF(gSpecialReorderCodes); ++i) {
     721           0 :         if(uprv_stricmp(word, gSpecialReorderCodes[i]) == 0) {
     722           0 :             return UCOL_REORDER_CODE_FIRST + i;
     723             :         }
     724             :     }
     725           0 :     int32_t script = u_getPropertyValueEnum(UCHAR_SCRIPT, word);
     726           0 :     if(script >= 0) {
     727           0 :         return script;
     728             :     }
     729           0 :     if(uprv_stricmp(word, "others") == 0) {
     730           0 :         return UCOL_REORDER_CODE_OTHERS;  // same as Zzzz = USCRIPT_UNKNOWN
     731             :     }
     732           0 :     return -1;
     733             : }
     734             : 
     735             : UColAttributeValue
     736           0 : CollationRuleParser::getOnOffValue(const UnicodeString &s) {
     737           0 :     if(s == UNICODE_STRING_SIMPLE("on")) {
     738           0 :         return UCOL_ON;
     739           0 :     } else if(s == UNICODE_STRING_SIMPLE("off")) {
     740           0 :         return UCOL_OFF;
     741             :     } else {
     742           0 :         return UCOL_DEFAULT;
     743             :     }
     744             : }
     745             : 
     746             : int32_t
     747           0 : CollationRuleParser::parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode) {
     748             :     // Collect a UnicodeSet pattern between a balanced pair of [brackets].
     749           0 :     int32_t level = 0;
     750           0 :     int32_t j = i;
     751             :     for(;;) {
     752           0 :         if(j == rules->length()) {
     753           0 :             setParseError("unbalanced UnicodeSet pattern brackets", errorCode);
     754           0 :             return j;
     755             :         }
     756           0 :         UChar c = rules->charAt(j++);
     757           0 :         if(c == 0x5b) {  // '['
     758           0 :             ++level;
     759           0 :         } else if(c == 0x5d) {  // ']'
     760           0 :             if(--level == 0) { break; }
     761             :         }
     762           0 :     }
     763           0 :     set.applyPattern(rules->tempSubStringBetween(i, j), errorCode);
     764           0 :     if(U_FAILURE(errorCode)) {
     765           0 :         errorCode = U_ZERO_ERROR;
     766           0 :         setParseError("not a valid UnicodeSet pattern", errorCode);
     767           0 :         return j;
     768             :     }
     769           0 :     j = skipWhiteSpace(j);
     770           0 :     if(j == rules->length() || rules->charAt(j) != 0x5d) {
     771           0 :         setParseError("missing option-terminating ']' after UnicodeSet pattern", errorCode);
     772           0 :         return j;
     773             :     }
     774           0 :     return ++j;
     775             : }
     776             : 
     777             : int32_t
     778           0 : CollationRuleParser::readWords(int32_t i, UnicodeString &raw) const {
     779             :     static const UChar sp = 0x20;
     780           0 :     raw.remove();
     781           0 :     i = skipWhiteSpace(i);
     782             :     for(;;) {
     783           0 :         if(i >= rules->length()) { return 0; }
     784           0 :         UChar c = rules->charAt(i);
     785           0 :         if(isSyntaxChar(c) && c != 0x2d && c != 0x5f) {  // syntax except -_
     786           0 :             if(raw.isEmpty()) { return i; }
     787           0 :             if(raw.endsWith(&sp, 1)) {  // remove trailing space
     788           0 :                 raw.truncate(raw.length() - 1);
     789             :             }
     790           0 :             return i;
     791             :         }
     792           0 :         if(PatternProps::isWhiteSpace(c)) {
     793           0 :             raw.append(sp);
     794           0 :             i = skipWhiteSpace(i + 1);
     795             :         } else {
     796           0 :             raw.append(c);
     797           0 :             ++i;
     798             :         }
     799           0 :     }
     800             : }
     801             : 
     802             : int32_t
     803           0 : CollationRuleParser::skipComment(int32_t i) const {
     804             :     // skip to past the newline
     805           0 :     while(i < rules->length()) {
     806           0 :         UChar c = rules->charAt(i++);
     807             :         // LF or FF or CR or NEL or LS or PS
     808           0 :         if(c == 0xa || c == 0xc || c == 0xd || c == 0x85 || c == 0x2028 || c == 0x2029) {
     809             :             // Unicode Newline Guidelines: "A readline function should stop at NLF, LS, FF, or PS."
     810             :             // NLF (new line function) = CR or LF or CR+LF or NEL.
     811             :             // No need to collect all of CR+LF because a following LF will be ignored anyway.
     812             :             break;
     813             :         }
     814             :     }
     815           0 :     return i;
     816             : }
     817             : 
     818             : void
     819           0 : CollationRuleParser::setParseError(const char *reason, UErrorCode &errorCode) {
     820           0 :     if(U_FAILURE(errorCode)) { return; }
     821             :     // Error code consistent with the old parser (from ca. 2001),
     822             :     // rather than U_PARSE_ERROR;
     823           0 :     errorCode = U_INVALID_FORMAT_ERROR;
     824           0 :     errorReason = reason;
     825           0 :     if(parseError != NULL) { setErrorContext(); }
     826             : }
     827             : 
     828             : void
     829           0 : CollationRuleParser::setErrorContext() {
     830           0 :     if(parseError == NULL) { return; }
     831             : 
     832             :     // Note: This relies on the calling code maintaining the ruleIndex
     833             :     // at a position that is useful for debugging.
     834             :     // For example, at the beginning of a reset or relation etc.
     835           0 :     parseError->offset = ruleIndex;
     836           0 :     parseError->line = 0;  // We are not counting line numbers.
     837             : 
     838             :     // before ruleIndex
     839           0 :     int32_t start = ruleIndex - (U_PARSE_CONTEXT_LEN - 1);
     840           0 :     if(start < 0) {
     841           0 :         start = 0;
     842           0 :     } else if(start > 0 && U16_IS_TRAIL(rules->charAt(start))) {
     843           0 :         ++start;
     844             :     }
     845           0 :     int32_t length = ruleIndex - start;
     846           0 :     rules->extract(start, length, parseError->preContext);
     847           0 :     parseError->preContext[length] = 0;
     848             : 
     849             :     // starting from ruleIndex
     850           0 :     length = rules->length() - ruleIndex;
     851           0 :     if(length >= U_PARSE_CONTEXT_LEN) {
     852           0 :         length = U_PARSE_CONTEXT_LEN - 1;
     853           0 :         if(U16_IS_LEAD(rules->charAt(ruleIndex + length - 1))) {
     854           0 :             --length;
     855             :         }
     856             :     }
     857           0 :     rules->extract(ruleIndex, length, parseError->postContext);
     858           0 :     parseError->postContext[length] = 0;
     859             : }
     860             : 
     861             : UBool
     862           0 : CollationRuleParser::isSyntaxChar(UChar32 c) {
     863           0 :     return 0x21 <= c && c <= 0x7e &&
     864           0 :             (c <= 0x2f || (0x3a <= c && c <= 0x40) ||
     865           0 :             (0x5b <= c && c <= 0x60) || (0x7b <= c));
     866             : }
     867             : 
     868             : int32_t
     869           0 : CollationRuleParser::skipWhiteSpace(int32_t i) const {
     870           0 :     while(i < rules->length() && PatternProps::isWhiteSpace(rules->charAt(i))) {
     871           0 :         ++i;
     872             :     }
     873           0 :     return i;
     874             : }
     875             : 
     876             : U_NAMESPACE_END
     877             : 
     878             : #endif  // !UCONFIG_NO_COLLATION

Generated by: LCOV version 1.13