LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - collationbuilder.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 890 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 51 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : * Copyright (C) 2013-2014, International Business Machines
       6             : * Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : * collationbuilder.cpp
       9             : *
      10             : * (replaced the former ucol_bld.cpp)
      11             : *
      12             : * created on: 2013may06
      13             : * created by: Markus W. Scherer
      14             : */
      15             : 
      16             : #ifdef DEBUG_COLLATION_BUILDER
      17             : #include <stdio.h>
      18             : #endif
      19             : 
      20             : #include "unicode/utypes.h"
      21             : 
      22             : #if !UCONFIG_NO_COLLATION
      23             : 
      24             : #include "unicode/caniter.h"
      25             : #include "unicode/normalizer2.h"
      26             : #include "unicode/tblcoll.h"
      27             : #include "unicode/parseerr.h"
      28             : #include "unicode/uchar.h"
      29             : #include "unicode/ucol.h"
      30             : #include "unicode/unistr.h"
      31             : #include "unicode/usetiter.h"
      32             : #include "unicode/utf16.h"
      33             : #include "unicode/uversion.h"
      34             : #include "cmemory.h"
      35             : #include "collation.h"
      36             : #include "collationbuilder.h"
      37             : #include "collationdata.h"
      38             : #include "collationdatabuilder.h"
      39             : #include "collationfastlatin.h"
      40             : #include "collationroot.h"
      41             : #include "collationrootelements.h"
      42             : #include "collationruleparser.h"
      43             : #include "collationsettings.h"
      44             : #include "collationtailoring.h"
      45             : #include "collationweights.h"
      46             : #include "normalizer2impl.h"
      47             : #include "uassert.h"
      48             : #include "ucol_imp.h"
      49             : #include "utf16collationiterator.h"
      50             : 
      51             : U_NAMESPACE_BEGIN
      52             : 
      53             : namespace {
      54             : 
      55             : class BundleImporter : public CollationRuleParser::Importer {
      56             : public:
      57           0 :     BundleImporter() {}
      58             :     virtual ~BundleImporter();
      59             :     virtual void getRules(
      60             :             const char *localeID, const char *collationType,
      61             :             UnicodeString &rules,
      62             :             const char *&errorReason, UErrorCode &errorCode);
      63             : };
      64             : 
      65           0 : BundleImporter::~BundleImporter() {}
      66             : 
      67             : void
      68           0 : BundleImporter::getRules(
      69             :         const char *localeID, const char *collationType,
      70             :         UnicodeString &rules,
      71             :         const char *& /*errorReason*/, UErrorCode &errorCode) {
      72           0 :     CollationLoader::loadRules(localeID, collationType, rules, errorCode);
      73           0 : }
      74             : 
      75             : }  // namespace
      76             : 
      77             : // RuleBasedCollator implementation ---------------------------------------- ***
      78             : 
      79             : // These methods are here, rather than in rulebasedcollator.cpp,
      80             : // for modularization:
      81             : // Most code using Collator does not need to build a Collator from rules.
      82             : // By moving these constructors and helper methods to a separate file,
      83             : // most code will not have a static dependency on the builder code.
      84             : 
      85           0 : RuleBasedCollator::RuleBasedCollator()
      86             :         : data(NULL),
      87             :           settings(NULL),
      88             :           tailoring(NULL),
      89             :           cacheEntry(NULL),
      90             :           validLocale(""),
      91             :           explicitlySetAttributes(0),
      92           0 :           actualLocaleIsSameAsValid(FALSE) {
      93           0 : }
      94             : 
      95           0 : RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)
      96             :         : data(NULL),
      97             :           settings(NULL),
      98             :           tailoring(NULL),
      99             :           cacheEntry(NULL),
     100             :           validLocale(""),
     101             :           explicitlySetAttributes(0),
     102           0 :           actualLocaleIsSameAsValid(FALSE) {
     103           0 :     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);
     104           0 : }
     105             : 
     106           0 : RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,
     107           0 :                                      UErrorCode &errorCode)
     108             :         : data(NULL),
     109             :           settings(NULL),
     110             :           tailoring(NULL),
     111             :           cacheEntry(NULL),
     112             :           validLocale(""),
     113             :           explicitlySetAttributes(0),
     114           0 :           actualLocaleIsSameAsValid(FALSE) {
     115           0 :     internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);
     116           0 : }
     117             : 
     118           0 : RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
     119             :                                      UColAttributeValue decompositionMode,
     120           0 :                                      UErrorCode &errorCode)
     121             :         : data(NULL),
     122             :           settings(NULL),
     123             :           tailoring(NULL),
     124             :           cacheEntry(NULL),
     125             :           validLocale(""),
     126             :           explicitlySetAttributes(0),
     127           0 :           actualLocaleIsSameAsValid(FALSE) {
     128           0 :     internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);
     129           0 : }
     130             : 
     131           0 : RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
     132             :                                      ECollationStrength strength,
     133             :                                      UColAttributeValue decompositionMode,
     134           0 :                                      UErrorCode &errorCode)
     135             :         : data(NULL),
     136             :           settings(NULL),
     137             :           tailoring(NULL),
     138             :           cacheEntry(NULL),
     139             :           validLocale(""),
     140             :           explicitlySetAttributes(0),
     141           0 :           actualLocaleIsSameAsValid(FALSE) {
     142           0 :     internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);
     143           0 : }
     144             : 
     145           0 : RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
     146             :                                      UParseError &parseError, UnicodeString &reason,
     147           0 :                                      UErrorCode &errorCode)
     148             :         : data(NULL),
     149             :           settings(NULL),
     150             :           tailoring(NULL),
     151             :           cacheEntry(NULL),
     152             :           validLocale(""),
     153             :           explicitlySetAttributes(0),
     154           0 :           actualLocaleIsSameAsValid(FALSE) {
     155           0 :     internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);
     156           0 : }
     157             : 
     158             : void
     159           0 : RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
     160             :                                           int32_t strength,
     161             :                                           UColAttributeValue decompositionMode,
     162             :                                           UParseError *outParseError, UnicodeString *outReason,
     163             :                                           UErrorCode &errorCode) {
     164           0 :     const CollationTailoring *base = CollationRoot::getRoot(errorCode);
     165           0 :     if(U_FAILURE(errorCode)) { return; }
     166           0 :     if(outReason != NULL) { outReason->remove(); }
     167           0 :     CollationBuilder builder(base, errorCode);
     168           0 :     UVersionInfo noVersion = { 0, 0, 0, 0 };
     169           0 :     BundleImporter importer;
     170             :     LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,
     171             :                                                              &importer,
     172           0 :                                                              outParseError, errorCode));
     173           0 :     if(U_FAILURE(errorCode)) {
     174           0 :         const char *reason = builder.getErrorReason();
     175           0 :         if(reason != NULL && outReason != NULL) {
     176           0 :             *outReason = UnicodeString(reason, -1, US_INV);
     177             :         }
     178           0 :         return;
     179             :     }
     180           0 :     t->actualLocale.setToBogus();
     181           0 :     adoptTailoring(t.orphan(), errorCode);
     182             :     // Set attributes after building the collator,
     183             :     // to keep the default settings consistent with the rule string.
     184           0 :     if(strength != UCOL_DEFAULT) {
     185           0 :         setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);
     186             :     }
     187           0 :     if(decompositionMode != UCOL_DEFAULT) {
     188           0 :         setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
     189             :     }
     190             : }
     191             : 
     192             : // CollationBuilder implementation ----------------------------------------- ***
     193             : 
     194             : // Some compilers don't care if constants are defined in the .cpp file.
     195             : // MS Visual C++ does not like it, but gcc requires it. clang does not care.
     196             : #ifndef _MSC_VER
     197             : const int32_t CollationBuilder::HAS_BEFORE2;
     198             : const int32_t CollationBuilder::HAS_BEFORE3;
     199             : #endif
     200             : 
     201           0 : CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
     202           0 :         : nfd(*Normalizer2::getNFDInstance(errorCode)),
     203           0 :           fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
     204           0 :           nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
     205             :           base(b),
     206           0 :           baseData(b->data),
     207           0 :           rootElements(b->data->rootElements, b->data->rootElementsLength),
     208             :           variableTop(0),
     209           0 :           dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
     210             :           errorReason(NULL),
     211             :           cesLength(0),
     212           0 :           rootPrimaryIndexes(errorCode), nodes(errorCode) {
     213           0 :     nfcImpl.ensureCanonIterData(errorCode);
     214           0 :     if(U_FAILURE(errorCode)) {
     215           0 :         errorReason = "CollationBuilder fields initialization failed";
     216           0 :         return;
     217             :     }
     218           0 :     if(dataBuilder == NULL) {
     219           0 :         errorCode = U_MEMORY_ALLOCATION_ERROR;
     220           0 :         return;
     221             :     }
     222           0 :     dataBuilder->initForTailoring(baseData, errorCode);
     223           0 :     if(U_FAILURE(errorCode)) {
     224           0 :         errorReason = "CollationBuilder initialization failed";
     225             :     }
     226             : }
     227             : 
     228           0 : CollationBuilder::~CollationBuilder() {
     229           0 :     delete dataBuilder;
     230           0 : }
     231             : 
     232             : CollationTailoring *
     233           0 : CollationBuilder::parseAndBuild(const UnicodeString &ruleString,
     234             :                                 const UVersionInfo rulesVersion,
     235             :                                 CollationRuleParser::Importer *importer,
     236             :                                 UParseError *outParseError,
     237             :                                 UErrorCode &errorCode) {
     238           0 :     if(U_FAILURE(errorCode)) { return NULL; }
     239           0 :     if(baseData->rootElements == NULL) {
     240           0 :         errorCode = U_MISSING_RESOURCE_ERROR;
     241           0 :         errorReason = "missing root elements data, tailoring not supported";
     242           0 :         return NULL;
     243             :     }
     244           0 :     LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));
     245           0 :     if(tailoring.isNull() || tailoring->isBogus()) {
     246           0 :         errorCode = U_MEMORY_ALLOCATION_ERROR;
     247           0 :         return NULL;
     248             :     }
     249           0 :     CollationRuleParser parser(baseData, errorCode);
     250           0 :     if(U_FAILURE(errorCode)) { return NULL; }
     251             :     // Note: This always bases &[last variable] and &[first regular]
     252             :     // on the root collator's maxVariable/variableTop.
     253             :     // If we wanted this to change after [maxVariable x], then we would keep
     254             :     // the tailoring.settings pointer here and read its variableTop when we need it.
     255             :     // See http://unicode.org/cldr/trac/ticket/6070
     256           0 :     variableTop = base->settings->variableTop;
     257           0 :     parser.setSink(this);
     258           0 :     parser.setImporter(importer);
     259           0 :     CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
     260           0 :     parser.parse(ruleString, ownedSettings, outParseError, errorCode);
     261           0 :     errorReason = parser.getErrorReason();
     262           0 :     if(U_FAILURE(errorCode)) { return NULL; }
     263           0 :     if(dataBuilder->hasMappings()) {
     264           0 :         makeTailoredCEs(errorCode);
     265           0 :         closeOverComposites(errorCode);
     266           0 :         finalizeCEs(errorCode);
     267             :         // Copy all of ASCII, and Latin-1 letters, into each tailoring.
     268           0 :         optimizeSet.add(0, 0x7f);
     269           0 :         optimizeSet.add(0xc0, 0xff);
     270             :         // Hangul is decomposed on the fly during collation,
     271             :         // and the tailoring data is always built with HANGUL_TAG specials.
     272           0 :         optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
     273           0 :         dataBuilder->optimize(optimizeSet, errorCode);
     274           0 :         tailoring->ensureOwnedData(errorCode);
     275           0 :         if(U_FAILURE(errorCode)) { return NULL; }
     276           0 :         if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
     277           0 :         dataBuilder->build(*tailoring->ownedData, errorCode);
     278           0 :         tailoring->builder = dataBuilder;
     279           0 :         dataBuilder = NULL;
     280             :     } else {
     281           0 :         tailoring->data = baseData;
     282             :     }
     283           0 :     if(U_FAILURE(errorCode)) { return NULL; }
     284           0 :     ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
     285           0 :         tailoring->data, ownedSettings,
     286             :         ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
     287           0 :     tailoring->rules = ruleString;
     288           0 :     tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination
     289           0 :     tailoring->setVersion(base->version, rulesVersion);
     290           0 :     return tailoring.orphan();
     291             : }
     292             : 
     293             : void
     294           0 : CollationBuilder::addReset(int32_t strength, const UnicodeString &str,
     295             :                            const char *&parserErrorReason, UErrorCode &errorCode) {
     296           0 :     if(U_FAILURE(errorCode)) { return; }
     297           0 :     U_ASSERT(!str.isEmpty());
     298           0 :     if(str.charAt(0) == CollationRuleParser::POS_LEAD) {
     299           0 :         ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);
     300           0 :         cesLength = 1;
     301           0 :         if(U_FAILURE(errorCode)) { return; }
     302           0 :         U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);
     303             :     } else {
     304             :         // normal reset to a character or string
     305           0 :         UnicodeString nfdString = nfd.normalize(str, errorCode);
     306           0 :         if(U_FAILURE(errorCode)) {
     307           0 :             parserErrorReason = "normalizing the reset position";
     308           0 :             return;
     309             :         }
     310           0 :         cesLength = dataBuilder->getCEs(nfdString, ces, 0);
     311           0 :         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
     312           0 :             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     313           0 :             parserErrorReason = "reset position maps to too many collation elements (more than 31)";
     314           0 :             return;
     315             :         }
     316             :     }
     317           0 :     if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position
     318             : 
     319             :     // &[before strength]position
     320           0 :     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);
     321           0 :     int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
     322           0 :     if(U_FAILURE(errorCode)) { return; }
     323             : 
     324           0 :     int64_t node = nodes.elementAti(index);
     325             :     // If the index is for a "weaker" node,
     326             :     // then skip backwards over this and further "weaker" nodes.
     327           0 :     while(strengthFromNode(node) > strength) {
     328           0 :         index = previousIndexFromNode(node);
     329           0 :         node = nodes.elementAti(index);
     330             :     }
     331             : 
     332             :     // Find or insert a node whose index we will put into a temporary CE.
     333           0 :     if(strengthFromNode(node) == strength && isTailoredNode(node)) {
     334             :         // Reset to just before this same-strength tailored node.
     335           0 :         index = previousIndexFromNode(node);
     336           0 :     } else if(strength == UCOL_PRIMARY) {
     337             :         // root primary node (has no previous index)
     338           0 :         uint32_t p = weight32FromNode(node);
     339           0 :         if(p == 0) {
     340           0 :             errorCode = U_UNSUPPORTED_ERROR;
     341           0 :             parserErrorReason = "reset primary-before ignorable not possible";
     342           0 :             return;
     343             :         }
     344           0 :         if(p <= rootElements.getFirstPrimary()) {
     345             :             // There is no primary gap between ignorables and the space-first-primary.
     346           0 :             errorCode = U_UNSUPPORTED_ERROR;
     347           0 :             parserErrorReason = "reset primary-before first non-ignorable not supported";
     348           0 :             return;
     349             :         }
     350           0 :         if(p == Collation::FIRST_TRAILING_PRIMARY) {
     351             :             // We do not support tailoring to an unassigned-implicit CE.
     352           0 :             errorCode = U_UNSUPPORTED_ERROR;
     353           0 :             parserErrorReason = "reset primary-before [first trailing] not supported";
     354           0 :             return;
     355             :         }
     356           0 :         p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
     357           0 :         index = findOrInsertNodeForPrimary(p, errorCode);
     358             :         // Go to the last node in this list:
     359             :         // Tailor after the last node between adjacent root nodes.
     360             :         for(;;) {
     361           0 :             node = nodes.elementAti(index);
     362           0 :             int32_t nextIndex = nextIndexFromNode(node);
     363           0 :             if(nextIndex == 0) { break; }
     364           0 :             index = nextIndex;
     365           0 :         }
     366             :     } else {
     367             :         // &[before 2] or &[before 3]
     368           0 :         index = findCommonNode(index, UCOL_SECONDARY);
     369           0 :         if(strength >= UCOL_TERTIARY) {
     370           0 :             index = findCommonNode(index, UCOL_TERTIARY);
     371             :         }
     372             :         // findCommonNode() stayed on the stronger node or moved to
     373             :         // an explicit common-weight node of the reset-before strength.
     374           0 :         node = nodes.elementAti(index);
     375           0 :         if(strengthFromNode(node) == strength) {
     376             :             // Found a same-strength node with an explicit weight.
     377           0 :             uint32_t weight16 = weight16FromNode(node);
     378           0 :             if(weight16 == 0) {
     379           0 :                 errorCode = U_UNSUPPORTED_ERROR;
     380           0 :                 if(strength == UCOL_SECONDARY) {
     381           0 :                     parserErrorReason = "reset secondary-before secondary ignorable not possible";
     382             :                 } else {
     383           0 :                     parserErrorReason = "reset tertiary-before completely ignorable not possible";
     384             :                 }
     385           0 :                 return;
     386             :             }
     387           0 :             U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
     388             :             // Reset to just before this node.
     389             :             // Insert the preceding same-level explicit weight if it is not there already.
     390             :             // Which explicit weight immediately precedes this one?
     391           0 :             weight16 = getWeight16Before(index, node, strength);
     392             :             // Does this preceding weight have a node?
     393             :             uint32_t previousWeight16;
     394           0 :             int32_t previousIndex = previousIndexFromNode(node);
     395           0 :             for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
     396           0 :                 node = nodes.elementAti(i);
     397           0 :                 int32_t previousStrength = strengthFromNode(node);
     398           0 :                 if(previousStrength < strength) {
     399           0 :                     U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
     400             :                     // Either the reset element has an above-common weight and
     401             :                     // the parent node provides the implied common weight,
     402             :                     // or the reset element has a weight<=common in the node
     403             :                     // right after the parent, and we need to insert the preceding weight.
     404           0 :                     previousWeight16 = Collation::COMMON_WEIGHT16;
     405           0 :                     break;
     406           0 :                 } else if(previousStrength == strength && !isTailoredNode(node)) {
     407           0 :                     previousWeight16 = weight16FromNode(node);
     408           0 :                     break;
     409             :                 }
     410             :                 // Skip weaker nodes and same-level tailored nodes.
     411           0 :             }
     412           0 :             if(previousWeight16 == weight16) {
     413             :                 // The preceding weight has a node,
     414             :                 // maybe with following weaker or tailored nodes.
     415             :                 // Reset to the last of them.
     416           0 :                 index = previousIndex;
     417             :             } else {
     418             :                 // Insert a node with the preceding weight, reset to that.
     419           0 :                 node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
     420           0 :                 index = insertNodeBetween(previousIndex, index, node, errorCode);
     421             :             }
     422             :         } else {
     423             :             // Found a stronger node with implied strength-common weight.
     424           0 :             uint32_t weight16 = getWeight16Before(index, node, strength);
     425           0 :             index = findOrInsertWeakNode(index, weight16, strength, errorCode);
     426             :         }
     427             :         // Strength of the temporary CE = strength of its reset position.
     428             :         // Code above raises an error if the before-strength is stronger.
     429           0 :         strength = ceStrength(ces[cesLength - 1]);
     430             :     }
     431           0 :     if(U_FAILURE(errorCode)) {
     432           0 :         parserErrorReason = "inserting reset position for &[before n]";
     433           0 :         return;
     434             :     }
     435           0 :     ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
     436             : }
     437             : 
     438             : uint32_t
     439           0 : CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
     440           0 :     U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
     441             :     // Collect the root CE weights if this node is for a root CE.
     442             :     // If it is not, then return the low non-primary boundary for a tailored CE.
     443             :     uint32_t t;
     444           0 :     if(strengthFromNode(node) == UCOL_TERTIARY) {
     445           0 :         t = weight16FromNode(node);
     446             :     } else {
     447           0 :         t = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
     448             :     }
     449           0 :     while(strengthFromNode(node) > UCOL_SECONDARY) {
     450           0 :         index = previousIndexFromNode(node);
     451           0 :         node = nodes.elementAti(index);
     452             :     }
     453           0 :     if(isTailoredNode(node)) {
     454           0 :         return Collation::BEFORE_WEIGHT16;
     455             :     }
     456             :     uint32_t s;
     457           0 :     if(strengthFromNode(node) == UCOL_SECONDARY) {
     458           0 :         s = weight16FromNode(node);
     459             :     } else {
     460           0 :         s = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
     461             :     }
     462           0 :     while(strengthFromNode(node) > UCOL_PRIMARY) {
     463           0 :         index = previousIndexFromNode(node);
     464           0 :         node = nodes.elementAti(index);
     465             :     }
     466           0 :     if(isTailoredNode(node)) {
     467           0 :         return Collation::BEFORE_WEIGHT16;
     468             :     }
     469             :     // [p, s, t] is a root CE. Return the preceding weight for the requested level.
     470           0 :     uint32_t p = weight32FromNode(node);
     471             :     uint32_t weight16;
     472           0 :     if(level == UCOL_SECONDARY) {
     473           0 :         weight16 = rootElements.getSecondaryBefore(p, s);
     474             :     } else {
     475           0 :         weight16 = rootElements.getTertiaryBefore(p, s, t);
     476           0 :         U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
     477             :     }
     478           0 :     return weight16;
     479             : }
     480             : 
     481             : int64_t
     482           0 : CollationBuilder::getSpecialResetPosition(const UnicodeString &str,
     483             :                                           const char *&parserErrorReason, UErrorCode &errorCode) {
     484           0 :     U_ASSERT(str.length() == 2);
     485             :     int64_t ce;
     486           0 :     int32_t strength = UCOL_PRIMARY;
     487           0 :     UBool isBoundary = FALSE;
     488           0 :     UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
     489           0 :     U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);
     490           0 :     switch(pos) {
     491             :     case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:
     492             :         // Quaternary CEs are not supported.
     493             :         // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
     494           0 :         return 0;
     495             :     case CollationRuleParser::LAST_TERTIARY_IGNORABLE:
     496           0 :         return 0;
     497             :     case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {
     498             :         // Look for a tailored tertiary node after [0, 0, 0].
     499           0 :         int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);
     500           0 :         if(U_FAILURE(errorCode)) { return 0; }
     501           0 :         int64_t node = nodes.elementAti(index);
     502           0 :         if((index = nextIndexFromNode(node)) != 0) {
     503           0 :             node = nodes.elementAti(index);
     504           0 :             U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);
     505           0 :             if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {
     506           0 :                 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
     507             :             }
     508             :         }
     509           0 :         return rootElements.getFirstTertiaryCE();
     510             :         // No need to look for nodeHasAnyBefore() on a tertiary node.
     511             :     }
     512             :     case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
     513           0 :         ce = rootElements.getLastTertiaryCE();
     514           0 :         strength = UCOL_TERTIARY;
     515           0 :         break;
     516             :     case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {
     517             :         // Look for a tailored secondary node after [0, 0, *].
     518           0 :         int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);
     519           0 :         if(U_FAILURE(errorCode)) { return 0; }
     520           0 :         int64_t node = nodes.elementAti(index);
     521           0 :         while((index = nextIndexFromNode(node)) != 0) {
     522           0 :             node = nodes.elementAti(index);
     523           0 :             strength = strengthFromNode(node);
     524           0 :             if(strength < UCOL_SECONDARY) { break; }
     525           0 :             if(strength == UCOL_SECONDARY) {
     526           0 :                 if(isTailoredNode(node)) {
     527           0 :                     if(nodeHasBefore3(node)) {
     528           0 :                         index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
     529           0 :                         U_ASSERT(isTailoredNode(nodes.elementAti(index)));
     530             :                     }
     531           0 :                     return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
     532             :                 } else {
     533           0 :                     break;
     534             :                 }
     535             :             }
     536             :         }
     537           0 :         ce = rootElements.getFirstSecondaryCE();
     538           0 :         strength = UCOL_SECONDARY;
     539           0 :         break;
     540             :     }
     541             :     case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
     542           0 :         ce = rootElements.getLastSecondaryCE();
     543           0 :         strength = UCOL_SECONDARY;
     544           0 :         break;
     545             :     case CollationRuleParser::FIRST_VARIABLE:
     546           0 :         ce = rootElements.getFirstPrimaryCE();
     547           0 :         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
     548           0 :         break;
     549             :     case CollationRuleParser::LAST_VARIABLE:
     550           0 :         ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
     551           0 :         break;
     552             :     case CollationRuleParser::FIRST_REGULAR:
     553           0 :         ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
     554           0 :         isBoundary = TRUE;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
     555           0 :         break;
     556             :     case CollationRuleParser::LAST_REGULAR:
     557             :         // Use the Hani-first-primary rather than the actual last "regular" CE before it,
     558             :         // for backward compatibility with behavior before the introduction of
     559             :         // script-first-primary CEs in the root collator.
     560           0 :         ce = rootElements.firstCEWithPrimaryAtLeast(
     561           0 :             baseData->getFirstPrimaryForGroup(USCRIPT_HAN));
     562           0 :         break;
     563             :     case CollationRuleParser::FIRST_IMPLICIT:
     564           0 :         ce = baseData->getSingleCE(0x4e00, errorCode);
     565           0 :         break;
     566             :     case CollationRuleParser::LAST_IMPLICIT:
     567             :         // We do not support tailoring to an unassigned-implicit CE.
     568           0 :         errorCode = U_UNSUPPORTED_ERROR;
     569           0 :         parserErrorReason = "reset to [last implicit] not supported";
     570           0 :         return 0;
     571             :     case CollationRuleParser::FIRST_TRAILING:
     572           0 :         ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
     573           0 :         isBoundary = TRUE;  // trailing first primary (there is no mapping for it)
     574           0 :         break;
     575             :     case CollationRuleParser::LAST_TRAILING:
     576           0 :         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     577           0 :         parserErrorReason = "LDML forbids tailoring to U+FFFF";
     578           0 :         return 0;
     579             :     default:
     580           0 :         U_ASSERT(FALSE);
     581             :         return 0;
     582             :     }
     583             : 
     584           0 :     int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);
     585           0 :     if(U_FAILURE(errorCode)) { return 0; }
     586           0 :     int64_t node = nodes.elementAti(index);
     587           0 :     if((pos & 1) == 0) {
     588             :         // even pos = [first xyz]
     589           0 :         if(!nodeHasAnyBefore(node) && isBoundary) {
     590             :             // A <group> first primary boundary is artificially added to FractionalUCA.txt.
     591             :             // It is reachable via its special contraction, but is not normally used.
     592             :             // Find the first character tailored after the boundary CE,
     593             :             // or the first real root CE after it.
     594           0 :             if((index = nextIndexFromNode(node)) != 0) {
     595             :                 // If there is a following node, then it must be tailored
     596             :                 // because there are no root CEs with a boundary primary
     597             :                 // and non-common secondary/tertiary weights.
     598           0 :                 node = nodes.elementAti(index);
     599           0 :                 U_ASSERT(isTailoredNode(node));
     600           0 :                 ce = tempCEFromIndexAndStrength(index, strength);
     601             :             } else {
     602           0 :                 U_ASSERT(strength == UCOL_PRIMARY);
     603           0 :                 uint32_t p = (uint32_t)(ce >> 32);
     604           0 :                 int32_t pIndex = rootElements.findPrimary(p);
     605           0 :                 UBool isCompressible = baseData->isCompressiblePrimary(p);
     606           0 :                 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
     607           0 :                 ce = Collation::makeCE(p);
     608           0 :                 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);
     609           0 :                 if(U_FAILURE(errorCode)) { return 0; }
     610           0 :                 node = nodes.elementAti(index);
     611             :             }
     612             :         }
     613           0 :         if(nodeHasAnyBefore(node)) {
     614             :             // Get the first node that was tailored before this one at a weaker strength.
     615           0 :             if(nodeHasBefore2(node)) {
     616           0 :                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
     617           0 :                 node = nodes.elementAti(index);
     618             :             }
     619           0 :             if(nodeHasBefore3(node)) {
     620           0 :                 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
     621             :             }
     622           0 :             U_ASSERT(isTailoredNode(nodes.elementAti(index)));
     623           0 :             ce = tempCEFromIndexAndStrength(index, strength);
     624             :         }
     625             :     } else {
     626             :         // odd pos = [last xyz]
     627             :         // Find the last node that was tailored after the [last xyz]
     628             :         // at a strength no greater than the position's strength.
     629             :         for(;;) {
     630           0 :             int32_t nextIndex = nextIndexFromNode(node);
     631           0 :             if(nextIndex == 0) { break; }
     632           0 :             int64_t nextNode = nodes.elementAti(nextIndex);
     633           0 :             if(strengthFromNode(nextNode) < strength) { break; }
     634           0 :             index = nextIndex;
     635           0 :             node = nextNode;
     636           0 :         }
     637             :         // Do not make a temporary CE for a root node.
     638             :         // This last node might be the node for the root CE itself,
     639             :         // or a node with a common secondary or tertiary weight.
     640           0 :         if(isTailoredNode(node)) {
     641           0 :             ce = tempCEFromIndexAndStrength(index, strength);
     642             :         }
     643             :     }
     644           0 :     return ce;
     645             : }
     646             : 
     647             : void
     648           0 : CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
     649             :                               const UnicodeString &str, const UnicodeString &extension,
     650             :                               const char *&parserErrorReason, UErrorCode &errorCode) {
     651           0 :     if(U_FAILURE(errorCode)) { return; }
     652           0 :     UnicodeString nfdPrefix;
     653           0 :     if(!prefix.isEmpty()) {
     654           0 :         nfd.normalize(prefix, nfdPrefix, errorCode);
     655           0 :         if(U_FAILURE(errorCode)) {
     656           0 :             parserErrorReason = "normalizing the relation prefix";
     657           0 :             return;
     658             :         }
     659             :     }
     660           0 :     UnicodeString nfdString = nfd.normalize(str, errorCode);
     661           0 :     if(U_FAILURE(errorCode)) {
     662           0 :         parserErrorReason = "normalizing the relation string";
     663           0 :         return;
     664             :     }
     665             : 
     666             :     // The runtime code decomposes Hangul syllables on the fly,
     667             :     // with recursive processing but without making the Jamo pieces visible for matching.
     668             :     // It does not work with certain types of contextual mappings.
     669           0 :     int32_t nfdLength = nfdString.length();
     670           0 :     if(nfdLength >= 2) {
     671           0 :         UChar c = nfdString.charAt(0);
     672           0 :         if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {
     673             :             // While handling a Hangul syllable, contractions starting with Jamo L or V
     674             :             // would not see the following Jamo of that syllable.
     675           0 :             errorCode = U_UNSUPPORTED_ERROR;
     676           0 :             parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";
     677           0 :             return;
     678             :         }
     679           0 :         c = nfdString.charAt(nfdLength - 1);
     680           0 :         if(Hangul::isJamoL(c) ||
     681           0 :                 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {
     682             :             // A contraction ending with Jamo L or L+V would require
     683             :             // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
     684             :             // or decomposing a following Hangul syllable on the fly, during contraction matching.
     685           0 :             errorCode = U_UNSUPPORTED_ERROR;
     686           0 :             parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";
     687           0 :             return;
     688             :         }
     689             :         // A Hangul syllable completely inside a contraction is ok.
     690             :     }
     691             :     // Note: If there is a prefix, then the parser checked that
     692             :     // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
     693             :     // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
     694             :     // (While handling a Hangul syllable, prefixes on Jamo V or T
     695             :     // would not see the previous Jamo of that syllable.)
     696             : 
     697           0 :     if(strength != UCOL_IDENTICAL) {
     698             :         // Find the node index after which we insert the new tailored node.
     699           0 :         int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
     700           0 :         U_ASSERT(cesLength > 0);
     701           0 :         int64_t ce = ces[cesLength - 1];
     702           0 :         if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) {
     703             :             // There is no primary gap between ignorables and the space-first-primary.
     704           0 :             errorCode = U_UNSUPPORTED_ERROR;
     705           0 :             parserErrorReason = "tailoring primary after ignorables not supported";
     706           0 :             return;
     707             :         }
     708           0 :         if(strength == UCOL_QUATERNARY && ce == 0) {
     709             :             // The CE data structure does not support non-zero quaternary weights
     710             :             // on tertiary ignorables.
     711           0 :             errorCode = U_UNSUPPORTED_ERROR;
     712           0 :             parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";
     713           0 :             return;
     714             :         }
     715             :         // Insert the new tailored node.
     716           0 :         index = insertTailoredNodeAfter(index, strength, errorCode);
     717           0 :         if(U_FAILURE(errorCode)) {
     718           0 :             parserErrorReason = "modifying collation elements";
     719           0 :             return;
     720             :         }
     721             :         // Strength of the temporary CE:
     722             :         // The new relation may yield a stronger CE but not a weaker one.
     723           0 :         int32_t tempStrength = ceStrength(ce);
     724           0 :         if(strength < tempStrength) { tempStrength = strength; }
     725           0 :         ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
     726             :     }
     727             : 
     728           0 :     setCaseBits(nfdString, parserErrorReason, errorCode);
     729           0 :     if(U_FAILURE(errorCode)) { return; }
     730             : 
     731           0 :     int32_t cesLengthBeforeExtension = cesLength;
     732           0 :     if(!extension.isEmpty()) {
     733           0 :         UnicodeString nfdExtension = nfd.normalize(extension, errorCode);
     734           0 :         if(U_FAILURE(errorCode)) {
     735           0 :             parserErrorReason = "normalizing the relation extension";
     736           0 :             return;
     737             :         }
     738           0 :         cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);
     739           0 :         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
     740           0 :             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     741           0 :             parserErrorReason =
     742             :                 "extension string adds too many collation elements (more than 31 total)";
     743           0 :             return;
     744             :         }
     745             :     }
     746           0 :     uint32_t ce32 = Collation::UNASSIGNED_CE32;
     747           0 :     if((prefix != nfdPrefix || str != nfdString) &&
     748           0 :             !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
     749             :         // Map from the original input to the CEs.
     750             :         // We do this in case the canonical closure is incomplete,
     751             :         // so that it is possible to explicitly provide the missing mappings.
     752           0 :         ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
     753             :     }
     754           0 :     addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
     755           0 :     if(U_FAILURE(errorCode)) {
     756           0 :         parserErrorReason = "writing collation elements";
     757           0 :         return;
     758             :     }
     759           0 :     cesLength = cesLengthBeforeExtension;
     760             : }
     761             : 
     762             : int32_t
     763           0 : CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
     764             :                                          UErrorCode &errorCode) {
     765           0 :     if(U_FAILURE(errorCode)) { return 0; }
     766           0 :     U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);
     767             : 
     768             :     // Find the last CE that is at least as "strong" as the requested difference.
     769             :     // Note: Stronger is smaller (UCOL_PRIMARY=0).
     770             :     int64_t ce;
     771           0 :     for(;; --cesLength) {
     772           0 :         if(cesLength == 0) {
     773           0 :             ce = ces[0] = 0;
     774           0 :             cesLength = 1;
     775           0 :             break;
     776             :         } else {
     777           0 :             ce = ces[cesLength - 1];
     778             :         }
     779           0 :         if(ceStrength(ce) <= strength) { break; }
     780             :     }
     781             : 
     782           0 :     if(isTempCE(ce)) {
     783             :         // No need to findCommonNode() here for lower levels
     784             :         // because insertTailoredNodeAfter() will do that anyway.
     785           0 :         return indexFromTempCE(ce);
     786             :     }
     787             : 
     788             :     // root CE
     789           0 :     if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
     790           0 :         errorCode = U_UNSUPPORTED_ERROR;
     791           0 :         parserErrorReason = "tailoring relative to an unassigned code point not supported";
     792           0 :         return 0;
     793             :     }
     794           0 :     return findOrInsertNodeForRootCE(ce, strength, errorCode);
     795             : }
     796             : 
     797             : int32_t
     798           0 : CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {
     799           0 :     if(U_FAILURE(errorCode)) { return 0; }
     800           0 :     U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);
     801             : 
     802             :     // Find or insert the node for each of the root CE's weights,
     803             :     // down to the requested level/strength.
     804             :     // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
     805           0 :     U_ASSERT((ce & 0xc0) == 0);
     806           0 :     int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);
     807           0 :     if(strength >= UCOL_SECONDARY) {
     808           0 :         uint32_t lower32 = (uint32_t)ce;
     809           0 :         index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
     810           0 :         if(strength >= UCOL_TERTIARY) {
     811           0 :             index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
     812           0 :                                          UCOL_TERTIARY, errorCode);
     813             :         }
     814             :     }
     815           0 :     return index;
     816             : }
     817             : 
     818             : namespace {
     819             : 
     820             : /**
     821             :  * Like Java Collections.binarySearch(List, key, Comparator).
     822             :  *
     823             :  * @return the index>=0 where the item was found,
     824             :  *         or the index<0 for inserting the string at ~index in sorted order
     825             :  *         (index into rootPrimaryIndexes)
     826             :  */
     827             : int32_t
     828           0 : binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,
     829             :                                const int64_t *nodes, uint32_t p) {
     830           0 :     if(length == 0) { return ~0; }
     831           0 :     int32_t start = 0;
     832           0 :     int32_t limit = length;
     833             :     for (;;) {
     834           0 :         int32_t i = (start + limit) / 2;
     835           0 :         int64_t node = nodes[rootPrimaryIndexes[i]];
     836           0 :         uint32_t nodePrimary = (uint32_t)(node >> 32);  // weight32FromNode(node)
     837           0 :         if (p == nodePrimary) {
     838           0 :             return i;
     839           0 :         } else if (p < nodePrimary) {
     840           0 :             if (i == start) {
     841           0 :                 return ~start;  // insert s before i
     842             :             }
     843           0 :             limit = i;
     844             :         } else {
     845           0 :             if (i == start) {
     846           0 :                 return ~(start + 1);  // insert s after i
     847             :             }
     848           0 :             start = i;
     849             :         }
     850           0 :     }
     851             : }
     852             : 
     853             : }  // namespace
     854             : 
     855             : int32_t
     856           0 : CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {
     857           0 :     if(U_FAILURE(errorCode)) { return 0; }
     858             : 
     859           0 :     int32_t rootIndex = binarySearchForRootPrimaryNode(
     860           0 :         rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
     861           0 :     if(rootIndex >= 0) {
     862           0 :         return rootPrimaryIndexes.elementAti(rootIndex);
     863             :     } else {
     864             :         // Start a new list of nodes with this primary.
     865           0 :         int32_t index = nodes.size();
     866           0 :         nodes.addElement(nodeFromWeight32(p), errorCode);
     867           0 :         rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);
     868           0 :         return index;
     869             :     }
     870             : }
     871             : 
     872             : int32_t
     873           0 : CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
     874           0 :     if(U_FAILURE(errorCode)) { return 0; }
     875           0 :     U_ASSERT(0 <= index && index < nodes.size());
     876           0 :     U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);
     877             : 
     878           0 :     if(weight16 == Collation::COMMON_WEIGHT16) {
     879           0 :         return findCommonNode(index, level);
     880             :     }
     881             : 
     882             :     // If this will be the first below-common weight for the parent node,
     883             :     // then we will also need to insert a common weight after it.
     884           0 :     int64_t node = nodes.elementAti(index);
     885           0 :     U_ASSERT(strengthFromNode(node) < level);  // parent node is stronger
     886           0 :     if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
     887           0 :         int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
     888           0 :         if((node & hasThisLevelBefore) == 0) {
     889             :             // The parent node has an implied level-common weight.
     890             :             int64_t commonNode =
     891           0 :                 nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
     892           0 :             if(level == UCOL_SECONDARY) {
     893             :                 // Move the HAS_BEFORE3 flag from the parent node
     894             :                 // to the new secondary common node.
     895           0 :                 commonNode |= node & HAS_BEFORE3;
     896           0 :                 node &= ~(int64_t)HAS_BEFORE3;
     897             :             }
     898           0 :             nodes.setElementAt(node | hasThisLevelBefore, index);
     899             :             // Insert below-common-weight node.
     900           0 :             int32_t nextIndex = nextIndexFromNode(node);
     901           0 :             node = nodeFromWeight16(weight16) | nodeFromStrength(level);
     902           0 :             index = insertNodeBetween(index, nextIndex, node, errorCode);
     903             :             // Insert common-weight node.
     904           0 :             insertNodeBetween(index, nextIndex, commonNode, errorCode);
     905             :             // Return index of below-common-weight node.
     906           0 :             return index;
     907             :         }
     908             :     }
     909             : 
     910             :     // Find the root CE's weight for this level.
     911             :     // Postpone insertion if not found:
     912             :     // Insert the new root node before the next stronger node,
     913             :     // or before the next root node with the same strength and a larger weight.
     914             :     int32_t nextIndex;
     915           0 :     while((nextIndex = nextIndexFromNode(node)) != 0) {
     916           0 :         node = nodes.elementAti(nextIndex);
     917           0 :         int32_t nextStrength = strengthFromNode(node);
     918           0 :         if(nextStrength <= level) {
     919             :             // Insert before a stronger node.
     920           0 :             if(nextStrength < level) { break; }
     921             :             // nextStrength == level
     922           0 :             if(!isTailoredNode(node)) {
     923           0 :                 uint32_t nextWeight16 = weight16FromNode(node);
     924           0 :                 if(nextWeight16 == weight16) {
     925             :                     // Found the node for the root CE up to this level.
     926           0 :                     return nextIndex;
     927             :                 }
     928             :                 // Insert before a node with a larger same-strength weight.
     929           0 :                 if(nextWeight16 > weight16) { break; }
     930             :             }
     931             :         }
     932             :         // Skip the next node.
     933           0 :         index = nextIndex;
     934             :     }
     935           0 :     node = nodeFromWeight16(weight16) | nodeFromStrength(level);
     936           0 :     return insertNodeBetween(index, nextIndex, node, errorCode);
     937             : }
     938             : 
     939             : int32_t
     940           0 : CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {
     941           0 :     if(U_FAILURE(errorCode)) { return 0; }
     942           0 :     U_ASSERT(0 <= index && index < nodes.size());
     943           0 :     if(strength >= UCOL_SECONDARY) {
     944           0 :         index = findCommonNode(index, UCOL_SECONDARY);
     945           0 :         if(strength >= UCOL_TERTIARY) {
     946           0 :             index = findCommonNode(index, UCOL_TERTIARY);
     947             :         }
     948             :     }
     949             :     // Postpone insertion:
     950             :     // Insert the new node before the next one with a strength at least as strong.
     951           0 :     int64_t node = nodes.elementAti(index);
     952             :     int32_t nextIndex;
     953           0 :     while((nextIndex = nextIndexFromNode(node)) != 0) {
     954           0 :         node = nodes.elementAti(nextIndex);
     955           0 :         if(strengthFromNode(node) <= strength) { break; }
     956             :         // Skip the next node which has a weaker (larger) strength than the new one.
     957           0 :         index = nextIndex;
     958             :     }
     959           0 :     node = IS_TAILORED | nodeFromStrength(strength);
     960           0 :     return insertNodeBetween(index, nextIndex, node, errorCode);
     961             : }
     962             : 
     963             : int32_t
     964           0 : CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
     965             :                                     UErrorCode &errorCode) {
     966           0 :     if(U_FAILURE(errorCode)) { return 0; }
     967           0 :     U_ASSERT(previousIndexFromNode(node) == 0);
     968           0 :     U_ASSERT(nextIndexFromNode(node) == 0);
     969           0 :     U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
     970             :     // Append the new node and link it to the existing nodes.
     971           0 :     int32_t newIndex = nodes.size();
     972           0 :     node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
     973           0 :     nodes.addElement(node, errorCode);
     974           0 :     if(U_FAILURE(errorCode)) { return 0; }
     975             :     // nodes[index].nextIndex = newIndex
     976           0 :     node = nodes.elementAti(index);
     977           0 :     nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
     978             :     // nodes[nextIndex].previousIndex = newIndex
     979           0 :     if(nextIndex != 0) {
     980           0 :         node = nodes.elementAti(nextIndex);
     981           0 :         nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
     982             :     }
     983           0 :     return newIndex;
     984             : }
     985             : 
     986             : int32_t
     987           0 : CollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
     988           0 :     U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
     989           0 :     int64_t node = nodes.elementAti(index);
     990           0 :     if(strengthFromNode(node) >= strength) {
     991             :         // The current node is no stronger.
     992           0 :         return index;
     993             :     }
     994           0 :     if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
     995             :         // The current node implies the strength-common weight.
     996           0 :         return index;
     997             :     }
     998           0 :     index = nextIndexFromNode(node);
     999           0 :     node = nodes.elementAti(index);
    1000           0 :     U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
    1001             :             weight16FromNode(node) < Collation::COMMON_WEIGHT16);
    1002             :     // Skip to the explicit common node.
    1003           0 :     do {
    1004           0 :         index = nextIndexFromNode(node);
    1005           0 :         node = nodes.elementAti(index);
    1006           0 :         U_ASSERT(strengthFromNode(node) >= strength);
    1007           0 :     } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
    1008           0 :             weight16FromNode(node) < Collation::COMMON_WEIGHT16);
    1009           0 :     U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
    1010           0 :     return index;
    1011             : }
    1012             : 
    1013             : void
    1014           0 : CollationBuilder::setCaseBits(const UnicodeString &nfdString,
    1015             :                               const char *&parserErrorReason, UErrorCode &errorCode) {
    1016           0 :     if(U_FAILURE(errorCode)) { return; }
    1017           0 :     int32_t numTailoredPrimaries = 0;
    1018           0 :     for(int32_t i = 0; i < cesLength; ++i) {
    1019           0 :         if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
    1020             :     }
    1021             :     // We should not be able to get too many case bits because
    1022             :     // cesLength<=31==MAX_EXPANSION_LENGTH.
    1023             :     // 31 pairs of case bits fit into an int64_t without setting its sign bit.
    1024           0 :     U_ASSERT(numTailoredPrimaries <= 31);
    1025             : 
    1026           0 :     int64_t cases = 0;
    1027           0 :     if(numTailoredPrimaries > 0) {
    1028           0 :         const UChar *s = nfdString.getBuffer();
    1029           0 :         UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length());
    1030           0 :         int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;
    1031           0 :         if(U_FAILURE(errorCode)) {
    1032           0 :             parserErrorReason = "fetching root CEs for tailored string";
    1033           0 :             return;
    1034             :         }
    1035           0 :         U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);
    1036             : 
    1037           0 :         uint32_t lastCase = 0;
    1038           0 :         int32_t numBasePrimaries = 0;
    1039           0 :         for(int32_t i = 0; i < baseCEsLength; ++i) {
    1040           0 :             int64_t ce = baseCEs.getCE(i);
    1041           0 :             if((ce >> 32) != 0) {
    1042           0 :                 ++numBasePrimaries;
    1043           0 :                 uint32_t c = ((uint32_t)ce >> 14) & 3;
    1044           0 :                 U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
    1045           0 :                 if(numBasePrimaries < numTailoredPrimaries) {
    1046           0 :                     cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);
    1047           0 :                 } else if(numBasePrimaries == numTailoredPrimaries) {
    1048           0 :                     lastCase = c;
    1049           0 :                 } else if(c != lastCase) {
    1050             :                     // There are more base primary CEs than tailored primaries.
    1051             :                     // Set mixed case if the case bits of the remainder differ.
    1052           0 :                     lastCase = 1;
    1053             :                     // Nothing more can change.
    1054           0 :                     break;
    1055             :                 }
    1056             :             }
    1057             :         }
    1058           0 :         if(numBasePrimaries >= numTailoredPrimaries) {
    1059           0 :             cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);
    1060             :         }
    1061             :     }
    1062             : 
    1063           0 :     for(int32_t i = 0; i < cesLength; ++i) {
    1064           0 :         int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits
    1065           0 :         int32_t strength = ceStrength(ce);
    1066           0 :         if(strength == UCOL_PRIMARY) {
    1067           0 :             ce |= (cases & 3) << 14;
    1068           0 :             cases >>= 2;
    1069           0 :         } else if(strength == UCOL_TERTIARY) {
    1070             :             // Tertiary CEs must have uppercase bits.
    1071             :             // See the LDML spec, and comments in class CollationCompare.
    1072           0 :             ce |= 0x8000;
    1073             :         }
    1074             :         // Tertiary ignorable CEs must have 0 case bits.
    1075             :         // We set 0 case bits for secondary CEs too
    1076             :         // since currently only U+0345 is cased and maps to a secondary CE,
    1077             :         // and it is lowercase. Other secondaries are uncased.
    1078             :         // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
    1079           0 :         ces[i] = ce;
    1080             :     }
    1081             : }
    1082             : 
    1083             : void
    1084           0 : CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
    1085             :                                        UErrorCode &errorCode) {
    1086           0 :     if(U_FAILURE(errorCode)) { return; }
    1087           0 :     dataBuilder->suppressContractions(set, errorCode);
    1088           0 :     if(U_FAILURE(errorCode)) {
    1089           0 :         parserErrorReason = "application of [suppressContractions [set]] failed";
    1090             :     }
    1091             : }
    1092             : 
    1093             : void
    1094           0 : CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,
    1095             :                            UErrorCode &errorCode) {
    1096           0 :     if(U_FAILURE(errorCode)) { return; }
    1097           0 :     optimizeSet.addAll(set);
    1098             : }
    1099             : 
    1100             : uint32_t
    1101           0 : CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
    1102             :                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
    1103             :                                  UErrorCode &errorCode) {
    1104             :     // Map from the NFD input to the CEs.
    1105           0 :     ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
    1106           0 :     ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
    1107           0 :     addTailComposites(nfdPrefix, nfdString, errorCode);
    1108           0 :     return ce32;
    1109             : }
    1110             : 
    1111             : uint32_t
    1112           0 : CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
    1113             :                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
    1114             :                                  UErrorCode &errorCode) {
    1115           0 :     if(U_FAILURE(errorCode)) { return ce32; }
    1116             : 
    1117             :     // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
    1118           0 :     if(nfdPrefix.isEmpty()) {
    1119           0 :         CanonicalIterator stringIter(nfdString, errorCode);
    1120           0 :         if(U_FAILURE(errorCode)) { return ce32; }
    1121           0 :         UnicodeString prefix;
    1122             :         for(;;) {
    1123           0 :             UnicodeString str = stringIter.next();
    1124           0 :             if(str.isBogus()) { break; }
    1125           0 :             if(ignoreString(str, errorCode) || str == nfdString) { continue; }
    1126           0 :             ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
    1127           0 :             if(U_FAILURE(errorCode)) { return ce32; }
    1128           0 :         }
    1129             :     } else {
    1130           0 :         CanonicalIterator prefixIter(nfdPrefix, errorCode);
    1131           0 :         CanonicalIterator stringIter(nfdString, errorCode);
    1132           0 :         if(U_FAILURE(errorCode)) { return ce32; }
    1133             :         for(;;) {
    1134           0 :             UnicodeString prefix = prefixIter.next();
    1135           0 :             if(prefix.isBogus()) { break; }
    1136           0 :             if(ignorePrefix(prefix, errorCode)) { continue; }
    1137           0 :             UBool samePrefix = prefix == nfdPrefix;
    1138             :             for(;;) {
    1139           0 :                 UnicodeString str = stringIter.next();
    1140           0 :                 if(str.isBogus()) { break; }
    1141           0 :                 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
    1142           0 :                 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
    1143           0 :                 if(U_FAILURE(errorCode)) { return ce32; }
    1144           0 :             }
    1145           0 :             stringIter.reset();
    1146           0 :         }
    1147             :     }
    1148           0 :     return ce32;
    1149             : }
    1150             : 
    1151             : void
    1152           0 : CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
    1153             :                                     UErrorCode &errorCode) {
    1154           0 :     if(U_FAILURE(errorCode)) { return; }
    1155             : 
    1156             :     // Look for the last starter in the NFD string.
    1157             :     UChar32 lastStarter;
    1158           0 :     int32_t indexAfterLastStarter = nfdString.length();
    1159             :     for(;;) {
    1160           0 :         if(indexAfterLastStarter == 0) { return; }  // no starter at all
    1161           0 :         lastStarter = nfdString.char32At(indexAfterLastStarter - 1);
    1162           0 :         if(nfd.getCombiningClass(lastStarter) == 0) { break; }
    1163           0 :         indexAfterLastStarter -= U16_LENGTH(lastStarter);
    1164             :     }
    1165             :     // No closure to Hangul syllables since we decompose them on the fly.
    1166           0 :     if(Hangul::isJamoL(lastStarter)) { return; }
    1167             : 
    1168             :     // Are there any composites whose decomposition starts with the lastStarter?
    1169             :     // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
    1170             :     // We might find some more equivalent mappings here if it did.
    1171           0 :     UnicodeSet composites;
    1172           0 :     if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
    1173             : 
    1174           0 :     UnicodeString decomp;
    1175           0 :     UnicodeString newNFDString, newString;
    1176             :     int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
    1177           0 :     UnicodeSetIterator iter(composites);
    1178           0 :     while(iter.next()) {
    1179           0 :         U_ASSERT(!iter.isString());
    1180           0 :         UChar32 composite = iter.getCodepoint();
    1181           0 :         nfd.getDecomposition(composite, decomp);
    1182           0 :         if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
    1183             :                                      newNFDString, newString, errorCode)) {
    1184           0 :             continue;
    1185             :         }
    1186           0 :         int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);
    1187           0 :         if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {
    1188             :             // Ignore mappings that we cannot store.
    1189           0 :             continue;
    1190             :         }
    1191             :         // Note: It is possible that the newCEs do not make use of the mapping
    1192             :         // for which we are adding the tail composites, in which case we might be adding
    1193             :         // unnecessary mappings.
    1194             :         // For example, when we add tail composites for ae^ (^=combining circumflex),
    1195             :         // UCA discontiguous-contraction matching does not find any matches
    1196             :         // for ae_^ (_=any combining diacritic below) *unless* there is also
    1197             :         // a contraction mapping for ae.
    1198             :         // Thus, if there is no ae contraction, then the ae^ mapping is ignored
    1199             :         // while fetching the newCEs for ae_^.
    1200             :         // TODO: Try to detect this effectively.
    1201             :         // (Alternatively, print a warning when prefix contractions are missing.)
    1202             : 
    1203             :         // We do not need an explicit mapping for the NFD strings.
    1204             :         // It is fine if the NFD input collates like this via a sequence of mappings.
    1205             :         // It also saves a little bit of space, and may reduce the set of characters with contractions.
    1206             :         uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
    1207           0 :                                        newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);
    1208           0 :         if(ce32 != Collation::UNASSIGNED_CE32) {
    1209             :             // was different, was added
    1210           0 :             addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
    1211             :         }
    1212             :     }
    1213             : }
    1214             : 
    1215             : UBool
    1216           0 : CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
    1217             :                                            int32_t indexAfterLastStarter,
    1218             :                                            UChar32 composite, const UnicodeString &decomp,
    1219             :                                            UnicodeString &newNFDString, UnicodeString &newString,
    1220             :                                            UErrorCode &errorCode) const {
    1221           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
    1222           0 :     U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
    1223           0 :     int32_t lastStarterLength = decomp.moveIndex32(0, 1);
    1224           0 :     if(lastStarterLength == decomp.length()) {
    1225             :         // Singleton decompositions should be found by addWithClosure()
    1226             :         // and the CanonicalIterator, so we can ignore them here.
    1227           0 :         return FALSE;
    1228             :     }
    1229           0 :     if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
    1230             :                          decomp, lastStarterLength, 0x7fffffff) == 0) {
    1231             :         // same strings, nothing new to be found here
    1232           0 :         return FALSE;
    1233             :     }
    1234             : 
    1235             :     // Make new FCD strings that combine a composite, or its decomposition,
    1236             :     // into the nfdString's last starter and the combining marks following it.
    1237             :     // Make an NFD version, and a version with the composite.
    1238           0 :     newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
    1239           0 :     newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);
    1240             : 
    1241             :     // The following is related to discontiguous contraction matching,
    1242             :     // but builds only FCD strings (or else returns FALSE).
    1243           0 :     int32_t sourceIndex = indexAfterLastStarter;
    1244           0 :     int32_t decompIndex = lastStarterLength;
    1245             :     // Small optimization: We keep the source character across loop iterations
    1246             :     // because we do not always consume it,
    1247             :     // and then need not fetch it again nor look up its combining class again.
    1248           0 :     UChar32 sourceChar = U_SENTINEL;
    1249             :     // The cc variables need to be declared before the loop so that at the end
    1250             :     // they are set to the last combining classes seen.
    1251           0 :     uint8_t sourceCC = 0;
    1252           0 :     uint8_t decompCC = 0;
    1253             :     for(;;) {
    1254           0 :         if(sourceChar < 0) {
    1255           0 :             if(sourceIndex >= nfdString.length()) { break; }
    1256           0 :             sourceChar = nfdString.char32At(sourceIndex);
    1257           0 :             sourceCC = nfd.getCombiningClass(sourceChar);
    1258           0 :             U_ASSERT(sourceCC != 0);
    1259             :         }
    1260             :         // We consume a decomposition character in each iteration.
    1261           0 :         if(decompIndex >= decomp.length()) { break; }
    1262           0 :         UChar32 decompChar = decomp.char32At(decompIndex);
    1263           0 :         decompCC = nfd.getCombiningClass(decompChar);
    1264             :         // Compare the two characters and their combining classes.
    1265           0 :         if(decompCC == 0) {
    1266             :             // Unable to merge because the source contains a non-zero combining mark
    1267             :             // but the composite's decomposition contains another starter.
    1268             :             // The strings would not be equivalent.
    1269           0 :             return FALSE;
    1270           0 :         } else if(sourceCC < decompCC) {
    1271             :             // Composite + sourceChar would not be FCD.
    1272           0 :             return FALSE;
    1273           0 :         } else if(decompCC < sourceCC) {
    1274           0 :             newNFDString.append(decompChar);
    1275           0 :             decompIndex += U16_LENGTH(decompChar);
    1276           0 :         } else if(decompChar != sourceChar) {
    1277             :             // Blocked because same combining class.
    1278           0 :             return FALSE;
    1279             :         } else {  // match: decompChar == sourceChar
    1280           0 :             newNFDString.append(decompChar);
    1281           0 :             decompIndex += U16_LENGTH(decompChar);
    1282           0 :             sourceIndex += U16_LENGTH(decompChar);
    1283           0 :             sourceChar = U_SENTINEL;
    1284             :         }
    1285           0 :     }
    1286             :     // We are at the end of at least one of the two inputs.
    1287           0 :     if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
    1288           0 :         if(sourceCC < decompCC) {
    1289             :             // Appending the next source character to the composite would not be FCD.
    1290           0 :             return FALSE;
    1291             :         }
    1292           0 :         newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
    1293           0 :         newString.append(nfdString, sourceIndex, 0x7fffffff);
    1294           0 :     } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
    1295           0 :         newNFDString.append(decomp, decompIndex, 0x7fffffff);
    1296             :     }
    1297           0 :     U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
    1298           0 :     U_ASSERT(fcd.isNormalized(newString, errorCode));
    1299           0 :     U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent
    1300           0 :     return TRUE;
    1301             : }
    1302             : 
    1303             : UBool
    1304           0 : CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {
    1305             :     // Do not map non-FCD prefixes.
    1306           0 :     return !isFCD(s, errorCode);
    1307             : }
    1308             : 
    1309             : UBool
    1310           0 : CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {
    1311             :     // Do not map non-FCD strings.
    1312             :     // Do not map strings that start with Hangul syllables: We decompose those on the fly.
    1313           0 :     return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
    1314             : }
    1315             : 
    1316             : UBool
    1317           0 : CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {
    1318           0 :     return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);
    1319             : }
    1320             : 
    1321             : void
    1322           0 : CollationBuilder::closeOverComposites(UErrorCode &errorCode) {
    1323           0 :     UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final
    1324           0 :     if(U_FAILURE(errorCode)) { return; }
    1325             :     // Hangul is decomposed on the fly during collation.
    1326           0 :     composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
    1327           0 :     UnicodeString prefix;  // empty
    1328           0 :     UnicodeString nfdString;
    1329           0 :     UnicodeSetIterator iter(composites);
    1330           0 :     while(iter.next()) {
    1331           0 :         U_ASSERT(!iter.isString());
    1332           0 :         nfd.getDecomposition(iter.getCodepoint(), nfdString);
    1333           0 :         cesLength = dataBuilder->getCEs(nfdString, ces, 0);
    1334           0 :         if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
    1335             :             // Too many CEs from the decomposition (unusual), ignore this composite.
    1336             :             // We could add a capacity parameter to getCEs() and reallocate if necessary.
    1337             :             // However, this can only really happen in contrived cases.
    1338           0 :             continue;
    1339             :         }
    1340           0 :         const UnicodeString &composite(iter.getString());
    1341           0 :         addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
    1342             :     }
    1343             : }
    1344             : 
    1345             : uint32_t
    1346           0 : CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
    1347             :                                  const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
    1348             :                                  UErrorCode &errorCode) {
    1349           0 :     if(U_FAILURE(errorCode)) { return ce32; }
    1350             :     int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];
    1351           0 :     int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);
    1352           0 :     if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
    1353           0 :         if(ce32 == Collation::UNASSIGNED_CE32) {
    1354           0 :             ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);
    1355             :         }
    1356           0 :         dataBuilder->addCE32(prefix, str, ce32, errorCode);
    1357             :     }
    1358           0 :     return ce32;
    1359             : }
    1360             : 
    1361             : UBool
    1362           0 : CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,
    1363             :                           const int64_t ces2[], int32_t ces2Length) {
    1364           0 :     if(ces1Length != ces2Length) {
    1365           0 :         return FALSE;
    1366             :     }
    1367           0 :     U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);
    1368           0 :     for(int32_t i = 0; i < ces1Length; ++i) {
    1369           0 :         if(ces1[i] != ces2[i]) { return FALSE; }
    1370             :     }
    1371           0 :     return TRUE;
    1372             : }
    1373             : 
    1374             : #ifdef DEBUG_COLLATION_BUILDER
    1375             : 
    1376             : uint32_t
    1377             : alignWeightRight(uint32_t w) {
    1378             :     if(w != 0) {
    1379             :         while((w & 0xff) == 0) { w >>= 8; }
    1380             :     }
    1381             :     return w;
    1382             : }
    1383             : 
    1384             : #endif
    1385             : 
    1386             : void
    1387           0 : CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
    1388           0 :     if(U_FAILURE(errorCode)) { return; }
    1389             : 
    1390           0 :     CollationWeights primaries, secondaries, tertiaries;
    1391           0 :     int64_t *nodesArray = nodes.getBuffer();
    1392             : #ifdef DEBUG_COLLATION_BUILDER
    1393             :         puts("\nCollationBuilder::makeTailoredCEs()");
    1394             : #endif
    1395             : 
    1396           0 :     for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
    1397           0 :         int32_t i = rootPrimaryIndexes.elementAti(rpi);
    1398           0 :         int64_t node = nodesArray[i];
    1399           0 :         uint32_t p = weight32FromNode(node);
    1400           0 :         uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;
    1401           0 :         uint32_t t = s;
    1402           0 :         uint32_t q = 0;
    1403           0 :         UBool pIsTailored = FALSE;
    1404           0 :         UBool sIsTailored = FALSE;
    1405           0 :         UBool tIsTailored = FALSE;
    1406             : #ifdef DEBUG_COLLATION_BUILDER
    1407             :         printf("\nprimary     %lx\n", (long)alignWeightRight(p));
    1408             : #endif
    1409           0 :         int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
    1410           0 :         int32_t nextIndex = nextIndexFromNode(node);
    1411           0 :         while(nextIndex != 0) {
    1412           0 :             i = nextIndex;
    1413           0 :             node = nodesArray[i];
    1414           0 :             nextIndex = nextIndexFromNode(node);
    1415           0 :             int32_t strength = strengthFromNode(node);
    1416           0 :             if(strength == UCOL_QUATERNARY) {
    1417           0 :                 U_ASSERT(isTailoredNode(node));
    1418             : #ifdef DEBUG_COLLATION_BUILDER
    1419             :                 printf("      quat+     ");
    1420             : #endif
    1421           0 :                 if(q == 3) {
    1422           0 :                     errorCode = U_BUFFER_OVERFLOW_ERROR;
    1423           0 :                     errorReason = "quaternary tailoring gap too small";
    1424           0 :                     return;
    1425             :                 }
    1426           0 :                 ++q;
    1427             :             } else {
    1428           0 :                 if(strength == UCOL_TERTIARY) {
    1429           0 :                     if(isTailoredNode(node)) {
    1430             : #ifdef DEBUG_COLLATION_BUILDER
    1431             :                         printf("    ter+        ");
    1432             : #endif
    1433           0 :                         if(!tIsTailored) {
    1434             :                             // First tailored tertiary node for [p, s].
    1435           0 :                             int32_t tCount = countTailoredNodes(nodesArray, nextIndex,
    1436           0 :                                                                 UCOL_TERTIARY) + 1;
    1437             :                             uint32_t tLimit;
    1438           0 :                             if(t == 0) {
    1439             :                                 // Gap at the beginning of the tertiary CE range.
    1440           0 :                                 t = rootElements.getTertiaryBoundary() - 0x100;
    1441           0 :                                 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
    1442           0 :                             } else if(!pIsTailored && !sIsTailored) {
    1443             :                                 // p and s are root weights.
    1444           0 :                                 tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
    1445           0 :                             } else if(t == Collation::BEFORE_WEIGHT16) {
    1446           0 :                                 tLimit = Collation::COMMON_WEIGHT16;
    1447             :                             } else {
    1448             :                                 // [p, s] is tailored.
    1449           0 :                                 U_ASSERT(t == Collation::COMMON_WEIGHT16);
    1450           0 :                                 tLimit = rootElements.getTertiaryBoundary();
    1451             :                             }
    1452           0 :                             U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);
    1453           0 :                             tertiaries.initForTertiary();
    1454           0 :                             if(!tertiaries.allocWeights(t, tLimit, tCount)) {
    1455           0 :                                 errorCode = U_BUFFER_OVERFLOW_ERROR;
    1456           0 :                                 errorReason = "tertiary tailoring gap too small";
    1457           0 :                                 return;
    1458             :                             }
    1459           0 :                             tIsTailored = TRUE;
    1460             :                         }
    1461           0 :                         t = tertiaries.nextWeight();
    1462           0 :                         U_ASSERT(t != 0xffffffff);
    1463             :                     } else {
    1464           0 :                         t = weight16FromNode(node);
    1465           0 :                         tIsTailored = FALSE;
    1466             : #ifdef DEBUG_COLLATION_BUILDER
    1467             :                         printf("    ter     %lx\n", (long)alignWeightRight(t));
    1468             : #endif
    1469             :                     }
    1470             :                 } else {
    1471           0 :                     if(strength == UCOL_SECONDARY) {
    1472           0 :                         if(isTailoredNode(node)) {
    1473             : #ifdef DEBUG_COLLATION_BUILDER
    1474             :                             printf("  sec+          ");
    1475             : #endif
    1476           0 :                             if(!sIsTailored) {
    1477             :                                 // First tailored secondary node for p.
    1478           0 :                                 int32_t sCount = countTailoredNodes(nodesArray, nextIndex,
    1479           0 :                                                                     UCOL_SECONDARY) + 1;
    1480             :                                 uint32_t sLimit;
    1481           0 :                                 if(s == 0) {
    1482             :                                     // Gap at the beginning of the secondary CE range.
    1483           0 :                                     s = rootElements.getSecondaryBoundary() - 0x100;
    1484           0 :                                     sLimit = rootElements.getFirstSecondaryCE() >> 16;
    1485           0 :                                 } else if(!pIsTailored) {
    1486             :                                     // p is a root primary.
    1487           0 :                                     sLimit = rootElements.getSecondaryAfter(pIndex, s);
    1488           0 :                                 } else if(s == Collation::BEFORE_WEIGHT16) {
    1489           0 :                                     sLimit = Collation::COMMON_WEIGHT16;
    1490             :                                 } else {
    1491             :                                     // p is a tailored primary.
    1492           0 :                                     U_ASSERT(s == Collation::COMMON_WEIGHT16);
    1493           0 :                                     sLimit = rootElements.getSecondaryBoundary();
    1494             :                                 }
    1495           0 :                                 if(s == Collation::COMMON_WEIGHT16) {
    1496             :                                     // Do not tailor into the getSortKey() range of
    1497             :                                     // compressed common secondaries.
    1498           0 :                                     s = rootElements.getLastCommonSecondary();
    1499             :                                 }
    1500           0 :                                 secondaries.initForSecondary();
    1501           0 :                                 if(!secondaries.allocWeights(s, sLimit, sCount)) {
    1502           0 :                                     errorCode = U_BUFFER_OVERFLOW_ERROR;
    1503           0 :                                     errorReason = "secondary tailoring gap too small";
    1504             : #ifdef DEBUG_COLLATION_BUILDER
    1505             :                                     printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
    1506             :                                            (long)alignWeightRight(s), (long)alignWeightRight(sLimit),
    1507             :                                            (long)alignWeightRight(sCount));
    1508             : #endif
    1509           0 :                                     return;
    1510             :                                 }
    1511           0 :                                 sIsTailored = TRUE;
    1512             :                             }
    1513           0 :                             s = secondaries.nextWeight();
    1514           0 :                             U_ASSERT(s != 0xffffffff);
    1515             :                         } else {
    1516           0 :                             s = weight16FromNode(node);
    1517           0 :                             sIsTailored = FALSE;
    1518             : #ifdef DEBUG_COLLATION_BUILDER
    1519             :                             printf("  sec       %lx\n", (long)alignWeightRight(s));
    1520             : #endif
    1521             :                         }
    1522             :                     } else /* UCOL_PRIMARY */ {
    1523           0 :                         U_ASSERT(isTailoredNode(node));
    1524             : #ifdef DEBUG_COLLATION_BUILDER
    1525             :                         printf("pri+            ");
    1526             : #endif
    1527           0 :                         if(!pIsTailored) {
    1528             :                             // First tailored primary node in this list.
    1529           0 :                             int32_t pCount = countTailoredNodes(nodesArray, nextIndex,
    1530           0 :                                                                 UCOL_PRIMARY) + 1;
    1531           0 :                             UBool isCompressible = baseData->isCompressiblePrimary(p);
    1532             :                             uint32_t pLimit =
    1533           0 :                                 rootElements.getPrimaryAfter(p, pIndex, isCompressible);
    1534           0 :                             primaries.initForPrimary(isCompressible);
    1535           0 :                             if(!primaries.allocWeights(p, pLimit, pCount)) {
    1536           0 :                                 errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?
    1537           0 :                                 errorReason = "primary tailoring gap too small";
    1538           0 :                                 return;
    1539             :                             }
    1540           0 :                             pIsTailored = TRUE;
    1541             :                         }
    1542           0 :                         p = primaries.nextWeight();
    1543           0 :                         U_ASSERT(p != 0xffffffff);
    1544           0 :                         s = Collation::COMMON_WEIGHT16;
    1545           0 :                         sIsTailored = FALSE;
    1546             :                     }
    1547           0 :                     t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;
    1548           0 :                     tIsTailored = FALSE;
    1549             :                 }
    1550           0 :                 q = 0;
    1551             :             }
    1552           0 :             if(isTailoredNode(node)) {
    1553           0 :                 nodesArray[i] = Collation::makeCE(p, s, t, q);
    1554             : #ifdef DEBUG_COLLATION_BUILDER
    1555             :                 printf("%016llx\n", (long long)nodesArray[i]);
    1556             : #endif
    1557             :             }
    1558             :         }
    1559             :     }
    1560             : }
    1561             : 
    1562             : int32_t
    1563           0 : CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {
    1564           0 :     int32_t count = 0;
    1565             :     for(;;) {
    1566           0 :         if(i == 0) { break; }
    1567           0 :         int64_t node = nodesArray[i];
    1568           0 :         if(strengthFromNode(node) < strength) { break; }
    1569           0 :         if(strengthFromNode(node) == strength) {
    1570           0 :             if(isTailoredNode(node)) {
    1571           0 :                 ++count;
    1572             :             } else {
    1573           0 :                 break;
    1574             :             }
    1575             :         }
    1576           0 :         i = nextIndexFromNode(node);
    1577           0 :     }
    1578           0 :     return count;
    1579             : }
    1580             : 
    1581             : class CEFinalizer : public CollationDataBuilder::CEModifier {
    1582             : public:
    1583           0 :     CEFinalizer(const int64_t *ces) : finalCEs(ces) {}
    1584             :     virtual ~CEFinalizer();
    1585           0 :     virtual int64_t modifyCE32(uint32_t ce32) const {
    1586           0 :         U_ASSERT(!Collation::isSpecialCE32(ce32));
    1587           0 :         if(CollationBuilder::isTempCE32(ce32)) {
    1588             :             // retain case bits
    1589           0 :             return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
    1590             :         } else {
    1591           0 :             return Collation::NO_CE;
    1592             :         }
    1593             :     }
    1594           0 :     virtual int64_t modifyCE(int64_t ce) const {
    1595           0 :         if(CollationBuilder::isTempCE(ce)) {
    1596             :             // retain case bits
    1597           0 :             return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);
    1598             :         } else {
    1599           0 :             return Collation::NO_CE;
    1600             :         }
    1601             :     }
    1602             : 
    1603             : private:
    1604             :     const int64_t *finalCEs;
    1605             : };
    1606             : 
    1607           0 : CEFinalizer::~CEFinalizer() {}
    1608             : 
    1609             : void
    1610           0 : CollationBuilder::finalizeCEs(UErrorCode &errorCode) {
    1611           0 :     if(U_FAILURE(errorCode)) { return; }
    1612           0 :     LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);
    1613           0 :     if(U_FAILURE(errorCode)) {
    1614           0 :         return;
    1615             :     }
    1616           0 :     newBuilder->initForTailoring(baseData, errorCode);
    1617           0 :     CEFinalizer finalizer(nodes.getBuffer());
    1618           0 :     newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);
    1619           0 :     if(U_FAILURE(errorCode)) { return; }
    1620           0 :     delete dataBuilder;
    1621           0 :     dataBuilder = newBuilder.orphan();
    1622             : }
    1623             : 
    1624             : int32_t
    1625           0 : CollationBuilder::ceStrength(int64_t ce) {
    1626             :     return
    1627           0 :         isTempCE(ce) ? strengthFromTempCE(ce) :
    1628           0 :         (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :
    1629           0 :         ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :
    1630             :         ce != 0 ? UCOL_TERTIARY :
    1631           0 :         UCOL_IDENTICAL;
    1632             : }
    1633             : 
    1634             : U_NAMESPACE_END
    1635             : 
    1636             : U_NAMESPACE_USE
    1637             : 
    1638             : U_CAPI UCollator * U_EXPORT2
    1639           0 : ucol_openRules(const UChar *rules, int32_t rulesLength,
    1640             :                UColAttributeValue normalizationMode, UCollationStrength strength,
    1641             :                UParseError *parseError, UErrorCode *pErrorCode) {
    1642           0 :     if(U_FAILURE(*pErrorCode)) { return NULL; }
    1643           0 :     if(rules == NULL && rulesLength != 0) {
    1644           0 :         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    1645           0 :         return NULL;
    1646             :     }
    1647           0 :     RuleBasedCollator *coll = new RuleBasedCollator();
    1648           0 :     if(coll == NULL) {
    1649           0 :         *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
    1650           0 :         return NULL;
    1651             :     }
    1652           0 :     UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);
    1653           0 :     coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);
    1654           0 :     if(U_FAILURE(*pErrorCode)) {
    1655           0 :         delete coll;
    1656           0 :         return NULL;
    1657             :     }
    1658           0 :     return coll->toUCollator();
    1659             : }
    1660             : 
    1661             : static const int32_t internalBufferSize = 512;
    1662             : 
    1663             : // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp
    1664             : // because it calls UnicodeSet "builder" code that depends on all Unicode properties,
    1665             : // and the rest of the collation "runtime" code only depends on normalization.
    1666             : // This function is not related to the collation builder,
    1667             : // but it did not seem worth moving it into its own .cpp file,
    1668             : // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.
    1669             : U_CAPI int32_t U_EXPORT2
    1670           0 : ucol_getUnsafeSet( const UCollator *coll,
    1671             :                   USet *unsafe,
    1672             :                   UErrorCode *status)
    1673             : {
    1674             :     UChar buffer[internalBufferSize];
    1675           0 :     int32_t len = 0;
    1676             : 
    1677           0 :     uset_clear(unsafe);
    1678             : 
    1679             :     // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
    1680             :     static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
    1681             :                                     0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
    1682             : 
    1683             :     // add chars that fail the fcd check
    1684           0 :     uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
    1685             : 
    1686             :     // add lead/trail surrogates
    1687             :     // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,
    1688             :     // not when testing code *points*)
    1689           0 :     uset_addRange(unsafe, 0xd800, 0xdfff);
    1690             : 
    1691           0 :     USet *contractions = uset_open(0,0);
    1692             : 
    1693           0 :     int32_t i = 0, j = 0;
    1694           0 :     ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
    1695           0 :     int32_t contsSize = uset_size(contractions);
    1696           0 :     UChar32 c = 0;
    1697             :     // Contraction set consists only of strings
    1698             :     // to get unsafe code points, we need to
    1699             :     // break the strings apart and add them to the unsafe set
    1700           0 :     for(i = 0; i < contsSize; i++) {
    1701           0 :         len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
    1702           0 :         if(len > 0) {
    1703           0 :             j = 0;
    1704           0 :             while(j < len) {
    1705           0 :                 U16_NEXT(buffer, j, len, c);
    1706           0 :                 if(j < len) {
    1707           0 :                     uset_add(unsafe, c);
    1708             :                 }
    1709             :             }
    1710             :         }
    1711             :     }
    1712             : 
    1713           0 :     uset_close(contractions);
    1714             : 
    1715           0 :     return uset_size(unsafe);
    1716             : }
    1717             : 
    1718             : #endif  // !UCONFIG_NO_COLLATION

Generated by: LCOV version 1.13