LCOV - output.info - intl/icu/source/i18n/collationfastlatinbuilder.cpp

LCOV - code coverage report

Current view:	top level - intl/icu/source/i18n - collationfastlatinbuilder.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	383	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	19	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : * Copyright (C) 2013-2015, International Business Machines
       6             : * Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : * collationfastlatinbuilder.cpp
       9             : *
      10             : * created on: 2013aug09
      11             : * created by: Markus W. Scherer
      12             : */
      13             : 
      14             : #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0  // 0 or 1 or 2
      15             : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
      16             : #include <stdio.h>
      17             : #include <string>
      18             : #endif
      19             : 
      20             : #include "unicode/utypes.h"
      21             : 
      22             : #if !UCONFIG_NO_COLLATION
      23             : 
      24             : #include "unicode/ucol.h"
      25             : #include "unicode/ucharstrie.h"
      26             : #include "unicode/unistr.h"
      27             : #include "unicode/uobject.h"
      28             : #include "unicode/uscript.h"
      29             : #include "cmemory.h"
      30             : #include "collation.h"
      31             : #include "collationdata.h"
      32             : #include "collationfastlatin.h"
      33             : #include "collationfastlatinbuilder.h"
      34             : #include "uassert.h"
      35             : #include "uvectr64.h"
      36             : 
      37             : U_NAMESPACE_BEGIN
      38             : 
      39             : struct CollationData;
      40             : 
      41             : namespace {
      42             : 
      43             : /**
      44             :  * Compare two signed int64_t values as if they were unsigned.
      45             :  */
      46             : int32_t
      47           0 : compareInt64AsUnsigned(int64_t a, int64_t b) {
      48           0 :     if((uint64_t)a < (uint64_t)b) {
      49           0 :         return -1;
      50           0 :     } else if((uint64_t)a > (uint64_t)b) {
      51           0 :         return 1;
      52             :     } else {
      53           0 :         return 0;
      54             :     }
      55             : }
      56             : 
      57             : // TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp
      58             : /**
      59             :  * Like Java Collections.binarySearch(List, String, Comparator).
      60             :  *
      61             :  * @return the index>=0 where the item was found,
      62             :  *         or the index<0 for inserting the string at ~index in sorted order
      63             :  */
      64             : int32_t
      65           0 : binarySearch(const int64_t list[], int32_t limit, int64_t ce) {
      66           0 :     if (limit == 0) { return ~0; }
      67           0 :     int32_t start = 0;
      68             :     for (;;) {
      69           0 :         int32_t i = (start + limit) / 2;
      70           0 :         int32_t cmp = compareInt64AsUnsigned(ce, list[i]);
      71           0 :         if (cmp == 0) {
      72           0 :             return i;
      73           0 :         } else if (cmp < 0) {
      74           0 :             if (i == start) {
      75           0 :                 return ~start;  // insert ce before i
      76             :             }
      77           0 :             limit = i;
      78             :         } else {
      79           0 :             if (i == start) {
      80           0 :                 return ~(start + 1);  // insert ce after i
      81             :             }
      82           0 :             start = i;
      83             :         }
      84           0 :     }
      85             : }
      86             : 
      87             : }  // namespace
      88             : 
      89           0 : CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode &errorCode)
      90             :         : ce0(0), ce1(0),
      91             :           contractionCEs(errorCode), uniqueCEs(errorCode),
      92             :           miniCEs(NULL),
      93             :           firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0),
      94             :           firstShortPrimary(0), shortPrimaryOverflow(FALSE),
      95           0 :           headerLength(0) {
      96           0 : }
      97             : 
      98           0 : CollationFastLatinBuilder::~CollationFastLatinBuilder() {
      99           0 :     uprv_free(miniCEs);
     100           0 : }
     101             : 
     102             : UBool
     103           0 : CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorCode) {
     104           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     105           0 :     if(!result.isEmpty()) {  // This builder is not reusable.
     106           0 :         errorCode = U_INVALID_STATE_ERROR;
     107           0 :         return FALSE;
     108             :     }
     109           0 :     if(!loadGroups(data, errorCode)) { return FALSE; }
     110             : 
     111             :     // Fast handling of digits.
     112           0 :     firstShortPrimary = firstDigitPrimary;
     113           0 :     getCEs(data, errorCode);
     114           0 :     if(!encodeUniqueCEs(errorCode)) { return FALSE; }
     115           0 :     if(shortPrimaryOverflow) {
     116             :         // Give digits long mini primaries,
     117             :         // so that there are more short primaries for letters.
     118           0 :         firstShortPrimary = firstLatinPrimary;
     119           0 :         resetCEs();
     120           0 :         getCEs(data, errorCode);
     121           0 :         if(!encodeUniqueCEs(errorCode)) { return FALSE; }
     122             :     }
     123             :     // Note: If we still have a short-primary overflow but not a long-primary overflow,
     124             :     // then we could calculate how many more long primaries would fit,
     125             :     // and set the firstShortPrimary to that many after the current firstShortPrimary,
     126             :     // and try again.
     127             :     // However, this might only benefit the en_US_POSIX tailoring,
     128             :     // and it is simpler to suppress building fast Latin data for it in genrb,
     129             :     // or by returning FALSE here if shortPrimaryOverflow.
     130             : 
     131           0 :     UBool ok = !shortPrimaryOverflow &&
     132           0 :             encodeCharCEs(errorCode) && encodeContractions(errorCode);
     133           0 :     contractionCEs.removeAllElements();  // might reduce heap memory usage
     134           0 :     uniqueCEs.removeAllElements();
     135           0 :     return ok;
     136             : }
     137             : 
     138             : UBool
     139           0 : CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) {
     140           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     141           0 :     headerLength = 1 + NUM_SPECIAL_GROUPS;
     142           0 :     uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength;
     143           0 :     result.append((UChar)r0);
     144             :     // The first few reordering groups should be special groups
     145             :     // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
     146           0 :     for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
     147           0 :         lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i);
     148           0 :         if(lastSpecialPrimaries[i] == 0) {
     149             :             // missing data
     150           0 :             return FALSE;
     151             :         }
     152           0 :         result.append((UChar)0);  // reserve a slot for this group
     153             :     }
     154             : 
     155           0 :     firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT);
     156           0 :     firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN);
     157           0 :     lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN);
     158           0 :     if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
     159             :         // missing data
     160           0 :         return FALSE;
     161             :     }
     162           0 :     return TRUE;
     163             : }
     164             : 
     165             : UBool
     166           0 : CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const {
     167             :     // Both or neither need to be encoded as short primaries,
     168             :     // so that we can test only one and use the same bit mask.
     169           0 :     if(p >= firstShortPrimary) {
     170           0 :         return q >= firstShortPrimary;
     171           0 :     } else if(q >= firstShortPrimary) {
     172           0 :         return FALSE;
     173             :     }
     174             :     // Both or neither must be potentially-variable,
     175             :     // so that we can test only one and determine if both are variable.
     176           0 :     uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
     177           0 :     if(p > lastVariablePrimary) {
     178           0 :         return q > lastVariablePrimary;
     179           0 :     } else if(q > lastVariablePrimary) {
     180           0 :         return FALSE;
     181             :     }
     182             :     // Both will be encoded with long mini primaries.
     183             :     // They must be in the same special reordering group,
     184             :     // so that we can test only one and determine if both are variable.
     185           0 :     U_ASSERT(p != 0 && q != 0);
     186           0 :     for(int32_t i = 0;; ++i) {  // will terminate
     187           0 :         uint32_t lastPrimary = lastSpecialPrimaries[i];
     188           0 :         if(p <= lastPrimary) {
     189           0 :             return q <= lastPrimary;
     190           0 :         } else if(q <= lastPrimary) {
     191           0 :             return FALSE;
     192             :         }
     193           0 :     }
     194             : }
     195             : 
     196             : void
     197           0 : CollationFastLatinBuilder::resetCEs() {
     198           0 :     contractionCEs.removeAllElements();
     199           0 :     uniqueCEs.removeAllElements();
     200           0 :     shortPrimaryOverflow = FALSE;
     201           0 :     result.truncate(headerLength);
     202           0 : }
     203             : 
     204             : void
     205           0 : CollationFastLatinBuilder::getCEs(const CollationData &data, UErrorCode &errorCode) {
     206           0 :     if(U_FAILURE(errorCode)) { return; }
     207           0 :     int32_t i = 0;
     208           0 :     for(UChar c = 0;; ++i, ++c) {
     209           0 :         if(c == CollationFastLatin::LATIN_LIMIT) {
     210           0 :             c = CollationFastLatin::PUNCT_START;
     211           0 :         } else if(c == CollationFastLatin::PUNCT_LIMIT) {
     212           0 :             break;
     213             :         }
     214             :         const CollationData *d;
     215           0 :         uint32_t ce32 = data.getCE32(c);
     216           0 :         if(ce32 == Collation::FALLBACK_CE32) {
     217           0 :             d = data.base;
     218           0 :             ce32 = d->getCE32(c);
     219             :         } else {
     220           0 :             d = &data;
     221             :         }
     222           0 :         if(getCEsFromCE32(*d, c, ce32, errorCode)) {
     223           0 :             charCEs[i][0] = ce0;
     224           0 :             charCEs[i][1] = ce1;
     225           0 :             addUniqueCE(ce0, errorCode);
     226           0 :             addUniqueCE(ce1, errorCode);
     227             :         } else {
     228             :             // bail out for c
     229           0 :             charCEs[i][0] = ce0 = Collation::NO_CE;
     230           0 :             charCEs[i][1] = ce1 = 0;
     231             :         }
     232           0 :         if(c == 0 && !isContractionCharCE(ce0)) {
     233             :             // Always map U+0000 to a contraction.
     234             :             // Write a contraction list with only a default value if there is no real contraction.
     235           0 :             U_ASSERT(contractionCEs.isEmpty());
     236           0 :             addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
     237           0 :             charCEs[0][0] = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
     238           0 :             charCEs[0][1] = 0;
     239             :         }
     240           0 :     }
     241             :     // Terminate the last contraction list.
     242           0 :     contractionCEs.addElement(CollationFastLatin::CONTR_CHAR_MASK, errorCode);
     243             : }
     244             : 
     245             : UBool
     246           0 : CollationFastLatinBuilder::getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32,
     247             :                                           UErrorCode &errorCode) {
     248           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     249           0 :     ce32 = data.getFinalCE32(ce32);
     250           0 :     ce1 = 0;
     251           0 :     if(Collation::isSimpleOrLongCE32(ce32)) {
     252           0 :         ce0 = Collation::ceFromCE32(ce32);
     253             :     } else {
     254           0 :         switch(Collation::tagFromCE32(ce32)) {
     255             :         case Collation::LATIN_EXPANSION_TAG:
     256           0 :             ce0 = Collation::latinCE0FromCE32(ce32);
     257           0 :             ce1 = Collation::latinCE1FromCE32(ce32);
     258           0 :             break;
     259             :         case Collation::EXPANSION32_TAG: {
     260           0 :             const uint32_t *ce32s = data.ce32s + Collation::indexFromCE32(ce32);
     261           0 :             int32_t length = Collation::lengthFromCE32(ce32);
     262           0 :             if(length <= 2) {
     263           0 :                 ce0 = Collation::ceFromCE32(ce32s[0]);
     264           0 :                 if(length == 2) {
     265           0 :                     ce1 = Collation::ceFromCE32(ce32s[1]);
     266             :                 }
     267           0 :                 break;
     268             :             } else {
     269           0 :                 return FALSE;
     270             :             }
     271             :         }
     272             :         case Collation::EXPANSION_TAG: {
     273           0 :             const int64_t *ces = data.ces + Collation::indexFromCE32(ce32);
     274           0 :             int32_t length = Collation::lengthFromCE32(ce32);
     275           0 :             if(length <= 2) {
     276           0 :                 ce0 = ces[0];
     277           0 :                 if(length == 2) {
     278           0 :                     ce1 = ces[1];
     279             :                 }
     280           0 :                 break;
     281             :             } else {
     282           0 :                 return FALSE;
     283             :             }
     284             :         }
     285             :         // Note: We could support PREFIX_TAG (assert c>=0)
     286             :         // by recursing on its default CE32 and checking that none of the prefixes starts
     287             :         // with a fast Latin character.
     288             :         // However, currently (2013) there are only the L-before-middle-dot
     289             :         // prefix mappings in the Latin range, and those would be rejected anyway.
     290             :         case Collation::CONTRACTION_TAG:
     291           0 :             U_ASSERT(c >= 0);
     292           0 :             return getCEsFromContractionCE32(data, ce32, errorCode);
     293             :         case Collation::OFFSET_TAG:
     294           0 :             U_ASSERT(c >= 0);
     295           0 :             ce0 = data.getCEFromOffsetCE32(c, ce32);
     296           0 :             break;
     297             :         default:
     298           0 :             return FALSE;
     299             :         }
     300             :     }
     301             :     // A mapping can be completely ignorable.
     302           0 :     if(ce0 == 0) { return ce1 == 0; }
     303             :     // We do not support an ignorable ce0 unless it is completely ignorable.
     304           0 :     uint32_t p0 = (uint32_t)(ce0 >> 32);
     305           0 :     if(p0 == 0) { return FALSE; }
     306             :     // We only support primaries up to the Latin script.
     307           0 :     if(p0 > lastLatinPrimary) { return FALSE; }
     308             :     // We support non-common secondary and case weights only together with short primaries.
     309           0 :     uint32_t lower32_0 = (uint32_t)ce0;
     310           0 :     if(p0 < firstShortPrimary) {
     311           0 :         uint32_t sc0 = lower32_0 & Collation::SECONDARY_AND_CASE_MASK;
     312           0 :         if(sc0 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
     313             :     }
     314             :     // No below-common tertiary weights.
     315           0 :     if((lower32_0 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
     316           0 :     if(ce1 != 0) {
     317             :         // Both primaries must be in the same group,
     318             :         // or both must get short mini primaries,
     319             :         // or a short-primary CE is followed by a secondary CE.
     320             :         // This is so that we can test the first primary and use the same mask for both,
     321             :         // and determine for both whether they are variable.
     322           0 :         uint32_t p1 = (uint32_t)(ce1 >> 32);
     323           0 :         if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return FALSE; }
     324           0 :         uint32_t lower32_1 = (uint32_t)ce1;
     325             :         // No tertiary CEs.
     326           0 :         if((lower32_1 >> 16) == 0) { return FALSE; }
     327             :         // We support non-common secondary and case weights
     328             :         // only for secondary CEs or together with short primaries.
     329           0 :         if(p1 != 0 && p1 < firstShortPrimary) {
     330           0 :             uint32_t sc1 = lower32_1 & Collation::SECONDARY_AND_CASE_MASK;
     331           0 :             if(sc1 != Collation::COMMON_SECONDARY_CE) { return FALSE; }
     332             :         }
     333             :         // No below-common tertiary weights.
     334           0 :         if((lower32_1 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; }
     335             :     }
     336             :     // No quaternary weights.
     337           0 :     if(((ce0 | ce1) & Collation::QUATERNARY_MASK) != 0) { return FALSE; }
     338           0 :     return TRUE;
     339             : }
     340             : 
     341             : UBool
     342           0 : CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData &data, uint32_t ce32,
     343             :                                                      UErrorCode &errorCode) {
     344           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     345           0 :     const UChar *p = data.contexts + Collation::indexFromCE32(ce32);
     346           0 :     ce32 = CollationData::readCE32(p);  // Default if no suffix match.
     347             :     // Since the original ce32 is not a prefix mapping,
     348             :     // the default ce32 must not be another contraction.
     349           0 :     U_ASSERT(!Collation::isContractionCE32(ce32));
     350           0 :     int32_t contractionIndex = contractionCEs.size();
     351           0 :     if(getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
     352           0 :         addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode);
     353             :     } else {
     354             :         // Bail out for c-without-contraction.
     355           0 :         addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, Collation::NO_CE, 0, errorCode);
     356             :     }
     357             :     // Handle an encodable contraction unless the next contraction is too long
     358             :     // and starts with the same character.
     359           0 :     int32_t prevX = -1;
     360           0 :     UBool addContraction = FALSE;
     361           0 :     UCharsTrie::Iterator suffixes(p + 2, 0, errorCode);
     362           0 :     while(suffixes.next(errorCode)) {
     363           0 :         const UnicodeString &suffix = suffixes.getString();
     364           0 :         int32_t x = CollationFastLatin::getCharIndex(suffix.charAt(0));
     365           0 :         if(x < 0) { continue; }  // ignore anything but fast Latin text
     366           0 :         if(x == prevX) {
     367           0 :             if(addContraction) {
     368             :                 // Bail out for all contractions starting with this character.
     369           0 :                 addContractionEntry(x, Collation::NO_CE, 0, errorCode);
     370           0 :                 addContraction = FALSE;
     371             :             }
     372           0 :             continue;
     373             :         }
     374           0 :         if(addContraction) {
     375           0 :             addContractionEntry(prevX, ce0, ce1, errorCode);
     376             :         }
     377           0 :         ce32 = (uint32_t)suffixes.getValue();
     378           0 :         if(suffix.length() == 1 && getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) {
     379           0 :             addContraction = TRUE;
     380             :         } else {
     381           0 :             addContractionEntry(x, Collation::NO_CE, 0, errorCode);
     382           0 :             addContraction = FALSE;
     383             :         }
     384           0 :         prevX = x;
     385             :     }
     386           0 :     if(addContraction) {
     387           0 :         addContractionEntry(prevX, ce0, ce1, errorCode);
     388             :     }
     389           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     390             :     // Note: There might not be any fast Latin contractions, but
     391             :     // we need to enter contraction handling anyway so that we can bail out
     392             :     // when there is a non-fast-Latin character following.
     393             :     // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
     394             :     // following umlaut and bail out, rather than return the difference of Y vs. u.
     395           0 :     ce0 = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
     396           0 :     ce1 = 0;
     397           0 :     return TRUE;
     398             : }
     399             : 
     400             : void
     401           0 : CollationFastLatinBuilder::addContractionEntry(int32_t x, int64_t cce0, int64_t cce1,
     402             :                                                UErrorCode &errorCode) {
     403           0 :     contractionCEs.addElement(x, errorCode);
     404           0 :     contractionCEs.addElement(cce0, errorCode);
     405           0 :     contractionCEs.addElement(cce1, errorCode);
     406           0 :     addUniqueCE(cce0, errorCode);
     407           0 :     addUniqueCE(cce1, errorCode);
     408           0 : }
     409             : 
     410             : void
     411           0 : CollationFastLatinBuilder::addUniqueCE(int64_t ce, UErrorCode &errorCode) {
     412           0 :     if(U_FAILURE(errorCode)) { return; }
     413           0 :     if(ce == 0 || (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY) { return; }
     414           0 :     ce &= ~(int64_t)Collation::CASE_MASK;  // blank out case bits
     415           0 :     int32_t i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
     416           0 :     if(i < 0) {
     417           0 :         uniqueCEs.insertElementAt(ce, ~i, errorCode);
     418             :     }
     419             : }
     420             : 
     421             : uint32_t
     422           0 : CollationFastLatinBuilder::getMiniCE(int64_t ce) const {
     423           0 :     ce &= ~(int64_t)Collation::CASE_MASK;  // blank out case bits
     424           0 :     int32_t index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
     425           0 :     U_ASSERT(index >= 0);
     426           0 :     return miniCEs[index];
     427             : }
     428             : 
     429             : UBool
     430           0 : CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) {
     431           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     432           0 :     uprv_free(miniCEs);
     433           0 :     miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2);
     434           0 :     if(miniCEs == NULL) {
     435           0 :         errorCode = U_MEMORY_ALLOCATION_ERROR;
     436           0 :         return FALSE;
     437             :     }
     438           0 :     int32_t group = 0;
     439           0 :     uint32_t lastGroupPrimary = lastSpecialPrimaries[group];
     440             :     // The lowest unique CE must be at least a secondary CE.
     441           0 :     U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0);
     442           0 :     uint32_t prevPrimary = 0;
     443           0 :     uint32_t prevSecondary = 0;
     444           0 :     uint32_t pri = 0;
     445           0 :     uint32_t sec = 0;
     446           0 :     uint32_t ter = CollationFastLatin::COMMON_TER;
     447           0 :     for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
     448           0 :         int64_t ce = uniqueCEs.elementAti(i);
     449             :         // Note: At least one of the p/s/t weights changes from one unique CE to the next.
     450             :         // (uniqueCEs does not store case bits.)
     451           0 :         uint32_t p = (uint32_t)(ce >> 32);
     452           0 :         if(p != prevPrimary) {
     453           0 :             while(p > lastGroupPrimary) {
     454           0 :                 U_ASSERT(pri <= CollationFastLatin::MAX_LONG);
     455             :                 // Set the group's header entry to the
     456             :                 // last "long primary" in or before the group.
     457           0 :                 result.setCharAt(1 + group, (UChar)pri);
     458           0 :                 if(++group < NUM_SPECIAL_GROUPS) {
     459           0 :                     lastGroupPrimary = lastSpecialPrimaries[group];
     460             :                 } else {
     461           0 :                     lastGroupPrimary = 0xffffffff;
     462           0 :                     break;
     463             :                 }
     464             :             }
     465           0 :             if(p < firstShortPrimary) {
     466           0 :                 if(pri == 0) {
     467           0 :                     pri = CollationFastLatin::MIN_LONG;
     468           0 :                 } else if(pri < CollationFastLatin::MAX_LONG) {
     469           0 :                     pri += CollationFastLatin::LONG_INC;
     470             :                 } else {
     471             : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
     472             :                     printf("long-primary overflow for %08x\n", p);
     473             : #endif
     474           0 :                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
     475           0 :                     continue;
     476             :                 }
     477             :             } else {
     478           0 :                 if(pri < CollationFastLatin::MIN_SHORT) {
     479           0 :                     pri = CollationFastLatin::MIN_SHORT;
     480           0 :                 } else if(pri < (CollationFastLatin::MAX_SHORT - CollationFastLatin::SHORT_INC)) {
     481             :                     // Reserve the highest primary weight for U+FFFF.
     482           0 :                     pri += CollationFastLatin::SHORT_INC;
     483             :                 } else {
     484             : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
     485             :                     printf("short-primary overflow for %08x\n", p);
     486             : #endif
     487           0 :                     shortPrimaryOverflow = TRUE;
     488           0 :                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
     489           0 :                     continue;
     490             :                 }
     491             :             }
     492           0 :             prevPrimary = p;
     493           0 :             prevSecondary = Collation::COMMON_WEIGHT16;
     494           0 :             sec = CollationFastLatin::COMMON_SEC;
     495           0 :             ter = CollationFastLatin::COMMON_TER;
     496             :         }
     497           0 :         uint32_t lower32 = (uint32_t)ce;
     498           0 :         uint32_t s = lower32 >> 16;
     499           0 :         if(s != prevSecondary) {
     500           0 :             if(pri == 0) {
     501           0 :                 if(sec == 0) {
     502           0 :                     sec = CollationFastLatin::MIN_SEC_HIGH;
     503           0 :                 } else if(sec < CollationFastLatin::MAX_SEC_HIGH) {
     504           0 :                     sec += CollationFastLatin::SEC_INC;
     505             :                 } else {
     506           0 :                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
     507           0 :                     continue;
     508             :                 }
     509           0 :                 prevSecondary = s;
     510           0 :                 ter = CollationFastLatin::COMMON_TER;
     511           0 :             } else if(s < Collation::COMMON_WEIGHT16) {
     512           0 :                 if(sec == CollationFastLatin::COMMON_SEC) {
     513           0 :                     sec = CollationFastLatin::MIN_SEC_BEFORE;
     514           0 :                 } else if(sec < CollationFastLatin::MAX_SEC_BEFORE) {
     515           0 :                     sec += CollationFastLatin::SEC_INC;
     516             :                 } else {
     517           0 :                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
     518           0 :                     continue;
     519             :                 }
     520           0 :             } else if(s == Collation::COMMON_WEIGHT16) {
     521           0 :                 sec = CollationFastLatin::COMMON_SEC;
     522             :             } else {
     523           0 :                 if(sec < CollationFastLatin::MIN_SEC_AFTER) {
     524           0 :                     sec = CollationFastLatin::MIN_SEC_AFTER;
     525           0 :                 } else if(sec < CollationFastLatin::MAX_SEC_AFTER) {
     526           0 :                     sec += CollationFastLatin::SEC_INC;
     527             :                 } else {
     528           0 :                     miniCEs[i] = CollationFastLatin::BAIL_OUT;
     529           0 :                     continue;
     530             :                 }
     531             :             }
     532           0 :             prevSecondary = s;
     533           0 :             ter = CollationFastLatin::COMMON_TER;
     534             :         }
     535           0 :         U_ASSERT((lower32 & Collation::CASE_MASK) == 0);  // blanked out in uniqueCEs
     536           0 :         uint32_t t = lower32 & Collation::ONLY_TERTIARY_MASK;
     537           0 :         if(t > Collation::COMMON_WEIGHT16) {
     538           0 :             if(ter < CollationFastLatin::MAX_TER_AFTER) {
     539           0 :                 ++ter;
     540             :             } else {
     541           0 :                 miniCEs[i] = CollationFastLatin::BAIL_OUT;
     542           0 :                 continue;
     543             :             }
     544             :         }
     545           0 :         if(CollationFastLatin::MIN_LONG <= pri && pri <= CollationFastLatin::MAX_LONG) {
     546           0 :             U_ASSERT(sec == CollationFastLatin::COMMON_SEC);
     547           0 :             miniCEs[i] = (uint16_t)(pri | ter);
     548             :         } else {
     549           0 :             miniCEs[i] = (uint16_t)(pri | sec | ter);
     550             :         }
     551             :     }
     552             : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
     553             :     printf("last mini primary: %04x\n", pri);
     554             : #endif
     555             : #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
     556             :     for(int32_t i = 0; i < uniqueCEs.size(); ++i) {
     557             :         int64_t ce = uniqueCEs.elementAti(i);
     558             :         printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
     559             :     }
     560             : #endif
     561           0 :     return U_SUCCESS(errorCode);
     562             : }
     563             : 
     564             : UBool
     565           0 : CollationFastLatinBuilder::encodeCharCEs(UErrorCode &errorCode) {
     566           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     567           0 :     int32_t miniCEsStart = result.length();
     568           0 :     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
     569           0 :         result.append((UChar)0);  // initialize to completely ignorable
     570             :     }
     571           0 :     int32_t indexBase = result.length();
     572           0 :     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
     573           0 :         int64_t ce = charCEs[i][0];
     574           0 :         if(isContractionCharCE(ce)) { continue; }  // defer contraction
     575           0 :         uint32_t miniCE = encodeTwoCEs(ce, charCEs[i][1]);
     576           0 :         if(miniCE > 0xffff) {
     577             :             // Note: There is a chance that this new expansion is the same as a previous one,
     578             :             // and if so, then we could reuse the other expansion.
     579             :             // However, that seems unlikely.
     580           0 :             int32_t expansionIndex = result.length() - indexBase;
     581           0 :             if(expansionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
     582           0 :                 miniCE = CollationFastLatin::BAIL_OUT;
     583             :             } else {
     584           0 :                 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
     585           0 :                 miniCE = CollationFastLatin::EXPANSION | expansionIndex;
     586             :             }
     587             :         }
     588           0 :         result.setCharAt(miniCEsStart + i, (UChar)miniCE);
     589             :     }
     590           0 :     return U_SUCCESS(errorCode);
     591             : }
     592             : 
     593             : UBool
     594           0 : CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) {
     595             :     // We encode all contraction lists so that the first word of a list
     596             :     // terminates the previous list, and we only need one additional terminator at the end.
     597           0 :     if(U_FAILURE(errorCode)) { return FALSE; }
     598           0 :     int32_t indexBase = headerLength + CollationFastLatin::NUM_FAST_CHARS;
     599           0 :     int32_t firstContractionIndex = result.length();
     600           0 :     for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) {
     601           0 :         int64_t ce = charCEs[i][0];
     602           0 :         if(!isContractionCharCE(ce)) { continue; }
     603           0 :         int32_t contractionIndex = result.length() - indexBase;
     604           0 :         if(contractionIndex > (int32_t)CollationFastLatin::INDEX_MASK) {
     605           0 :             result.setCharAt(headerLength + i, CollationFastLatin::BAIL_OUT);
     606           0 :             continue;
     607             :         }
     608           0 :         UBool firstTriple = TRUE;
     609           0 :         for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) {
     610           0 :             int32_t x = contractionCEs.elementAti(index);
     611           0 :             if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; }
     612           0 :             int64_t cce0 = contractionCEs.elementAti(index + 1);
     613           0 :             int64_t cce1 = contractionCEs.elementAti(index + 2);
     614           0 :             uint32_t miniCE = encodeTwoCEs(cce0, cce1);
     615           0 :             if(miniCE == CollationFastLatin::BAIL_OUT) {
     616           0 :                 result.append((UChar)(x | (1 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
     617           0 :             } else if(miniCE <= 0xffff) {
     618           0 :                 result.append((UChar)(x | (2 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
     619           0 :                 result.append((UChar)miniCE);
     620             :             } else {
     621           0 :                 result.append((UChar)(x | (3 << CollationFastLatin::CONTR_LENGTH_SHIFT)));
     622           0 :                 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE);
     623             :             }
     624           0 :             firstTriple = FALSE;
     625           0 :         }
     626             :         // Note: There is a chance that this new contraction list is the same as a previous one,
     627             :         // and if so, then we could truncate the result and reuse the other list.
     628             :         // However, that seems unlikely.
     629           0 :         result.setCharAt(headerLength + i,
     630           0 :                          (UChar)(CollationFastLatin::CONTRACTION | contractionIndex));
     631             :     }
     632           0 :     if(result.length() > firstContractionIndex) {
     633             :         // Terminate the last contraction list.
     634           0 :         result.append((UChar)CollationFastLatin::CONTR_CHAR_MASK);
     635             :     }
     636           0 :     if(result.isBogus()) {
     637           0 :         errorCode = U_MEMORY_ALLOCATION_ERROR;
     638           0 :         return FALSE;
     639             :     }
     640             : #if DEBUG_COLLATION_FAST_LATIN_BUILDER
     641             :     printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
     642             :     puts("   header & below-digit groups map");
     643             :     int32_t i = 0;
     644             :     for(; i < headerLength; ++i) {
     645             :         printf(" %04x", result[i]);
     646             :     }
     647             :     printf("\n   char mini CEs");
     648             :     U_ASSERT(CollationFastLatin::NUM_FAST_CHARS % 16 == 0);
     649             :     for(; i < indexBase; i += 16) {
     650             :         UChar32 c = i - headerLength;
     651             :         if(c >= CollationFastLatin::LATIN_LIMIT) {
     652             :             c = CollationFastLatin::PUNCT_START + c - CollationFastLatin::LATIN_LIMIT;
     653             :         }
     654             :         printf("\n %04x:", c);
     655             :         for(int32_t j = 0; j < 16; ++j) {
     656             :             printf(" %04x", result[i + j]);
     657             :         }
     658             :     }
     659             :     printf("\n   expansions & contractions");
     660             :     for(; i < result.length(); ++i) {
     661             :         if((i - indexBase) % 16 == 0) { puts(""); }
     662             :         printf(" %04x", result[i]);
     663             :     }
     664             :     puts("");
     665             : #endif
     666           0 :     return TRUE;
     667             : }
     668             : 
     669             : uint32_t
     670           0 : CollationFastLatinBuilder::encodeTwoCEs(int64_t first, int64_t second) const {
     671           0 :     if(first == 0) {
     672           0 :         return 0;  // completely ignorable
     673             :     }
     674           0 :     if(first == Collation::NO_CE) {
     675           0 :         return CollationFastLatin::BAIL_OUT;
     676             :     }
     677           0 :     U_ASSERT((uint32_t)(first >> 32) != Collation::NO_CE_PRIMARY);
     678             : 
     679           0 :     uint32_t miniCE = getMiniCE(first);
     680           0 :     if(miniCE == CollationFastLatin::BAIL_OUT) { return miniCE; }
     681           0 :     if(miniCE >= CollationFastLatin::MIN_SHORT) {
     682             :         // Extract & copy the case bits.
     683             :         // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
     684           0 :         uint32_t c = (((uint32_t)first & Collation::CASE_MASK) >> (14 - 3));
     685             :         // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
     686           0 :         c += CollationFastLatin::LOWER_CASE;
     687           0 :         miniCE |= c;
     688             :     }
     689           0 :     if(second == 0) { return miniCE; }
     690             : 
     691           0 :     uint32_t miniCE1 = getMiniCE(second);
     692           0 :     if(miniCE1 == CollationFastLatin::BAIL_OUT) { return miniCE1; }
     693             : 
     694           0 :     uint32_t case1 = (uint32_t)second & Collation::CASE_MASK;
     695           0 :     if(miniCE >= CollationFastLatin::MIN_SHORT &&
     696           0 :             (miniCE & CollationFastLatin::SECONDARY_MASK) == CollationFastLatin::COMMON_SEC) {
     697             :         // Try to combine the two mini CEs into one.
     698           0 :         uint32_t sec1 = miniCE1 & CollationFastLatin::SECONDARY_MASK;
     699           0 :         uint32_t ter1 = miniCE1 & CollationFastLatin::TERTIARY_MASK;
     700           0 :         if(sec1 >= CollationFastLatin::MIN_SEC_HIGH && case1 == 0 &&
     701             :                 ter1 == CollationFastLatin::COMMON_TER) {
     702             :             // sec1>=sec_high implies pri1==0.
     703           0 :             return (miniCE & ~CollationFastLatin::SECONDARY_MASK) | sec1;
     704             :         }
     705             :     }
     706             : 
     707           0 :     if(miniCE1 <= CollationFastLatin::SECONDARY_MASK || CollationFastLatin::MIN_SHORT <= miniCE1) {
     708             :         // Secondary CE, or a CE with a short primary, copy the case bits.
     709           0 :         case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE;
     710           0 :         miniCE1 |= case1;
     711             :     }
     712           0 :     return (miniCE << 16) | miniCE1;
     713             : }
     714             : 
     715             : U_NAMESPACE_END
     716             : 
     717             : #endif  // !UCONFIG_NO_COLLATION

Generated by: LCOV version 1.13