LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - collationdata.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 207 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 12 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : * Copyright (C) 2012-2015, International Business Machines
       6             : * Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : * collationdata.cpp
       9             : *
      10             : * created on: 2012jul28
      11             : * created by: Markus W. Scherer
      12             : */
      13             : 
      14             : #include "unicode/utypes.h"
      15             : 
      16             : #if !UCONFIG_NO_COLLATION
      17             : 
      18             : #include "unicode/ucol.h"
      19             : #include "unicode/udata.h"
      20             : #include "unicode/uscript.h"
      21             : #include "cmemory.h"
      22             : #include "collation.h"
      23             : #include "collationdata.h"
      24             : #include "uassert.h"
      25             : #include "utrie2.h"
      26             : #include "uvectr32.h"
      27             : 
      28             : U_NAMESPACE_BEGIN
      29             : 
      30             : uint32_t
      31           0 : CollationData::getIndirectCE32(uint32_t ce32) const {
      32           0 :     U_ASSERT(Collation::isSpecialCE32(ce32));
      33           0 :     int32_t tag = Collation::tagFromCE32(ce32);
      34           0 :     if(tag == Collation::DIGIT_TAG) {
      35             :         // Fetch the non-numeric-collation CE32.
      36           0 :         ce32 = ce32s[Collation::indexFromCE32(ce32)];
      37           0 :     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
      38           0 :         ce32 = Collation::UNASSIGNED_CE32;
      39           0 :     } else if(tag == Collation::U0000_TAG) {
      40             :         // Fetch the normal ce32 for U+0000.
      41           0 :         ce32 = ce32s[0];
      42             :     }
      43           0 :     return ce32;
      44             : }
      45             : 
      46             : uint32_t
      47           0 : CollationData::getFinalCE32(uint32_t ce32) const {
      48           0 :     if(Collation::isSpecialCE32(ce32)) {
      49           0 :         ce32 = getIndirectCE32(ce32);
      50             :     }
      51           0 :     return ce32;
      52             : }
      53             : 
      54             : int64_t
      55           0 : CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
      56           0 :     if(U_FAILURE(errorCode)) { return 0; }
      57             :     // Keep parallel with CollationDataBuilder::getSingleCE().
      58             :     const CollationData *d;
      59           0 :     uint32_t ce32 = getCE32(c);
      60           0 :     if(ce32 == Collation::FALLBACK_CE32) {
      61           0 :         d = base;
      62           0 :         ce32 = base->getCE32(c);
      63             :     } else {
      64           0 :         d = this;
      65             :     }
      66           0 :     while(Collation::isSpecialCE32(ce32)) {
      67           0 :         switch(Collation::tagFromCE32(ce32)) {
      68             :         case Collation::LATIN_EXPANSION_TAG:
      69             :         case Collation::BUILDER_DATA_TAG:
      70             :         case Collation::PREFIX_TAG:
      71             :         case Collation::CONTRACTION_TAG:
      72             :         case Collation::HANGUL_TAG:
      73             :         case Collation::LEAD_SURROGATE_TAG:
      74           0 :             errorCode = U_UNSUPPORTED_ERROR;
      75           0 :             return 0;
      76             :         case Collation::FALLBACK_TAG:
      77             :         case Collation::RESERVED_TAG_3:
      78           0 :             errorCode = U_INTERNAL_PROGRAM_ERROR;
      79           0 :             return 0;
      80             :         case Collation::LONG_PRIMARY_TAG:
      81           0 :             return Collation::ceFromLongPrimaryCE32(ce32);
      82             :         case Collation::LONG_SECONDARY_TAG:
      83           0 :             return Collation::ceFromLongSecondaryCE32(ce32);
      84             :         case Collation::EXPANSION32_TAG:
      85           0 :             if(Collation::lengthFromCE32(ce32) == 1) {
      86           0 :                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
      87           0 :                 break;
      88             :             } else {
      89           0 :                 errorCode = U_UNSUPPORTED_ERROR;
      90           0 :                 return 0;
      91             :             }
      92             :         case Collation::EXPANSION_TAG: {
      93           0 :             if(Collation::lengthFromCE32(ce32) == 1) {
      94           0 :                 return d->ces[Collation::indexFromCE32(ce32)];
      95             :             } else {
      96           0 :                 errorCode = U_UNSUPPORTED_ERROR;
      97           0 :                 return 0;
      98             :             }
      99             :         }
     100             :         case Collation::DIGIT_TAG:
     101             :             // Fetch the non-numeric-collation CE32 and continue.
     102           0 :             ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
     103           0 :             break;
     104             :         case Collation::U0000_TAG:
     105           0 :             U_ASSERT(c == 0);
     106             :             // Fetch the normal ce32 for U+0000 and continue.
     107           0 :             ce32 = d->ce32s[0];
     108           0 :             break;
     109             :         case Collation::OFFSET_TAG:
     110           0 :             return d->getCEFromOffsetCE32(c, ce32);
     111             :         case Collation::IMPLICIT_TAG:
     112           0 :             return Collation::unassignedCEFromCodePoint(c);
     113             :         }
     114             :     }
     115           0 :     return Collation::ceFromSimpleCE32(ce32);
     116             : }
     117             : 
     118             : uint32_t
     119           0 : CollationData::getFirstPrimaryForGroup(int32_t script) const {
     120           0 :     int32_t index = getScriptIndex(script);
     121           0 :     return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
     122             : }
     123             : 
     124             : uint32_t
     125           0 : CollationData::getLastPrimaryForGroup(int32_t script) const {
     126           0 :     int32_t index = getScriptIndex(script);
     127           0 :     if(index == 0) {
     128           0 :         return 0;
     129             :     }
     130           0 :     uint32_t limit = scriptStarts[index + 1];
     131           0 :     return (limit << 16) - 1;
     132             : }
     133             : 
     134             : int32_t
     135           0 : CollationData::getGroupForPrimary(uint32_t p) const {
     136           0 :     p >>= 16;
     137           0 :     if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
     138           0 :         return -1;
     139             :     }
     140           0 :     int32_t index = 1;
     141           0 :     while(p >= scriptStarts[index + 1]) { ++index; }
     142           0 :     for(int32_t i = 0; i < numScripts; ++i) {
     143           0 :         if(scriptsIndex[i] == index) {
     144           0 :             return i;
     145             :         }
     146             :     }
     147           0 :     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
     148           0 :         if(scriptsIndex[numScripts + i] == index) {
     149           0 :             return UCOL_REORDER_CODE_FIRST + i;
     150             :         }
     151             :     }
     152           0 :     return -1;
     153             : }
     154             : 
     155             : int32_t
     156           0 : CollationData::getScriptIndex(int32_t script) const {
     157           0 :     if(script < 0) {
     158           0 :         return 0;
     159           0 :     } else if(script < numScripts) {
     160           0 :         return scriptsIndex[script];
     161           0 :     } else if(script < UCOL_REORDER_CODE_FIRST) {
     162           0 :         return 0;
     163             :     } else {
     164           0 :         script -= UCOL_REORDER_CODE_FIRST;
     165           0 :         if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
     166           0 :             return scriptsIndex[numScripts + script];
     167             :         } else {
     168           0 :             return 0;
     169             :         }
     170             :     }
     171             : }
     172             : 
     173             : int32_t
     174           0 : CollationData::getEquivalentScripts(int32_t script,
     175             :                                     int32_t dest[], int32_t capacity,
     176             :                                     UErrorCode &errorCode) const {
     177           0 :     if(U_FAILURE(errorCode)) { return 0; }
     178           0 :     int32_t index = getScriptIndex(script);
     179           0 :     if(index == 0) { return 0; }
     180           0 :     if(script >= UCOL_REORDER_CODE_FIRST) {
     181             :         // Special groups have no aliases.
     182           0 :         if(capacity > 0) {
     183           0 :             dest[0] = script;
     184             :         } else {
     185           0 :             errorCode = U_BUFFER_OVERFLOW_ERROR;
     186             :         }
     187           0 :         return 1;
     188             :     }
     189             : 
     190           0 :     int32_t length = 0;
     191           0 :     for(int32_t i = 0; i < numScripts; ++i) {
     192           0 :         if(scriptsIndex[i] == index) {
     193           0 :             if(length < capacity) {
     194           0 :                 dest[length] = i;
     195             :             }
     196           0 :             ++length;
     197             :         }
     198             :     }
     199           0 :     if(length > capacity) {
     200           0 :         errorCode = U_BUFFER_OVERFLOW_ERROR;
     201             :     }
     202           0 :     return length;
     203             : }
     204             : 
     205             : void
     206           0 : CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
     207             :                                  UVector32 &ranges, UErrorCode &errorCode) const {
     208           0 :     makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
     209           0 : }
     210             : 
     211             : void
     212           0 : CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
     213             :                                  UBool latinMustMove,
     214             :                                  UVector32 &ranges, UErrorCode &errorCode) const {
     215           0 :     if(U_FAILURE(errorCode)) { return; }
     216           0 :     ranges.removeAllElements();
     217           0 :     if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
     218           0 :         return;
     219             :     }
     220             : 
     221             :     // Maps each script-or-group range to a new lead byte.
     222             :     uint8_t table[MAX_NUM_SCRIPT_RANGES];
     223           0 :     uprv_memset(table, 0, sizeof(table));
     224             : 
     225             :     {
     226             :         // Set "don't care" values for reserved ranges.
     227           0 :         int32_t index = scriptsIndex[
     228           0 :                 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
     229           0 :         if(index != 0) {
     230           0 :             table[index] = 0xff;
     231             :         }
     232           0 :         index = scriptsIndex[
     233           0 :                 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
     234           0 :         if(index != 0) {
     235           0 :             table[index] = 0xff;
     236             :         }
     237             :     }
     238             : 
     239             :     // Never reorder special low and high primary lead bytes.
     240           0 :     U_ASSERT(scriptStartsLength >= 2);
     241           0 :     U_ASSERT(scriptStarts[0] == 0);
     242           0 :     int32_t lowStart = scriptStarts[1];
     243           0 :     U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
     244           0 :     int32_t highLimit = scriptStarts[scriptStartsLength - 1];
     245           0 :     U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
     246             : 
     247             :     // Get the set of special reorder codes in the input list.
     248             :     // This supports a fixed number of special reorder codes;
     249             :     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
     250           0 :     uint32_t specials = 0;
     251           0 :     for(int32_t i = 0; i < length; ++i) {
     252           0 :         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
     253           0 :         if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
     254           0 :             specials |= (uint32_t)1 << reorderCode;
     255             :         }
     256             :     }
     257             : 
     258             :     // Start the reordering with the special low reorder codes that do not occur in the input.
     259           0 :     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
     260           0 :         int32_t index = scriptsIndex[numScripts + i];
     261           0 :         if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
     262           0 :             lowStart = addLowScriptRange(table, index, lowStart);
     263             :         }
     264             :     }
     265             : 
     266             :     // Skip the reserved range before Latin if Latin is the first script,
     267             :     // so that we do not move it unnecessarily.
     268           0 :     int32_t skippedReserved = 0;
     269           0 :     if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
     270           0 :         int32_t index = scriptsIndex[USCRIPT_LATIN];
     271           0 :         U_ASSERT(index != 0);
     272           0 :         int32_t start = scriptStarts[index];
     273           0 :         U_ASSERT(lowStart <= start);
     274           0 :         skippedReserved = start - lowStart;
     275           0 :         lowStart = start;
     276             :     }
     277             : 
     278             :     // Reorder according to the input scripts, continuing from the bottom of the primary range.
     279           0 :     int32_t originalLength = length;  // length will be decremented if "others" is in the list.
     280           0 :     UBool hasReorderToEnd = FALSE;
     281           0 :     for(int32_t i = 0; i < length;) {
     282           0 :         int32_t script = reorder[i++];
     283           0 :         if(script == USCRIPT_UNKNOWN) {
     284             :             // Put the remaining scripts at the top.
     285           0 :             hasReorderToEnd = TRUE;
     286           0 :             while(i < length) {
     287           0 :                 script = reorder[--length];
     288           0 :                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
     289             :                         script == UCOL_REORDER_CODE_DEFAULT) {
     290           0 :                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     291           0 :                     return;
     292             :                 }
     293           0 :                 int32_t index = getScriptIndex(script);
     294           0 :                 if(index == 0) { continue; }
     295           0 :                 if(table[index] != 0) {  // Duplicate or equivalent script.
     296           0 :                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     297           0 :                     return;
     298             :                 }
     299           0 :                 highLimit = addHighScriptRange(table, index, highLimit);
     300             :             }
     301           0 :             break;
     302             :         }
     303           0 :         if(script == UCOL_REORDER_CODE_DEFAULT) {
     304             :             // The default code must be the only one in the list, and that is handled by the caller.
     305             :             // Otherwise it must not be used.
     306           0 :             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     307           0 :             return;
     308             :         }
     309           0 :         int32_t index = getScriptIndex(script);
     310           0 :         if(index == 0) { continue; }
     311           0 :         if(table[index] != 0) {  // Duplicate or equivalent script.
     312           0 :             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
     313           0 :             return;
     314             :         }
     315           0 :         lowStart = addLowScriptRange(table, index, lowStart);
     316             :     }
     317             : 
     318             :     // Put all remaining scripts into the middle.
     319           0 :     for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
     320           0 :         int32_t leadByte = table[i];
     321           0 :         if(leadByte != 0) { continue; }
     322           0 :         int32_t start = scriptStarts[i];
     323           0 :         if(!hasReorderToEnd && start > lowStart) {
     324             :             // No need to move this script.
     325           0 :             lowStart = start;
     326             :         }
     327           0 :         lowStart = addLowScriptRange(table, i, lowStart);
     328             :     }
     329           0 :     if(lowStart > highLimit) {
     330           0 :         if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
     331             :             // Try not skipping the before-Latin reserved range.
     332           0 :             makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
     333           0 :             return;
     334             :         }
     335             :         // We need more primary lead bytes than available, despite the reserved ranges.
     336           0 :         errorCode = U_BUFFER_OVERFLOW_ERROR;
     337           0 :         return;
     338             :     }
     339             : 
     340             :     // Turn lead bytes into a list of (limit, offset) pairs.
     341             :     // Encode each pair in one list element:
     342             :     // Upper 16 bits = limit, lower 16 = signed lead byte offset.
     343           0 :     int32_t offset = 0;
     344           0 :     for(int32_t i = 1;; ++i) {
     345           0 :         int32_t nextOffset = offset;
     346           0 :         while(i < scriptStartsLength - 1) {
     347           0 :             int32_t newLeadByte = table[i];
     348           0 :             if(newLeadByte == 0xff) {
     349             :                 // "Don't care" lead byte for reserved range, continue with current offset.
     350             :             } else {
     351           0 :                 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
     352           0 :                 if(nextOffset != offset) { break; }
     353             :             }
     354           0 :             ++i;
     355             :         }
     356           0 :         if(offset != 0 || i < scriptStartsLength - 1) {
     357           0 :             ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
     358             :         }
     359           0 :         if(i == scriptStartsLength - 1) { break; }
     360           0 :         offset = nextOffset;
     361           0 :     }
     362             : }
     363             : 
     364             : int32_t
     365           0 : CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
     366           0 :     int32_t start = scriptStarts[index];
     367           0 :     if((start & 0xff) < (lowStart & 0xff)) {
     368           0 :         lowStart += 0x100;
     369             :     }
     370           0 :     table[index] = (uint8_t)(lowStart >> 8);
     371           0 :     int32_t limit = scriptStarts[index + 1];
     372           0 :     lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
     373           0 :     return lowStart;
     374             : }
     375             : 
     376             : int32_t
     377           0 : CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
     378           0 :     int32_t limit = scriptStarts[index + 1];
     379           0 :     if((limit & 0xff) > (highLimit & 0xff)) {
     380           0 :         highLimit -= 0x100;
     381             :     }
     382           0 :     int32_t start = scriptStarts[index];
     383           0 :     highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
     384           0 :     table[index] = (uint8_t)(highLimit >> 8);
     385           0 :     return highLimit;
     386             : }
     387             : 
     388             : U_NAMESPACE_END
     389             : 
     390             : #endif  // !UCONFIG_NO_COLLATION

Generated by: LCOV version 1.13