LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - collationdata.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 23 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 10 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : * Copyright (C) 2010-2015, International Business Machines
       6             : * Corporation and others.  All Rights Reserved.
       7             : *******************************************************************************
       8             : * collationdata.h
       9             : *
      10             : * created on: 2010oct27
      11             : * created by: Markus W. Scherer
      12             : */
      13             : 
      14             : #ifndef __COLLATIONDATA_H__
      15             : #define __COLLATIONDATA_H__
      16             : 
      17             : #include "unicode/utypes.h"
      18             : 
      19             : #if !UCONFIG_NO_COLLATION
      20             : 
      21             : #include "unicode/ucol.h"
      22             : #include "unicode/uniset.h"
      23             : #include "collation.h"
      24             : #include "normalizer2impl.h"
      25             : #include "utrie2.h"
      26             : 
      27             : struct UDataMemory;
      28             : 
      29             : U_NAMESPACE_BEGIN
      30             : 
      31             : class UVector32;
      32             : 
      33             : /**
      34             :  * Collation data container.
      35             :  * Immutable data created by a CollationDataBuilder, or loaded from a file,
      36             :  * or deserialized from API-provided binary data.
      37             :  *
      38             :  * Includes data for the collation base (root/default), aliased if this is not the base.
      39             :  */
      40             : struct U_I18N_API CollationData : public UMemory {
      41             :     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
      42             :     // parallel with the ranges, and resetting ranges that are indexed.
      43             :     // The reordering builder code could clone the resulting template array.
      44             :     enum {
      45             :         REORDER_RESERVED_BEFORE_LATIN = UCOL_REORDER_CODE_FIRST + 14,
      46             :         REORDER_RESERVED_AFTER_LATIN
      47             :     };
      48             : 
      49             :     enum {
      50             :         MAX_NUM_SPECIAL_REORDER_CODES = 8,
      51             :         /** C++ only, data reader check scriptStartsLength. */
      52             :         MAX_NUM_SCRIPT_RANGES = 256
      53             :     };
      54             : 
      55           0 :     CollationData(const Normalizer2Impl &nfc)
      56           0 :             : trie(NULL),
      57             :               ce32s(NULL), ces(NULL), contexts(NULL), base(NULL),
      58             :               jamoCE32s(NULL),
      59             :               nfcImpl(nfc),
      60             :               numericPrimary(0x12000000),
      61             :               ce32sLength(0), cesLength(0), contextsLength(0),
      62             :               compressibleBytes(NULL),
      63             :               unsafeBackwardSet(NULL),
      64             :               fastLatinTable(NULL), fastLatinTableLength(0),
      65             :               numScripts(0), scriptsIndex(NULL), scriptStarts(NULL), scriptStartsLength(0),
      66           0 :               rootElements(NULL), rootElementsLength(0) {}
      67             : 
      68           0 :     uint32_t getCE32(UChar32 c) const {
      69           0 :         return UTRIE2_GET32(trie, c);
      70             :     }
      71             : 
      72           0 :     uint32_t getCE32FromSupplementary(UChar32 c) const {
      73           0 :         return UTRIE2_GET32_FROM_SUPP(trie, c);
      74             :     }
      75             : 
      76           0 :     UBool isDigit(UChar32 c) const {
      77           0 :         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
      78           0 :                 Collation::hasCE32Tag(getCE32(c), Collation::DIGIT_TAG);
      79             :     }
      80             : 
      81           0 :     UBool isUnsafeBackward(UChar32 c, UBool numeric) const {
      82           0 :         return unsafeBackwardSet->contains(c) || (numeric && isDigit(c));
      83             :     }
      84             : 
      85           0 :     UBool isCompressibleLeadByte(uint32_t b) const {
      86           0 :         return compressibleBytes[b];
      87             :     }
      88             : 
      89           0 :     inline UBool isCompressiblePrimary(uint32_t p) const {
      90           0 :         return isCompressibleLeadByte(p >> 24);
      91             :     }
      92             : 
      93             :     /**
      94             :      * Returns the CE32 from two contexts words.
      95             :      * Access to the defaultCE32 for contraction and prefix matching.
      96             :      */
      97           0 :     static uint32_t readCE32(const UChar *p) {
      98           0 :         return ((uint32_t)p[0] << 16) | p[1];
      99             :     }
     100             : 
     101             :     /**
     102             :      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
     103             :      * Requires that ce32 is special.
     104             :      */
     105             :     uint32_t getIndirectCE32(uint32_t ce32) const;
     106             :     /**
     107             :      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
     108             :      * if ce32 is special.
     109             :      */
     110             :     uint32_t getFinalCE32(uint32_t ce32) const;
     111             : 
     112             :     /**
     113             :      * Computes a CE from c's ce32 which has the OFFSET_TAG.
     114             :      */
     115           0 :     int64_t getCEFromOffsetCE32(UChar32 c, uint32_t ce32) const {
     116           0 :         int64_t dataCE = ces[Collation::indexFromCE32(ce32)];
     117           0 :         return Collation::makeCE(Collation::getThreeBytePrimaryForOffsetData(c, dataCE));
     118             :     }
     119             : 
     120             :     /**
     121             :      * Returns the single CE that c maps to.
     122             :      * Sets U_UNSUPPORTED_ERROR if c does not map to a single CE.
     123             :      */
     124             :     int64_t getSingleCE(UChar32 c, UErrorCode &errorCode) const;
     125             : 
     126             :     /**
     127             :      * Returns the FCD16 value for code point c. c must be >= 0.
     128             :      */
     129           0 :     uint16_t getFCD16(UChar32 c) const {
     130           0 :         return nfcImpl.getFCD16(c);
     131             :     }
     132             : 
     133             :     /**
     134             :      * Returns the first primary for the script's reordering group.
     135             :      * @return the primary with only the first primary lead byte of the group
     136             :      *         (not necessarily an actual root collator primary weight),
     137             :      *         or 0 if the script is unknown
     138             :      */
     139             :     uint32_t getFirstPrimaryForGroup(int32_t script) const;
     140             : 
     141             :     /**
     142             :      * Returns the last primary for the script's reordering group.
     143             :      * @return the last primary of the group
     144             :      *         (not an actual root collator primary weight),
     145             :      *         or 0 if the script is unknown
     146             :      */
     147             :     uint32_t getLastPrimaryForGroup(int32_t script) const;
     148             : 
     149             :     /**
     150             :      * Finds the reordering group which contains the primary weight.
     151             :      * @return the first script of the group, or -1 if the weight is beyond the last group
     152             :      */
     153             :     int32_t getGroupForPrimary(uint32_t p) const;
     154             : 
     155             :     int32_t getEquivalentScripts(int32_t script,
     156             :                                  int32_t dest[], int32_t capacity, UErrorCode &errorCode) const;
     157             : 
     158             :     /**
     159             :      * Writes the permutation of primary-weight ranges
     160             :      * for the given reordering of scripts and groups.
     161             :      * The caller checks for illegal arguments and
     162             :      * takes care of [DEFAULT] and memory allocation.
     163             :      *
     164             :      * Each list element will be a (limit, offset) pair as described
     165             :      * for the CollationSettings::reorderRanges.
     166             :      * The list will be empty if no ranges are reordered.
     167             :      */
     168             :     void makeReorderRanges(const int32_t *reorder, int32_t length,
     169             :                            UVector32 &ranges, UErrorCode &errorCode) const;
     170             : 
     171             :     /** @see jamoCE32s */
     172             :     static const int32_t JAMO_CE32S_LENGTH = 19 + 21 + 27;
     173             : 
     174             :     /** Main lookup trie. */
     175             :     const UTrie2 *trie;
     176             :     /**
     177             :      * Array of CE32 values.
     178             :      * At index 0 there must be CE32(U+0000)
     179             :      * to support U+0000's special-tag for NUL-termination handling.
     180             :      */
     181             :     const uint32_t *ce32s;
     182             :     /** Array of CE values for expansions and OFFSET_TAG. */
     183             :     const int64_t *ces;
     184             :     /** Array of prefix and contraction-suffix matching data. */
     185             :     const UChar *contexts;
     186             :     /** Base collation data, or NULL if this data itself is a base. */
     187             :     const CollationData *base;
     188             :     /**
     189             :      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
     190             :      * They are normally simple CE32s, rarely expansions.
     191             :      * For fast handling of HANGUL_TAG.
     192             :      */
     193             :     const uint32_t *jamoCE32s;
     194             :     const Normalizer2Impl &nfcImpl;
     195             :     /** The single-byte primary weight (xx000000) for numeric collation. */
     196             :     uint32_t numericPrimary;
     197             : 
     198             :     int32_t ce32sLength;
     199             :     int32_t cesLength;
     200             :     int32_t contextsLength;
     201             : 
     202             :     /** 256 flags for which primary-weight lead bytes are compressible. */
     203             :     const UBool *compressibleBytes;
     204             :     /**
     205             :      * Set of code points that are unsafe for starting string comparison after an identical prefix,
     206             :      * or in backwards CE iteration.
     207             :      */
     208             :     const UnicodeSet *unsafeBackwardSet;
     209             : 
     210             :     /**
     211             :      * Fast Latin table for common-Latin-text string comparisons.
     212             :      * Data structure see class CollationFastLatin.
     213             :      */
     214             :     const uint16_t *fastLatinTable;
     215             :     int32_t fastLatinTableLength;
     216             : 
     217             :     /**
     218             :      * Data for scripts and reordering groups.
     219             :      * Uses include building a reordering permutation table and
     220             :      * providing script boundaries to AlphabeticIndex.
     221             :      */
     222             :     int32_t numScripts;
     223             :     /**
     224             :      * The length of scriptsIndex is numScripts+16.
     225             :      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
     226             :      * 16 special reorder codes (not all used) are mapped starting at numScripts.
     227             :      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
     228             :      * There are special codes at the end for reorder-reserved primary ranges.
     229             :      *
     230             :      * Multiple scripts may share a range and index, for example Hira & Kana.
     231             :      */
     232             :     const uint16_t *scriptsIndex;
     233             :     /**
     234             :      * Start primary weight (top 16 bits only) for a group/script/reserved range
     235             :      * indexed by scriptsIndex.
     236             :      * The first range (separators & terminators) and the last range (trailing weights)
     237             :      * are not reorderable, and no scriptsIndex entry points to them.
     238             :      */
     239             :     const uint16_t *scriptStarts;
     240             :     int32_t scriptStartsLength;
     241             : 
     242             :     /**
     243             :      * Collation elements in the root collator.
     244             :      * Used by the CollationRootElements class. The data structure is described there.
     245             :      * NULL in a tailoring.
     246             :      */
     247             :     const uint32_t *rootElements;
     248             :     int32_t rootElementsLength;
     249             : 
     250             : private:
     251             :     int32_t getScriptIndex(int32_t script) const;
     252             :     void makeReorderRanges(const int32_t *reorder, int32_t length,
     253             :                            UBool latinMustMove,
     254             :                            UVector32 &ranges, UErrorCode &errorCode) const;
     255             :     int32_t addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const;
     256             :     int32_t addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const;
     257             : };
     258             : 
     259             : U_NAMESPACE_END
     260             : 
     261             : #endif  // !UCONFIG_NO_COLLATION
     262             : #endif  // __COLLATIONDATA_H__

Generated by: LCOV version 1.13