LCOV - code coverage report
Current view: top level - intl/icu/source/i18n - coleitr.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 221 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 29 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : * Copyright (C) 1996-2014, International Business Machines Corporation and
       6             : * others. All Rights Reserved.
       7             : *******************************************************************************
       8             : */
       9             : 
      10             : /*
      11             : * File coleitr.cpp
      12             : *
      13             : * Created by: Helena Shih
      14             : *
      15             : * Modification History:
      16             : *
      17             : *  Date      Name        Description
      18             : *
      19             : *  6/23/97   helena      Adding comments to make code more readable.
      20             : * 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
      21             : * 12/10/99   aliu        Ported Thai collation support from Java.
      22             : * 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
      23             : * 02/19/01   swquek      Removed CollationElementIterator() since it is 
      24             : *                        private constructor and no calls are made to it
      25             : * 2012-2014  markus      Rewritten in C++ again.
      26             : */
      27             : 
      28             : #include "unicode/utypes.h"
      29             : 
      30             : #if !UCONFIG_NO_COLLATION
      31             : 
      32             : #include "unicode/chariter.h"
      33             : #include "unicode/coleitr.h"
      34             : #include "unicode/tblcoll.h"
      35             : #include "unicode/ustring.h"
      36             : #include "cmemory.h"
      37             : #include "collation.h"
      38             : #include "collationdata.h"
      39             : #include "collationiterator.h"
      40             : #include "collationsets.h"
      41             : #include "collationtailoring.h"
      42             : #include "uassert.h"
      43             : #include "uhash.h"
      44             : #include "utf16collationiterator.h"
      45             : #include "uvectr32.h"
      46             : 
      47             : /* Constants --------------------------------------------------------------- */
      48             : 
      49             : U_NAMESPACE_BEGIN
      50             : 
      51           0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationElementIterator)
      52             : 
      53             : /* CollationElementIterator public constructor/destructor ------------------ */
      54             : 
      55           0 : CollationElementIterator::CollationElementIterator(
      56           0 :                                          const CollationElementIterator& other) 
      57           0 :         : UObject(other), iter_(NULL), rbc_(NULL), otherHalf_(0), dir_(0), offsets_(NULL) {
      58           0 :     *this = other;
      59           0 : }
      60             : 
      61           0 : CollationElementIterator::~CollationElementIterator()
      62             : {
      63           0 :     delete iter_;
      64           0 :     delete offsets_;
      65           0 : }
      66             : 
      67             : /* CollationElementIterator public methods --------------------------------- */
      68             : 
      69             : namespace {
      70             : 
      71           0 : uint32_t getFirstHalf(uint32_t p, uint32_t lower32) {
      72           0 :     return (p & 0xffff0000) | ((lower32 >> 16) & 0xff00) | ((lower32 >> 8) & 0xff);
      73             : }
      74           0 : uint32_t getSecondHalf(uint32_t p, uint32_t lower32) {
      75           0 :     return (p << 16) | ((lower32 >> 8) & 0xff00) | (lower32 & 0x3f);
      76             : }
      77           0 : UBool ceNeedsTwoParts(int64_t ce) {
      78           0 :     return (ce & INT64_C(0xffff00ff003f)) != 0;
      79             : }
      80             : 
      81             : }  // namespace
      82             : 
      83           0 : int32_t CollationElementIterator::getOffset() const
      84             : {
      85           0 :     if (dir_ < 0 && offsets_ != NULL && !offsets_->isEmpty()) {
      86             :         // CollationIterator::previousCE() decrements the CEs length
      87             :         // while it pops CEs from its internal buffer.
      88           0 :         int32_t i = iter_->getCEsLength();
      89           0 :         if (otherHalf_ != 0) {
      90             :             // Return the trailing CE offset while we are in the middle of a 64-bit CE.
      91           0 :             ++i;
      92             :         }
      93           0 :         U_ASSERT(i < offsets_->size());
      94           0 :         return offsets_->elementAti(i);
      95             :     }
      96           0 :     return iter_->getOffset();
      97             : }
      98             : 
      99             : /**
     100             : * Get the ordering priority of the next character in the string.
     101             : * @return the next character's ordering. Returns NULLORDER if an error has 
     102             : *         occured or if the end of string has been reached
     103             : */
     104           0 : int32_t CollationElementIterator::next(UErrorCode& status)
     105             : {
     106           0 :     if (U_FAILURE(status)) { return NULLORDER; }
     107           0 :     if (dir_ > 1) {
     108             :         // Continue forward iteration. Test this first.
     109           0 :         if (otherHalf_ != 0) {
     110           0 :             uint32_t oh = otherHalf_;
     111           0 :             otherHalf_ = 0;
     112           0 :             return oh;
     113             :         }
     114           0 :     } else if (dir_ == 1) {
     115             :         // next() after setOffset()
     116           0 :         dir_ = 2;
     117           0 :     } else if (dir_ == 0) {
     118             :         // The iter_ is already reset to the start of the text.
     119           0 :         dir_ = 2;
     120             :     } else /* dir_ < 0 */ {
     121             :         // illegal change of direction
     122           0 :         status = U_INVALID_STATE_ERROR;
     123           0 :         return NULLORDER;
     124             :     }
     125             :     // No need to keep all CEs in the buffer when we iterate.
     126           0 :     iter_->clearCEsIfNoneRemaining();
     127           0 :     int64_t ce = iter_->nextCE(status);
     128           0 :     if (ce == Collation::NO_CE) { return NULLORDER; }
     129             :     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
     130           0 :     uint32_t p = (uint32_t)(ce >> 32);
     131           0 :     uint32_t lower32 = (uint32_t)ce;
     132           0 :     uint32_t firstHalf = getFirstHalf(p, lower32);
     133           0 :     uint32_t secondHalf = getSecondHalf(p, lower32);
     134           0 :     if (secondHalf != 0) {
     135           0 :         otherHalf_ = secondHalf | 0xc0;  // continuation CE
     136             :     }
     137           0 :     return firstHalf;
     138             : }
     139             : 
     140           0 : UBool CollationElementIterator::operator!=(
     141             :                                   const CollationElementIterator& other) const
     142             : {
     143           0 :     return !(*this == other);
     144             : }
     145             : 
     146           0 : UBool CollationElementIterator::operator==(
     147             :                                     const CollationElementIterator& that) const
     148             : {
     149           0 :     if (this == &that) {
     150           0 :         return TRUE;
     151             :     }
     152             : 
     153             :     return
     154           0 :         (rbc_ == that.rbc_ || *rbc_ == *that.rbc_) &&
     155           0 :         otherHalf_ == that.otherHalf_ &&
     156           0 :         normalizeDir() == that.normalizeDir() &&
     157           0 :         string_ == that.string_ &&
     158           0 :         *iter_ == *that.iter_;
     159             : }
     160             : 
     161             : /**
     162             : * Get the ordering priority of the previous collation element in the string.
     163             : * @param status the error code status.
     164             : * @return the previous element's ordering. Returns NULLORDER if an error has 
     165             : *         occured or if the start of string has been reached.
     166             : */
     167           0 : int32_t CollationElementIterator::previous(UErrorCode& status)
     168             : {
     169           0 :     if (U_FAILURE(status)) { return NULLORDER; }
     170           0 :     if (dir_ < 0) {
     171             :         // Continue backwards iteration. Test this first.
     172           0 :         if (otherHalf_ != 0) {
     173           0 :             uint32_t oh = otherHalf_;
     174           0 :             otherHalf_ = 0;
     175           0 :             return oh;
     176             :         }
     177           0 :     } else if (dir_ == 0) {
     178           0 :         iter_->resetToOffset(string_.length());
     179           0 :         dir_ = -1;
     180           0 :     } else if (dir_ == 1) {
     181             :         // previous() after setOffset()
     182           0 :         dir_ = -1;
     183             :     } else /* dir_ > 1 */ {
     184             :         // illegal change of direction
     185           0 :         status = U_INVALID_STATE_ERROR;
     186           0 :         return NULLORDER;
     187             :     }
     188           0 :     if (offsets_ == NULL) {
     189           0 :         offsets_ = new UVector32(status);
     190           0 :         if (offsets_ == NULL) {
     191           0 :             status = U_MEMORY_ALLOCATION_ERROR;
     192           0 :             return NULLORDER;
     193             :         }
     194             :     }
     195             :     // If we already have expansion CEs, then we also have offsets.
     196             :     // Otherwise remember the trailing offset in case we need to
     197             :     // write offsets for an artificial expansion.
     198           0 :     int32_t limitOffset = iter_->getCEsLength() == 0 ? iter_->getOffset() : 0;
     199           0 :     int64_t ce = iter_->previousCE(*offsets_, status);
     200           0 :     if (ce == Collation::NO_CE) { return NULLORDER; }
     201             :     // Turn the 64-bit CE into two old-style 32-bit CEs, without quaternary bits.
     202           0 :     uint32_t p = (uint32_t)(ce >> 32);
     203           0 :     uint32_t lower32 = (uint32_t)ce;
     204           0 :     uint32_t firstHalf = getFirstHalf(p, lower32);
     205           0 :     uint32_t secondHalf = getSecondHalf(p, lower32);
     206           0 :     if (secondHalf != 0) {
     207           0 :         if (offsets_->isEmpty()) {
     208             :             // When we convert a single 64-bit CE into two 32-bit CEs,
     209             :             // we need to make this artificial expansion behave like a normal expansion.
     210             :             // See CollationIterator::previousCE().
     211           0 :             offsets_->addElement(iter_->getOffset(), status);
     212           0 :             offsets_->addElement(limitOffset, status);
     213             :         }
     214           0 :         otherHalf_ = firstHalf;
     215           0 :         return secondHalf | 0xc0;  // continuation CE
     216             :     }
     217           0 :     return firstHalf;
     218             : }
     219             : 
     220             : /**
     221             : * Resets the cursor to the beginning of the string.
     222             : */
     223           0 : void CollationElementIterator::reset()
     224             : {
     225           0 :     iter_ ->resetToOffset(0);
     226           0 :     otherHalf_ = 0;
     227           0 :     dir_ = 0;
     228           0 : }
     229             : 
     230           0 : void CollationElementIterator::setOffset(int32_t newOffset, 
     231             :                                          UErrorCode& status)
     232             : {
     233           0 :     if (U_FAILURE(status)) { return; }
     234           0 :     if (0 < newOffset && newOffset < string_.length()) {
     235           0 :         int32_t offset = newOffset;
     236           0 :         do {
     237           0 :             UChar c = string_.charAt(offset);
     238           0 :             if (!rbc_->isUnsafe(c) ||
     239           0 :                     (U16_IS_LEAD(c) && !rbc_->isUnsafe(string_.char32At(offset)))) {
     240           0 :                 break;
     241             :             }
     242             :             // Back up to before this unsafe character.
     243           0 :             --offset;
     244           0 :         } while (offset > 0);
     245           0 :         if (offset < newOffset) {
     246             :             // We might have backed up more than necessary.
     247             :             // For example, contractions "ch" and "cu" make both 'h' and 'u' unsafe,
     248             :             // but for text "chu" setOffset(2) should remain at 2
     249             :             // although we initially back up to offset 0.
     250             :             // Find the last safe offset no greater than newOffset by iterating forward.
     251           0 :             int32_t lastSafeOffset = offset;
     252           0 :             do {
     253           0 :                 iter_->resetToOffset(lastSafeOffset);
     254           0 :                 do {
     255           0 :                     iter_->nextCE(status);
     256           0 :                     if (U_FAILURE(status)) { return; }
     257           0 :                 } while ((offset = iter_->getOffset()) == lastSafeOffset);
     258           0 :                 if (offset <= newOffset) {
     259           0 :                     lastSafeOffset = offset;
     260             :                 }
     261           0 :             } while (offset < newOffset);
     262           0 :             newOffset = lastSafeOffset;
     263             :         }
     264             :     }
     265           0 :     iter_->resetToOffset(newOffset);
     266           0 :     otherHalf_ = 0;
     267           0 :     dir_ = 1;
     268             : }
     269             : 
     270             : /**
     271             : * Sets the source to the new source string.
     272             : */
     273           0 : void CollationElementIterator::setText(const UnicodeString& source,
     274             :                                        UErrorCode& status)
     275             : {
     276           0 :     if (U_FAILURE(status)) {
     277           0 :         return;
     278             :     }
     279             : 
     280           0 :     string_ = source;
     281           0 :     const UChar *s = string_.getBuffer();
     282             :     CollationIterator *newIter;
     283           0 :     UBool numeric = rbc_->settings->isNumeric();
     284           0 :     if (rbc_->settings->dontCheckFCD()) {
     285           0 :         newIter = new UTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
     286             :     } else {
     287           0 :         newIter = new FCDUTF16CollationIterator(rbc_->data, numeric, s, s, s + string_.length());
     288             :     }
     289           0 :     if (newIter == NULL) {
     290           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     291           0 :         return;
     292             :     }
     293           0 :     delete iter_;
     294           0 :     iter_ = newIter;
     295           0 :     otherHalf_ = 0;
     296           0 :     dir_ = 0;
     297             : }
     298             : 
     299             : // Sets the source to the new character iterator.
     300           0 : void CollationElementIterator::setText(CharacterIterator& source, 
     301             :                                        UErrorCode& status)
     302             : {
     303           0 :     if (U_FAILURE(status)) 
     304           0 :         return;
     305             : 
     306           0 :     source.getText(string_);
     307           0 :     setText(string_, status);
     308             : }
     309             : 
     310           0 : int32_t CollationElementIterator::strengthOrder(int32_t order) const
     311             : {
     312           0 :     UColAttributeValue s = (UColAttributeValue)rbc_->settings->getStrength();
     313             :     // Mask off the unwanted differences.
     314           0 :     if (s == UCOL_PRIMARY) {
     315           0 :         order &= 0xffff0000;
     316             :     }
     317           0 :     else if (s == UCOL_SECONDARY) {
     318           0 :         order &= 0xffffff00;
     319             :     }
     320             : 
     321           0 :     return order;
     322             : }
     323             : 
     324             : /* CollationElementIterator private constructors/destructors --------------- */
     325             : 
     326             : /** 
     327             : * This is the "real" constructor for this class; it constructs an iterator
     328             : * over the source text using the specified collator
     329             : */
     330           0 : CollationElementIterator::CollationElementIterator(
     331             :                                                const UnicodeString &source,
     332             :                                                const RuleBasedCollator *coll,
     333           0 :                                                UErrorCode &status)
     334           0 :         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
     335           0 :     setText(source, status);
     336           0 : }
     337             : 
     338             : /** 
     339             : * This is the "real" constructor for this class; it constructs an iterator over 
     340             : * the source text using the specified collator
     341             : */
     342           0 : CollationElementIterator::CollationElementIterator(
     343             :                                            const CharacterIterator &source,
     344             :                                            const RuleBasedCollator *coll,
     345           0 :                                            UErrorCode &status)
     346           0 :         : iter_(NULL), rbc_(coll), otherHalf_(0), dir_(0), offsets_(NULL) {
     347             :     // We only call source.getText() which should be const anyway.
     348           0 :     setText(const_cast<CharacterIterator &>(source), status);
     349           0 : }
     350             : 
     351             : /* CollationElementIterator private methods -------------------------------- */
     352             : 
     353           0 : const CollationElementIterator& CollationElementIterator::operator=(
     354             :                                          const CollationElementIterator& other)
     355             : {
     356           0 :     if (this == &other) {
     357           0 :         return *this;
     358             :     }
     359             : 
     360             :     CollationIterator *newIter;
     361             :     const FCDUTF16CollationIterator *otherFCDIter =
     362           0 :             dynamic_cast<const FCDUTF16CollationIterator *>(other.iter_);
     363           0 :     if(otherFCDIter != NULL) {
     364           0 :         newIter = new FCDUTF16CollationIterator(*otherFCDIter, string_.getBuffer());
     365             :     } else {
     366             :         const UTF16CollationIterator *otherIter =
     367           0 :                 dynamic_cast<const UTF16CollationIterator *>(other.iter_);
     368           0 :         if(otherIter != NULL) {
     369           0 :             newIter = new UTF16CollationIterator(*otherIter, string_.getBuffer());
     370             :         } else {
     371           0 :             newIter = NULL;
     372             :         }
     373             :     }
     374           0 :     if(newIter != NULL) {
     375           0 :         delete iter_;
     376           0 :         iter_ = newIter;
     377           0 :         rbc_ = other.rbc_;
     378           0 :         otherHalf_ = other.otherHalf_;
     379           0 :         dir_ = other.dir_;
     380             : 
     381           0 :         string_ = other.string_;
     382             :     }
     383           0 :     if(other.dir_ < 0 && other.offsets_ != NULL && !other.offsets_->isEmpty()) {
     384           0 :         UErrorCode errorCode = U_ZERO_ERROR;
     385           0 :         if(offsets_ == NULL) {
     386           0 :             offsets_ = new UVector32(other.offsets_->size(), errorCode);
     387             :         }
     388           0 :         if(offsets_ != NULL) {
     389           0 :             offsets_->assign(*other.offsets_, errorCode);
     390             :         }
     391             :     }
     392           0 :     return *this;
     393             : }
     394             : 
     395             : namespace {
     396             : 
     397             : class MaxExpSink : public ContractionsAndExpansions::CESink {
     398             : public:
     399           0 :     MaxExpSink(UHashtable *h, UErrorCode &ec) : maxExpansions(h), errorCode(ec) {}
     400             :     virtual ~MaxExpSink();
     401           0 :     virtual void handleCE(int64_t /*ce*/) {}
     402           0 :     virtual void handleExpansion(const int64_t ces[], int32_t length) {
     403           0 :         if (length <= 1) {
     404             :             // We do not need to add single CEs into the map.
     405           0 :             return;
     406             :         }
     407           0 :         int32_t count = 0;  // number of CE "halves"
     408           0 :         for (int32_t i = 0; i < length; ++i) {
     409           0 :             count += ceNeedsTwoParts(ces[i]) ? 2 : 1;
     410             :         }
     411             :         // last "half" of the last CE
     412           0 :         int64_t ce = ces[length - 1];
     413           0 :         uint32_t p = (uint32_t)(ce >> 32);
     414           0 :         uint32_t lower32 = (uint32_t)ce;
     415           0 :         uint32_t lastHalf = getSecondHalf(p, lower32);
     416           0 :         if (lastHalf == 0) {
     417           0 :             lastHalf = getFirstHalf(p, lower32);
     418           0 :             U_ASSERT(lastHalf != 0);
     419             :         } else {
     420           0 :             lastHalf |= 0xc0;  // old-style continuation CE
     421             :         }
     422           0 :         if (count > uhash_igeti(maxExpansions, (int32_t)lastHalf)) {
     423           0 :             uhash_iputi(maxExpansions, (int32_t)lastHalf, count, &errorCode);
     424             :         }
     425             :     }
     426             : 
     427             : private:
     428             :     UHashtable *maxExpansions;
     429             :     UErrorCode &errorCode;
     430             : };
     431             : 
     432           0 : MaxExpSink::~MaxExpSink() {}
     433             : 
     434             : }  // namespace
     435             : 
     436             : UHashtable *
     437           0 : CollationElementIterator::computeMaxExpansions(const CollationData *data, UErrorCode &errorCode) {
     438           0 :     if (U_FAILURE(errorCode)) { return NULL; }
     439             :     UHashtable *maxExpansions = uhash_open(uhash_hashLong, uhash_compareLong,
     440           0 :                                            uhash_compareLong, &errorCode);
     441           0 :     if (U_FAILURE(errorCode)) { return NULL; }
     442           0 :     MaxExpSink sink(maxExpansions, errorCode);
     443           0 :     ContractionsAndExpansions(NULL, NULL, &sink, TRUE).forData(data, errorCode);
     444           0 :     if (U_FAILURE(errorCode)) {
     445           0 :         uhash_close(maxExpansions);
     446           0 :         return NULL;
     447             :     }
     448           0 :     return maxExpansions;
     449             : }
     450             : 
     451             : int32_t
     452           0 : CollationElementIterator::getMaxExpansion(int32_t order) const {
     453           0 :     return getMaxExpansion(rbc_->tailoring->maxExpansions, order);
     454             : }
     455             : 
     456             : int32_t
     457           0 : CollationElementIterator::getMaxExpansion(const UHashtable *maxExpansions, int32_t order) {
     458           0 :     if (order == 0) { return 1; }
     459             :     int32_t max;
     460           0 :     if(maxExpansions != NULL && (max = uhash_igeti(maxExpansions, order)) != 0) {
     461           0 :         return max;
     462             :     }
     463           0 :     if ((order & 0xc0) == 0xc0) {
     464             :         // old-style continuation CE
     465           0 :         return 2;
     466             :     } else {
     467           0 :         return 1;
     468             :     }
     469             : }
     470             : 
     471             : U_NAMESPACE_END
     472             : 
     473             : #endif /* #if !UCONFIG_NO_COLLATION */

Generated by: LCOV version 1.13