LCOV - code coverage report
Current view: top level - intl/icu/source/common - normlzr.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 233 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 39 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             :  *************************************************************************
       5             :  * COPYRIGHT: 
       6             :  * Copyright (c) 1996-2012, International Business Machines Corporation and
       7             :  * others. All Rights Reserved.
       8             :  *************************************************************************
       9             :  */
      10             : 
      11             : #include "unicode/utypes.h"
      12             : 
      13             : #if !UCONFIG_NO_NORMALIZATION
      14             : 
      15             : #include "unicode/uniset.h"
      16             : #include "unicode/unistr.h"
      17             : #include "unicode/chariter.h"
      18             : #include "unicode/schriter.h"
      19             : #include "unicode/uchriter.h"
      20             : #include "unicode/normlzr.h"
      21             : #include "unicode/utf16.h"
      22             : #include "cmemory.h"
      23             : #include "normalizer2impl.h"
      24             : #include "uprops.h"  // for uniset_getUnicode32Instance()
      25             : 
      26             : #if defined(_ARM64_) && defined(move32)
      27             :  // System can define move32 intrinsics, but the char iters define move32 method
      28             :  // using same undef trick in headers, so undef here to re-enable the method.
      29             : #undef move32
      30             : #endif
      31             : 
      32             : U_NAMESPACE_BEGIN
      33             : 
      34           0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
      35             : 
      36             : //-------------------------------------------------------------------------
      37             : // Constructors and other boilerplate
      38             : //-------------------------------------------------------------------------
      39             : 
      40           0 : Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
      41             :     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
      42           0 :     text(new StringCharacterIterator(str)),
      43             :     currentIndex(0), nextIndex(0),
      44           0 :     buffer(), bufferPos(0)
      45             : {
      46           0 :     init();
      47           0 : }
      48             : 
      49           0 : Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
      50             :     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
      51           0 :     text(new UCharCharacterIterator(str, length)),
      52             :     currentIndex(0), nextIndex(0),
      53           0 :     buffer(), bufferPos(0)
      54             : {
      55           0 :     init();
      56           0 : }
      57             : 
      58           0 : Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
      59             :     UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
      60           0 :     text(iter.clone()),
      61             :     currentIndex(0), nextIndex(0),
      62           0 :     buffer(), bufferPos(0)
      63             : {
      64           0 :     init();
      65           0 : }
      66             : 
      67           0 : Normalizer::Normalizer(const Normalizer &copy) :
      68           0 :     UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
      69           0 :     text(copy.text->clone()),
      70           0 :     currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
      71           0 :     buffer(copy.buffer), bufferPos(copy.bufferPos)
      72             : {
      73           0 :     init();
      74           0 : }
      75             : 
      76             : void
      77           0 : Normalizer::init() {
      78           0 :     UErrorCode errorCode=U_ZERO_ERROR;
      79           0 :     fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
      80           0 :     if(fOptions&UNORM_UNICODE_3_2) {
      81           0 :         delete fFilteredNorm2;
      82           0 :         fNorm2=fFilteredNorm2=
      83           0 :             new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
      84             :     }
      85           0 :     if(U_FAILURE(errorCode)) {
      86           0 :         errorCode=U_ZERO_ERROR;
      87           0 :         fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
      88             :     }
      89           0 : }
      90             : 
      91           0 : Normalizer::~Normalizer()
      92             : {
      93           0 :     delete fFilteredNorm2;
      94           0 :     delete text;
      95           0 : }
      96             : 
      97             : Normalizer* 
      98           0 : Normalizer::clone() const
      99             : {
     100           0 :     return new Normalizer(*this);
     101             : }
     102             : 
     103             : /**
     104             :  * Generates a hash code for this iterator.
     105             :  */
     106           0 : int32_t Normalizer::hashCode() const
     107             : {
     108           0 :     return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
     109             : }
     110             :     
     111           0 : UBool Normalizer::operator==(const Normalizer& that) const
     112             : {
     113             :     return
     114           0 :         this==&that ||
     115           0 :         (fUMode==that.fUMode &&
     116           0 :         fOptions==that.fOptions &&
     117           0 :         *text==*that.text &&
     118           0 :         buffer==that.buffer &&
     119           0 :         bufferPos==that.bufferPos &&
     120           0 :         nextIndex==that.nextIndex);
     121             : }
     122             : 
     123             : //-------------------------------------------------------------------------
     124             : // Static utility methods
     125             : //-------------------------------------------------------------------------
     126             : 
     127             : void U_EXPORT2
     128           0 : Normalizer::normalize(const UnicodeString& source, 
     129             :                       UNormalizationMode mode, int32_t options,
     130             :                       UnicodeString& result, 
     131             :                       UErrorCode &status) {
     132           0 :     if(source.isBogus() || U_FAILURE(status)) {
     133           0 :         result.setToBogus();
     134           0 :         if(U_SUCCESS(status)) {
     135           0 :             status=U_ILLEGAL_ARGUMENT_ERROR;
     136             :         }
     137             :     } else {
     138           0 :         UnicodeString localDest;
     139             :         UnicodeString *dest;
     140             : 
     141           0 :         if(&source!=&result) {
     142           0 :             dest=&result;
     143             :         } else {
     144             :             // the source and result strings are the same object, use a temporary one
     145           0 :             dest=&localDest;
     146             :         }
     147           0 :         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
     148           0 :         if(U_SUCCESS(status)) {
     149           0 :             if(options&UNORM_UNICODE_3_2) {
     150           0 :                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
     151           0 :                     normalize(source, *dest, status);
     152             :             } else {
     153           0 :                 n2->normalize(source, *dest, status);
     154             :             }
     155             :         }
     156           0 :         if(dest==&localDest && U_SUCCESS(status)) {
     157           0 :             result=*dest;
     158             :         }
     159             :     }
     160           0 : }
     161             : 
     162             : void U_EXPORT2
     163           0 : Normalizer::compose(const UnicodeString& source, 
     164             :                     UBool compat, int32_t options,
     165             :                     UnicodeString& result, 
     166             :                     UErrorCode &status) {
     167           0 :     normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
     168           0 : }
     169             : 
     170             : void U_EXPORT2
     171           0 : Normalizer::decompose(const UnicodeString& source, 
     172             :                       UBool compat, int32_t options,
     173             :                       UnicodeString& result, 
     174             :                       UErrorCode &status) {
     175           0 :     normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
     176           0 : }
     177             : 
     178             : UNormalizationCheckResult
     179           0 : Normalizer::quickCheck(const UnicodeString& source,
     180             :                        UNormalizationMode mode, int32_t options,
     181             :                        UErrorCode &status) {
     182           0 :     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
     183           0 :     if(U_SUCCESS(status)) {
     184           0 :         if(options&UNORM_UNICODE_3_2) {
     185           0 :             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
     186           0 :                 quickCheck(source, status);
     187             :         } else {
     188           0 :             return n2->quickCheck(source, status);
     189             :         }
     190             :     } else {
     191           0 :         return UNORM_MAYBE;
     192             :     }
     193             : }
     194             : 
     195             : UBool
     196           0 : Normalizer::isNormalized(const UnicodeString& source,
     197             :                          UNormalizationMode mode, int32_t options,
     198             :                          UErrorCode &status) {
     199           0 :     const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
     200           0 :     if(U_SUCCESS(status)) {
     201           0 :         if(options&UNORM_UNICODE_3_2) {
     202           0 :             return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
     203           0 :                 isNormalized(source, status);
     204             :         } else {
     205           0 :             return n2->isNormalized(source, status);
     206             :         }
     207             :     } else {
     208           0 :         return FALSE;
     209             :     }
     210             : }
     211             : 
     212             : UnicodeString & U_EXPORT2
     213           0 : Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
     214             :                         UnicodeString &result,
     215             :                         UNormalizationMode mode, int32_t options,
     216             :                         UErrorCode &errorCode) {
     217           0 :     if(left.isBogus() || right.isBogus() || U_FAILURE(errorCode)) {
     218           0 :         result.setToBogus();
     219           0 :         if(U_SUCCESS(errorCode)) {
     220           0 :             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     221             :         }
     222             :     } else {
     223           0 :         UnicodeString localDest;
     224             :         UnicodeString *dest;
     225             : 
     226           0 :         if(&right!=&result) {
     227           0 :             dest=&result;
     228             :         } else {
     229             :             // the right and result strings are the same object, use a temporary one
     230           0 :             dest=&localDest;
     231             :         }
     232           0 :         *dest=left;
     233           0 :         const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
     234           0 :         if(U_SUCCESS(errorCode)) {
     235           0 :             if(options&UNORM_UNICODE_3_2) {
     236           0 :                 FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
     237           0 :                     append(*dest, right, errorCode);
     238             :             } else {
     239           0 :                 n2->append(*dest, right, errorCode);
     240             :             }
     241             :         }
     242           0 :         if(dest==&localDest && U_SUCCESS(errorCode)) {
     243           0 :             result=*dest;
     244             :         }
     245             :     }
     246           0 :     return result;
     247             : }
     248             : 
     249             : //-------------------------------------------------------------------------
     250             : // Iteration API
     251             : //-------------------------------------------------------------------------
     252             : 
     253             : /**
     254             :  * Return the current character in the normalized text.
     255             :  */
     256           0 : UChar32 Normalizer::current() {
     257           0 :     if(bufferPos<buffer.length() || nextNormalize()) {
     258           0 :         return buffer.char32At(bufferPos);
     259             :     } else {
     260           0 :         return DONE;
     261             :     }
     262             : }
     263             : 
     264             : /**
     265             :  * Return the next character in the normalized text and advance
     266             :  * the iteration position by one.  If the end
     267             :  * of the text has already been reached, {@link #DONE} is returned.
     268             :  */
     269           0 : UChar32 Normalizer::next() {
     270           0 :     if(bufferPos<buffer.length() ||  nextNormalize()) {
     271           0 :         UChar32 c=buffer.char32At(bufferPos);
     272           0 :         bufferPos+=U16_LENGTH(c);
     273           0 :         return c;
     274             :     } else {
     275           0 :         return DONE;
     276             :     }
     277             : }
     278             : 
     279             : /**
     280             :  * Return the previous character in the normalized text and decrement
     281             :  * the iteration position by one.  If the beginning
     282             :  * of the text has already been reached, {@link #DONE} is returned.
     283             :  */
     284           0 : UChar32 Normalizer::previous() {
     285           0 :     if(bufferPos>0 || previousNormalize()) {
     286           0 :         UChar32 c=buffer.char32At(bufferPos-1);
     287           0 :         bufferPos-=U16_LENGTH(c);
     288           0 :         return c;
     289             :     } else {
     290           0 :         return DONE;
     291             :     }
     292             : }
     293             : 
     294           0 : void Normalizer::reset() {
     295           0 :     currentIndex=nextIndex=text->setToStart();
     296           0 :     clearBuffer();
     297           0 : }
     298             : 
     299             : void
     300           0 : Normalizer::setIndexOnly(int32_t index) {
     301           0 :     text->setIndex(index);  // pins index
     302           0 :     currentIndex=nextIndex=text->getIndex();
     303           0 :     clearBuffer();
     304           0 : }
     305             : 
     306             : /**
     307             :  * Return the first character in the normalized text.  This resets
     308             :  * the <tt>Normalizer's</tt> position to the beginning of the text.
     309             :  */
     310           0 : UChar32 Normalizer::first() {
     311           0 :     reset();
     312           0 :     return next();
     313             : }
     314             : 
     315             : /**
     316             :  * Return the last character in the normalized text.  This resets
     317             :  * the <tt>Normalizer's</tt> position to be just before the
     318             :  * the input text corresponding to that normalized character.
     319             :  */
     320           0 : UChar32 Normalizer::last() {
     321           0 :     currentIndex=nextIndex=text->setToEnd();
     322           0 :     clearBuffer();
     323           0 :     return previous();
     324             : }
     325             : 
     326             : /**
     327             :  * Retrieve the current iteration position in the input text that is
     328             :  * being normalized.  This method is useful in applications such as
     329             :  * searching, where you need to be able to determine the position in
     330             :  * the input text that corresponds to a given normalized output character.
     331             :  * <p>
     332             :  * <b>Note:</b> This method sets the position in the <em>input</em>, while
     333             :  * {@link #next} and {@link #previous} iterate through characters in the
     334             :  * <em>output</em>.  This means that there is not necessarily a one-to-one
     335             :  * correspondence between characters returned by <tt>next</tt> and
     336             :  * <tt>previous</tt> and the indices passed to and returned from
     337             :  * <tt>setIndex</tt> and {@link #getIndex}.
     338             :  *
     339             :  */
     340           0 : int32_t Normalizer::getIndex() const {
     341           0 :     if(bufferPos<buffer.length()) {
     342           0 :         return currentIndex;
     343             :     } else {
     344           0 :         return nextIndex;
     345             :     }
     346             : }
     347             : 
     348             : /**
     349             :  * Retrieve the index of the start of the input text.  This is the begin index
     350             :  * of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
     351             :  * over which this <tt>Normalizer</tt> is iterating
     352             :  */
     353           0 : int32_t Normalizer::startIndex() const {
     354           0 :     return text->startIndex();
     355             : }
     356             : 
     357             : /**
     358             :  * Retrieve the index of the end of the input text.  This is the end index
     359             :  * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
     360             :  * over which this <tt>Normalizer</tt> is iterating
     361             :  */
     362           0 : int32_t Normalizer::endIndex() const {
     363           0 :     return text->endIndex();
     364             : }
     365             : 
     366             : //-------------------------------------------------------------------------
     367             : // Property access methods
     368             : //-------------------------------------------------------------------------
     369             : 
     370             : void
     371           0 : Normalizer::setMode(UNormalizationMode newMode) 
     372             : {
     373           0 :     fUMode = newMode;
     374           0 :     init();
     375           0 : }
     376             : 
     377             : UNormalizationMode
     378           0 : Normalizer::getUMode() const
     379             : {
     380           0 :     return fUMode;
     381             : }
     382             : 
     383             : void
     384           0 : Normalizer::setOption(int32_t option, 
     385             :                       UBool value) 
     386             : {
     387           0 :     if (value) {
     388           0 :         fOptions |= option;
     389             :     } else {
     390           0 :         fOptions &= (~option);
     391             :     }
     392           0 :     init();
     393           0 : }
     394             : 
     395             : UBool
     396           0 : Normalizer::getOption(int32_t option) const
     397             : {
     398           0 :     return (fOptions & option) != 0;
     399             : }
     400             : 
     401             : /**
     402             :  * Set the input text over which this <tt>Normalizer</tt> will iterate.
     403             :  * The iteration position is set to the beginning of the input text.
     404             :  */
     405             : void
     406           0 : Normalizer::setText(const UnicodeString& newText, 
     407             :                     UErrorCode &status)
     408             : {
     409           0 :     if (U_FAILURE(status)) {
     410           0 :         return;
     411             :     }
     412           0 :     CharacterIterator *newIter = new StringCharacterIterator(newText);
     413           0 :     if (newIter == NULL) {
     414           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     415           0 :         return;
     416             :     }
     417           0 :     delete text;
     418           0 :     text = newIter;
     419           0 :     reset();
     420             : }
     421             : 
     422             : /**
     423             :  * Set the input text over which this <tt>Normalizer</tt> will iterate.
     424             :  * The iteration position is set to the beginning of the string.
     425             :  */
     426             : void
     427           0 : Normalizer::setText(const CharacterIterator& newText, 
     428             :                     UErrorCode &status) 
     429             : {
     430           0 :     if (U_FAILURE(status)) {
     431           0 :         return;
     432             :     }
     433           0 :     CharacterIterator *newIter = newText.clone();
     434           0 :     if (newIter == NULL) {
     435           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     436           0 :         return;
     437             :     }
     438           0 :     delete text;
     439           0 :     text = newIter;
     440           0 :     reset();
     441             : }
     442             : 
     443             : void
     444           0 : Normalizer::setText(ConstChar16Ptr newText,
     445             :                     int32_t length,
     446             :                     UErrorCode &status)
     447             : {
     448           0 :     if (U_FAILURE(status)) {
     449           0 :         return;
     450             :     }
     451           0 :     CharacterIterator *newIter = new UCharCharacterIterator(newText, length);
     452           0 :     if (newIter == NULL) {
     453           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     454           0 :         return;
     455             :     }
     456           0 :     delete text;
     457           0 :     text = newIter;
     458           0 :     reset();
     459             : }
     460             : 
     461             : /**
     462             :  * Copies the text under iteration into the UnicodeString referred to by "result".
     463             :  * @param result Receives a copy of the text under iteration.
     464             :  */
     465             : void
     466           0 : Normalizer::getText(UnicodeString&  result) 
     467             : {
     468           0 :     text->getText(result);
     469           0 : }
     470             : 
     471             : //-------------------------------------------------------------------------
     472             : // Private utility methods
     473             : //-------------------------------------------------------------------------
     474             : 
     475           0 : void Normalizer::clearBuffer() {
     476           0 :     buffer.remove();
     477           0 :     bufferPos=0;
     478           0 : }
     479             : 
     480             : UBool
     481           0 : Normalizer::nextNormalize() {
     482           0 :     clearBuffer();
     483           0 :     currentIndex=nextIndex;
     484           0 :     text->setIndex(nextIndex);
     485           0 :     if(!text->hasNext()) {
     486           0 :         return FALSE;
     487             :     }
     488             :     // Skip at least one character so we make progress.
     489           0 :     UnicodeString segment(text->next32PostInc());
     490           0 :     while(text->hasNext()) {
     491             :         UChar32 c;
     492           0 :         if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
     493           0 :             text->move32(-1, CharacterIterator::kCurrent);
     494           0 :             break;
     495             :         }
     496           0 :         segment.append(c);
     497             :     }
     498           0 :     nextIndex=text->getIndex();
     499           0 :     UErrorCode errorCode=U_ZERO_ERROR;
     500           0 :     fNorm2->normalize(segment, buffer, errorCode);
     501           0 :     return U_SUCCESS(errorCode) && !buffer.isEmpty();
     502             : }
     503             : 
     504             : UBool
     505           0 : Normalizer::previousNormalize() {
     506           0 :     clearBuffer();
     507           0 :     nextIndex=currentIndex;
     508           0 :     text->setIndex(currentIndex);
     509           0 :     if(!text->hasPrevious()) {
     510           0 :         return FALSE;
     511             :     }
     512           0 :     UnicodeString segment;
     513           0 :     while(text->hasPrevious()) {
     514           0 :         UChar32 c=text->previous32();
     515           0 :         segment.insert(0, c);
     516           0 :         if(fNorm2->hasBoundaryBefore(c)) {
     517           0 :             break;
     518             :         }
     519             :     }
     520           0 :     currentIndex=text->getIndex();
     521           0 :     UErrorCode errorCode=U_ZERO_ERROR;
     522           0 :     fNorm2->normalize(segment, buffer, errorCode);
     523           0 :     bufferPos=buffer.length();
     524           0 :     return U_SUCCESS(errorCode) && !buffer.isEmpty();
     525             : }
     526             : 
     527             : U_NAMESPACE_END
     528             : 
     529             : #endif /* #if !UCONFIG_NO_NORMALIZATION */

Generated by: LCOV version 1.13