LCOV - output.info - intl/icu/source/common/normalizer2impl.cpp

LCOV - code coverage report

Current view:	top level - intl/icu/source/common - normalizer2impl.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	38	1064	3.6 %
Date:	2017-07-14 16:53:18	Functions:	2	64	3.1 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : *
       6             : *   Copyright (C) 2009-2014, International Business Machines
       7             : *   Corporation and others.  All Rights Reserved.
       8             : *
       9             : *******************************************************************************
      10             : *   file name:  normalizer2impl.cpp
      11             : *   encoding:   UTF-8
      12             : *   tab size:   8 (not used)
      13             : *   indentation:4
      14             : *
      15             : *   created on: 2009nov22
      16             : *   created by: Markus W. Scherer
      17             : */
      18             : 
      19             : #include "unicode/utypes.h"
      20             : 
      21             : #if !UCONFIG_NO_NORMALIZATION
      22             : 
      23             : #include "unicode/normalizer2.h"
      24             : #include "unicode/udata.h"
      25             : #include "unicode/ustring.h"
      26             : #include "unicode/utf16.h"
      27             : #include "cmemory.h"
      28             : #include "mutex.h"
      29             : #include "normalizer2impl.h"
      30             : #include "putilimp.h"
      31             : #include "uassert.h"
      32             : #include "uset_imp.h"
      33             : #include "utrie2.h"
      34             : #include "uvector.h"
      35             : 
      36             : U_NAMESPACE_BEGIN
      37             : 
      38             : // ReorderingBuffer -------------------------------------------------------- ***
      39             : 
      40           0 : UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) {
      41           0 :     int32_t length=str.length();
      42           0 :     start=str.getBuffer(destCapacity);
      43           0 :     if(start==NULL) {
      44             :         // getBuffer() already did str.setToBogus()
      45           0 :         errorCode=U_MEMORY_ALLOCATION_ERROR;
      46           0 :         return FALSE;
      47             :     }
      48           0 :     limit=start+length;
      49           0 :     remainingCapacity=str.getCapacity()-length;
      50           0 :     reorderStart=start;
      51           0 :     if(start==limit) {
      52           0 :         lastCC=0;
      53             :     } else {
      54           0 :         setIterator();
      55           0 :         lastCC=previousCC();
      56             :         // Set reorderStart after the last code point with cc<=1 if there is one.
      57           0 :         if(lastCC>1) {
      58           0 :             while(previousCC()>1) {}
      59             :         }
      60           0 :         reorderStart=codePointLimit;
      61             :     }
      62           0 :     return TRUE;
      63             : }
      64             : 
      65           0 : UBool ReorderingBuffer::equals(const UChar *otherStart, const UChar *otherLimit) const {
      66           0 :     int32_t length=(int32_t)(limit-start);
      67             :     return
      68           0 :         length==(int32_t)(otherLimit-otherStart) &&
      69           0 :         0==u_memcmp(start, otherStart, length);
      70             : }
      71             : 
      72           0 : UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
      73           0 :     if(remainingCapacity<2 && !resize(2, errorCode)) {
      74           0 :         return FALSE;
      75             :     }
      76           0 :     if(lastCC<=cc || cc==0) {
      77           0 :         limit[0]=U16_LEAD(c);
      78           0 :         limit[1]=U16_TRAIL(c);
      79           0 :         limit+=2;
      80           0 :         lastCC=cc;
      81           0 :         if(cc<=1) {
      82           0 :             reorderStart=limit;
      83             :         }
      84             :     } else {
      85           0 :         insert(c, cc);
      86             :     }
      87           0 :     remainingCapacity-=2;
      88           0 :     return TRUE;
      89             : }
      90             : 
      91           0 : UBool ReorderingBuffer::append(const UChar *s, int32_t length,
      92             :                                uint8_t leadCC, uint8_t trailCC,
      93             :                                UErrorCode &errorCode) {
      94           0 :     if(length==0) {
      95           0 :         return TRUE;
      96             :     }
      97           0 :     if(remainingCapacity<length && !resize(length, errorCode)) {
      98           0 :         return FALSE;
      99             :     }
     100           0 :     remainingCapacity-=length;
     101           0 :     if(lastCC<=leadCC || leadCC==0) {
     102           0 :         if(trailCC<=1) {
     103           0 :             reorderStart=limit+length;
     104           0 :         } else if(leadCC<=1) {
     105           0 :             reorderStart=limit+1;  // Ok if not a code point boundary.
     106             :         }
     107           0 :         const UChar *sLimit=s+length;
     108           0 :         do { *limit++=*s++; } while(s!=sLimit);
     109           0 :         lastCC=trailCC;
     110             :     } else {
     111           0 :         int32_t i=0;
     112             :         UChar32 c;
     113           0 :         U16_NEXT(s, i, length, c);
     114           0 :         insert(c, leadCC);  // insert first code point
     115           0 :         while(i<length) {
     116           0 :             U16_NEXT(s, i, length, c);
     117           0 :             if(i<length) {
     118             :                 // s must be in NFD, otherwise we need to use getCC().
     119           0 :                 leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
     120             :             } else {
     121           0 :                 leadCC=trailCC;
     122             :             }
     123           0 :             append(c, leadCC, errorCode);
     124             :         }
     125             :     }
     126           0 :     return TRUE;
     127             : }
     128             : 
     129           0 : UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) {
     130           0 :     int32_t cpLength=U16_LENGTH(c);
     131           0 :     if(remainingCapacity<cpLength && !resize(cpLength, errorCode)) {
     132           0 :         return FALSE;
     133             :     }
     134           0 :     remainingCapacity-=cpLength;
     135           0 :     if(cpLength==1) {
     136           0 :         *limit++=(UChar)c;
     137             :     } else {
     138           0 :         limit[0]=U16_LEAD(c);
     139           0 :         limit[1]=U16_TRAIL(c);
     140           0 :         limit+=2;
     141             :     }
     142           0 :     lastCC=0;
     143           0 :     reorderStart=limit;
     144           0 :     return TRUE;
     145             : }
     146             : 
     147           0 : UBool ReorderingBuffer::appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode) {
     148           0 :     if(s==sLimit) {
     149           0 :         return TRUE;
     150             :     }
     151           0 :     int32_t length=(int32_t)(sLimit-s);
     152           0 :     if(remainingCapacity<length && !resize(length, errorCode)) {
     153           0 :         return FALSE;
     154             :     }
     155           0 :     u_memcpy(limit, s, length);
     156           0 :     limit+=length;
     157           0 :     remainingCapacity-=length;
     158           0 :     lastCC=0;
     159           0 :     reorderStart=limit;
     160           0 :     return TRUE;
     161             : }
     162             : 
     163           0 : void ReorderingBuffer::remove() {
     164           0 :     reorderStart=limit=start;
     165           0 :     remainingCapacity=str.getCapacity();
     166           0 :     lastCC=0;
     167           0 : }
     168             : 
     169           0 : void ReorderingBuffer::removeSuffix(int32_t suffixLength) {
     170           0 :     if(suffixLength<(limit-start)) {
     171           0 :         limit-=suffixLength;
     172           0 :         remainingCapacity+=suffixLength;
     173             :     } else {
     174           0 :         limit=start;
     175           0 :         remainingCapacity=str.getCapacity();
     176             :     }
     177           0 :     lastCC=0;
     178           0 :     reorderStart=limit;
     179           0 : }
     180             : 
     181           0 : UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) {
     182           0 :     int32_t reorderStartIndex=(int32_t)(reorderStart-start);
     183           0 :     int32_t length=(int32_t)(limit-start);
     184           0 :     str.releaseBuffer(length);
     185           0 :     int32_t newCapacity=length+appendLength;
     186           0 :     int32_t doubleCapacity=2*str.getCapacity();
     187           0 :     if(newCapacity<doubleCapacity) {
     188           0 :         newCapacity=doubleCapacity;
     189             :     }
     190           0 :     if(newCapacity<256) {
     191           0 :         newCapacity=256;
     192             :     }
     193           0 :     start=str.getBuffer(newCapacity);
     194           0 :     if(start==NULL) {
     195             :         // getBuffer() already did str.setToBogus()
     196           0 :         errorCode=U_MEMORY_ALLOCATION_ERROR;
     197           0 :         return FALSE;
     198             :     }
     199           0 :     reorderStart=start+reorderStartIndex;
     200           0 :     limit=start+length;
     201           0 :     remainingCapacity=str.getCapacity()-length;
     202           0 :     return TRUE;
     203             : }
     204             : 
     205           0 : void ReorderingBuffer::skipPrevious() {
     206           0 :     codePointLimit=codePointStart;
     207           0 :     UChar c=*--codePointStart;
     208           0 :     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(*(codePointStart-1))) {
     209           0 :         --codePointStart;
     210             :     }
     211           0 : }
     212             : 
     213           0 : uint8_t ReorderingBuffer::previousCC() {
     214           0 :     codePointLimit=codePointStart;
     215           0 :     if(reorderStart>=codePointStart) {
     216           0 :         return 0;
     217             :     }
     218           0 :     UChar32 c=*--codePointStart;
     219           0 :     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP) {
     220           0 :         return 0;
     221             :     }
     222             : 
     223             :     UChar c2;
     224           0 :     if(U16_IS_TRAIL(c) && start<codePointStart && U16_IS_LEAD(c2=*(codePointStart-1))) {
     225           0 :         --codePointStart;
     226           0 :         c=U16_GET_SUPPLEMENTARY(c2, c);
     227             :     }
     228           0 :     return Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
     229             : }
     230             : 
     231             : // Inserts c somewhere before the last character.
     232             : // Requires 0<cc<lastCC which implies reorderStart<limit.
     233           0 : void ReorderingBuffer::insert(UChar32 c, uint8_t cc) {
     234           0 :     for(setIterator(), skipPrevious(); previousCC()>cc;) {}
     235             :     // insert c at codePointLimit, after the character with prevCC<=cc
     236           0 :     UChar *q=limit;
     237           0 :     UChar *r=limit+=U16_LENGTH(c);
     238           0 :     do {
     239           0 :         *--r=*--q;
     240           0 :     } while(codePointLimit!=q);
     241           0 :     writeCodePoint(q, c);
     242           0 :     if(cc<=1) {
     243           0 :         reorderStart=r;
     244             :     }
     245           0 : }
     246             : 
     247             : // Normalizer2Impl --------------------------------------------------------- ***
     248             : 
     249             : struct CanonIterData : public UMemory {
     250             :     CanonIterData(UErrorCode &errorCode);
     251             :     ~CanonIterData();
     252             :     void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
     253             :     UTrie2 *trie;
     254             :     UVector canonStartSets;  // contains UnicodeSet *
     255             : };
     256             : 
     257           0 : Normalizer2Impl::~Normalizer2Impl() {
     258           0 :     delete fCanonIterData;
     259           0 : }
     260             : 
     261             : void
     262           5 : Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
     263             :                       const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
     264           5 :     minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
     265           5 :     minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
     266             : 
     267           5 :     minYesNo=inIndexes[IX_MIN_YES_NO];
     268           5 :     minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
     269           5 :     minNoNo=inIndexes[IX_MIN_NO_NO];
     270           5 :     limitNoNo=inIndexes[IX_LIMIT_NO_NO];
     271           5 :     minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
     272             : 
     273           5 :     normTrie=inTrie;
     274             : 
     275           5 :     maybeYesCompositions=inExtraData;
     276           5 :     extraData=maybeYesCompositions+(MIN_NORMAL_MAYBE_YES-minMaybeYes);
     277             : 
     278           5 :     smallFCD=inSmallFCD;
     279             : 
     280             :     // Build tccc180[].
     281             :     // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
     282           5 :     uint8_t bits=0;
     283          65 :     for(UChar c=0; c<0x180; bits>>=1) {
     284          60 :         if((c&0xff)==0) {
     285          10 :             bits=smallFCD[c>>8];  // one byte per 0x100 code points
     286             :         }
     287          60 :         if(bits&1) {
     288         990 :             for(int i=0; i<0x20; ++i, ++c) {
     289         960 :                 tccc180[c]=(uint8_t)getFCD16FromNormData(c);
     290             :             }
     291             :         } else {
     292          30 :             uprv_memset(tccc180+c, 0, 0x20);
     293          30 :             c+=0x20;
     294             :         }
     295             :     }
     296           5 : }
     297             : 
     298           0 : uint8_t Normalizer2Impl::getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const {
     299             :     UChar32 c;
     300           0 :     if(cpStart==(cpLimit-1)) {
     301           0 :         c=*cpStart;
     302             :     } else {
     303           0 :         c=U16_GET_SUPPLEMENTARY(cpStart[0], cpStart[1]);
     304             :     }
     305           0 :     uint16_t prevNorm16=getNorm16(c);
     306           0 :     if(prevNorm16<=minYesNo) {
     307           0 :         return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
     308             :     } else {
     309           0 :         return (uint8_t)(*getMapping(prevNorm16)>>8);  // tccc from yesNo
     310             :     }
     311             : }
     312             : 
     313             : namespace {
     314             : 
     315             : class LcccContext {
     316             : public:
     317           0 :     LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
     318             : 
     319           0 :     void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
     320           0 :         if(impl.isAlgorithmicNoNo(norm16)) {
     321             :             // Range of code points with same-norm16-value algorithmic decompositions.
     322             :             // They might have different non-zero FCD16 values.
     323           0 :             do {
     324           0 :                 uint16_t fcd16=impl.getFCD16(start);
     325           0 :                 if(fcd16>0xff) { set.add(start); }
     326             :             } while(++start<=end);
     327             :         } else {
     328           0 :             uint16_t fcd16=impl.getFCD16(start);
     329           0 :             if(fcd16>0xff) { set.add(start, end); }
     330             :         }
     331           0 :     }
     332             : 
     333             : private:
     334             :     const Normalizer2Impl &impl;
     335             :     UnicodeSet &set;
     336             : };
     337             : 
     338             : struct PropertyStartsContext {
     339           0 :     PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
     340           0 :             : impl(ni), sa(adder) {}
     341             : 
     342             :     const Normalizer2Impl &impl;
     343             :     const USetAdder *sa;
     344             : };
     345             : 
     346             : }  // namespace
     347             : 
     348             : U_CDECL_BEGIN
     349             : 
     350             : static UBool U_CALLCONV
     351           0 : enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
     352           0 :     ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
     353           0 :     return TRUE;
     354             : }
     355             : 
     356             : static UBool U_CALLCONV
     357           0 : enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
     358             :     /* add the start code point to the USet */
     359           0 :     const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
     360           0 :     const USetAdder *sa=ctx->sa;
     361           0 :     sa->add(sa->set, start);
     362           0 :     if(start!=end && ctx->impl.isAlgorithmicNoNo((uint16_t)value)) {
     363             :         // Range of code points with same-norm16-value algorithmic decompositions.
     364             :         // They might have different non-zero FCD16 values.
     365           0 :         uint16_t prevFCD16=ctx->impl.getFCD16(start);
     366           0 :         while(++start<=end) {
     367           0 :             uint16_t fcd16=ctx->impl.getFCD16(start);
     368           0 :             if(fcd16!=prevFCD16) {
     369           0 :                 sa->add(sa->set, start);
     370           0 :                 prevFCD16=fcd16;
     371             :             }
     372             :         }
     373             :     }
     374           0 :     return TRUE;
     375             : }
     376             : 
     377             : static UBool U_CALLCONV
     378           0 : enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
     379             :     /* add the start code point to the USet */
     380           0 :     const USetAdder *sa=(const USetAdder *)context;
     381           0 :     sa->add(sa->set, start);
     382           0 :     return TRUE;
     383             : }
     384             : 
     385             : static uint32_t U_CALLCONV
     386           0 : segmentStarterMapper(const void * /*context*/, uint32_t value) {
     387           0 :     return value&CANON_NOT_SEGMENT_STARTER;
     388             : }
     389             : 
     390             : U_CDECL_END
     391             : 
     392             : void
     393           0 : Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
     394             :     /* add the start code point of each same-value range of each trie */
     395           0 :     LcccContext context(*this, set);
     396           0 :     utrie2_enum(normTrie, NULL, enumLcccRange, &context);
     397           0 : }
     398             : 
     399             : void
     400           0 : Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
     401             :     /* add the start code point of each same-value range of each trie */
     402           0 :     PropertyStartsContext context(*this, sa);
     403           0 :     utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
     404             : 
     405             :     /* add Hangul LV syllables and LV+1 because of skippables */
     406           0 :     for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
     407           0 :         sa->add(sa->set, c);
     408           0 :         sa->add(sa->set, c+1);
     409             :     }
     410           0 :     sa->add(sa->set, Hangul::HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
     411           0 : }
     412             : 
     413             : void
     414           0 : Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
     415             :     /* add the start code point of each same-value range of the canonical iterator data trie */
     416           0 :     if(ensureCanonIterData(errorCode)) {
     417             :         // currently only used for the SEGMENT_STARTER property
     418           0 :         utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
     419             :     }
     420           0 : }
     421             : 
     422             : const UChar *
     423           0 : Normalizer2Impl::copyLowPrefixFromNulTerminated(const UChar *src,
     424             :                                                 UChar32 minNeedDataCP,
     425             :                                                 ReorderingBuffer *buffer,
     426             :                                                 UErrorCode &errorCode) const {
     427             :     // Make some effort to support NUL-terminated strings reasonably.
     428             :     // Take the part of the fast quick check loop that does not look up
     429             :     // data and check the first part of the string.
     430             :     // After this prefix, determine the string length to simplify the rest
     431             :     // of the code.
     432           0 :     const UChar *prevSrc=src;
     433             :     UChar c;
     434           0 :     while((c=*src++)<minNeedDataCP && c!=0) {}
     435             :     // Back out the last character for full processing.
     436             :     // Copy this prefix.
     437           0 :     if(--src!=prevSrc) {
     438           0 :         if(buffer!=NULL) {
     439           0 :             buffer->appendZeroCC(prevSrc, src, errorCode);
     440             :         }
     441             :     }
     442           0 :     return src;
     443             : }
     444             : 
     445             : UnicodeString &
     446           0 : Normalizer2Impl::decompose(const UnicodeString &src, UnicodeString &dest,
     447             :                            UErrorCode &errorCode) const {
     448           0 :     if(U_FAILURE(errorCode)) {
     449           0 :         dest.setToBogus();
     450           0 :         return dest;
     451             :     }
     452           0 :     const UChar *sArray=src.getBuffer();
     453           0 :     if(&dest==&src || sArray==NULL) {
     454           0 :         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
     455           0 :         dest.setToBogus();
     456           0 :         return dest;
     457             :     }
     458           0 :     decompose(sArray, sArray+src.length(), dest, src.length(), errorCode);
     459           0 :     return dest;
     460             : }
     461             : 
     462             : void
     463           0 : Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
     464             :                            UnicodeString &dest,
     465             :                            int32_t destLengthEstimate,
     466             :                            UErrorCode &errorCode) const {
     467           0 :     if(destLengthEstimate<0 && limit!=NULL) {
     468           0 :         destLengthEstimate=(int32_t)(limit-src);
     469             :     }
     470           0 :     dest.remove();
     471           0 :     ReorderingBuffer buffer(*this, dest);
     472           0 :     if(buffer.init(destLengthEstimate, errorCode)) {
     473           0 :         decompose(src, limit, &buffer, errorCode);
     474             :     }
     475           0 : }
     476             : 
     477             : // Dual functionality:
     478             : // buffer!=NULL: normalize
     479             : // buffer==NULL: isNormalized/spanQuickCheckYes
     480             : const UChar *
     481           0 : Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
     482             :                            ReorderingBuffer *buffer,
     483             :                            UErrorCode &errorCode) const {
     484           0 :     UChar32 minNoCP=minDecompNoCP;
     485           0 :     if(limit==NULL) {
     486           0 :         src=copyLowPrefixFromNulTerminated(src, minNoCP, buffer, errorCode);
     487           0 :         if(U_FAILURE(errorCode)) {
     488           0 :             return src;
     489             :         }
     490           0 :         limit=u_strchr(src, 0);
     491             :     }
     492             : 
     493             :     const UChar *prevSrc;
     494           0 :     UChar32 c=0;
     495           0 :     uint16_t norm16=0;
     496             : 
     497             :     // only for quick check
     498           0 :     const UChar *prevBoundary=src;
     499           0 :     uint8_t prevCC=0;
     500             : 
     501             :     for(;;) {
     502             :         // count code units below the minimum or with irrelevant data for the quick check
     503           0 :         for(prevSrc=src; src!=limit;) {
     504           0 :             if( (c=*src)<minNoCP ||
     505           0 :                 isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
     506             :             ) {
     507           0 :                 ++src;
     508           0 :             } else if(!U16_IS_SURROGATE(c)) {
     509           0 :                 break;
     510             :             } else {
     511             :                 UChar c2;
     512           0 :                 if(U16_IS_SURROGATE_LEAD(c)) {
     513           0 :                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
     514           0 :                         c=U16_GET_SUPPLEMENTARY(c, c2);
     515             :                     }
     516             :                 } else /* trail surrogate */ {
     517           0 :                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
     518           0 :                         --src;
     519           0 :                         c=U16_GET_SUPPLEMENTARY(c2, c);
     520             :                     }
     521             :                 }
     522           0 :                 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
     523           0 :                     src+=U16_LENGTH(c);
     524             :                 } else {
     525           0 :                     break;
     526             :                 }
     527             :             }
     528             :         }
     529             :         // copy these code units all at once
     530           0 :         if(src!=prevSrc) {
     531           0 :             if(buffer!=NULL) {
     532           0 :                 if(!buffer->appendZeroCC(prevSrc, src, errorCode)) {
     533           0 :                     break;
     534             :                 }
     535             :             } else {
     536           0 :                 prevCC=0;
     537           0 :                 prevBoundary=src;
     538             :             }
     539             :         }
     540           0 :         if(src==limit) {
     541           0 :             break;
     542             :         }
     543             : 
     544             :         // Check one above-minimum, relevant code point.
     545           0 :         src+=U16_LENGTH(c);
     546           0 :         if(buffer!=NULL) {
     547           0 :             if(!decompose(c, norm16, *buffer, errorCode)) {
     548           0 :                 break;
     549             :             }
     550             :         } else {
     551           0 :             if(isDecompYes(norm16)) {
     552           0 :                 uint8_t cc=getCCFromYesOrMaybe(norm16);
     553           0 :                 if(prevCC<=cc || cc==0) {
     554           0 :                     prevCC=cc;
     555           0 :                     if(cc<=1) {
     556           0 :                         prevBoundary=src;
     557             :                     }
     558           0 :                     continue;
     559             :                 }
     560             :             }
     561           0 :             return prevBoundary;  // "no" or cc out of order
     562             :         }
     563           0 :     }
     564           0 :     return src;
     565             : }
     566             : 
     567             : // Decompose a short piece of text which is likely to contain characters that
     568             : // fail the quick check loop and/or where the quick check loop's overhead
     569             : // is unlikely to be amortized.
     570             : // Called by the compose() and makeFCD() implementations.
     571           0 : UBool Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
     572             :                                       ReorderingBuffer &buffer,
     573             :                                       UErrorCode &errorCode) const {
     574           0 :     while(src<limit) {
     575             :         UChar32 c;
     576             :         uint16_t norm16;
     577           0 :         UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
     578           0 :         if(!decompose(c, norm16, buffer, errorCode)) {
     579           0 :             return FALSE;
     580             :         }
     581             :     }
     582           0 :     return TRUE;
     583             : }
     584             : 
     585           0 : UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
     586             :                                  ReorderingBuffer &buffer,
     587             :                                  UErrorCode &errorCode) const {
     588             :     // Only loops for 1:1 algorithmic mappings.
     589             :     for(;;) {
     590             :         // get the decomposition and the lead and trail cc's
     591           0 :         if(isDecompYes(norm16)) {
     592             :             // c does not decompose
     593           0 :             return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode);
     594           0 :         } else if(isHangul(norm16)) {
     595             :             // Hangul syllable: decompose algorithmically
     596             :             UChar jamos[3];
     597           0 :             return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode);
     598           0 :         } else if(isDecompNoAlgorithmic(norm16)) {
     599           0 :             c=mapAlgorithmic(c, norm16);
     600           0 :             norm16=getNorm16(c);
     601             :         } else {
     602             :             // c decomposes, get everything from the variable-length extra data
     603           0 :             const uint16_t *mapping=getMapping(norm16);
     604           0 :             uint16_t firstUnit=*mapping;
     605           0 :             int32_t length=firstUnit&MAPPING_LENGTH_MASK;
     606             :             uint8_t leadCC, trailCC;
     607           0 :             trailCC=(uint8_t)(firstUnit>>8);
     608           0 :             if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
     609           0 :                 leadCC=(uint8_t)(*(mapping-1)>>8);
     610             :             } else {
     611           0 :                 leadCC=0;
     612             :             }
     613           0 :             return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
     614             :         }
     615           0 :     }
     616             : }
     617             : 
     618             : const UChar *
     619           0 : Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const {
     620           0 :     const UChar *decomp=NULL;
     621             :     uint16_t norm16;
     622             :     for(;;) {
     623           0 :         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
     624             :             // c does not decompose
     625           0 :             return decomp;
     626           0 :         } else if(isHangul(norm16)) {
     627             :             // Hangul syllable: decompose algorithmically
     628           0 :             length=Hangul::decompose(c, buffer);
     629           0 :             return buffer;
     630           0 :         } else if(isDecompNoAlgorithmic(norm16)) {
     631           0 :             c=mapAlgorithmic(c, norm16);
     632           0 :             decomp=buffer;
     633           0 :             length=0;
     634           0 :             U16_APPEND_UNSAFE(buffer, length, c);
     635             :         } else {
     636             :             // c decomposes, get everything from the variable-length extra data
     637           0 :             const uint16_t *mapping=getMapping(norm16);
     638           0 :             length=*mapping&MAPPING_LENGTH_MASK;
     639           0 :             return (const UChar *)mapping+1;
     640             :         }
     641           0 :     }
     642             : }
     643             : 
     644             : // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1
     645             : // so that a raw mapping fits that consists of one unit ("rm0")
     646             : // plus all but the first two code units of the normal mapping.
     647             : // The maximum length of a normal mapping is 31=MAPPING_LENGTH_MASK.
     648             : const UChar *
     649           0 : Normalizer2Impl::getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const {
     650             :     // We do not loop in this method because an algorithmic mapping itself
     651             :     // becomes a final result rather than having to be decomposed recursively.
     652             :     uint16_t norm16;
     653           0 :     if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
     654             :         // c does not decompose
     655           0 :         return NULL;
     656           0 :     } else if(isHangul(norm16)) {
     657             :         // Hangul syllable: decompose algorithmically
     658           0 :         Hangul::getRawDecomposition(c, buffer);
     659           0 :         length=2;
     660           0 :         return buffer;
     661           0 :     } else if(isDecompNoAlgorithmic(norm16)) {
     662           0 :         c=mapAlgorithmic(c, norm16);
     663           0 :         length=0;
     664           0 :         U16_APPEND_UNSAFE(buffer, length, c);
     665           0 :         return buffer;
     666             :     } else {
     667             :         // c decomposes, get everything from the variable-length extra data
     668           0 :         const uint16_t *mapping=getMapping(norm16);
     669           0 :         uint16_t firstUnit=*mapping;
     670           0 :         int32_t mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
     671           0 :         if(firstUnit&MAPPING_HAS_RAW_MAPPING) {
     672             :             // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
     673             :             // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
     674           0 :             const uint16_t *rawMapping=mapping-((firstUnit>>7)&1)-1;
     675           0 :             uint16_t rm0=*rawMapping;
     676           0 :             if(rm0<=MAPPING_LENGTH_MASK) {
     677           0 :                 length=rm0;
     678           0 :                 return (const UChar *)rawMapping-rm0;
     679             :             } else {
     680             :                 // Copy the normal mapping and replace its first two code units with rm0.
     681           0 :                 buffer[0]=(UChar)rm0;
     682           0 :                 u_memcpy(buffer+1, (const UChar *)mapping+1+2, mLength-2);
     683           0 :                 length=mLength-1;
     684           0 :                 return buffer;
     685             :             }
     686             :         } else {
     687           0 :             length=mLength;
     688           0 :             return (const UChar *)mapping+1;
     689             :         }
     690             :     }
     691             : }
     692             : 
     693           0 : void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
     694             :                                          UBool doDecompose,
     695             :                                          UnicodeString &safeMiddle,
     696             :                                          ReorderingBuffer &buffer,
     697             :                                          UErrorCode &errorCode) const {
     698           0 :     buffer.copyReorderableSuffixTo(safeMiddle);
     699           0 :     if(doDecompose) {
     700           0 :         decompose(src, limit, &buffer, errorCode);
     701           0 :         return;
     702             :     }
     703             :     // Just merge the strings at the boundary.
     704           0 :     ForwardUTrie2StringIterator iter(normTrie, src, limit);
     705             :     uint8_t firstCC, prevCC, cc;
     706           0 :     firstCC=prevCC=cc=getCC(iter.next16());
     707           0 :     while(cc!=0) {
     708           0 :         prevCC=cc;
     709           0 :         cc=getCC(iter.next16());
     710             :     };
     711           0 :     if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
     712           0 :         limit=u_strchr(iter.codePointStart, 0);
     713             :     }
     714             : 
     715           0 :     if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
     716           0 :         buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
     717             :     }
     718             : }
     719             : 
     720             : // Note: hasDecompBoundary() could be implemented as aliases to
     721             : // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
     722             : // at the cost of building the FCD trie for a decomposition normalizer.
     723           0 : UBool Normalizer2Impl::hasDecompBoundary(UChar32 c, UBool before) const {
     724             :     for(;;) {
     725           0 :         if(c<minDecompNoCP) {
     726           0 :             return TRUE;
     727             :         }
     728           0 :         uint16_t norm16=getNorm16(c);
     729           0 :         if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
     730           0 :             return TRUE;
     731           0 :         } else if(norm16>MIN_NORMAL_MAYBE_YES) {
     732           0 :             return FALSE;  // ccc!=0
     733           0 :         } else if(isDecompNoAlgorithmic(norm16)) {
     734           0 :             c=mapAlgorithmic(c, norm16);
     735             :         } else {
     736             :             // c decomposes, get everything from the variable-length extra data
     737           0 :             const uint16_t *mapping=getMapping(norm16);
     738           0 :             uint16_t firstUnit=*mapping;
     739           0 :             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
     740           0 :                 return FALSE;
     741             :             }
     742           0 :             if(!before) {
     743             :                 // decomp after-boundary: same as hasFCDBoundaryAfter(),
     744             :                 // fcd16<=1 || trailCC==0
     745           0 :                 if(firstUnit>0x1ff) {
     746           0 :                     return FALSE;  // trailCC>1
     747             :                 }
     748           0 :                 if(firstUnit<=0xff) {
     749           0 :                     return TRUE;  // trailCC==0
     750             :                 }
     751             :                 // if(trailCC==1) test leadCC==0, same as checking for before-boundary
     752             :             }
     753             :             // TRUE if leadCC==0 (hasFCDBoundaryBefore())
     754           0 :             return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0;
     755             :         }
     756           0 :     }
     757             : }
     758             : 
     759             : /*
     760             :  * Finds the recomposition result for
     761             :  * a forward-combining "lead" character,
     762             :  * specified with a pointer to its compositions list,
     763             :  * and a backward-combining "trail" character.
     764             :  *
     765             :  * If the lead and trail characters combine, then this function returns
     766             :  * the following "compositeAndFwd" value:
     767             :  * Bits 21..1  composite character
     768             :  * Bit      0  set if the composite is a forward-combining starter
     769             :  * otherwise it returns -1.
     770             :  *
     771             :  * The compositions list has (trail, compositeAndFwd) pair entries,
     772             :  * encoded as either pairs or triples of 16-bit units.
     773             :  * The last entry has the high bit of its first unit set.
     774             :  *
     775             :  * The list is sorted by ascending trail characters (there are no duplicates).
     776             :  * A linear search is used.
     777             :  *
     778             :  * See normalizer2impl.h for a more detailed description
     779             :  * of the compositions list format.
     780             :  */
     781           0 : int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) {
     782             :     uint16_t key1, firstUnit;
     783           0 :     if(trail<COMP_1_TRAIL_LIMIT) {
     784             :         // trail character is 0..33FF
     785             :         // result entry may have 2 or 3 units
     786           0 :         key1=(uint16_t)(trail<<1);
     787           0 :         while(key1>(firstUnit=*list)) {
     788           0 :             list+=2+(firstUnit&COMP_1_TRIPLE);
     789             :         }
     790           0 :         if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
     791           0 :             if(firstUnit&COMP_1_TRIPLE) {
     792           0 :                 return ((int32_t)list[1]<<16)|list[2];
     793             :             } else {
     794           0 :                 return list[1];
     795             :             }
     796             :         }
     797             :     } else {
     798             :         // trail character is 3400..10FFFF
     799             :         // result entry has 3 units
     800           0 :         key1=(uint16_t)(COMP_1_TRAIL_LIMIT+
     801           0 :                         (((trail>>COMP_1_TRAIL_SHIFT))&
     802             :                           ~COMP_1_TRIPLE));
     803           0 :         uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT);
     804             :         uint16_t secondUnit;
     805             :         for(;;) {
     806           0 :             if(key1>(firstUnit=*list)) {
     807           0 :                 list+=2+(firstUnit&COMP_1_TRIPLE);
     808           0 :             } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
     809           0 :                 if(key2>(secondUnit=list[1])) {
     810           0 :                     if(firstUnit&COMP_1_LAST_TUPLE) {
     811           0 :                         break;
     812             :                     } else {
     813           0 :                         list+=3;
     814             :                     }
     815           0 :                 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
     816           0 :                     return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2];
     817             :                 } else {
     818           0 :                     break;
     819             :                 }
     820             :             } else {
     821           0 :                 break;
     822             :             }
     823             :         }
     824             :     }
     825           0 :     return -1;
     826             : }
     827             : 
     828             : /**
     829             :   * @param list some character's compositions list
     830             :   * @param set recursively receives the composites from these compositions
     831             :   */
     832           0 : void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const {
     833             :     uint16_t firstUnit;
     834             :     int32_t compositeAndFwd;
     835           0 :     do {
     836           0 :         firstUnit=*list;
     837           0 :         if((firstUnit&COMP_1_TRIPLE)==0) {
     838           0 :             compositeAndFwd=list[1];
     839           0 :             list+=2;
     840             :         } else {
     841           0 :             compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2];
     842           0 :             list+=3;
     843             :         }
     844           0 :         UChar32 composite=compositeAndFwd>>1;
     845           0 :         if((compositeAndFwd&1)!=0) {
     846           0 :             addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
     847             :         }
     848           0 :         set.add(composite);
     849           0 :     } while((firstUnit&COMP_1_LAST_TUPLE)==0);
     850           0 : }
     851             : 
     852             : /*
     853             :  * Recomposes the buffer text starting at recomposeStartIndex
     854             :  * (which is in NFD - decomposed and canonically ordered),
     855             :  * and truncates the buffer contents.
     856             :  *
     857             :  * Note that recomposition never lengthens the text:
     858             :  * Any character consists of either one or two code units;
     859             :  * a composition may contain at most one more code unit than the original starter,
     860             :  * while the combining mark that is removed has at least one code unit.
     861             :  */
     862           0 : void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
     863             :                                 UBool onlyContiguous) const {
     864           0 :     UChar *p=buffer.getStart()+recomposeStartIndex;
     865           0 :     UChar *limit=buffer.getLimit();
     866           0 :     if(p==limit) {
     867           0 :         return;
     868             :     }
     869             : 
     870             :     UChar *starter, *pRemove, *q, *r;
     871             :     const uint16_t *compositionsList;
     872             :     UChar32 c, compositeAndFwd;
     873             :     uint16_t norm16;
     874             :     uint8_t cc, prevCC;
     875             :     UBool starterIsSupplementary;
     876             : 
     877             :     // Some of the following variables are not used until we have a forward-combining starter
     878             :     // and are only initialized now to avoid compiler warnings.
     879           0 :     compositionsList=NULL;  // used as indicator for whether we have a forward-combining starter
     880           0 :     starter=NULL;
     881           0 :     starterIsSupplementary=FALSE;
     882           0 :     prevCC=0;
     883             : 
     884             :     for(;;) {
     885           0 :         UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
     886           0 :         cc=getCCFromYesOrMaybe(norm16);
     887           0 :         if( // this character combines backward and
     888           0 :             isMaybe(norm16) &&
     889             :             // we have seen a starter that combines forward and
     890           0 :             compositionsList!=NULL &&
     891             :             // the backward-combining character is not blocked
     892           0 :             (prevCC<cc || prevCC==0)
     893             :         ) {
     894           0 :             if(isJamoVT(norm16)) {
     895             :                 // c is a Jamo V/T, see if we can compose it with the previous character.
     896           0 :                 if(c<Hangul::JAMO_T_BASE) {
     897             :                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
     898           0 :                     UChar prev=(UChar)(*starter-Hangul::JAMO_L_BASE);
     899           0 :                     if(prev<Hangul::JAMO_L_COUNT) {
     900           0 :                         pRemove=p-1;
     901             :                         UChar syllable=(UChar)
     902           0 :                             (Hangul::HANGUL_BASE+
     903           0 :                              (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
     904           0 :                              Hangul::JAMO_T_COUNT);
     905             :                         UChar t;
     906           0 :                         if(p!=limit && (t=(UChar)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
     907           0 :                             ++p;
     908           0 :                             syllable+=t;  // The next character was a Jamo T.
     909             :                         }
     910           0 :                         *starter=syllable;
     911             :                         // remove the Jamo V/T
     912           0 :                         q=pRemove;
     913           0 :                         r=p;
     914           0 :                         while(r<limit) {
     915           0 :                             *q++=*r++;
     916             :                         }
     917           0 :                         limit=q;
     918           0 :                         p=pRemove;
     919             :                     }
     920             :                 }
     921             :                 /*
     922             :                  * No "else" for Jamo T:
     923             :                  * Since the input is in NFD, there are no Hangul LV syllables that
     924             :                  * a Jamo T could combine with.
     925             :                  * All Jamo Ts are combined above when handling Jamo Vs.
     926             :                  */
     927           0 :                 if(p==limit) {
     928           0 :                     break;
     929             :                 }
     930           0 :                 compositionsList=NULL;
     931           0 :                 continue;
     932           0 :             } else if((compositeAndFwd=combine(compositionsList, c))>=0) {
     933             :                 // The starter and the combining mark (c) do combine.
     934           0 :                 UChar32 composite=compositeAndFwd>>1;
     935             : 
     936             :                 // Replace the starter with the composite, remove the combining mark.
     937           0 :                 pRemove=p-U16_LENGTH(c);  // pRemove & p: start & limit of the combining mark
     938           0 :                 if(starterIsSupplementary) {
     939           0 :                     if(U_IS_SUPPLEMENTARY(composite)) {
     940             :                         // both are supplementary
     941           0 :                         starter[0]=U16_LEAD(composite);
     942           0 :                         starter[1]=U16_TRAIL(composite);
     943             :                     } else {
     944           0 :                         *starter=(UChar)composite;
     945             :                         // The composite is shorter than the starter,
     946             :                         // move the intermediate characters forward one.
     947           0 :                         starterIsSupplementary=FALSE;
     948           0 :                         q=starter+1;
     949           0 :                         r=q+1;
     950           0 :                         while(r<pRemove) {
     951           0 :                             *q++=*r++;
     952             :                         }
     953           0 :                         --pRemove;
     954             :                     }
     955           0 :                 } else if(U_IS_SUPPLEMENTARY(composite)) {
     956             :                     // The composite is longer than the starter,
     957             :                     // move the intermediate characters back one.
     958           0 :                     starterIsSupplementary=TRUE;
     959           0 :                     ++starter;  // temporarily increment for the loop boundary
     960           0 :                     q=pRemove;
     961           0 :                     r=++pRemove;
     962           0 :                     while(starter<q) {
     963           0 :                         *--r=*--q;
     964             :                     }
     965           0 :                     *starter=U16_TRAIL(composite);
     966           0 :                     *--starter=U16_LEAD(composite);  // undo the temporary increment
     967             :                 } else {
     968             :                     // both are on the BMP
     969           0 :                     *starter=(UChar)composite;
     970             :                 }
     971             : 
     972             :                 /* remove the combining mark by moving the following text over it */
     973           0 :                 if(pRemove<p) {
     974           0 :                     q=pRemove;
     975           0 :                     r=p;
     976           0 :                     while(r<limit) {
     977           0 :                         *q++=*r++;
     978             :                     }
     979           0 :                     limit=q;
     980           0 :                     p=pRemove;
     981             :                 }
     982             :                 // Keep prevCC because we removed the combining mark.
     983             : 
     984           0 :                 if(p==limit) {
     985           0 :                     break;
     986             :                 }
     987             :                 // Is the composite a starter that combines forward?
     988           0 :                 if(compositeAndFwd&1) {
     989             :                     compositionsList=
     990           0 :                         getCompositionsListForComposite(getNorm16(composite));
     991             :                 } else {
     992           0 :                     compositionsList=NULL;
     993             :                 }
     994             : 
     995             :                 // We combined; continue with looking for compositions.
     996           0 :                 continue;
     997             :             }
     998             :         }
     999             : 
    1000             :         // no combination this time
    1001           0 :         prevCC=cc;
    1002           0 :         if(p==limit) {
    1003           0 :             break;
    1004             :         }
    1005             : 
    1006             :         // If c did not combine, then check if it is a starter.
    1007           0 :         if(cc==0) {
    1008             :             // Found a new starter.
    1009           0 :             if((compositionsList=getCompositionsListForDecompYes(norm16))!=NULL) {
    1010             :                 // It may combine with something, prepare for it.
    1011           0 :                 if(U_IS_BMP(c)) {
    1012           0 :                     starterIsSupplementary=FALSE;
    1013           0 :                     starter=p-1;
    1014             :                 } else {
    1015           0 :                     starterIsSupplementary=TRUE;
    1016           0 :                     starter=p-2;
    1017             :                 }
    1018             :             }
    1019           0 :         } else if(onlyContiguous) {
    1020             :             // FCC: no discontiguous compositions; any intervening character blocks.
    1021           0 :             compositionsList=NULL;
    1022             :         }
    1023           0 :     }
    1024           0 :     buffer.setReorderingLimit(limit);
    1025             : }
    1026             : 
    1027             : UChar32
    1028           0 : Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
    1029           0 :     uint16_t norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
    1030             :     const uint16_t *list;
    1031           0 :     if(isInert(norm16)) {
    1032           0 :         return U_SENTINEL;
    1033           0 :     } else if(norm16<minYesNoMappingsOnly) {
    1034           0 :         if(isJamoL(norm16)) {
    1035           0 :             b-=Hangul::JAMO_V_BASE;
    1036           0 :             if(0<=b && b<Hangul::JAMO_V_COUNT) {
    1037             :                 return
    1038             :                     (Hangul::HANGUL_BASE+
    1039           0 :                      ((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
    1040           0 :                      Hangul::JAMO_T_COUNT);
    1041             :             } else {
    1042           0 :                 return U_SENTINEL;
    1043             :             }
    1044           0 :         } else if(isHangul(norm16)) {
    1045           0 :             b-=Hangul::JAMO_T_BASE;
    1046           0 :             if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) {  // not b==0!
    1047           0 :                 return a+b;
    1048             :             } else {
    1049           0 :                 return U_SENTINEL;
    1050             :             }
    1051             :         } else {
    1052             :             // 'a' has a compositions list in extraData
    1053           0 :             list=extraData+norm16;
    1054           0 :             if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
    1055           0 :                 list+=  // mapping pointer
    1056           0 :                     1+  // +1 to skip the first unit with the mapping lenth
    1057           0 :                     (*list&MAPPING_LENGTH_MASK);  // + mapping length
    1058             :             }
    1059             :         }
    1060           0 :     } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
    1061           0 :         return U_SENTINEL;
    1062             :     } else {
    1063           0 :         list=maybeYesCompositions+norm16-minMaybeYes;
    1064             :     }
    1065           0 :     if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
    1066           0 :         return U_SENTINEL;
    1067             :     }
    1068             : #if U_SIGNED_RIGHT_SHIFT_IS_ARITHMETIC
    1069           0 :     return combine(list, b)>>1;
    1070             : #else
    1071             :     int32_t compositeAndFwd=combine(list, b);
    1072             :     return compositeAndFwd>=0 ? compositeAndFwd>>1 : U_SENTINEL;
    1073             : #endif
    1074             : }
    1075             : 
    1076             : // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
    1077             : // doCompose: normalize
    1078             : // !doCompose: isNormalized (buffer must be empty and initialized)
    1079             : UBool
    1080           0 : Normalizer2Impl::compose(const UChar *src, const UChar *limit,
    1081             :                          UBool onlyContiguous,
    1082             :                          UBool doCompose,
    1083             :                          ReorderingBuffer &buffer,
    1084             :                          UErrorCode &errorCode) const {
    1085             :     /*
    1086             :      * prevBoundary points to the last character before the current one
    1087             :      * that has a composition boundary before it with ccc==0 and quick check "yes".
    1088             :      * Keeping track of prevBoundary saves us looking for a composition boundary
    1089             :      * when we find a "no" or "maybe".
    1090             :      *
    1091             :      * When we back out from prevSrc back to prevBoundary,
    1092             :      * then we also remove those same characters (which had been simply copied
    1093             :      * or canonically-order-inserted) from the ReorderingBuffer.
    1094             :      * Therefore, at all times, the [prevBoundary..prevSrc[ source units
    1095             :      * must correspond 1:1 to destination units at the end of the destination buffer.
    1096             :      */
    1097           0 :     const UChar *prevBoundary=src;
    1098           0 :     UChar32 minNoMaybeCP=minCompNoMaybeCP;
    1099           0 :     if(limit==NULL) {
    1100           0 :         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP,
    1101             :                                            doCompose ? &buffer : NULL,
    1102           0 :                                            errorCode);
    1103           0 :         if(U_FAILURE(errorCode)) {
    1104           0 :             return FALSE;
    1105             :         }
    1106           0 :         if(prevBoundary<src) {
    1107             :             // Set prevBoundary to the last character in the prefix.
    1108           0 :             prevBoundary=src-1;
    1109             :         }
    1110           0 :         limit=u_strchr(src, 0);
    1111             :     }
    1112             : 
    1113             :     const UChar *prevSrc;
    1114           0 :     UChar32 c=0;
    1115           0 :     uint16_t norm16=0;
    1116             : 
    1117             :     // only for isNormalized
    1118           0 :     uint8_t prevCC=0;
    1119             : 
    1120             :     for(;;) {
    1121             :         // count code units below the minimum or with irrelevant data for the quick check
    1122           0 :         for(prevSrc=src; src!=limit;) {
    1123           0 :             if( (c=*src)<minNoMaybeCP ||
    1124           0 :                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
    1125             :             ) {
    1126           0 :                 ++src;
    1127           0 :             } else if(!U16_IS_SURROGATE(c)) {
    1128           0 :                 break;
    1129             :             } else {
    1130             :                 UChar c2;
    1131           0 :                 if(U16_IS_SURROGATE_LEAD(c)) {
    1132           0 :                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    1133           0 :                         c=U16_GET_SUPPLEMENTARY(c, c2);
    1134             :                     }
    1135             :                 } else /* trail surrogate */ {
    1136           0 :                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    1137           0 :                         --src;
    1138           0 :                         c=U16_GET_SUPPLEMENTARY(c2, c);
    1139             :                     }
    1140             :                 }
    1141           0 :                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
    1142           0 :                     src+=U16_LENGTH(c);
    1143             :                 } else {
    1144           0 :                     break;
    1145             :                 }
    1146             :             }
    1147             :         }
    1148             :         // copy these code units all at once
    1149           0 :         if(src!=prevSrc) {
    1150           0 :             if(doCompose) {
    1151           0 :                 if(!buffer.appendZeroCC(prevSrc, src, errorCode)) {
    1152           0 :                     break;
    1153             :                 }
    1154             :             } else {
    1155           0 :                 prevCC=0;
    1156             :             }
    1157           0 :             if(src==limit) {
    1158           0 :                 break;
    1159             :             }
    1160             :             // Set prevBoundary to the last character in the quick check loop.
    1161           0 :             prevBoundary=src-1;
    1162           0 :             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
    1163           0 :                 U16_IS_LEAD(*(prevBoundary-1))
    1164             :             ) {
    1165           0 :                 --prevBoundary;
    1166             :             }
    1167             :             // The start of the current character (c).
    1168           0 :             prevSrc=src;
    1169           0 :         } else if(src==limit) {
    1170           0 :             break;
    1171             :         }
    1172             : 
    1173           0 :         src+=U16_LENGTH(c);
    1174             :         /*
    1175             :          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
    1176             :          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
    1177             :          * or has ccc!=0.
    1178             :          * Check for Jamo V/T, then for regular characters.
    1179             :          * c is not a Hangul syllable or Jamo L because those have "yes" properties.
    1180             :          */
    1181           0 :         if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
    1182           0 :             UChar prev=*(prevSrc-1);
    1183           0 :             UBool needToDecompose=FALSE;
    1184           0 :             if(c<Hangul::JAMO_T_BASE) {
    1185             :                 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
    1186           0 :                 prev=(UChar)(prev-Hangul::JAMO_L_BASE);
    1187           0 :                 if(prev<Hangul::JAMO_L_COUNT) {
    1188           0 :                     if(!doCompose) {
    1189           0 :                         return FALSE;
    1190             :                     }
    1191             :                     UChar syllable=(UChar)
    1192           0 :                         (Hangul::HANGUL_BASE+
    1193           0 :                          (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))*
    1194           0 :                          Hangul::JAMO_T_COUNT);
    1195             :                     UChar t;
    1196           0 :                     if(src!=limit && (t=(UChar)(*src-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) {
    1197           0 :                         ++src;
    1198           0 :                         syllable+=t;  // The next character was a Jamo T.
    1199           0 :                         prevBoundary=src;
    1200           0 :                         buffer.setLastChar(syllable);
    1201           0 :                         continue;
    1202             :                     }
    1203             :                     // If we see L+V+x where x!=T then we drop to the slow path,
    1204             :                     // decompose and recompose.
    1205             :                     // This is to deal with NFKC finding normal L and V but a
    1206             :                     // compatibility variant of a T. We need to either fully compose that
    1207             :                     // combination here (which would complicate the code and may not work
    1208             :                     // with strange custom data) or use the slow path -- or else our replacing
    1209             :                     // two input characters (L+V) with one output character (LV syllable)
    1210             :                     // would violate the invariant that [prevBoundary..prevSrc[ has the same
    1211             :                     // length as what we appended to the buffer since prevBoundary.
    1212           0 :                     needToDecompose=TRUE;
    1213             :                 }
    1214           0 :             } else if(Hangul::isHangulWithoutJamoT(prev)) {
    1215             :                 // c is a Jamo Trailing consonant,
    1216             :                 // compose with previous Hangul LV that does not contain a Jamo T.
    1217           0 :                 if(!doCompose) {
    1218           0 :                     return FALSE;
    1219             :                 }
    1220           0 :                 buffer.setLastChar((UChar)(prev+c-Hangul::JAMO_T_BASE));
    1221           0 :                 prevBoundary=src;
    1222           0 :                 continue;
    1223             :             }
    1224           0 :             if(!needToDecompose) {
    1225             :                 // The Jamo V/T did not compose into a Hangul syllable.
    1226           0 :                 if(doCompose) {
    1227           0 :                     if(!buffer.appendBMP((UChar)c, 0, errorCode)) {
    1228           0 :                         break;
    1229             :                     }
    1230             :                 } else {
    1231           0 :                     prevCC=0;
    1232             :                 }
    1233           0 :                 continue;
    1234             :             }
    1235             :         }
    1236             :         /*
    1237             :          * Source buffer pointers:
    1238             :          *
    1239             :          *  all done      quick check   current char  not yet
    1240             :          *                "yes" but     (c)           processed
    1241             :          *                may combine
    1242             :          *                forward
    1243             :          * [-------------[-------------[-------------[-------------[
    1244             :          * |             |             |             |             |
    1245             :          * orig. src     prevBoundary  prevSrc       src           limit
    1246             :          *
    1247             :          *
    1248             :          * Destination buffer pointers inside the ReorderingBuffer:
    1249             :          *
    1250             :          *  all done      might take    not filled yet
    1251             :          *                characters for
    1252             :          *                reordering
    1253             :          * [-------------[-------------[-------------[
    1254             :          * |             |             |             |
    1255             :          * start         reorderStart  limit         |
    1256             :          *                             +remainingCap.+
    1257             :          */
    1258           0 :         if(norm16>=MIN_YES_YES_WITH_CC) {
    1259           0 :             uint8_t cc=(uint8_t)norm16;  // cc!=0
    1260           0 :             if( onlyContiguous &&  // FCC
    1261           0 :                 (doCompose ? buffer.getLastCC() : prevCC)==0 &&
    1262           0 :                 prevBoundary<prevSrc &&
    1263             :                 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
    1264             :                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
    1265             :                 // passed the quick check "yes && ccc==0" test.
    1266             :                 // Check whether the last character was a "yesYes" or a "yesNo".
    1267             :                 // If a "yesNo", then we get its trailing ccc from its
    1268             :                 // mapping and check for canonical order.
    1269             :                 // All other cases are ok.
    1270           0 :                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
    1271             :             ) {
    1272             :                 // Fails FCD test, need to decompose and contiguously recompose.
    1273           0 :                 if(!doCompose) {
    1274           0 :                     return FALSE;
    1275             :                 }
    1276           0 :             } else if(doCompose) {
    1277           0 :                 if(!buffer.append(c, cc, errorCode)) {
    1278           0 :                     break;
    1279             :                 }
    1280           0 :                 continue;
    1281           0 :             } else if(prevCC<=cc) {
    1282           0 :                 prevCC=cc;
    1283           0 :                 continue;
    1284             :             } else {
    1285           0 :                 return FALSE;
    1286             :             }
    1287           0 :         } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
    1288           0 :             return FALSE;
    1289             :         }
    1290             : 
    1291             :         /*
    1292             :          * Find appropriate boundaries around this character,
    1293             :          * decompose the source text from between the boundaries,
    1294             :          * and recompose it.
    1295             :          *
    1296             :          * We may need to remove the last few characters from the ReorderingBuffer
    1297             :          * to account for source text that was copied or appended
    1298             :          * but needs to take part in the recomposition.
    1299             :          */
    1300             : 
    1301             :         /*
    1302             :          * Find the last composition boundary in [prevBoundary..src[.
    1303             :          * It is either the decomposition of the current character (at prevSrc),
    1304             :          * or prevBoundary.
    1305             :          */
    1306           0 :         if(hasCompBoundaryBefore(c, norm16)) {
    1307           0 :             prevBoundary=prevSrc;
    1308           0 :         } else if(doCompose) {
    1309           0 :             buffer.removeSuffix((int32_t)(prevSrc-prevBoundary));
    1310             :         }
    1311             : 
    1312             :         // Find the next composition boundary in [src..limit[ -
    1313             :         // modifies src to point to the next starter.
    1314           0 :         src=(UChar *)findNextCompBoundary(src, limit);
    1315             : 
    1316             :         // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
    1317           0 :         int32_t recomposeStartIndex=buffer.length();
    1318           0 :         if(!decomposeShort(prevBoundary, src, buffer, errorCode)) {
    1319           0 :             break;
    1320             :         }
    1321           0 :         recompose(buffer, recomposeStartIndex, onlyContiguous);
    1322           0 :         if(!doCompose) {
    1323           0 :             if(!buffer.equals(prevBoundary, src)) {
    1324           0 :                 return FALSE;
    1325             :             }
    1326           0 :             buffer.remove();
    1327           0 :             prevCC=0;
    1328             :         }
    1329             : 
    1330             :         // Move to the next starter. We never need to look back before this point again.
    1331           0 :         prevBoundary=src;
    1332           0 :     }
    1333           0 :     return TRUE;
    1334             : }
    1335             : 
    1336             : // Very similar to compose(): Make the same changes in both places if relevant.
    1337             : // pQCResult==NULL: spanQuickCheckYes
    1338             : // pQCResult!=NULL: quickCheck (*pQCResult must be UNORM_YES)
    1339             : const UChar *
    1340           0 : Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
    1341             :                                    UBool onlyContiguous,
    1342             :                                    UNormalizationCheckResult *pQCResult) const {
    1343             :     /*
    1344             :      * prevBoundary points to the last character before the current one
    1345             :      * that has a composition boundary before it with ccc==0 and quick check "yes".
    1346             :      */
    1347           0 :     const UChar *prevBoundary=src;
    1348           0 :     UChar32 minNoMaybeCP=minCompNoMaybeCP;
    1349           0 :     if(limit==NULL) {
    1350           0 :         UErrorCode errorCode=U_ZERO_ERROR;
    1351           0 :         src=copyLowPrefixFromNulTerminated(src, minNoMaybeCP, NULL, errorCode);
    1352           0 :         if(prevBoundary<src) {
    1353             :             // Set prevBoundary to the last character in the prefix.
    1354           0 :             prevBoundary=src-1;
    1355             :         }
    1356           0 :         limit=u_strchr(src, 0);
    1357             :     }
    1358             : 
    1359             :     const UChar *prevSrc;
    1360           0 :     UChar32 c=0;
    1361           0 :     uint16_t norm16=0;
    1362           0 :     uint8_t prevCC=0;
    1363             : 
    1364             :     for(;;) {
    1365             :         // count code units below the minimum or with irrelevant data for the quick check
    1366           0 :         for(prevSrc=src;;) {
    1367           0 :             if(src==limit) {
    1368           0 :                 return src;
    1369             :             }
    1370           0 :             if( (c=*src)<minNoMaybeCP ||
    1371           0 :                 isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
    1372             :             ) {
    1373           0 :                 ++src;
    1374           0 :             } else if(!U16_IS_SURROGATE(c)) {
    1375           0 :                 break;
    1376             :             } else {
    1377             :                 UChar c2;
    1378           0 :                 if(U16_IS_SURROGATE_LEAD(c)) {
    1379           0 :                     if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    1380           0 :                         c=U16_GET_SUPPLEMENTARY(c, c2);
    1381             :                     }
    1382             :                 } else /* trail surrogate */ {
    1383           0 :                     if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    1384           0 :                         --src;
    1385           0 :                         c=U16_GET_SUPPLEMENTARY(c2, c);
    1386             :                     }
    1387             :                 }
    1388           0 :                 if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
    1389           0 :                     src+=U16_LENGTH(c);
    1390             :                 } else {
    1391           0 :                     break;
    1392             :                 }
    1393             :             }
    1394           0 :         }
    1395           0 :         if(src!=prevSrc) {
    1396             :             // Set prevBoundary to the last character in the quick check loop.
    1397           0 :             prevBoundary=src-1;
    1398           0 :             if( U16_IS_TRAIL(*prevBoundary) && prevSrc<prevBoundary &&
    1399           0 :                 U16_IS_LEAD(*(prevBoundary-1))
    1400             :             ) {
    1401           0 :                 --prevBoundary;
    1402             :             }
    1403           0 :             prevCC=0;
    1404             :             // The start of the current character (c).
    1405           0 :             prevSrc=src;
    1406             :         }
    1407             : 
    1408           0 :         src+=U16_LENGTH(c);
    1409             :         /*
    1410             :          * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
    1411             :          * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
    1412             :          * or has ccc!=0.
    1413             :          */
    1414           0 :         if(isMaybeOrNonZeroCC(norm16)) {
    1415           0 :             uint8_t cc=getCCFromYesOrMaybe(norm16);
    1416           0 :             if( onlyContiguous &&  // FCC
    1417           0 :                 cc!=0 &&
    1418           0 :                 prevCC==0 &&
    1419           0 :                 prevBoundary<prevSrc &&
    1420             :                 // prevCC==0 && prevBoundary<prevSrc tell us that
    1421             :                 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
    1422             :                 // passed the quick check "yes && ccc==0" test.
    1423             :                 // Check whether the last character was a "yesYes" or a "yesNo".
    1424             :                 // If a "yesNo", then we get its trailing ccc from its
    1425             :                 // mapping and check for canonical order.
    1426             :                 // All other cases are ok.
    1427           0 :                 getTrailCCFromCompYesAndZeroCC(prevBoundary, prevSrc)>cc
    1428             :             ) {
    1429             :                 // Fails FCD test.
    1430           0 :             } else if(prevCC<=cc || cc==0) {
    1431           0 :                 prevCC=cc;
    1432           0 :                 if(norm16<MIN_YES_YES_WITH_CC) {
    1433           0 :                     if(pQCResult!=NULL) {
    1434           0 :                         *pQCResult=UNORM_MAYBE;
    1435             :                     } else {
    1436           0 :                         return prevBoundary;
    1437             :                     }
    1438             :                 }
    1439           0 :                 continue;
    1440             :             }
    1441             :         }
    1442           0 :         if(pQCResult!=NULL) {
    1443           0 :             *pQCResult=UNORM_NO;
    1444             :         }
    1445           0 :         return prevBoundary;
    1446           0 :     }
    1447             : }
    1448             : 
    1449           0 : void Normalizer2Impl::composeAndAppend(const UChar *src, const UChar *limit,
    1450             :                                        UBool doCompose,
    1451             :                                        UBool onlyContiguous,
    1452             :                                        UnicodeString &safeMiddle,
    1453             :                                        ReorderingBuffer &buffer,
    1454             :                                        UErrorCode &errorCode) const {
    1455           0 :     if(!buffer.isEmpty()) {
    1456           0 :         const UChar *firstStarterInSrc=findNextCompBoundary(src, limit);
    1457           0 :         if(src!=firstStarterInSrc) {
    1458           0 :             const UChar *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(),
    1459           0 :                                                                     buffer.getLimit());
    1460           0 :             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest);
    1461           0 :             UnicodeString middle(lastStarterInDest, destSuffixLength);
    1462           0 :             buffer.removeSuffix(destSuffixLength);
    1463           0 :             safeMiddle=middle;
    1464           0 :             middle.append(src, (int32_t)(firstStarterInSrc-src));
    1465           0 :             const UChar *middleStart=middle.getBuffer();
    1466           0 :             compose(middleStart, middleStart+middle.length(), onlyContiguous,
    1467           0 :                     TRUE, buffer, errorCode);
    1468           0 :             if(U_FAILURE(errorCode)) {
    1469           0 :                 return;
    1470             :             }
    1471           0 :             src=firstStarterInSrc;
    1472             :         }
    1473             :     }
    1474           0 :     if(doCompose) {
    1475           0 :         compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
    1476             :     } else {
    1477           0 :         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
    1478           0 :             limit=u_strchr(src, 0);
    1479             :         }
    1480           0 :         buffer.appendZeroCC(src, limit, errorCode);
    1481             :     }
    1482             : }
    1483             : 
    1484             : /**
    1485             :  * Does c have a composition boundary before it?
    1486             :  * True if its decomposition begins with a character that has
    1487             :  * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
    1488             :  * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
    1489             :  * (isCompYesAndZeroCC()) so we need not decompose.
    1490             :  */
    1491           0 : UBool Normalizer2Impl::hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
    1492             :     for(;;) {
    1493           0 :         if(isCompYesAndZeroCC(norm16)) {
    1494           0 :             return TRUE;
    1495           0 :         } else if(isMaybeOrNonZeroCC(norm16)) {
    1496           0 :             return FALSE;
    1497           0 :         } else if(isDecompNoAlgorithmic(norm16)) {
    1498           0 :             c=mapAlgorithmic(c, norm16);
    1499           0 :             norm16=getNorm16(c);
    1500             :         } else {
    1501             :             // c decomposes, get everything from the variable-length extra data
    1502           0 :             const uint16_t *mapping=getMapping(norm16);
    1503           0 :             uint16_t firstUnit=*mapping;
    1504           0 :             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
    1505           0 :                 return FALSE;
    1506             :             }
    1507           0 :             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD) && (*(mapping-1)&0xff00)) {
    1508           0 :                 return FALSE;  // non-zero leadCC
    1509             :             }
    1510           0 :             int32_t i=1;  // skip over the firstUnit
    1511             :             UChar32 c;
    1512           0 :             U16_NEXT_UNSAFE(mapping, i, c);
    1513           0 :             return isCompYesAndZeroCC(getNorm16(c));
    1514             :         }
    1515           0 :     }
    1516             : }
    1517             : 
    1518           0 : UBool Normalizer2Impl::hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const {
    1519             :     for(;;) {
    1520           0 :         uint16_t norm16=getNorm16(c);
    1521           0 :         if(isInert(norm16)) {
    1522           0 :             return TRUE;
    1523           0 :         } else if(norm16<=minYesNo) {
    1524             :             // Hangul: norm16==minYesNo
    1525             :             // Hangul LVT has a boundary after it.
    1526             :             // Hangul LV and non-inert yesYes characters combine forward.
    1527           0 :             return isHangul(norm16) && !Hangul::isHangulWithoutJamoT((UChar)c);
    1528           0 :         } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
    1529           0 :             return FALSE;
    1530           0 :         } else if(isDecompNoAlgorithmic(norm16)) {
    1531           0 :             c=mapAlgorithmic(c, norm16);
    1532             :         } else {
    1533             :             // c decomposes, get everything from the variable-length extra data.
    1534             :             // If testInert, then c must be a yesNo character which has lccc=0,
    1535             :             // otherwise it could be a noNo.
    1536           0 :             const uint16_t *mapping=getMapping(norm16);
    1537           0 :             uint16_t firstUnit=*mapping;
    1538             :             // TRUE if
    1539             :             //   not MAPPING_NO_COMP_BOUNDARY_AFTER
    1540             :             //     (which is set if
    1541             :             //       c is not deleted, and
    1542             :             //       it and its decomposition do not combine forward, and it has a starter)
    1543             :             //   and if FCC then trailCC<=1
    1544             :             return
    1545           0 :                 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
    1546           0 :                 (!onlyContiguous || firstUnit<=0x1ff);
    1547             :         }
    1548           0 :     }
    1549             : }
    1550             : 
    1551           0 : const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p) const {
    1552           0 :     BackwardUTrie2StringIterator iter(normTrie, start, p);
    1553             :     uint16_t norm16;
    1554           0 :     do {
    1555           0 :         norm16=iter.previous16();
    1556           0 :     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
    1557             :     // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
    1558             :     // but that's probably not worth the extra cost.
    1559           0 :     return iter.codePointStart;
    1560             : }
    1561             : 
    1562           0 : const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit) const {
    1563           0 :     ForwardUTrie2StringIterator iter(normTrie, p, limit);
    1564             :     uint16_t norm16;
    1565           0 :     do {
    1566           0 :         norm16=iter.next16();
    1567           0 :     } while(!hasCompBoundaryBefore(iter.codePoint, norm16));
    1568           0 :     return iter.codePointStart;
    1569             : }
    1570             : 
    1571             : // Note: normalizer2impl.cpp r30982 (2011-nov-27)
    1572             : // still had getFCDTrie() which built and cached an FCD trie.
    1573             : // That provided faster access to FCD data than getFCD16FromNormData()
    1574             : // but required synchronization and consumed some 10kB of heap memory
    1575             : // in any process that uses FCD (e.g., via collation).
    1576             : // tccc180[] and smallFCD[] are intended to help with any loss of performance,
    1577             : // at least for Latin & CJK.
    1578             : 
    1579             : // Gets the FCD value from the regular normalization data.
    1580        1137 : uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
    1581             :     // Only loops for 1:1 algorithmic mappings.
    1582             :     for(;;) {
    1583        1137 :         uint16_t norm16=getNorm16(c);
    1584        1137 :         if(norm16<=minYesNo) {
    1585             :             // no decomposition or Hangul syllable, all zeros
    1586         170 :             return 0;
    1587         967 :         } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
    1588             :             // combining mark
    1589           0 :             norm16&=0xff;
    1590           0 :             return norm16|(norm16<<8);
    1591         967 :         } else if(norm16>=minMaybeYes) {
    1592           0 :             return 0;
    1593         967 :         } else if(isDecompNoAlgorithmic(norm16)) {
    1594         177 :             c=mapAlgorithmic(c, norm16);
    1595             :         } else {
    1596             :             // c decomposes, get everything from the variable-length extra data
    1597         790 :             const uint16_t *mapping=getMapping(norm16);
    1598         790 :             uint16_t firstUnit=*mapping;
    1599         790 :             if((firstUnit&MAPPING_LENGTH_MASK)==0) {
    1600             :                 // A character that is deleted (maps to an empty string) must
    1601             :                 // get the worst-case lccc and tccc values because arbitrary
    1602             :                 // characters on both sides will become adjacent.
    1603           3 :                 return 0x1ff;
    1604             :             } else {
    1605         787 :                 norm16=firstUnit>>8;  // tccc
    1606         787 :                 if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) {
    1607           0 :                     norm16|=*(mapping-1)&0xff00;  // lccc
    1608             :                 }
    1609         787 :                 return norm16;
    1610             :             }
    1611             :         }
    1612         177 :     }
    1613             : }
    1614             : 
    1615             : // Dual functionality:
    1616             : // buffer!=NULL: normalize
    1617             : // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
    1618             : const UChar *
    1619           0 : Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
    1620             :                          ReorderingBuffer *buffer,
    1621             :                          UErrorCode &errorCode) const {
    1622             :     // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
    1623             :     // Similar to the prevBoundary in the compose() implementation.
    1624           0 :     const UChar *prevBoundary=src;
    1625           0 :     int32_t prevFCD16=0;
    1626           0 :     if(limit==NULL) {
    1627           0 :         src=copyLowPrefixFromNulTerminated(src, MIN_CCC_LCCC_CP, buffer, errorCode);
    1628           0 :         if(U_FAILURE(errorCode)) {
    1629           0 :             return src;
    1630             :         }
    1631           0 :         if(prevBoundary<src) {
    1632           0 :             prevBoundary=src;
    1633             :             // We know that the previous character's lccc==0.
    1634             :             // Fetching the fcd16 value was deferred for this below-U+0300 code point.
    1635           0 :             prevFCD16=getFCD16(*(src-1));
    1636           0 :             if(prevFCD16>1) {
    1637           0 :                 --prevBoundary;
    1638             :             }
    1639             :         }
    1640           0 :         limit=u_strchr(src, 0);
    1641             :     }
    1642             : 
    1643             :     // Note: In this function we use buffer->appendZeroCC() because we track
    1644             :     // the lead and trail combining classes here, rather than leaving it to
    1645             :     // the ReorderingBuffer.
    1646             :     // The exception is the call to decomposeShort() which uses the buffer
    1647             :     // in the normal way.
    1648             : 
    1649             :     const UChar *prevSrc;
    1650           0 :     UChar32 c=0;
    1651           0 :     uint16_t fcd16=0;
    1652             : 
    1653             :     for(;;) {
    1654             :         // count code units with lccc==0
    1655           0 :         for(prevSrc=src; src!=limit;) {
    1656           0 :             if((c=*src)<MIN_CCC_LCCC_CP) {
    1657           0 :                 prevFCD16=~c;
    1658           0 :                 ++src;
    1659           0 :             } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
    1660           0 :                 prevFCD16=0;
    1661           0 :                 ++src;
    1662             :             } else {
    1663           0 :                 if(U16_IS_SURROGATE(c)) {
    1664             :                     UChar c2;
    1665           0 :                     if(U16_IS_SURROGATE_LEAD(c)) {
    1666           0 :                         if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
    1667           0 :                             c=U16_GET_SUPPLEMENTARY(c, c2);
    1668             :                         }
    1669             :                     } else /* trail surrogate */ {
    1670           0 :                         if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
    1671           0 :                             --src;
    1672           0 :                             c=U16_GET_SUPPLEMENTARY(c2, c);
    1673             :                         }
    1674             :                     }
    1675             :                 }
    1676           0 :                 if((fcd16=getFCD16FromNormData(c))<=0xff) {
    1677           0 :                     prevFCD16=fcd16;
    1678           0 :                     src+=U16_LENGTH(c);
    1679             :                 } else {
    1680           0 :                     break;
    1681             :                 }
    1682             :             }
    1683             :         }
    1684             :         // copy these code units all at once
    1685           0 :         if(src!=prevSrc) {
    1686           0 :             if(buffer!=NULL && !buffer->appendZeroCC(prevSrc, src, errorCode)) {
    1687           0 :                 break;
    1688             :             }
    1689           0 :             if(src==limit) {
    1690           0 :                 break;
    1691             :             }
    1692           0 :             prevBoundary=src;
    1693             :             // We know that the previous character's lccc==0.
    1694           0 :             if(prevFCD16<0) {
    1695             :                 // Fetching the fcd16 value was deferred for this below-U+0300 code point.
    1696           0 :                 UChar32 prev=~prevFCD16;
    1697           0 :                 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
    1698           0 :                 if(prevFCD16>1) {
    1699           0 :                     --prevBoundary;
    1700             :                 }
    1701             :             } else {
    1702           0 :                 const UChar *p=src-1;
    1703           0 :                 if(U16_IS_TRAIL(*p) && prevSrc<p && U16_IS_LEAD(*(p-1))) {
    1704           0 :                     --p;
    1705             :                     // Need to fetch the previous character's FCD value because
    1706             :                     // prevFCD16 was just for the trail surrogate code point.
    1707           0 :                     prevFCD16=getFCD16FromNormData(U16_GET_SUPPLEMENTARY(p[0], p[1]));
    1708             :                     // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
    1709             :                 }
    1710           0 :                 if(prevFCD16>1) {
    1711           0 :                     prevBoundary=p;
    1712             :                 }
    1713             :             }
    1714             :             // The start of the current character (c).
    1715           0 :             prevSrc=src;
    1716           0 :         } else if(src==limit) {
    1717           0 :             break;
    1718             :         }
    1719             : 
    1720           0 :         src+=U16_LENGTH(c);
    1721             :         // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
    1722             :         // Check for proper order, and decompose locally if necessary.
    1723           0 :         if((prevFCD16&0xff)<=(fcd16>>8)) {
    1724             :             // proper order: prev tccc <= current lccc
    1725           0 :             if((fcd16&0xff)<=1) {
    1726           0 :                 prevBoundary=src;
    1727             :             }
    1728           0 :             if(buffer!=NULL && !buffer->appendZeroCC(c, errorCode)) {
    1729           0 :                 break;
    1730             :             }
    1731           0 :             prevFCD16=fcd16;
    1732           0 :             continue;
    1733           0 :         } else if(buffer==NULL) {
    1734           0 :             return prevBoundary;  // quick check "no"
    1735             :         } else {
    1736             :             /*
    1737             :              * Back out the part of the source that we copied or appended
    1738             :              * already but is now going to be decomposed.
    1739             :              * prevSrc is set to after what was copied/appended.
    1740             :              */
    1741           0 :             buffer->removeSuffix((int32_t)(prevSrc-prevBoundary));
    1742             :             /*
    1743             :              * Find the part of the source that needs to be decomposed,
    1744             :              * up to the next safe boundary.
    1745             :              */
    1746           0 :             src=findNextFCDBoundary(src, limit);
    1747             :             /*
    1748             :              * The source text does not fulfill the conditions for FCD.
    1749             :              * Decompose and reorder a limited piece of the text.
    1750             :              */
    1751           0 :             if(!decomposeShort(prevBoundary, src, *buffer, errorCode)) {
    1752           0 :                 break;
    1753             :             }
    1754           0 :             prevBoundary=src;
    1755           0 :             prevFCD16=0;
    1756             :         }
    1757           0 :     }
    1758           0 :     return src;
    1759             : }
    1760             : 
    1761           0 : void Normalizer2Impl::makeFCDAndAppend(const UChar *src, const UChar *limit,
    1762             :                                        UBool doMakeFCD,
    1763             :                                        UnicodeString &safeMiddle,
    1764             :                                        ReorderingBuffer &buffer,
    1765             :                                        UErrorCode &errorCode) const {
    1766           0 :     if(!buffer.isEmpty()) {
    1767           0 :         const UChar *firstBoundaryInSrc=findNextFCDBoundary(src, limit);
    1768           0 :         if(src!=firstBoundaryInSrc) {
    1769           0 :             const UChar *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(),
    1770           0 :                                                                     buffer.getLimit());
    1771           0 :             int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest);
    1772           0 :             UnicodeString middle(lastBoundaryInDest, destSuffixLength);
    1773           0 :             buffer.removeSuffix(destSuffixLength);
    1774           0 :             safeMiddle=middle;
    1775           0 :             middle.append(src, (int32_t)(firstBoundaryInSrc-src));
    1776           0 :             const UChar *middleStart=middle.getBuffer();
    1777           0 :             makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode);
    1778           0 :             if(U_FAILURE(errorCode)) {
    1779           0 :                 return;
    1780             :             }
    1781           0 :             src=firstBoundaryInSrc;
    1782             :         }
    1783             :     }
    1784           0 :     if(doMakeFCD) {
    1785           0 :         makeFCD(src, limit, &buffer, errorCode);
    1786             :     } else {
    1787           0 :         if(limit==NULL) {  // appendZeroCC() needs limit!=NULL
    1788           0 :             limit=u_strchr(src, 0);
    1789             :         }
    1790           0 :         buffer.appendZeroCC(src, limit, errorCode);
    1791             :     }
    1792             : }
    1793             : 
    1794           0 : const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const UChar *p) const {
    1795           0 :     while(start<p && previousFCD16(start, p)>0xff) {}
    1796           0 :     return p;
    1797             : }
    1798             : 
    1799           0 : const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *limit) const {
    1800           0 :     while(p<limit) {
    1801           0 :         const UChar *codePointStart=p;
    1802           0 :         if(nextFCD16(p, limit)<=0xff) {
    1803           0 :             return codePointStart;
    1804             :         }
    1805             :     }
    1806           0 :     return p;
    1807             : }
    1808             : 
    1809             : // CanonicalIterator data -------------------------------------------------- ***
    1810             : 
    1811           0 : CanonIterData::CanonIterData(UErrorCode &errorCode) :
    1812           0 :         trie(utrie2_open(0, 0, &errorCode)),
    1813           0 :         canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
    1814             : 
    1815           0 : CanonIterData::~CanonIterData() {
    1816           0 :     utrie2_close(trie);
    1817           0 : }
    1818             : 
    1819           0 : void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
    1820           0 :     uint32_t canonValue=utrie2_get32(trie, decompLead);
    1821           0 :     if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
    1822             :         // origin is the first character whose decomposition starts with
    1823             :         // the character for which we are setting the value.
    1824           0 :         utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
    1825             :     } else {
    1826             :         // origin is not the first character, or it is U+0000.
    1827             :         UnicodeSet *set;
    1828           0 :         if((canonValue&CANON_HAS_SET)==0) {
    1829           0 :             set=new UnicodeSet;
    1830           0 :             if(set==NULL) {
    1831           0 :                 errorCode=U_MEMORY_ALLOCATION_ERROR;
    1832           0 :                 return;
    1833             :             }
    1834           0 :             UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
    1835           0 :             canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
    1836           0 :             utrie2_set32(trie, decompLead, canonValue, &errorCode);
    1837           0 :             canonStartSets.addElement(set, errorCode);
    1838           0 :             if(firstOrigin!=0) {
    1839           0 :                 set->add(firstOrigin);
    1840             :             }
    1841             :         } else {
    1842           0 :             set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)];
    1843             :         }
    1844           0 :         set->add(origin);
    1845             :     }
    1846             : }
    1847             : 
    1848             : U_CDECL_BEGIN
    1849             : 
    1850             : // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
    1851             : //     context: the Normalizer2Impl
    1852             : static UBool U_CALLCONV
    1853           0 : enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    1854           0 :     UErrorCode errorCode = U_ZERO_ERROR;
    1855           0 :     if (value != 0) {
    1856           0 :         Normalizer2Impl *impl = (Normalizer2Impl *)context;
    1857           0 :         impl->makeCanonIterDataFromNorm16(
    1858           0 :             start, end, (uint16_t)value, *impl->fCanonIterData, errorCode);
    1859             :     }
    1860           0 :     return U_SUCCESS(errorCode);
    1861             : }
    1862             : 
    1863             : 
    1864             : 
    1865             : // UInitOnce instantiation function for CanonIterData
    1866             : 
    1867             : static void U_CALLCONV 
    1868           0 : initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
    1869           0 :     U_ASSERT(impl->fCanonIterData == NULL);
    1870           0 :     impl->fCanonIterData = new CanonIterData(errorCode);
    1871           0 :     if (impl->fCanonIterData == NULL) {
    1872           0 :         errorCode=U_MEMORY_ALLOCATION_ERROR;
    1873             :     }
    1874           0 :     if (U_SUCCESS(errorCode)) {
    1875           0 :         utrie2_enum(impl->getNormTrie(), NULL, enumCIDRangeHandler, impl);
    1876           0 :         utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
    1877             :     }
    1878           0 :     if (U_FAILURE(errorCode)) {
    1879           0 :         delete impl->fCanonIterData;
    1880           0 :         impl->fCanonIterData = NULL;
    1881             :     }
    1882           0 : }
    1883             : 
    1884             : U_CDECL_END
    1885             : 
    1886           0 : void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
    1887             :                                                   CanonIterData &newData,
    1888             :                                                   UErrorCode &errorCode) const {
    1889           0 :     if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
    1890             :         // Inert, or 2-way mapping (including Hangul syllable).
    1891             :         // We do not write a canonStartSet for any yesNo character.
    1892             :         // Composites from 2-way mappings are added at runtime from the
    1893             :         // starter's compositions list, and the other characters in
    1894             :         // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
    1895             :         // "maybe" characters.
    1896           0 :         return;
    1897             :     }
    1898           0 :     for(UChar32 c=start; c<=end; ++c) {
    1899           0 :         uint32_t oldValue=utrie2_get32(newData.trie, c);
    1900           0 :         uint32_t newValue=oldValue;
    1901           0 :         if(norm16>=minMaybeYes) {
    1902             :             // not a segment starter if it occurs in a decomposition or has cc!=0
    1903           0 :             newValue|=CANON_NOT_SEGMENT_STARTER;
    1904           0 :             if(norm16<MIN_NORMAL_MAYBE_YES) {
    1905           0 :                 newValue|=CANON_HAS_COMPOSITIONS;
    1906             :             }
    1907           0 :         } else if(norm16<minYesNo) {
    1908           0 :             newValue|=CANON_HAS_COMPOSITIONS;
    1909             :         } else {
    1910             :             // c has a one-way decomposition
    1911           0 :             UChar32 c2=c;
    1912           0 :             uint16_t norm16_2=norm16;
    1913           0 :             while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
    1914           0 :                 c2=mapAlgorithmic(c2, norm16_2);
    1915           0 :                 norm16_2=getNorm16(c2);
    1916             :             }
    1917           0 :             if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
    1918             :                 // c decomposes, get everything from the variable-length extra data
    1919           0 :                 const uint16_t *mapping=getMapping(norm16_2);
    1920           0 :                 uint16_t firstUnit=*mapping;
    1921           0 :                 int32_t length=firstUnit&MAPPING_LENGTH_MASK;
    1922           0 :                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
    1923           0 :                     if(c==c2 && (*(mapping-1)&0xff)!=0) {
    1924           0 :                         newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
    1925             :                     }
    1926             :                 }
    1927             :                 // Skip empty mappings (no characters in the decomposition).
    1928           0 :                 if(length!=0) {
    1929           0 :                     ++mapping;  // skip over the firstUnit
    1930             :                     // add c to first code point's start set
    1931           0 :                     int32_t i=0;
    1932           0 :                     U16_NEXT_UNSAFE(mapping, i, c2);
    1933           0 :                     newData.addToStartSet(c, c2, errorCode);
    1934             :                     // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
    1935             :                     // one-way mapping. A 2-way mapping is possible here after
    1936             :                     // intermediate algorithmic mapping.
    1937           0 :                     if(norm16_2>=minNoNo) {
    1938           0 :                         while(i<length) {
    1939           0 :                             U16_NEXT_UNSAFE(mapping, i, c2);
    1940           0 :                             uint32_t c2Value=utrie2_get32(newData.trie, c2);
    1941           0 :                             if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
    1942           0 :                                 utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
    1943           0 :                                              &errorCode);
    1944             :                             }
    1945             :                         }
    1946             :                     }
    1947           0 :                 }
    1948             :             } else {
    1949             :                 // c decomposed to c2 algorithmically; c has cc==0
    1950           0 :                 newData.addToStartSet(c, c2, errorCode);
    1951             :             }
    1952             :         }
    1953           0 :         if(newValue!=oldValue) {
    1954           0 :             utrie2_set32(newData.trie, c, newValue, &errorCode);
    1955             :         }
    1956             :     }
    1957             : }
    1958             : 
    1959           0 : UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
    1960             :     // Logically const: Synchronized instantiation.
    1961           0 :     Normalizer2Impl *me=const_cast<Normalizer2Impl *>(this);
    1962           0 :     umtx_initOnce(me->fCanonIterDataInitOnce, &initCanonIterData, me, errorCode);
    1963           0 :     return U_SUCCESS(errorCode);
    1964             : }
    1965             : 
    1966           0 : int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
    1967           0 :     return (int32_t)utrie2_get32(fCanonIterData->trie, c);
    1968             : }
    1969             : 
    1970           0 : const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
    1971           0 :     return *(const UnicodeSet *)fCanonIterData->canonStartSets[n];
    1972             : }
    1973             : 
    1974           0 : UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const {
    1975           0 :     return getCanonValue(c)>=0;
    1976             : }
    1977             : 
    1978           0 : UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
    1979           0 :     int32_t canonValue=getCanonValue(c)&~CANON_NOT_SEGMENT_STARTER;
    1980           0 :     if(canonValue==0) {
    1981           0 :         return FALSE;
    1982             :     }
    1983           0 :     set.clear();
    1984           0 :     int32_t value=canonValue&CANON_VALUE_MASK;
    1985           0 :     if((canonValue&CANON_HAS_SET)!=0) {
    1986           0 :         set.addAll(getCanonStartSet(value));
    1987           0 :     } else if(value!=0) {
    1988           0 :         set.add(value);
    1989             :     }
    1990           0 :     if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
    1991           0 :         uint16_t norm16=getNorm16(c);
    1992           0 :         if(norm16==JAMO_L) {
    1993             :             UChar32 syllable=
    1994           0 :                 (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
    1995           0 :             set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1);
    1996             :         } else {
    1997           0 :             addComposites(getCompositionsList(norm16), set);
    1998             :         }
    1999             :     }
    2000           0 :     return TRUE;
    2001             : }
    2002             : 
    2003             : U_NAMESPACE_END
    2004             : 
    2005             : // Normalizer2 data swapping ----------------------------------------------- ***
    2006             : 
    2007             : U_NAMESPACE_USE
    2008             : 
    2009             : U_CAPI int32_t U_EXPORT2
    2010           0 : unorm2_swap(const UDataSwapper *ds,
    2011             :             const void *inData, int32_t length, void *outData,
    2012             :             UErrorCode *pErrorCode) {
    2013             :     const UDataInfo *pInfo;
    2014             :     int32_t headerSize;
    2015             : 
    2016             :     const uint8_t *inBytes;
    2017             :     uint8_t *outBytes;
    2018             : 
    2019             :     const int32_t *inIndexes;
    2020             :     int32_t indexes[Normalizer2Impl::IX_MIN_MAYBE_YES+1];
    2021             : 
    2022             :     int32_t i, offset, nextOffset, size;
    2023             : 
    2024             :     /* udata_swapDataHeader checks the arguments */
    2025           0 :     headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
    2026           0 :     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
    2027           0 :         return 0;
    2028             :     }
    2029             : 
    2030             :     /* check data format and format version */
    2031           0 :     pInfo=(const UDataInfo *)((const char *)inData+4);
    2032           0 :     if(!(
    2033           0 :         pInfo->dataFormat[0]==0x4e &&   /* dataFormat="Nrm2" */
    2034           0 :         pInfo->dataFormat[1]==0x72 &&
    2035           0 :         pInfo->dataFormat[2]==0x6d &&
    2036           0 :         pInfo->dataFormat[3]==0x32 &&
    2037           0 :         (pInfo->formatVersion[0]==1 || pInfo->formatVersion[0]==2)
    2038             :     )) {
    2039           0 :         udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
    2040           0 :                          pInfo->dataFormat[0], pInfo->dataFormat[1],
    2041           0 :                          pInfo->dataFormat[2], pInfo->dataFormat[3],
    2042           0 :                          pInfo->formatVersion[0]);
    2043           0 :         *pErrorCode=U_UNSUPPORTED_ERROR;
    2044           0 :         return 0;
    2045             :     }
    2046             : 
    2047           0 :     inBytes=(const uint8_t *)inData+headerSize;
    2048           0 :     outBytes=(uint8_t *)outData+headerSize;
    2049             : 
    2050           0 :     inIndexes=(const int32_t *)inBytes;
    2051             : 
    2052           0 :     if(length>=0) {
    2053           0 :         length-=headerSize;
    2054           0 :         if(length<(int32_t)sizeof(indexes)) {
    2055             :             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for Normalizer2 data\n",
    2056           0 :                              length);
    2057           0 :             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    2058           0 :             return 0;
    2059             :         }
    2060             :     }
    2061             : 
    2062             :     /* read the first few indexes */
    2063           0 :     for(i=0; i<=Normalizer2Impl::IX_MIN_MAYBE_YES; ++i) {
    2064           0 :         indexes[i]=udata_readInt32(ds, inIndexes[i]);
    2065             :     }
    2066             : 
    2067             :     /* get the total length of the data */
    2068           0 :     size=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
    2069             : 
    2070           0 :     if(length>=0) {
    2071           0 :         if(length<size) {
    2072             :             udata_printError(ds, "unorm2_swap(): too few bytes (%d after header) for all of Normalizer2 data\n",
    2073           0 :                              length);
    2074           0 :             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    2075           0 :             return 0;
    2076             :         }
    2077             : 
    2078             :         /* copy the data for inaccessible bytes */
    2079           0 :         if(inBytes!=outBytes) {
    2080           0 :             uprv_memcpy(outBytes, inBytes, size);
    2081             :         }
    2082             : 
    2083           0 :         offset=0;
    2084             : 
    2085             :         /* swap the int32_t indexes[] */
    2086           0 :         nextOffset=indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET];
    2087           0 :         ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
    2088           0 :         offset=nextOffset;
    2089             : 
    2090             :         /* swap the UTrie2 */
    2091           0 :         nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
    2092           0 :         utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
    2093           0 :         offset=nextOffset;
    2094             : 
    2095             :         /* swap the uint16_t extraData[] */
    2096           0 :         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET];
    2097           0 :         ds->swapArray16(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
    2098           0 :         offset=nextOffset;
    2099             : 
    2100             :         /* no need to swap the uint8_t smallFCD[] (new in formatVersion 2) */
    2101           0 :         nextOffset=indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET+1];
    2102           0 :         offset=nextOffset;
    2103             : 
    2104           0 :         U_ASSERT(offset==size);
    2105             :     }
    2106             : 
    2107           0 :     return headerSize+size;
    2108             : }
    2109             : 
    2110             : #endif  // !UCONFIG_NO_NORMALIZATION

Generated by: LCOV version 1.13