LCOV - code coverage report
Current view: top level - intl/icu/source/common - ucase.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 0 484 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 34 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : *******************************************************************************
       5             : *
       6             : *   Copyright (C) 2004-2014, International Business Machines
       7             : *   Corporation and others.  All Rights Reserved.
       8             : *
       9             : *******************************************************************************
      10             : *   file name:  ucase.cpp
      11             : *   encoding:   UTF-8
      12             : *   tab size:   8 (not used)
      13             : *   indentation:4
      14             : *
      15             : *   created on: 2004aug30
      16             : *   created by: Markus W. Scherer
      17             : *
      18             : *   Low-level Unicode character/string case mapping code.
      19             : *   Much code moved here (and modified) from uchar.c.
      20             : */
      21             : 
      22             : #include "unicode/utypes.h"
      23             : #include "unicode/unistr.h"
      24             : #include "unicode/uset.h"
      25             : #include "unicode/udata.h" /* UDataInfo */
      26             : #include "unicode/utf16.h"
      27             : #include "ucmndata.h" /* DataHeader */
      28             : #include "udatamem.h"
      29             : #include "umutex.h"
      30             : #include "uassert.h"
      31             : #include "cmemory.h"
      32             : #include "utrie2.h"
      33             : #include "ucase.h"
      34             : 
      35             : struct UCaseProps {
      36             :     UDataMemory *mem;
      37             :     const int32_t *indexes;
      38             :     const uint16_t *exceptions;
      39             :     const uint16_t *unfold;
      40             : 
      41             :     UTrie2 trie;
      42             :     uint8_t formatVersion[4];
      43             : };
      44             : 
      45             : /* ucase_props_data.h is machine-generated by gencase --csource */
      46             : #define INCLUDED_FROM_UCASE_CPP
      47             : #include "ucase_props_data.h"
      48             : 
      49             : /* set of property starts for UnicodeSet ------------------------------------ */
      50             : 
      51             : static UBool U_CALLCONV
      52           0 : _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
      53             :     /* add the start code point to the USet */
      54           0 :     const USetAdder *sa=(const USetAdder *)context;
      55           0 :     sa->add(sa->set, start);
      56           0 :     return TRUE;
      57             : }
      58             : 
      59             : U_CFUNC void U_EXPORT2
      60           0 : ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
      61           0 :     if(U_FAILURE(*pErrorCode)) {
      62           0 :         return;
      63             :     }
      64             : 
      65             :     /* add the start code point of each same-value range of the trie */
      66           0 :     utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
      67             : 
      68             :     /* add code points with hardcoded properties, plus the ones following them */
      69             : 
      70             :     /* (none right now, see comment below) */
      71             : 
      72             :     /*
      73             :      * Omit code points with hardcoded specialcasing properties
      74             :      * because we do not build property UnicodeSets for them right now.
      75             :      */
      76             : }
      77             : 
      78             : /* data access primitives --------------------------------------------------- */
      79             : 
      80             : #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
      81             : 
      82             : #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
      83             : 
      84             : /* number of bits in an 8-bit integer value */
      85             : static const uint8_t flagsOffset[256]={
      86             :     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
      87             :     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
      88             :     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
      89             :     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
      90             :     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
      91             :     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
      92             :     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
      93             :     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
      94             :     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
      95             :     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
      96             :     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
      97             :     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
      98             :     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
      99             :     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     100             :     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     101             :     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
     102             : };
     103             : 
     104             : #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
     105             : #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
     106             : 
     107             : /*
     108             :  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
     109             :  *
     110             :  * @param excWord (in) initial exceptions word
     111             :  * @param idx (in) desired slot index
     112             :  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
     113             :  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
     114             :  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
     115             :  */
     116             : #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
     117             :     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
     118             :         (pExc16)+=SLOT_OFFSET(excWord, idx); \
     119             :         (value)=*pExc16; \
     120             :     } else { \
     121             :         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
     122             :         (value)=*pExc16++; \
     123             :         (value)=((value)<<16)|*pExc16; \
     124             :     }
     125             : 
     126             : /* simple case mappings ----------------------------------------------------- */
     127             : 
     128             : U_CAPI UChar32 U_EXPORT2
     129           0 : ucase_tolower(UChar32 c) {
     130           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     131           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
     132           0 :         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
     133           0 :             c+=UCASE_GET_DELTA(props);
     134             :         }
     135             :     } else {
     136           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
     137           0 :         uint16_t excWord=*pe++;
     138           0 :         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
     139           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
     140             :         }
     141             :     }
     142           0 :     return c;
     143             : }
     144             : 
     145             : U_CAPI UChar32 U_EXPORT2
     146           0 : ucase_toupper(UChar32 c) {
     147           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     148           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
     149           0 :         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
     150           0 :             c+=UCASE_GET_DELTA(props);
     151             :         }
     152             :     } else {
     153           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
     154           0 :         uint16_t excWord=*pe++;
     155           0 :         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
     156           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
     157             :         }
     158             :     }
     159           0 :     return c;
     160             : }
     161             : 
     162             : U_CAPI UChar32 U_EXPORT2
     163           0 : ucase_totitle(UChar32 c) {
     164           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     165           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
     166           0 :         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
     167           0 :             c+=UCASE_GET_DELTA(props);
     168             :         }
     169             :     } else {
     170           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
     171           0 :         uint16_t excWord=*pe++;
     172             :         int32_t idx;
     173           0 :         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
     174           0 :             idx=UCASE_EXC_TITLE;
     175           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
     176           0 :             idx=UCASE_EXC_UPPER;
     177             :         } else {
     178           0 :             return c;
     179             :         }
     180           0 :         GET_SLOT_VALUE(excWord, idx, pe, c);
     181             :     }
     182           0 :     return c;
     183             : }
     184             : 
     185             : static const UChar iDot[2] = { 0x69, 0x307 };
     186             : static const UChar jDot[2] = { 0x6a, 0x307 };
     187             : static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
     188             : static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
     189             : static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
     190             : static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
     191             : 
     192             : 
     193             : U_CFUNC void U_EXPORT2
     194           0 : ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
     195             :     uint16_t props;
     196             : 
     197             :     /*
     198             :      * Hardcode the case closure of i and its relatives and ignore the
     199             :      * data file data for these characters.
     200             :      * The Turkic dotless i and dotted I with their case mapping conditions
     201             :      * and case folding option make the related characters behave specially.
     202             :      * This code matches their closure behavior to their case folding behavior.
     203             :      */
     204             : 
     205           0 :     switch(c) {
     206             :     case 0x49:
     207             :         /* regular i and I are in one equivalence class */
     208           0 :         sa->add(sa->set, 0x69);
     209           0 :         return;
     210             :     case 0x69:
     211           0 :         sa->add(sa->set, 0x49);
     212           0 :         return;
     213             :     case 0x130:
     214             :         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
     215           0 :         sa->addString(sa->set, iDot, 2);
     216           0 :         return;
     217             :     case 0x131:
     218             :         /* dotless i is in a class by itself */
     219           0 :         return;
     220             :     default:
     221             :         /* otherwise use the data file data */
     222           0 :         break;
     223             :     }
     224             : 
     225           0 :     props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     226           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
     227           0 :         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
     228             :             /* add the one simple case mapping, no matter what type it is */
     229           0 :             int32_t delta=UCASE_GET_DELTA(props);
     230           0 :             if(delta!=0) {
     231           0 :                 sa->add(sa->set, c+delta);
     232             :             }
     233             :         }
     234             :     } else {
     235             :         /*
     236             :          * c has exceptions, so there may be multiple simple and/or
     237             :          * full case mappings. Add them all.
     238             :          */
     239           0 :         const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
     240             :         const UChar *closure;
     241           0 :         uint16_t excWord=*pe++;
     242             :         int32_t idx, closureLength, fullLength, length;
     243             : 
     244           0 :         pe0=pe;
     245             : 
     246             :         /* add all simple case mappings */
     247           0 :         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
     248           0 :             if(HAS_SLOT(excWord, idx)) {
     249           0 :                 pe=pe0;
     250           0 :                 GET_SLOT_VALUE(excWord, idx, pe, c);
     251           0 :                 sa->add(sa->set, c);
     252             :             }
     253             :         }
     254             : 
     255             :         /* get the closure string pointer & length */
     256           0 :         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
     257           0 :             pe=pe0;
     258           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
     259           0 :             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
     260           0 :             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
     261             :         } else {
     262           0 :             closureLength=0;
     263           0 :             closure=NULL;
     264             :         }
     265             : 
     266             :         /* add the full case folding */
     267           0 :         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
     268           0 :             pe=pe0;
     269           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
     270             : 
     271             :             /* start of full case mapping strings */
     272           0 :             ++pe;
     273             : 
     274           0 :             fullLength&=0xffff; /* bits 16 and higher are reserved */
     275             : 
     276             :             /* skip the lowercase result string */
     277           0 :             pe+=fullLength&UCASE_FULL_LOWER;
     278           0 :             fullLength>>=4;
     279             : 
     280             :             /* add the full case folding string */
     281           0 :             length=fullLength&0xf;
     282           0 :             if(length!=0) {
     283           0 :                 sa->addString(sa->set, (const UChar *)pe, length);
     284           0 :                 pe+=length;
     285             :             }
     286             : 
     287             :             /* skip the uppercase and titlecase strings */
     288           0 :             fullLength>>=4;
     289           0 :             pe+=fullLength&0xf;
     290           0 :             fullLength>>=4;
     291           0 :             pe+=fullLength;
     292             : 
     293           0 :             closure=(const UChar *)pe; /* behind full case mappings */
     294             :         }
     295             : 
     296             :         /* add each code point in the closure string */
     297           0 :         for(idx=0; idx<closureLength;) {
     298           0 :             U16_NEXT_UNSAFE(closure, idx, c);
     299           0 :             sa->add(sa->set, c);
     300             :         }
     301             :     }
     302             : }
     303             : 
     304             : /*
     305             :  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
     306             :  * must be length>0 and max>0 and length<=max
     307             :  */
     308             : static inline int32_t
     309           0 : strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
     310             :     int32_t c1, c2;
     311             : 
     312           0 :     max-=length; /* we require length<=max, so no need to decrement max in the loop */
     313           0 :     do {
     314           0 :         c1=*s++;
     315           0 :         c2=*t++;
     316           0 :         if(c2==0) {
     317           0 :             return 1; /* reached the end of t but not of s */
     318             :         }
     319           0 :         c1-=c2;
     320           0 :         if(c1!=0) {
     321           0 :             return c1; /* return difference result */
     322             :         }
     323             :     } while(--length>0);
     324             :     /* ends with length==0 */
     325             : 
     326           0 :     if(max==0 || *t==0) {
     327           0 :         return 0; /* equal to length of both strings */
     328             :     } else {
     329           0 :         return -max; /* return lengh difference */
     330             :     }
     331             : }
     332             : 
     333             : U_CFUNC UBool U_EXPORT2
     334           0 : ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
     335             :     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
     336             : 
     337           0 :     if(ucase_props_singleton.unfold==NULL || s==NULL) {
     338           0 :         return FALSE; /* no reverse case folding data, or no string */
     339             :     }
     340           0 :     if(length<=1) {
     341             :         /* the string is too short to find any match */
     342             :         /*
     343             :          * more precise would be:
     344             :          * if(!u_strHasMoreChar32Than(s, length, 1))
     345             :          * but this does not make much practical difference because
     346             :          * a single supplementary code point would just not be found
     347             :          */
     348           0 :         return FALSE;
     349             :     }
     350             : 
     351           0 :     const uint16_t *unfold=ucase_props_singleton.unfold;
     352           0 :     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
     353           0 :     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
     354           0 :     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
     355           0 :     unfold+=unfoldRowWidth;
     356             : 
     357           0 :     if(length>unfoldStringWidth) {
     358             :         /* the string is too long to find any match */
     359           0 :         return FALSE;
     360             :     }
     361             : 
     362             :     /* do a binary search for the string */
     363           0 :     start=0;
     364           0 :     limit=unfoldRows;
     365           0 :     while(start<limit) {
     366           0 :         i=(start+limit)/2;
     367           0 :         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
     368           0 :         result=strcmpMax(s, length, p, unfoldStringWidth);
     369             : 
     370           0 :         if(result==0) {
     371             :             /* found the string: add each code point, and its case closure */
     372             :             UChar32 c;
     373             : 
     374           0 :             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
     375           0 :                 U16_NEXT_UNSAFE(p, i, c);
     376           0 :                 sa->add(sa->set, c);
     377           0 :                 ucase_addCaseClosure(c, sa);
     378             :             }
     379           0 :             return TRUE;
     380           0 :         } else if(result<0) {
     381           0 :             limit=i;
     382             :         } else /* result>0 */ {
     383           0 :             start=i+1;
     384             :         }
     385             :     }
     386             : 
     387           0 :     return FALSE; /* string not found */
     388             : }
     389             : 
     390             : U_NAMESPACE_BEGIN
     391             : 
     392           0 : FullCaseFoldingIterator::FullCaseFoldingIterator()
     393           0 :         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
     394           0 :           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
     395           0 :           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
     396           0 :           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
     397             :           currentRow(0),
     398           0 :           rowCpIndex(unfoldStringWidth) {
     399           0 :     unfold+=unfoldRowWidth;
     400           0 : }
     401             : 
     402             : UChar32
     403           0 : FullCaseFoldingIterator::next(UnicodeString &full) {
     404             :     // Advance past the last-delivered code point.
     405           0 :     const UChar *p=unfold+(currentRow*unfoldRowWidth);
     406           0 :     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
     407           0 :         ++currentRow;
     408           0 :         p+=unfoldRowWidth;
     409           0 :         rowCpIndex=unfoldStringWidth;
     410             :     }
     411           0 :     if(currentRow>=unfoldRows) { return U_SENTINEL; }
     412             :     // Set "full" to the NUL-terminated string in the first unfold column.
     413           0 :     int32_t length=unfoldStringWidth;
     414           0 :     while(length>0 && p[length-1]==0) { --length; }
     415           0 :     full.setTo(FALSE, p, length);
     416             :     // Return the code point.
     417             :     UChar32 c;
     418           0 :     U16_NEXT_UNSAFE(p, rowCpIndex, c);
     419           0 :     return c;
     420             : }
     421             : 
     422             : U_NAMESPACE_END
     423             : 
     424             : /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
     425             : U_CAPI int32_t U_EXPORT2
     426           0 : ucase_getType(UChar32 c) {
     427           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     428           0 :     return UCASE_GET_TYPE(props);
     429             : }
     430             : 
     431             : /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
     432             : U_CAPI int32_t U_EXPORT2
     433           0 : ucase_getTypeOrIgnorable(UChar32 c) {
     434           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     435           0 :     return UCASE_GET_TYPE_AND_IGNORABLE(props);
     436             : }
     437             : 
     438             : /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
     439             : static inline int32_t
     440           0 : getDotType(UChar32 c) {
     441           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     442           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
     443           0 :         return props&UCASE_DOT_MASK;
     444             :     } else {
     445           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
     446           0 :         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
     447             :     }
     448             : }
     449             : 
     450             : U_CAPI UBool U_EXPORT2
     451           0 : ucase_isSoftDotted(UChar32 c) {
     452           0 :     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
     453             : }
     454             : 
     455             : U_CAPI UBool U_EXPORT2
     456           0 : ucase_isCaseSensitive(UChar32 c) {
     457           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     458           0 :     return (UBool)((props&UCASE_SENSITIVE)!=0);
     459             : }
     460             : 
     461             : /* string casing ------------------------------------------------------------ */
     462             : 
     463             : /*
     464             :  * These internal functions form the core of string case mappings.
     465             :  * They map single code points to result code points or strings and take
     466             :  * all necessary conditions (context, locale ID, options) into account.
     467             :  *
     468             :  * They do not iterate over the source or write to the destination
     469             :  * so that the same functions are useful for non-standard string storage,
     470             :  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
     471             :  * For the same reason, the "surrounding text" context is passed in as a
     472             :  * UCaseContextIterator which does not make any assumptions about
     473             :  * the underlying storage.
     474             :  *
     475             :  * This section contains helper functions that check for conditions
     476             :  * in the input text surrounding the current code point
     477             :  * according to SpecialCasing.txt.
     478             :  *
     479             :  * Each helper function gets the index
     480             :  * - after the current code point if it looks at following text
     481             :  * - before the current code point if it looks at preceding text
     482             :  *
     483             :  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
     484             :  *
     485             :  * Final_Sigma
     486             :  *   C is preceded by a sequence consisting of
     487             :  *     a cased letter and a case-ignorable sequence,
     488             :  *   and C is not followed by a sequence consisting of
     489             :  *     an ignorable sequence and then a cased letter.
     490             :  *
     491             :  * More_Above
     492             :  *   C is followed by one or more characters of combining class 230 (ABOVE)
     493             :  *   in the combining character sequence.
     494             :  *
     495             :  * After_Soft_Dotted
     496             :  *   The last preceding character with combining class of zero before C
     497             :  *   was Soft_Dotted,
     498             :  *   and there is no intervening combining character class 230 (ABOVE).
     499             :  *
     500             :  * Before_Dot
     501             :  *   C is followed by combining dot above (U+0307).
     502             :  *   Any sequence of characters with a combining class that is neither 0 nor 230
     503             :  *   may intervene between the current character and the combining dot above.
     504             :  *
     505             :  * The erratum from 2002-10-31 adds the condition
     506             :  *
     507             :  * After_I
     508             :  *   The last preceding base character was an uppercase I, and there is no
     509             :  *   intervening combining character class 230 (ABOVE).
     510             :  *
     511             :  *   (See Jitterbug 2344 and the comments on After_I below.)
     512             :  *
     513             :  * Helper definitions in Unicode 3.2 UAX 21:
     514             :  *
     515             :  * D1. A character C is defined to be cased
     516             :  *     if it meets any of the following criteria:
     517             :  *
     518             :  *   - The general category of C is Titlecase Letter (Lt)
     519             :  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
     520             :  *   - Given D = NFD(C), then it is not the case that:
     521             :  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
     522             :  *     (This third criterium does not add any characters to the list
     523             :  *      for Unicode 3.2. Ignored.)
     524             :  *
     525             :  * D2. A character C is defined to be case-ignorable
     526             :  *     if it meets either of the following criteria:
     527             :  *
     528             :  *   - The general category of C is
     529             :  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
     530             :  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
     531             :  *   - C is one of the following characters 
     532             :  *     U+0027 APOSTROPHE
     533             :  *     U+00AD SOFT HYPHEN (SHY)
     534             :  *     U+2019 RIGHT SINGLE QUOTATION MARK
     535             :  *            (the preferred character for apostrophe)
     536             :  *
     537             :  * D3. A case-ignorable sequence is a sequence of
     538             :  *     zero or more case-ignorable characters.
     539             :  */
     540             : 
     541             : #define is_d(c) ((c)=='d' || (c)=='D')
     542             : #define is_e(c) ((c)=='e' || (c)=='E')
     543             : #define is_i(c) ((c)=='i' || (c)=='I')
     544             : #define is_l(c) ((c)=='l' || (c)=='L')
     545             : #define is_r(c) ((c)=='r' || (c)=='R')
     546             : #define is_t(c) ((c)=='t' || (c)=='T')
     547             : #define is_u(c) ((c)=='u' || (c)=='U')
     548             : #define is_z(c) ((c)=='z' || (c)=='Z')
     549             : 
     550             : /* separator? */
     551             : #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
     552             : 
     553             : /**
     554             :  * Requires non-NULL locale ID but otherwise does the equivalent of
     555             :  * checking for language codes as if uloc_getLanguage() were called:
     556             :  * Accepts both 2- and 3-letter codes and accepts case variants.
     557             :  */
     558             : U_CFUNC int32_t
     559           0 : ucase_getCaseLocale(const char *locale) {
     560             :     /*
     561             :      * This function used to use uloc_getLanguage(), but the current code
     562             :      * removes the dependency of this low-level code on uloc implementation code
     563             :      * and is faster because not the whole locale ID has to be
     564             :      * examined and copied/transformed.
     565             :      *
     566             :      * Because this code does not want to depend on uloc, the caller must
     567             :      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
     568             :      */
     569           0 :     char c=*locale++;
     570             :     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
     571             :     // and for Chinese "zh": Very common but no special case mapping behavior.
     572             :     // Then check lowercase vs. uppercase to reduce the number of comparisons
     573             :     // for other locales without special behavior.
     574           0 :     if(c=='e') {
     575             :         /* el or ell? */
     576           0 :         c=*locale++;
     577           0 :         if(is_l(c)) {
     578           0 :             c=*locale++;
     579           0 :             if(is_l(c)) {
     580           0 :                 c=*locale;
     581             :             }
     582           0 :             if(is_sep(c)) {
     583           0 :                 return UCASE_LOC_GREEK;
     584             :             }
     585             :         }
     586             :         // en, es, ... -> root
     587           0 :     } else if(c=='z') {
     588           0 :         return UCASE_LOC_ROOT;
     589             : #if U_CHARSET_FAMILY==U_ASCII_FAMILY
     590           0 :     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
     591             : #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
     592             :     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
     593             : #else
     594             : #   error Unknown charset family!
     595             : #endif
     596             :         // lowercase c
     597           0 :         if(c=='t') {
     598             :             /* tr or tur? */
     599           0 :             c=*locale++;
     600           0 :             if(is_u(c)) {
     601           0 :                 c=*locale++;
     602             :             }
     603           0 :             if(is_r(c)) {
     604           0 :                 c=*locale;
     605           0 :                 if(is_sep(c)) {
     606           0 :                     return UCASE_LOC_TURKISH;
     607             :                 }
     608             :             }
     609           0 :         } else if(c=='a') {
     610             :             /* az or aze? */
     611           0 :             c=*locale++;
     612           0 :             if(is_z(c)) {
     613           0 :                 c=*locale++;
     614           0 :                 if(is_e(c)) {
     615           0 :                     c=*locale;
     616             :                 }
     617           0 :                 if(is_sep(c)) {
     618           0 :                     return UCASE_LOC_TURKISH;
     619             :                 }
     620             :             }
     621           0 :         } else if(c=='l') {
     622             :             /* lt or lit? */
     623           0 :             c=*locale++;
     624           0 :             if(is_i(c)) {
     625           0 :                 c=*locale++;
     626             :             }
     627           0 :             if(is_t(c)) {
     628           0 :                 c=*locale;
     629           0 :                 if(is_sep(c)) {
     630           0 :                     return UCASE_LOC_LITHUANIAN;
     631             :                 }
     632             :             }
     633           0 :         } else if(c=='n') {
     634             :             /* nl or nld? */
     635           0 :             c=*locale++;
     636           0 :             if(is_l(c)) {
     637           0 :                 c=*locale++;
     638           0 :                 if(is_d(c)) {
     639           0 :                     c=*locale;
     640             :                 }
     641           0 :                 if(is_sep(c)) {
     642           0 :                     return UCASE_LOC_DUTCH;
     643             :                 }
     644             :             }
     645             :         }
     646             :     } else {
     647             :         // uppercase c
     648             :         // Same code as for lowercase c but also check for 'E'.
     649           0 :         if(c=='T') {
     650             :             /* tr or tur? */
     651           0 :             c=*locale++;
     652           0 :             if(is_u(c)) {
     653           0 :                 c=*locale++;
     654             :             }
     655           0 :             if(is_r(c)) {
     656           0 :                 c=*locale;
     657           0 :                 if(is_sep(c)) {
     658           0 :                     return UCASE_LOC_TURKISH;
     659             :                 }
     660             :             }
     661           0 :         } else if(c=='A') {
     662             :             /* az or aze? */
     663           0 :             c=*locale++;
     664           0 :             if(is_z(c)) {
     665           0 :                 c=*locale++;
     666           0 :                 if(is_e(c)) {
     667           0 :                     c=*locale;
     668             :                 }
     669           0 :                 if(is_sep(c)) {
     670           0 :                     return UCASE_LOC_TURKISH;
     671             :                 }
     672             :             }
     673           0 :         } else if(c=='L') {
     674             :             /* lt or lit? */
     675           0 :             c=*locale++;
     676           0 :             if(is_i(c)) {
     677           0 :                 c=*locale++;
     678             :             }
     679           0 :             if(is_t(c)) {
     680           0 :                 c=*locale;
     681           0 :                 if(is_sep(c)) {
     682           0 :                     return UCASE_LOC_LITHUANIAN;
     683             :                 }
     684             :             }
     685           0 :         } else if(c=='E') {
     686             :             /* el or ell? */
     687           0 :             c=*locale++;
     688           0 :             if(is_l(c)) {
     689           0 :                 c=*locale++;
     690           0 :                 if(is_l(c)) {
     691           0 :                     c=*locale;
     692             :                 }
     693           0 :                 if(is_sep(c)) {
     694           0 :                     return UCASE_LOC_GREEK;
     695             :                 }
     696             :             }
     697           0 :         } else if(c=='N') {
     698             :             /* nl or nld? */
     699           0 :             c=*locale++;
     700           0 :             if(is_l(c)) {
     701           0 :                 c=*locale++;
     702           0 :                 if(is_d(c)) {
     703           0 :                     c=*locale;
     704             :                 }
     705           0 :                 if(is_sep(c)) {
     706           0 :                     return UCASE_LOC_DUTCH;
     707             :                 }
     708             :             }
     709             :         }
     710             :     }
     711           0 :     return UCASE_LOC_ROOT;
     712             : }
     713             : 
     714             : /*
     715             :  * Is followed by
     716             :  *   {case-ignorable}* cased
     717             :  * ?
     718             :  * (dir determines looking forward/backward)
     719             :  * If a character is case-ignorable, it is skipped regardless of whether
     720             :  * it is also cased or not.
     721             :  */
     722             : static UBool
     723           0 : isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
     724             :     UChar32 c;
     725             : 
     726           0 :     if(iter==NULL) {
     727           0 :         return FALSE;
     728             :     }
     729             : 
     730           0 :     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
     731           0 :         int32_t type=ucase_getTypeOrIgnorable(c);
     732           0 :         if(type&4) {
     733             :             /* case-ignorable, continue with the loop */
     734           0 :         } else if(type!=UCASE_NONE) {
     735           0 :             return TRUE; /* followed by cased letter */
     736             :         } else {
     737           0 :             return FALSE; /* uncased and not case-ignorable */
     738             :         }
     739             :     }
     740             : 
     741           0 :     return FALSE; /* not followed by cased letter */
     742             : }
     743             : 
     744             : /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
     745             : static UBool
     746           0 : isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
     747             :     UChar32 c;
     748             :     int32_t dotType;
     749             :     int8_t dir;
     750             : 
     751           0 :     if(iter==NULL) {
     752           0 :         return FALSE;
     753             :     }
     754             : 
     755           0 :     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
     756           0 :         dotType=getDotType(c);
     757           0 :         if(dotType==UCASE_SOFT_DOTTED) {
     758           0 :             return TRUE; /* preceded by TYPE_i */
     759           0 :         } else if(dotType!=UCASE_OTHER_ACCENT) {
     760           0 :             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
     761             :         }
     762             :     }
     763             : 
     764           0 :     return FALSE; /* not preceded by TYPE_i */
     765             : }
     766             : 
     767             : /*
     768             :  * See Jitterbug 2344:
     769             :  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
     770             :  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
     771             :  * we made those releases compatible with Unicode 3.2 which had not fixed
     772             :  * a related bug in SpecialCasing.txt.
     773             :  *
     774             :  * From the Jitterbug 2344 text:
     775             :  * ... this bug is listed as a Unicode erratum
     776             :  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
     777             :  * <quote>
     778             :  * There are two errors in SpecialCasing.txt.
     779             :  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
     780             :  * 2. An incorrect context definition. Correct as follows:
     781             :  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
     782             :  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
     783             :  * ---
     784             :  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
     785             :  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
     786             :  * where the context After_I is defined as:
     787             :  * The last preceding base character was an uppercase I, and there is no
     788             :  * intervening combining character class 230 (ABOVE).
     789             :  * </quote>
     790             :  *
     791             :  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
     792             :  *
     793             :  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
     794             :  * # This matches the behavior of the canonically equivalent I-dot_above
     795             :  *
     796             :  * See also the description in this place in older versions of uchar.c (revision 1.100).
     797             :  *
     798             :  * Markus W. Scherer 2003-feb-15
     799             :  */
     800             : 
     801             : /* Is preceded by base character 'I' with no intervening cc=230 ? */
     802             : static UBool
     803           0 : isPrecededBy_I(UCaseContextIterator *iter, void *context) {
     804             :     UChar32 c;
     805             :     int32_t dotType;
     806             :     int8_t dir;
     807             : 
     808           0 :     if(iter==NULL) {
     809           0 :         return FALSE;
     810             :     }
     811             : 
     812           0 :     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
     813           0 :         if(c==0x49) {
     814           0 :             return TRUE; /* preceded by I */
     815             :         }
     816           0 :         dotType=getDotType(c);
     817           0 :         if(dotType!=UCASE_OTHER_ACCENT) {
     818           0 :             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
     819             :         }
     820             :     }
     821             : 
     822           0 :     return FALSE; /* not preceded by I */
     823             : }
     824             : 
     825             : /* Is followed by one or more cc==230 ? */
     826             : static UBool
     827           0 : isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
     828             :     UChar32 c;
     829             :     int32_t dotType;
     830             :     int8_t dir;
     831             : 
     832           0 :     if(iter==NULL) {
     833           0 :         return FALSE;
     834             :     }
     835             : 
     836           0 :     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
     837           0 :         dotType=getDotType(c);
     838           0 :         if(dotType==UCASE_ABOVE) {
     839           0 :             return TRUE; /* at least one cc==230 following */
     840           0 :         } else if(dotType!=UCASE_OTHER_ACCENT) {
     841           0 :             return FALSE; /* next base character, no more cc==230 following */
     842             :         }
     843             :     }
     844             : 
     845           0 :     return FALSE; /* no more cc==230 following */
     846             : }
     847             : 
     848             : /* Is followed by a dot above (without cc==230 in between) ? */
     849             : static UBool
     850           0 : isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
     851             :     UChar32 c;
     852             :     int32_t dotType;
     853             :     int8_t dir;
     854             : 
     855           0 :     if(iter==NULL) {
     856           0 :         return FALSE;
     857             :     }
     858             : 
     859           0 :     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
     860           0 :         if(c==0x307) {
     861           0 :             return TRUE;
     862             :         }
     863           0 :         dotType=getDotType(c);
     864           0 :         if(dotType!=UCASE_OTHER_ACCENT) {
     865           0 :             return FALSE; /* next base character or cc==230 in between */
     866             :         }
     867             :     }
     868             : 
     869           0 :     return FALSE; /* no dot above following */
     870             : }
     871             : 
     872             : U_CAPI int32_t U_EXPORT2
     873           0 : ucase_toFullLower(UChar32 c,
     874             :                   UCaseContextIterator *iter, void *context,
     875             :                   const UChar **pString,
     876             :                   int32_t loc) {
     877             :     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
     878           0 :     U_ASSERT(c >= 0);
     879           0 :     UChar32 result=c;
     880           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
     881           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
     882           0 :         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
     883           0 :             result=c+UCASE_GET_DELTA(props);
     884             :         }
     885             :     } else {
     886           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
     887           0 :         uint16_t excWord=*pe++;
     888             :         int32_t full;
     889             : 
     890           0 :         pe2=pe;
     891             : 
     892           0 :         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
     893             :             /* use hardcoded conditions and mappings */
     894             : 
     895             :             /*
     896             :              * Test for conditional mappings first
     897             :              *   (otherwise the unconditional default mappings are always taken),
     898             :              * then test for characters that have unconditional mappings in SpecialCasing.txt,
     899             :              * then get the UnicodeData.txt mappings.
     900             :              */
     901           0 :             if( loc==UCASE_LOC_LITHUANIAN &&
     902             :                     /* base characters, find accents above */
     903           0 :                     (((c==0x49 || c==0x4a || c==0x12e) &&
     904           0 :                         isFollowedByMoreAbove(iter, context)) ||
     905             :                     /* precomposed with accent above, no need to find one */
     906           0 :                     (c==0xcc || c==0xcd || c==0x128))
     907             :             ) {
     908             :                 /*
     909             :                     # Lithuanian
     910             : 
     911             :                     # Lithuanian retains the dot in a lowercase i when followed by accents.
     912             : 
     913             :                     # Introduce an explicit dot above when lowercasing capital I's and J's
     914             :                     # whenever there are more accents above.
     915             :                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
     916             : 
     917             :                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
     918             :                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
     919             :                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
     920             :                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
     921             :                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
     922             :                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
     923             :                  */
     924           0 :                 switch(c) {
     925             :                 case 0x49:  /* LATIN CAPITAL LETTER I */
     926           0 :                     *pString=iDot;
     927           0 :                     return 2;
     928             :                 case 0x4a:  /* LATIN CAPITAL LETTER J */
     929           0 :                     *pString=jDot;
     930           0 :                     return 2;
     931             :                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
     932           0 :                     *pString=iOgonekDot;
     933           0 :                     return 2;
     934             :                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
     935           0 :                     *pString=iDotGrave;
     936           0 :                     return 3;
     937             :                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
     938           0 :                     *pString=iDotAcute;
     939           0 :                     return 3;
     940             :                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
     941           0 :                     *pString=iDotTilde;
     942           0 :                     return 3;
     943             :                 default:
     944           0 :                     return 0; /* will not occur */
     945             :                 }
     946             :             /* # Turkish and Azeri */
     947           0 :             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
     948             :                 /*
     949             :                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
     950             :                     # The following rules handle those cases.
     951             : 
     952             :                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
     953             :                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
     954             :                  */
     955           0 :                 return 0x69;
     956           0 :             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
     957             :                 /*
     958             :                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
     959             :                     # This matches the behavior of the canonically equivalent I-dot_above
     960             : 
     961             :                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
     962             :                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
     963             :                  */
     964           0 :                 return 0; /* remove the dot (continue without output) */
     965           0 :             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
     966             :                 /*
     967             :                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
     968             : 
     969             :                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
     970             :                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
     971             :                  */
     972           0 :                 return 0x131;
     973           0 :             } else if(c==0x130) {
     974             :                 /*
     975             :                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
     976             : 
     977             :                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
     978             :                  */
     979           0 :                 *pString=iDot;
     980           0 :                 return 2;
     981           0 :             } else if(  c==0x3a3 &&
     982           0 :                         !isFollowedByCasedLetter(iter, context, 1) &&
     983           0 :                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
     984             :             ) {
     985             :                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
     986             :                 /*
     987             :                     # Special case for final form of sigma
     988             : 
     989             :                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
     990             :                  */
     991           0 :                 return 0x3c2; /* greek small final sigma */
     992             :             } else {
     993             :                 /* no known conditional special case mapping, use a normal mapping */
     994             :             }
     995           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
     996           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
     997           0 :             full&=UCASE_FULL_LOWER;
     998           0 :             if(full!=0) {
     999             :                 /* set the output pointer to the lowercase mapping */
    1000           0 :                 *pString=reinterpret_cast<const UChar *>(pe+1);
    1001             : 
    1002             :                 /* return the string length */
    1003           0 :                 return full;
    1004             :             }
    1005             :         }
    1006             : 
    1007           0 :         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    1008           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
    1009             :         }
    1010             :     }
    1011             : 
    1012           0 :     return (result==c) ? ~result : result;
    1013             : }
    1014             : 
    1015             : /* internal */
    1016             : static int32_t
    1017           0 : toUpperOrTitle(UChar32 c,
    1018             :                UCaseContextIterator *iter, void *context,
    1019             :                const UChar **pString,
    1020             :                int32_t loc,
    1021             :                UBool upperNotTitle) {
    1022             :     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
    1023           0 :     U_ASSERT(c >= 0);
    1024           0 :     UChar32 result=c;
    1025           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    1026           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
    1027           0 :         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    1028           0 :             result=c+UCASE_GET_DELTA(props);
    1029             :         }
    1030             :     } else {
    1031           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
    1032           0 :         uint16_t excWord=*pe++;
    1033             :         int32_t full, idx;
    1034             : 
    1035           0 :         pe2=pe;
    1036             : 
    1037           0 :         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
    1038             :             /* use hardcoded conditions and mappings */
    1039           0 :             if(loc==UCASE_LOC_TURKISH && c==0x69) {
    1040             :                 /*
    1041             :                     # Turkish and Azeri
    1042             : 
    1043             :                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    1044             :                     # The following rules handle those cases.
    1045             : 
    1046             :                     # When uppercasing, i turns into a dotted capital I
    1047             : 
    1048             :                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
    1049             :                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
    1050             :                 */
    1051           0 :                 return 0x130;
    1052           0 :             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
    1053             :                 /*
    1054             :                     # Lithuanian
    1055             : 
    1056             :                     # Lithuanian retains the dot in a lowercase i when followed by accents.
    1057             : 
    1058             :                     # Remove DOT ABOVE after "i" with upper or titlecase
    1059             : 
    1060             :                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
    1061             :                  */
    1062           0 :                 return 0; /* remove the dot (continue without output) */
    1063             :             } else {
    1064             :                 /* no known conditional special case mapping, use a normal mapping */
    1065             :             }
    1066           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    1067           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    1068             : 
    1069             :             /* start of full case mapping strings */
    1070           0 :             ++pe;
    1071             : 
    1072             :             /* skip the lowercase and case-folding result strings */
    1073           0 :             pe+=full&UCASE_FULL_LOWER;
    1074           0 :             full>>=4;
    1075           0 :             pe+=full&0xf;
    1076           0 :             full>>=4;
    1077             : 
    1078           0 :             if(upperNotTitle) {
    1079           0 :                 full&=0xf;
    1080             :             } else {
    1081             :                 /* skip the uppercase result string */
    1082           0 :                 pe+=full&0xf;
    1083           0 :                 full=(full>>4)&0xf;
    1084             :             }
    1085             : 
    1086           0 :             if(full!=0) {
    1087             :                 /* set the output pointer to the result string */
    1088           0 :                 *pString=reinterpret_cast<const UChar *>(pe);
    1089             : 
    1090             :                 /* return the string length */
    1091           0 :                 return full;
    1092             :             }
    1093             :         }
    1094             : 
    1095           0 :         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
    1096           0 :             idx=UCASE_EXC_TITLE;
    1097           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    1098             :             /* here, titlecase is same as uppercase */
    1099           0 :             idx=UCASE_EXC_UPPER;
    1100             :         } else {
    1101           0 :             return ~c;
    1102             :         }
    1103           0 :         GET_SLOT_VALUE(excWord, idx, pe2, result);
    1104             :     }
    1105             : 
    1106           0 :     return (result==c) ? ~result : result;
    1107             : }
    1108             : 
    1109             : U_CAPI int32_t U_EXPORT2
    1110           0 : ucase_toFullUpper(UChar32 c,
    1111             :                   UCaseContextIterator *iter, void *context,
    1112             :                   const UChar **pString,
    1113             :                   int32_t caseLocale) {
    1114           0 :     return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
    1115             : }
    1116             : 
    1117             : U_CAPI int32_t U_EXPORT2
    1118           0 : ucase_toFullTitle(UChar32 c,
    1119             :                   UCaseContextIterator *iter, void *context,
    1120             :                   const UChar **pString,
    1121             :                   int32_t caseLocale) {
    1122           0 :     return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
    1123             : }
    1124             : 
    1125             : /* case folding ------------------------------------------------------------- */
    1126             : 
    1127             : /*
    1128             :  * Case folding is similar to lowercasing.
    1129             :  * The result may be a simple mapping, i.e., a single code point, or
    1130             :  * a full mapping, i.e., a string.
    1131             :  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
    1132             :  * then only the lowercase mapping is stored.
    1133             :  *
    1134             :  * Some special cases are hardcoded because their conditions cannot be
    1135             :  * parsed and processed from CaseFolding.txt.
    1136             :  *
    1137             :  * Unicode 3.2 CaseFolding.txt specifies for its status field:
    1138             : 
    1139             : # C: common case folding, common mappings shared by both simple and full mappings.
    1140             : # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
    1141             : # S: simple case folding, mappings to single characters where different from F.
    1142             : # T: special case for uppercase I and dotted uppercase I
    1143             : #    - For non-Turkic languages, this mapping is normally not used.
    1144             : #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
    1145             : #
    1146             : # Usage:
    1147             : #  A. To do a simple case folding, use the mappings with status C + S.
    1148             : #  B. To do a full case folding, use the mappings with status C + F.
    1149             : #
    1150             : #    The mappings with status T can be used or omitted depending on the desired case-folding
    1151             : #    behavior. (The default option is to exclude them.)
    1152             : 
    1153             :  * Unicode 3.2 has 'T' mappings as follows:
    1154             : 
    1155             : 0049; T; 0131; # LATIN CAPITAL LETTER I
    1156             : 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    1157             : 
    1158             :  * while the default mappings for these code points are:
    1159             : 
    1160             : 0049; C; 0069; # LATIN CAPITAL LETTER I
    1161             : 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    1162             : 
    1163             :  * U+0130 has no simple case folding (simple-case-folds to itself).
    1164             :  */
    1165             : 
    1166             : /* return the simple case folding mapping for c */
    1167             : U_CAPI UChar32 U_EXPORT2
    1168           0 : ucase_fold(UChar32 c, uint32_t options) {
    1169           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    1170           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
    1171           0 :         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    1172           0 :             c+=UCASE_GET_DELTA(props);
    1173             :         }
    1174             :     } else {
    1175           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    1176           0 :         uint16_t excWord=*pe++;
    1177             :         int32_t idx;
    1178           0 :         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
    1179             :             /* special case folding mappings, hardcoded */
    1180           0 :             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
    1181             :                 /* default mappings */
    1182           0 :                 if(c==0x49) {
    1183             :                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
    1184           0 :                     return 0x69;
    1185           0 :                 } else if(c==0x130) {
    1186             :                     /* no simple case folding for U+0130 */
    1187           0 :                     return c;
    1188             :                 }
    1189             :             } else {
    1190             :                 /* Turkic mappings */
    1191           0 :                 if(c==0x49) {
    1192             :                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
    1193           0 :                     return 0x131;
    1194           0 :                 } else if(c==0x130) {
    1195             :                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
    1196           0 :                     return 0x69;
    1197             :                 }
    1198             :             }
    1199             :         }
    1200           0 :         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
    1201           0 :             idx=UCASE_EXC_FOLD;
    1202           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    1203           0 :             idx=UCASE_EXC_LOWER;
    1204             :         } else {
    1205           0 :             return c;
    1206             :         }
    1207           0 :         GET_SLOT_VALUE(excWord, idx, pe, c);
    1208             :     }
    1209           0 :     return c;
    1210             : }
    1211             : 
    1212             : /*
    1213             :  * Issue for canonical caseless match (UAX #21):
    1214             :  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
    1215             :  * canonical equivalence, unlike default-option casefolding.
    1216             :  * For example, I-grave and I + grave fold to strings that are not canonically
    1217             :  * equivalent.
    1218             :  * For more details, see the comment in unorm_compare() in unorm.cpp
    1219             :  * and the intermediate prototype changes for Jitterbug 2021.
    1220             :  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
    1221             :  *
    1222             :  * This did not get fixed because it appears that it is not possible to fix
    1223             :  * it for uppercase and lowercase characters (I-grave vs. i-grave)
    1224             :  * together in a way that they still fold to common result strings.
    1225             :  */
    1226             : 
    1227             : U_CAPI int32_t U_EXPORT2
    1228           0 : ucase_toFullFolding(UChar32 c,
    1229             :                     const UChar **pString,
    1230             :                     uint32_t options) {
    1231             :     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
    1232           0 :     U_ASSERT(c >= 0);
    1233           0 :     UChar32 result=c;
    1234           0 :     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    1235           0 :     if(!PROPS_HAS_EXCEPTION(props)) {
    1236           0 :         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    1237           0 :             result=c+UCASE_GET_DELTA(props);
    1238             :         }
    1239             :     } else {
    1240           0 :         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
    1241           0 :         uint16_t excWord=*pe++;
    1242             :         int32_t full, idx;
    1243             : 
    1244           0 :         pe2=pe;
    1245             : 
    1246           0 :         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
    1247             :             /* use hardcoded conditions and mappings */
    1248           0 :             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
    1249             :                 /* default mappings */
    1250           0 :                 if(c==0x49) {
    1251             :                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
    1252           0 :                     return 0x69;
    1253           0 :                 } else if(c==0x130) {
    1254             :                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
    1255           0 :                     *pString=iDot;
    1256           0 :                     return 2;
    1257             :                 }
    1258             :             } else {
    1259             :                 /* Turkic mappings */
    1260           0 :                 if(c==0x49) {
    1261             :                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
    1262           0 :                     return 0x131;
    1263           0 :                 } else if(c==0x130) {
    1264             :                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
    1265           0 :                     return 0x69;
    1266             :                 }
    1267             :             }
    1268           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    1269           0 :             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    1270             : 
    1271             :             /* start of full case mapping strings */
    1272           0 :             ++pe;
    1273             : 
    1274             :             /* skip the lowercase result string */
    1275           0 :             pe+=full&UCASE_FULL_LOWER;
    1276           0 :             full=(full>>4)&0xf;
    1277             : 
    1278           0 :             if(full!=0) {
    1279             :                 /* set the output pointer to the result string */
    1280           0 :                 *pString=reinterpret_cast<const UChar *>(pe);
    1281             : 
    1282             :                 /* return the string length */
    1283           0 :                 return full;
    1284             :             }
    1285             :         }
    1286             : 
    1287           0 :         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
    1288           0 :             idx=UCASE_EXC_FOLD;
    1289           0 :         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    1290           0 :             idx=UCASE_EXC_LOWER;
    1291             :         } else {
    1292           0 :             return ~c;
    1293             :         }
    1294           0 :         GET_SLOT_VALUE(excWord, idx, pe2, result);
    1295             :     }
    1296             : 
    1297           0 :     return (result==c) ? ~result : result;
    1298             : }
    1299             : 
    1300             : /* case mapping properties API ---------------------------------------------- */
    1301             : 
    1302             : /* public API (see uchar.h) */
    1303             : 
    1304             : U_CAPI UBool U_EXPORT2
    1305           0 : u_isULowercase(UChar32 c) {
    1306           0 :     return (UBool)(UCASE_LOWER==ucase_getType(c));
    1307             : }
    1308             : 
    1309             : U_CAPI UBool U_EXPORT2
    1310           0 : u_isUUppercase(UChar32 c) {
    1311           0 :     return (UBool)(UCASE_UPPER==ucase_getType(c));
    1312             : }
    1313             : 
    1314             : /* Transforms the Unicode character to its lower case equivalent.*/
    1315             : U_CAPI UChar32 U_EXPORT2
    1316           0 : u_tolower(UChar32 c) {
    1317           0 :     return ucase_tolower(c);
    1318             : }
    1319             :     
    1320             : /* Transforms the Unicode character to its upper case equivalent.*/
    1321             : U_CAPI UChar32 U_EXPORT2
    1322           0 : u_toupper(UChar32 c) {
    1323           0 :     return ucase_toupper(c);
    1324             : }
    1325             : 
    1326             : /* Transforms the Unicode character to its title case equivalent.*/
    1327             : U_CAPI UChar32 U_EXPORT2
    1328           0 : u_totitle(UChar32 c) {
    1329           0 :     return ucase_totitle(c);
    1330             : }
    1331             : 
    1332             : /* return the simple case folding mapping for c */
    1333             : U_CAPI UChar32 U_EXPORT2
    1334           0 : u_foldCase(UChar32 c, uint32_t options) {
    1335           0 :     return ucase_fold(c, options);
    1336             : }
    1337             : 
    1338             : U_CFUNC int32_t U_EXPORT2
    1339           0 : ucase_hasBinaryProperty(UChar32 c, UProperty which) {
    1340             :     /* case mapping properties */
    1341             :     const UChar *resultString;
    1342           0 :     switch(which) {
    1343             :     case UCHAR_LOWERCASE:
    1344           0 :         return (UBool)(UCASE_LOWER==ucase_getType(c));
    1345             :     case UCHAR_UPPERCASE:
    1346           0 :         return (UBool)(UCASE_UPPER==ucase_getType(c));
    1347             :     case UCHAR_SOFT_DOTTED:
    1348           0 :         return ucase_isSoftDotted(c);
    1349             :     case UCHAR_CASE_SENSITIVE:
    1350           0 :         return ucase_isCaseSensitive(c);
    1351             :     case UCHAR_CASED:
    1352           0 :         return (UBool)(UCASE_NONE!=ucase_getType(c));
    1353             :     case UCHAR_CASE_IGNORABLE:
    1354           0 :         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
    1355             :     /*
    1356             :      * Note: The following Changes_When_Xyz are defined as testing whether
    1357             :      * the NFD form of the input changes when Xyz-case-mapped.
    1358             :      * However, this simpler implementation of these properties,
    1359             :      * ignoring NFD, passes the tests.
    1360             :      * The implementation needs to be changed if the tests start failing.
    1361             :      * When that happens, optimizations should be used to work with the
    1362             :      * per-single-code point ucase_toFullXyz() functions unless
    1363             :      * the NFD form has more than one code point,
    1364             :      * and the property starts set needs to be the union of the
    1365             :      * start sets for normalization and case mappings.
    1366             :      */
    1367             :     case UCHAR_CHANGES_WHEN_LOWERCASED:
    1368           0 :         return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
    1369             :     case UCHAR_CHANGES_WHEN_UPPERCASED:
    1370           0 :         return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
    1371             :     case UCHAR_CHANGES_WHEN_TITLECASED:
    1372           0 :         return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
    1373             :     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
    1374             :     case UCHAR_CHANGES_WHEN_CASEMAPPED:
    1375             :         return (UBool)(
    1376           0 :             ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
    1377           0 :             ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
    1378           0 :             ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
    1379             :     default:
    1380           0 :         return FALSE;
    1381             :     }
    1382             : }

Generated by: LCOV version 1.13