LCOV - output.info - intl/icu/source/i18n/uspoof

LCOV - code coverage report

Current view:	top level - intl/icu/source/i18n - uspoof_impl.cpp (source / functions)		Hit	Total	Coverage
Test:	output.info	Lines:	0	479	0.0 %
Date:	2017-07-14 16:53:18	Functions:	0	50	0.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : // © 2016 and later: Unicode, Inc. and others.
       2             : // License & terms of use: http://www.unicode.org/copyright.html
       3             : /*
       4             : **********************************************************************
       5             : *   Copyright (C) 2008-2016, International Business Machines
       6             : *   Corporation and others.  All Rights Reserved.
       7             : **********************************************************************
       8             : */
       9             : 
      10             : #include "unicode/utypes.h"
      11             : #include "unicode/uspoof.h"
      12             : #include "unicode/uchar.h"
      13             : #include "unicode/uniset.h"
      14             : #include "unicode/utf16.h"
      15             : #include "utrie2.h"
      16             : #include "cmemory.h"
      17             : #include "cstring.h"
      18             : #include "scriptset.h"
      19             : #include "umutex.h"
      20             : #include "udataswp.h"
      21             : #include "uassert.h"
      22             : #include "ucln_in.h"
      23             : #include "uspoof_impl.h"
      24             : 
      25             : #if !UCONFIG_NO_NORMALIZATION
      26             : 
      27             : 
      28             : U_NAMESPACE_BEGIN
      29             : 
      30           0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
      31             : 
      32           0 : SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
      33           0 :     construct(status);
      34           0 :     fSpoofData = data;
      35           0 : }
      36             : 
      37           0 : SpoofImpl::SpoofImpl(UErrorCode& status) {
      38           0 :     construct(status);
      39             : 
      40             :     // TODO: Call this method where it is actually needed, instead of in the
      41             :     // constructor, to allow for lazy data loading.  See #12696.
      42           0 :     fSpoofData = SpoofData::getDefault(status);
      43           0 : }
      44             : 
      45           0 : SpoofImpl::SpoofImpl() {
      46           0 :     UErrorCode status = U_ZERO_ERROR;
      47           0 :     construct(status);
      48             : 
      49             :     // TODO: Call this method where it is actually needed, instead of in the
      50             :     // constructor, to allow for lazy data loading.  See #12696.
      51           0 :     fSpoofData = SpoofData::getDefault(status);
      52           0 : }
      53             : 
      54           0 : void SpoofImpl::construct(UErrorCode& status) {
      55           0 :     fMagic = USPOOF_MAGIC;
      56           0 :     fChecks = USPOOF_ALL_CHECKS;
      57           0 :     fSpoofData = NULL;
      58           0 :     fAllowedCharsSet = NULL;
      59           0 :     fAllowedLocales = NULL;
      60           0 :     fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
      61             : 
      62           0 :     if (U_FAILURE(status)) { return; }
      63             : 
      64           0 :     UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
      65           0 :     fAllowedCharsSet = allowedCharsSet;
      66           0 :     fAllowedLocales  = uprv_strdup("");
      67           0 :     if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
      68           0 :         status = U_MEMORY_ALLOCATION_ERROR;
      69           0 :         return;
      70             :     }
      71           0 :     allowedCharsSet->freeze();
      72             : }
      73             : 
      74             : 
      75             : // Copy Constructor, used by the user level clone() function.
      76           0 : SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
      77             :         fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , 
      78           0 :         fAllowedLocales(NULL) {
      79           0 :     if (U_FAILURE(status)) {
      80           0 :         return;
      81             :     }
      82           0 :     fMagic = src.fMagic;
      83           0 :     fChecks = src.fChecks;
      84           0 :     if (src.fSpoofData != NULL) {
      85           0 :         fSpoofData = src.fSpoofData->addReference();
      86             :     }
      87           0 :     fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
      88           0 :     fAllowedLocales = uprv_strdup(src.fAllowedLocales);
      89           0 :     if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
      90           0 :         status = U_MEMORY_ALLOCATION_ERROR;
      91             :     }
      92           0 :     fRestrictionLevel = src.fRestrictionLevel;
      93             : }
      94             : 
      95           0 : SpoofImpl::~SpoofImpl() {
      96           0 :     fMagic = 0;                // head off application errors by preventing use of
      97             :                                //    of deleted objects.
      98           0 :     if (fSpoofData != NULL) {
      99           0 :         fSpoofData->removeReference();   // Will delete if refCount goes to zero.
     100             :     }
     101           0 :     delete fAllowedCharsSet;
     102           0 :     uprv_free((void *)fAllowedLocales);
     103           0 : }
     104             : 
     105             : //  Cast this instance as a USpoofChecker for the C API.
     106           0 : USpoofChecker *SpoofImpl::asUSpoofChecker() {
     107           0 :     return reinterpret_cast<USpoofChecker*>(this);
     108             : }
     109             : 
     110             : //
     111             : //  Incoming parameter check on Status and the SpoofChecker object
     112             : //    received from the C API.
     113             : //
     114           0 : const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
     115           0 :     if (U_FAILURE(status)) {
     116           0 :         return NULL;
     117             :     }
     118           0 :     if (sc == NULL) {
     119           0 :         status = U_ILLEGAL_ARGUMENT_ERROR;
     120           0 :         return NULL;
     121             :     }
     122           0 :     SpoofImpl *This = (SpoofImpl *)sc;
     123           0 :     if (This->fMagic != USPOOF_MAGIC) {
     124           0 :         status = U_INVALID_FORMAT_ERROR;
     125           0 :         return NULL;
     126             :     }
     127           0 :     if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
     128           0 :         return NULL;
     129             :     }
     130           0 :     return This;
     131             : }
     132             : 
     133           0 : SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
     134             :     return const_cast<SpoofImpl *>
     135           0 :         (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
     136             : }
     137             : 
     138             : 
     139           0 : void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
     140           0 :     UnicodeSet    allowedChars;
     141           0 :     UnicodeSet    *tmpSet = NULL;
     142           0 :     const char    *locStart = localesList;
     143           0 :     const char    *locEnd = NULL;
     144           0 :     const char    *localesListEnd = localesList + uprv_strlen(localesList);
     145           0 :     int32_t        localeListCount = 0;   // Number of locales provided by caller.
     146             : 
     147             :     // Loop runs once per locale from the localesList, a comma separated list of locales.
     148           0 :     do {
     149           0 :         locEnd = uprv_strchr(locStart, ',');
     150           0 :         if (locEnd == NULL) {
     151           0 :             locEnd = localesListEnd;
     152             :         }
     153           0 :         while (*locStart == ' ') {
     154           0 :             locStart++;
     155             :         }
     156           0 :         const char *trimmedEnd = locEnd-1;
     157           0 :         while (trimmedEnd > locStart && *trimmedEnd == ' ') {
     158           0 :             trimmedEnd--;
     159             :         }
     160           0 :         if (trimmedEnd <= locStart) {
     161           0 :             break;
     162             :         }
     163           0 :         const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
     164           0 :         localeListCount++;
     165             : 
     166             :         // We have one locale from the locales list.
     167             :         // Add the script chars for this locale to the accumulating set of allowed chars.
     168             :         // If the locale is no good, we will be notified back via status.
     169           0 :         addScriptChars(locale, &allowedChars, status);
     170           0 :         uprv_free((void *)locale);
     171           0 :         if (U_FAILURE(status)) {
     172           0 :             break;
     173             :         }
     174           0 :         locStart = locEnd + 1;
     175           0 :     } while (locStart < localesListEnd);
     176             : 
     177             :     // If our caller provided an empty list of locales, we disable the allowed characters checking
     178           0 :     if (localeListCount == 0) {
     179           0 :         uprv_free((void *)fAllowedLocales);
     180           0 :         fAllowedLocales = uprv_strdup("");
     181           0 :         tmpSet = new UnicodeSet(0, 0x10ffff);
     182           0 :         if (fAllowedLocales == NULL || tmpSet == NULL) {
     183           0 :             status = U_MEMORY_ALLOCATION_ERROR;
     184           0 :             return;
     185             :         } 
     186           0 :         tmpSet->freeze();
     187           0 :         delete fAllowedCharsSet;
     188           0 :         fAllowedCharsSet = tmpSet;
     189           0 :         fChecks &= ~USPOOF_CHAR_LIMIT;
     190           0 :         return;
     191             :     }
     192             : 
     193             :         
     194             :     // Add all common and inherited characters to the set of allowed chars.
     195           0 :     UnicodeSet tempSet;
     196           0 :     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
     197           0 :     allowedChars.addAll(tempSet);
     198           0 :     tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
     199           0 :     allowedChars.addAll(tempSet);
     200             :     
     201             :     // If anything went wrong, we bail out without changing
     202             :     // the state of the spoof checker.
     203           0 :     if (U_FAILURE(status)) {
     204           0 :         return;
     205             :     }
     206             : 
     207             :     // Store the updated spoof checker state.
     208           0 :     tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
     209           0 :     const char *tmpLocalesList = uprv_strdup(localesList);
     210           0 :     if (tmpSet == NULL || tmpLocalesList == NULL) {
     211           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     212           0 :         return;
     213             :     }
     214           0 :     uprv_free((void *)fAllowedLocales);
     215           0 :     fAllowedLocales = tmpLocalesList;
     216           0 :     tmpSet->freeze();
     217           0 :     delete fAllowedCharsSet;
     218           0 :     fAllowedCharsSet = tmpSet;
     219           0 :     fChecks |= USPOOF_CHAR_LIMIT;
     220             : }
     221             : 
     222             : 
     223           0 : const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
     224           0 :     return fAllowedLocales;
     225             : }
     226             : 
     227             : 
     228             : // Given a locale (a language), add all the characters from all of the scripts used with that language
     229             : // to the allowedChars UnicodeSet
     230             : 
     231           0 : void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
     232             :     UScriptCode scripts[30];
     233             : 
     234           0 :     int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
     235           0 :     if (U_FAILURE(status)) {
     236           0 :         return;
     237             :     }
     238           0 :     if (status == U_USING_DEFAULT_WARNING) {
     239           0 :         status = U_ILLEGAL_ARGUMENT_ERROR;
     240           0 :         return;
     241             :     }
     242           0 :     UnicodeSet tmpSet;
     243             :     int32_t    i;
     244           0 :     for (i=0; i<numScripts; i++) {
     245           0 :         tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
     246           0 :         allowedChars->addAll(tmpSet);
     247             :     }
     248             : }
     249             : 
     250             : // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
     251           0 : void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
     252           0 :     result.resetAll();
     253           0 :     result.setScriptExtensions(codePoint, status);
     254           0 :     if (U_FAILURE(status)) { return; }
     255             : 
     256             :     // Section 5.1 step 1
     257           0 :     if (result.test(USCRIPT_HAN, status)) {
     258           0 :         result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
     259           0 :         result.set(USCRIPT_JAPANESE, status);
     260           0 :         result.set(USCRIPT_KOREAN, status);
     261             :     }
     262           0 :     if (result.test(USCRIPT_HIRAGANA, status)) {
     263           0 :         result.set(USCRIPT_JAPANESE, status);
     264             :     }
     265           0 :     if (result.test(USCRIPT_KATAKANA, status)) {
     266           0 :         result.set(USCRIPT_JAPANESE, status);
     267             :     }
     268           0 :     if (result.test(USCRIPT_HANGUL, status)) {
     269           0 :         result.set(USCRIPT_KOREAN, status);
     270             :     }
     271           0 :     if (result.test(USCRIPT_BOPOMOFO, status)) {
     272           0 :         result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
     273             :     }
     274             : 
     275             :     // Section 5.1 step 2
     276           0 :     if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
     277           0 :         result.setAll();
     278             :     }
     279             : }
     280             : 
     281             : // Computes the resolved script set for a string, according to UTS 39 section 5.1.
     282           0 : void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
     283           0 :     getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
     284           0 : }
     285             : 
     286             : // Computes the resolved script set for a string, omitting characters having the specified script.
     287             : // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
     288           0 : void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
     289           0 :     result.setAll();
     290             : 
     291           0 :     ScriptSet temp;
     292             :     UChar32 codePoint;
     293           0 :     for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
     294           0 :         codePoint = input.char32At(i);
     295             : 
     296             :         // Compute the augmented script set for the character
     297           0 :         getAugmentedScriptSet(codePoint, temp, status);
     298           0 :         if (U_FAILURE(status)) { return; }
     299             : 
     300             :         // Intersect the augmented script set with the resolved script set, but only if the character doesn't
     301             :         // have the script specified in the function call
     302           0 :         if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
     303           0 :             result.intersect(temp);
     304             :         }
     305             :     }
     306             : }
     307             : 
     308             : // Computes the set of numerics for a string, according to UTS 39 section 5.3.
     309           0 : void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
     310           0 :     result.clear();
     311             : 
     312             :     UChar32 codePoint;
     313           0 :     for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
     314           0 :         codePoint = input.char32At(i);
     315             : 
     316             :         // Store a representative character for each kind of decimal digit
     317           0 :         if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
     318             :             // Store the zero character as a representative for comparison.
     319             :             // Unicode guarantees it is codePoint - value
     320           0 :             result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
     321             :         }
     322             :     }
     323           0 : }
     324             : 
     325             : // Computes the restriction level of a string, according to UTS 39 section 5.2.
     326           0 : URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
     327             :     // Section 5.2 step 1:
     328           0 :     if (!fAllowedCharsSet->containsAll(input)) {
     329           0 :         return USPOOF_UNRESTRICTIVE;
     330             :     }
     331             : 
     332             :     // Section 5.2 step 2
     333             :     // Java use a static UnicodeSet for this test.  In C++, avoid the static variable
     334             :     // and just do a simple for loop.
     335           0 :     UBool allASCII = TRUE;
     336           0 :     for (int32_t i=0, length=input.length(); i<length; i++) {
     337           0 :         if (input.charAt(i) > 0x7f) {
     338           0 :             allASCII = FALSE;
     339           0 :             break;
     340             :         }
     341             :     }
     342           0 :     if (allASCII) {
     343           0 :         return USPOOF_ASCII;
     344             :     }
     345             : 
     346             :     // Section 5.2 steps 3:
     347           0 :     ScriptSet resolvedScriptSet;
     348           0 :     getResolvedScriptSet(input, resolvedScriptSet, status);
     349           0 :     if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
     350             : 
     351             :     // Section 5.2 step 4:
     352           0 :     if (!resolvedScriptSet.isEmpty()) {
     353           0 :         return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
     354             :     }
     355             : 
     356             :     // Section 5.2 step 5:
     357           0 :     ScriptSet resolvedNoLatn;
     358           0 :     getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
     359           0 :     if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
     360             : 
     361             :     // Section 5.2 step 6:
     362           0 :     if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
     363           0 :             || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
     364           0 :             || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
     365           0 :         return USPOOF_HIGHLY_RESTRICTIVE;
     366             :     }
     367             : 
     368             :     // Section 5.2 step 7:
     369           0 :     if (!resolvedNoLatn.isEmpty()
     370           0 :             && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
     371           0 :             && !resolvedNoLatn.test(USCRIPT_GREEK, status)
     372           0 :             && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
     373           0 :         return USPOOF_MODERATELY_RESTRICTIVE;
     374             :     }
     375             : 
     376             :     // Section 5.2 step 8:
     377           0 :     return USPOOF_MINIMALLY_RESTRICTIVE;
     378             : }
     379             : 
     380             : 
     381             : 
     382             : // Convert a text format hex number.  Utility function used by builder code.  Static.
     383             : // Input: UChar *string text.  Output: a UChar32
     384             : // Input has been pre-checked, and will have no non-hex chars.
     385             : // The number must fall in the code point range of 0..0x10ffff
     386             : // Static Function.
     387           0 : UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
     388           0 :     if (U_FAILURE(status)) {
     389           0 :         return 0;
     390             :     }
     391           0 :     U_ASSERT(limit-start > 0);
     392           0 :     uint32_t val = 0;
     393             :     int i;
     394           0 :     for (i=start; i<limit; i++) {
     395           0 :         int digitVal = s[i] - 0x30;
     396           0 :         if (digitVal>9) {
     397           0 :             digitVal = 0xa + (s[i] - 0x41);  // Upper Case 'A'
     398             :         }
     399           0 :         if (digitVal>15) {
     400           0 :             digitVal = 0xa + (s[i] - 0x61);  // Lower Case 'a'
     401             :         }
     402           0 :         U_ASSERT(digitVal <= 0xf);
     403           0 :         val <<= 4;
     404           0 :         val += digitVal;
     405             :     }
     406           0 :     if (val > 0x10ffff) {
     407           0 :         status = U_PARSE_ERROR;
     408           0 :         val = 0;
     409             :     }
     410           0 :     return (UChar32)val;
     411             : }
     412             : 
     413             : 
     414             : //-----------------------------------------
     415             : //
     416             : //   class CheckResult Implementation
     417             : //
     418             : //-----------------------------------------
     419             : 
     420           0 : CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
     421           0 :     clear();
     422           0 : }
     423             : 
     424           0 : USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
     425           0 :     return reinterpret_cast<USpoofCheckResult*>(this);
     426             : }
     427             : 
     428             : //
     429             : //  Incoming parameter check on Status and the CheckResult object
     430             : //    received from the C API.
     431             : //
     432           0 : const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
     433           0 :     if (U_FAILURE(status)) { return NULL; }
     434           0 :     if (ptr == NULL) {
     435           0 :         status = U_ILLEGAL_ARGUMENT_ERROR;
     436           0 :         return NULL;
     437             :     }
     438           0 :     CheckResult *This = (CheckResult*) ptr;
     439           0 :     if (This->fMagic != USPOOF_CHECK_MAGIC) {
     440           0 :         status = U_INVALID_FORMAT_ERROR;
     441           0 :         return NULL;
     442             :     }
     443           0 :     return This;
     444             : }
     445             : 
     446           0 : CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
     447             :     return const_cast<CheckResult *>
     448           0 :         (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
     449             : }
     450             : 
     451           0 : void CheckResult::clear() {
     452           0 :     fChecks = 0;
     453           0 :     fNumerics.clear();
     454           0 :     fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
     455           0 : }
     456             : 
     457           0 : int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
     458           0 :     if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
     459           0 :         return fChecks | fRestrictionLevel;
     460             :     } else {
     461           0 :         return fChecks;
     462             :     }
     463             : }
     464             : 
     465           0 : CheckResult::~CheckResult() {
     466           0 : }
     467             : 
     468             : //----------------------------------------------------------------------------------------------
     469             : //
     470             : //   class SpoofData Implementation
     471             : //
     472             : //----------------------------------------------------------------------------------------------
     473             : 
     474             : 
     475           0 : UBool SpoofData::validateDataVersion(UErrorCode &status) const {
     476           0 :     if (U_FAILURE(status) ||
     477           0 :         fRawData == NULL ||
     478           0 :         fRawData->fMagic != USPOOF_MAGIC ||
     479           0 :         fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
     480           0 :         fRawData->fFormatVersion[1] != 0 ||
     481           0 :         fRawData->fFormatVersion[2] != 0 ||
     482           0 :         fRawData->fFormatVersion[3] != 0) {
     483           0 :             status = U_INVALID_FORMAT_ERROR;
     484           0 :             return FALSE;
     485             :     }
     486           0 :     return TRUE;
     487             : }
     488             : 
     489             : static UBool U_CALLCONV
     490           0 : spoofDataIsAcceptable(void *context,
     491             :                         const char * /* type */, const char * /*name*/,
     492             :                         const UDataInfo *pInfo) {
     493           0 :     if(
     494           0 :         pInfo->size >= 20 &&
     495           0 :         pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
     496           0 :         pInfo->charsetFamily == U_CHARSET_FAMILY &&
     497           0 :         pInfo->dataFormat[0] == 0x43 &&  // dataFormat="Cfu "
     498           0 :         pInfo->dataFormat[1] == 0x66 &&
     499           0 :         pInfo->dataFormat[2] == 0x75 &&
     500           0 :         pInfo->dataFormat[3] == 0x20 &&
     501           0 :         pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
     502             :     ) {
     503           0 :         UVersionInfo *version = static_cast<UVersionInfo *>(context);
     504           0 :         if(version != NULL) {
     505           0 :             uprv_memcpy(version, pInfo->dataVersion, 4);
     506             :         }
     507           0 :         return TRUE;
     508             :     } else {
     509           0 :         return FALSE;
     510             :     }
     511             : }
     512             : 
     513             : //  Methods for the loading of the default confusables data file.  The confusable
     514             : //  data is loaded only when it is needed.
     515             : //
     516             : //  SpoofData::getDefault() - Return the default confusables data, and call the
     517             : //                            initOnce() if it is not available.  Adds a reference
     518             : //                            to the SpoofData that the caller is responsible for
     519             : //                            decrementing when they are done with the data.
     520             : //
     521             : //  uspoof_loadDefaultData - Called once, from initOnce().  The resulting SpoofData
     522             : //                           is shared by all spoof checkers using the default data.
     523             : //
     524             : //  uspoof_cleanupDefaultData - Called during cleanup.
     525             : //
     526             : 
     527             : static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
     528             : static SpoofData* gDefaultSpoofData;
     529             : 
     530             : static UBool U_CALLCONV
     531           0 : uspoof_cleanupDefaultData(void) {
     532           0 :     if (gDefaultSpoofData) {
     533             :         // Will delete, assuming all user-level spoof checkers were closed.
     534           0 :         gDefaultSpoofData->removeReference();
     535           0 :         gDefaultSpoofData = NULL;
     536           0 :         gSpoofInitDefaultOnce.reset();
     537             :     }
     538           0 :     return TRUE;
     539             : }
     540             : 
     541           0 : static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
     542             :     UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
     543             :                                         spoofDataIsAcceptable, 
     544             :                                         NULL,       // context, would receive dataVersion if supplied.
     545           0 :                                         &status);
     546           0 :     if (U_FAILURE(status)) { return; }
     547           0 :     gDefaultSpoofData = new SpoofData(udm, status);
     548           0 :     if (U_FAILURE(status)) {
     549           0 :         delete gDefaultSpoofData;
     550           0 :         return;
     551             :     }
     552           0 :     if (gDefaultSpoofData == NULL) {
     553           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     554           0 :         return;
     555             :     }
     556           0 :     ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
     557             : }
     558             : 
     559           0 : SpoofData* SpoofData::getDefault(UErrorCode& status) {
     560           0 :     umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
     561           0 :     if (U_FAILURE(status)) { return NULL; }
     562           0 :     gDefaultSpoofData->addReference();
     563           0 :     return gDefaultSpoofData;
     564             : }
     565             : 
     566             : 
     567             : 
     568           0 : SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
     569             : {
     570           0 :     reset();
     571           0 :     if (U_FAILURE(status)) {
     572           0 :         return;
     573             :     }
     574           0 :     fUDM = udm;
     575             :     // fRawData is non-const because it may be constructed by the data builder.
     576           0 :     fRawData = reinterpret_cast<SpoofDataHeader *>(
     577           0 :             const_cast<void *>(udata_getMemory(udm)));
     578           0 :     validateDataVersion(status);
     579           0 :     initPtrs(status);
     580             : }
     581             : 
     582             : 
     583           0 : SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
     584             : {
     585           0 :     reset();
     586           0 :     if (U_FAILURE(status)) {
     587           0 :         return;
     588             :     }
     589           0 :     if ((size_t)length < sizeof(SpoofDataHeader)) {
     590           0 :         status = U_INVALID_FORMAT_ERROR;
     591           0 :         return;
     592             :     }
     593           0 :     void *ncData = const_cast<void *>(data);
     594           0 :     fRawData = static_cast<SpoofDataHeader *>(ncData);
     595           0 :     if (length < fRawData->fLength) {
     596           0 :         status = U_INVALID_FORMAT_ERROR;
     597           0 :         return;
     598             :     }
     599           0 :     validateDataVersion(status);
     600           0 :     initPtrs(status);
     601             : }
     602             : 
     603             : 
     604             : // Spoof Data constructor for use from data builder.
     605             : //   Initializes a new, empty data area that will be populated later.
     606           0 : SpoofData::SpoofData(UErrorCode &status) {
     607           0 :     reset();
     608           0 :     if (U_FAILURE(status)) {
     609           0 :         return;
     610             :     }
     611           0 :     fDataOwned = true;
     612             : 
     613             :     // The spoof header should already be sized to be a multiple of 16 bytes.
     614             :     // Just in case it's not, round it up.
     615           0 :     uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
     616           0 :     U_ASSERT(initialSize == sizeof(SpoofDataHeader));
     617             :     
     618           0 :     fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
     619           0 :     fMemLimit = initialSize;
     620           0 :     if (fRawData == NULL) {
     621           0 :         status = U_MEMORY_ALLOCATION_ERROR;
     622           0 :         return;
     623             :     }
     624           0 :     uprv_memset(fRawData, 0, initialSize);
     625             : 
     626           0 :     fRawData->fMagic = USPOOF_MAGIC;
     627           0 :     fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
     628           0 :     fRawData->fFormatVersion[1] = 0;
     629           0 :     fRawData->fFormatVersion[2] = 0;
     630           0 :     fRawData->fFormatVersion[3] = 0;
     631           0 :     initPtrs(status);
     632             : }
     633             : 
     634             : // reset() - initialize all fields.
     635             : //           Should be updated if any new fields are added.
     636             : //           Called by constructors to put things in a known initial state.
     637           0 : void SpoofData::reset() {
     638           0 :    fRawData = NULL;
     639           0 :    fDataOwned = FALSE;
     640           0 :    fUDM      = NULL;
     641           0 :    fMemLimit = 0;
     642           0 :    fRefCount = 1;
     643           0 :    fCFUKeys = NULL;
     644           0 :    fCFUValues = NULL;
     645           0 :    fCFUStrings = NULL;
     646           0 : }
     647             : 
     648             : 
     649             : //  SpoofData::initPtrs()
     650             : //            Initialize the pointers to the various sections of the raw data.
     651             : //
     652             : //            This function is used both during the Trie building process (multiple
     653             : //            times, as the individual data sections are added), and
     654             : //            during the opening of a Spoof Checker from prebuilt data.
     655             : //
     656             : //            The pointers for non-existent data sections (identified by an offset of 0)
     657             : //            are set to NULL.
     658             : //
     659             : //            Note:  During building the data, adding each new data section
     660             : //            reallocs the raw data area, which likely relocates it, which
     661             : //            in turn requires reinitializing all of the pointers into it, hence
     662             : //            multiple calls to this function during building.
     663             : //
     664           0 : void SpoofData::initPtrs(UErrorCode &status) {
     665           0 :     fCFUKeys = NULL;
     666           0 :     fCFUValues = NULL;
     667           0 :     fCFUStrings = NULL;
     668           0 :     if (U_FAILURE(status)) {
     669           0 :         return;
     670             :     }
     671           0 :     if (fRawData->fCFUKeys != 0) {
     672           0 :         fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
     673             :     }
     674           0 :     if (fRawData->fCFUStringIndex != 0) {
     675           0 :         fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
     676             :     }
     677           0 :     if (fRawData->fCFUStringTable != 0) {
     678           0 :         fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
     679             :     }
     680             : }
     681             : 
     682             : 
     683           0 : SpoofData::~SpoofData() {
     684           0 :     if (fDataOwned) {
     685           0 :         uprv_free(fRawData);
     686             :     }
     687           0 :     fRawData = NULL;
     688           0 :     if (fUDM != NULL) {
     689           0 :         udata_close(fUDM);
     690             :     }
     691           0 :     fUDM = NULL;
     692           0 : }
     693             : 
     694             : 
     695           0 : void SpoofData::removeReference() {
     696           0 :     if (umtx_atomic_dec(&fRefCount) == 0) {
     697           0 :         delete this;
     698             :     }
     699           0 : }
     700             : 
     701             : 
     702           0 : SpoofData *SpoofData::addReference() {
     703           0 :     umtx_atomic_inc(&fRefCount);
     704           0 :     return this;
     705             : }
     706             : 
     707             : 
     708           0 : void *SpoofData::reserveSpace(int32_t numBytes,  UErrorCode &status) {
     709           0 :     if (U_FAILURE(status)) {
     710           0 :         return NULL;
     711             :     }
     712           0 :     if (!fDataOwned) {
     713           0 :         U_ASSERT(FALSE);
     714             :         status = U_INTERNAL_PROGRAM_ERROR;
     715             :         return NULL;
     716             :     }
     717             : 
     718           0 :     numBytes = (numBytes + 15) & ~15;   // Round up to a multiple of 16
     719           0 :     uint32_t returnOffset = fMemLimit;
     720           0 :     fMemLimit += numBytes;
     721           0 :     fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
     722           0 :     fRawData->fLength = fMemLimit;
     723           0 :     uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
     724           0 :     initPtrs(status);
     725           0 :     return (char *)fRawData + returnOffset;
     726             : }
     727             : 
     728           0 : int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
     729           0 :     int32_t dataSize = fRawData->fLength;
     730           0 :     if (capacity < dataSize) {
     731           0 :         status = U_BUFFER_OVERFLOW_ERROR;
     732           0 :         return dataSize;
     733             :     }
     734           0 :     uprv_memcpy(buf, fRawData, dataSize);
     735           0 :     return dataSize;
     736             : }
     737             : 
     738           0 : int32_t SpoofData::size() const {
     739           0 :     return fRawData->fLength;
     740             : }
     741             : 
     742             : //-------------------------------
     743             : //
     744             : // Front-end APIs for SpoofData
     745             : //
     746             : //-------------------------------
     747             : 
     748           0 : int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
     749             :     // Perform a binary search.
     750             :     // [lo, hi), i.e lo is inclusive, hi is exclusive.
     751             :     // The result after the loop will be in lo.
     752           0 :     int32_t lo = 0;
     753           0 :     int32_t hi = length();
     754           0 :     do {
     755           0 :         int32_t mid = (lo + hi) / 2;
     756           0 :         if (codePointAt(mid) > inChar) {
     757           0 :             hi = mid;
     758           0 :         } else if (codePointAt(mid) < inChar) {
     759           0 :             lo = mid;
     760             :         } else {
     761             :             // Found result.  Break early.
     762           0 :             lo = mid;
     763           0 :             break;
     764             :         }
     765           0 :     } while (hi - lo > 1);
     766             : 
     767             :     // Did we find an entry?  If not, the char maps to itself.
     768           0 :     if (codePointAt(lo) != inChar) {
     769           0 :         dest.append(inChar);
     770           0 :         return 1;
     771             :     }
     772             : 
     773             :     // Add the element to the string builder and return.
     774           0 :     return appendValueTo(lo, dest);
     775             : }
     776             : 
     777           0 : int32_t SpoofData::length() const {
     778           0 :     return fRawData->fCFUKeysSize;
     779             : }
     780             : 
     781           0 : UChar32 SpoofData::codePointAt(int32_t index) const {
     782           0 :     return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
     783             : }
     784             : 
     785           0 : int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
     786           0 :     int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
     787             : 
     788             :     // Value is either a char (for strings of length 1) or
     789             :     // an index into the string table (for longer strings)
     790           0 :     uint16_t value = fCFUValues[index];
     791           0 :     if (stringLength == 1) {
     792           0 :         dest.append((UChar)value);
     793             :     } else {
     794           0 :         dest.append(fCFUStrings + value, stringLength);
     795             :     }
     796             : 
     797           0 :     return stringLength;
     798             : }
     799             : 
     800             : 
     801             : U_NAMESPACE_END
     802             : 
     803             : U_NAMESPACE_USE
     804             : 
     805             : //-----------------------------------------------------------------------------
     806             : //
     807             : //  uspoof_swap   -  byte swap and char encoding swap of spoof data
     808             : //
     809             : //-----------------------------------------------------------------------------
     810             : U_CAPI int32_t U_EXPORT2
     811           0 : uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
     812             :            UErrorCode *status) {
     813             : 
     814           0 :     if (status == NULL || U_FAILURE(*status)) {
     815           0 :         return 0;
     816             :     }
     817           0 :     if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
     818           0 :         *status=U_ILLEGAL_ARGUMENT_ERROR;
     819           0 :         return 0;
     820             :     }
     821             : 
     822             :     //
     823             :     //  Check that the data header is for spoof data.
     824             :     //    (Header contents are defined in gencfu.cpp)
     825             :     //
     826           0 :     const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
     827           0 :     if(!(  pInfo->dataFormat[0]==0x43 &&   /* dataFormat="Cfu " */
     828           0 :            pInfo->dataFormat[1]==0x66 &&
     829           0 :            pInfo->dataFormat[2]==0x75 &&
     830           0 :            pInfo->dataFormat[3]==0x20 &&
     831           0 :            pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
     832           0 :            pInfo->formatVersion[1]==0 &&
     833           0 :            pInfo->formatVersion[2]==0 &&
     834           0 :            pInfo->formatVersion[3]==0  )) {
     835           0 :         udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
     836             :                              "(format version %02x %02x %02x %02x) is not recognized\n",
     837           0 :                          pInfo->dataFormat[0], pInfo->dataFormat[1],
     838           0 :                          pInfo->dataFormat[2], pInfo->dataFormat[3],
     839           0 :                          pInfo->formatVersion[0], pInfo->formatVersion[1],
     840           0 :                          pInfo->formatVersion[2], pInfo->formatVersion[3]);
     841           0 :         *status=U_UNSUPPORTED_ERROR;
     842           0 :         return 0;
     843             :     }
     844             : 
     845             :     //
     846             :     // Swap the data header.  (This is the generic ICU Data Header, not the uspoof Specific
     847             :     //                         header).  This swap also conveniently gets us
     848             :     //                         the size of the ICU d.h., which lets us locate the start
     849             :     //                         of the uspoof specific data.
     850             :     //
     851           0 :     int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
     852             : 
     853             : 
     854             :     //
     855             :     // Get the Spoof Data Header, and check that it appears to be OK.
     856             :     //
     857             :     //
     858           0 :     const uint8_t   *inBytes =(const uint8_t *)inData+headerSize;
     859           0 :     SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
     860           0 :     if (ds->readUInt32(spoofDH->fMagic)   != USPOOF_MAGIC ||
     861           0 :         ds->readUInt32(spoofDH->fLength)  <  sizeof(SpoofDataHeader)) 
     862             :     {
     863           0 :         udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
     864           0 :         *status=U_UNSUPPORTED_ERROR;
     865           0 :         return 0;
     866             :     }
     867             : 
     868             :     //
     869             :     // Prefight operation?  Just return the size
     870             :     //
     871           0 :     int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
     872           0 :     int32_t totalSize = headerSize + spoofDataLength;
     873           0 :     if (length < 0) {
     874           0 :         return totalSize;
     875             :     }
     876             : 
     877             :     //
     878             :     // Check that length passed in is consistent with length from Spoof data header.
     879             :     //
     880           0 :     if (length < totalSize) {
     881             :         udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
     882           0 :                             spoofDataLength);
     883           0 :         *status=U_INDEX_OUTOFBOUNDS_ERROR;
     884           0 :         return 0;
     885             :         }
     886             : 
     887             : 
     888             :     //
     889             :     // Swap the Data.  Do the data itself first, then the Spoof Data Header, because
     890             :     //                 we need to reference the header to locate the data, and an
     891             :     //                 inplace swap of the header leaves it unusable.
     892             :     //
     893           0 :     uint8_t          *outBytes = (uint8_t *)outData + headerSize;
     894           0 :     SpoofDataHeader  *outputDH = (SpoofDataHeader *)outBytes;
     895             : 
     896             :     int32_t   sectionStart;
     897             :     int32_t   sectionLength;
     898             : 
     899             :     //
     900             :     // If not swapping in place, zero out the output buffer before starting.
     901             :     //    Gaps may exist between the individual sections, and these must be zeroed in
     902             :     //    the output buffer.  The simplest way to do that is to just zero the whole thing.
     903             :     //
     904           0 :     if (inBytes != outBytes) {
     905           0 :         uprv_memset(outBytes, 0, spoofDataLength);
     906             :     }
     907             : 
     908             :     // Confusables Keys Section   (fCFUKeys)
     909           0 :     sectionStart  = ds->readUInt32(spoofDH->fCFUKeys);
     910           0 :     sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
     911           0 :     ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
     912             : 
     913             :     // String Index Section
     914           0 :     sectionStart  = ds->readUInt32(spoofDH->fCFUStringIndex);
     915           0 :     sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
     916           0 :     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
     917             : 
     918             :     // String Table Section
     919           0 :     sectionStart  = ds->readUInt32(spoofDH->fCFUStringTable);
     920           0 :     sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
     921           0 :     ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
     922             : 
     923             :     // And, last, swap the header itself.
     924             :     //   int32_t   fMagic             // swap this
     925             :     //   uint8_t   fFormatVersion[4]  // Do not swap this, just copy
     926             :     //   int32_t   fLength and all the rest       // Swap the rest, all is 32 bit stuff.
     927             :     //
     928           0 :     uint32_t magic = ds->readUInt32(spoofDH->fMagic);
     929           0 :     ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
     930             : 
     931           0 :     if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
     932           0 :         uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
     933             :     }
     934             :     // swap starting at fLength
     935           0 :     ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
     936             : 
     937           0 :     return totalSize;
     938             : }
     939             : 
     940             : #endif
     941             : 
     942             :

Generated by: LCOV version 1.13