Line data Source code
1 : // © 2016 and later: Unicode, Inc. and others.
2 : // License & terms of use: http://www.unicode.org/copyright.html
3 : /*
4 : **********************************************************************
5 : * Copyright (C) 2008-2016, International Business Machines
6 : * Corporation and others. All Rights Reserved.
7 : **********************************************************************
8 : */
9 :
10 : #include "unicode/utypes.h"
11 : #include "unicode/uspoof.h"
12 : #include "unicode/uchar.h"
13 : #include "unicode/uniset.h"
14 : #include "unicode/utf16.h"
15 : #include "utrie2.h"
16 : #include "cmemory.h"
17 : #include "cstring.h"
18 : #include "scriptset.h"
19 : #include "umutex.h"
20 : #include "udataswp.h"
21 : #include "uassert.h"
22 : #include "ucln_in.h"
23 : #include "uspoof_impl.h"
24 :
25 : #if !UCONFIG_NO_NORMALIZATION
26 :
27 :
28 : U_NAMESPACE_BEGIN
29 :
30 0 : UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
31 :
32 0 : SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
33 0 : construct(status);
34 0 : fSpoofData = data;
35 0 : }
36 :
37 0 : SpoofImpl::SpoofImpl(UErrorCode& status) {
38 0 : construct(status);
39 :
40 : // TODO: Call this method where it is actually needed, instead of in the
41 : // constructor, to allow for lazy data loading. See #12696.
42 0 : fSpoofData = SpoofData::getDefault(status);
43 0 : }
44 :
45 0 : SpoofImpl::SpoofImpl() {
46 0 : UErrorCode status = U_ZERO_ERROR;
47 0 : construct(status);
48 :
49 : // TODO: Call this method where it is actually needed, instead of in the
50 : // constructor, to allow for lazy data loading. See #12696.
51 0 : fSpoofData = SpoofData::getDefault(status);
52 0 : }
53 :
54 0 : void SpoofImpl::construct(UErrorCode& status) {
55 0 : fMagic = USPOOF_MAGIC;
56 0 : fChecks = USPOOF_ALL_CHECKS;
57 0 : fSpoofData = NULL;
58 0 : fAllowedCharsSet = NULL;
59 0 : fAllowedLocales = NULL;
60 0 : fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
61 :
62 0 : if (U_FAILURE(status)) { return; }
63 :
64 0 : UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
65 0 : fAllowedCharsSet = allowedCharsSet;
66 0 : fAllowedLocales = uprv_strdup("");
67 0 : if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
68 0 : status = U_MEMORY_ALLOCATION_ERROR;
69 0 : return;
70 : }
71 0 : allowedCharsSet->freeze();
72 : }
73 :
74 :
75 : // Copy Constructor, used by the user level clone() function.
76 0 : SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
77 : fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
78 0 : fAllowedLocales(NULL) {
79 0 : if (U_FAILURE(status)) {
80 0 : return;
81 : }
82 0 : fMagic = src.fMagic;
83 0 : fChecks = src.fChecks;
84 0 : if (src.fSpoofData != NULL) {
85 0 : fSpoofData = src.fSpoofData->addReference();
86 : }
87 0 : fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
88 0 : fAllowedLocales = uprv_strdup(src.fAllowedLocales);
89 0 : if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
90 0 : status = U_MEMORY_ALLOCATION_ERROR;
91 : }
92 0 : fRestrictionLevel = src.fRestrictionLevel;
93 : }
94 :
95 0 : SpoofImpl::~SpoofImpl() {
96 0 : fMagic = 0; // head off application errors by preventing use of
97 : // of deleted objects.
98 0 : if (fSpoofData != NULL) {
99 0 : fSpoofData->removeReference(); // Will delete if refCount goes to zero.
100 : }
101 0 : delete fAllowedCharsSet;
102 0 : uprv_free((void *)fAllowedLocales);
103 0 : }
104 :
105 : // Cast this instance as a USpoofChecker for the C API.
106 0 : USpoofChecker *SpoofImpl::asUSpoofChecker() {
107 0 : return reinterpret_cast<USpoofChecker*>(this);
108 : }
109 :
110 : //
111 : // Incoming parameter check on Status and the SpoofChecker object
112 : // received from the C API.
113 : //
114 0 : const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
115 0 : if (U_FAILURE(status)) {
116 0 : return NULL;
117 : }
118 0 : if (sc == NULL) {
119 0 : status = U_ILLEGAL_ARGUMENT_ERROR;
120 0 : return NULL;
121 : }
122 0 : SpoofImpl *This = (SpoofImpl *)sc;
123 0 : if (This->fMagic != USPOOF_MAGIC) {
124 0 : status = U_INVALID_FORMAT_ERROR;
125 0 : return NULL;
126 : }
127 0 : if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
128 0 : return NULL;
129 : }
130 0 : return This;
131 : }
132 :
133 0 : SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
134 : return const_cast<SpoofImpl *>
135 0 : (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
136 : }
137 :
138 :
139 0 : void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
140 0 : UnicodeSet allowedChars;
141 0 : UnicodeSet *tmpSet = NULL;
142 0 : const char *locStart = localesList;
143 0 : const char *locEnd = NULL;
144 0 : const char *localesListEnd = localesList + uprv_strlen(localesList);
145 0 : int32_t localeListCount = 0; // Number of locales provided by caller.
146 :
147 : // Loop runs once per locale from the localesList, a comma separated list of locales.
148 0 : do {
149 0 : locEnd = uprv_strchr(locStart, ',');
150 0 : if (locEnd == NULL) {
151 0 : locEnd = localesListEnd;
152 : }
153 0 : while (*locStart == ' ') {
154 0 : locStart++;
155 : }
156 0 : const char *trimmedEnd = locEnd-1;
157 0 : while (trimmedEnd > locStart && *trimmedEnd == ' ') {
158 0 : trimmedEnd--;
159 : }
160 0 : if (trimmedEnd <= locStart) {
161 0 : break;
162 : }
163 0 : const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
164 0 : localeListCount++;
165 :
166 : // We have one locale from the locales list.
167 : // Add the script chars for this locale to the accumulating set of allowed chars.
168 : // If the locale is no good, we will be notified back via status.
169 0 : addScriptChars(locale, &allowedChars, status);
170 0 : uprv_free((void *)locale);
171 0 : if (U_FAILURE(status)) {
172 0 : break;
173 : }
174 0 : locStart = locEnd + 1;
175 0 : } while (locStart < localesListEnd);
176 :
177 : // If our caller provided an empty list of locales, we disable the allowed characters checking
178 0 : if (localeListCount == 0) {
179 0 : uprv_free((void *)fAllowedLocales);
180 0 : fAllowedLocales = uprv_strdup("");
181 0 : tmpSet = new UnicodeSet(0, 0x10ffff);
182 0 : if (fAllowedLocales == NULL || tmpSet == NULL) {
183 0 : status = U_MEMORY_ALLOCATION_ERROR;
184 0 : return;
185 : }
186 0 : tmpSet->freeze();
187 0 : delete fAllowedCharsSet;
188 0 : fAllowedCharsSet = tmpSet;
189 0 : fChecks &= ~USPOOF_CHAR_LIMIT;
190 0 : return;
191 : }
192 :
193 :
194 : // Add all common and inherited characters to the set of allowed chars.
195 0 : UnicodeSet tempSet;
196 0 : tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
197 0 : allowedChars.addAll(tempSet);
198 0 : tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
199 0 : allowedChars.addAll(tempSet);
200 :
201 : // If anything went wrong, we bail out without changing
202 : // the state of the spoof checker.
203 0 : if (U_FAILURE(status)) {
204 0 : return;
205 : }
206 :
207 : // Store the updated spoof checker state.
208 0 : tmpSet = static_cast<UnicodeSet *>(allowedChars.clone());
209 0 : const char *tmpLocalesList = uprv_strdup(localesList);
210 0 : if (tmpSet == NULL || tmpLocalesList == NULL) {
211 0 : status = U_MEMORY_ALLOCATION_ERROR;
212 0 : return;
213 : }
214 0 : uprv_free((void *)fAllowedLocales);
215 0 : fAllowedLocales = tmpLocalesList;
216 0 : tmpSet->freeze();
217 0 : delete fAllowedCharsSet;
218 0 : fAllowedCharsSet = tmpSet;
219 0 : fChecks |= USPOOF_CHAR_LIMIT;
220 : }
221 :
222 :
223 0 : const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
224 0 : return fAllowedLocales;
225 : }
226 :
227 :
228 : // Given a locale (a language), add all the characters from all of the scripts used with that language
229 : // to the allowedChars UnicodeSet
230 :
231 0 : void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
232 : UScriptCode scripts[30];
233 :
234 0 : int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
235 0 : if (U_FAILURE(status)) {
236 0 : return;
237 : }
238 0 : if (status == U_USING_DEFAULT_WARNING) {
239 0 : status = U_ILLEGAL_ARGUMENT_ERROR;
240 0 : return;
241 : }
242 0 : UnicodeSet tmpSet;
243 : int32_t i;
244 0 : for (i=0; i<numScripts; i++) {
245 0 : tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
246 0 : allowedChars->addAll(tmpSet);
247 : }
248 : }
249 :
250 : // Computes the augmented script set for a code point, according to UTS 39 section 5.1.
251 0 : void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
252 0 : result.resetAll();
253 0 : result.setScriptExtensions(codePoint, status);
254 0 : if (U_FAILURE(status)) { return; }
255 :
256 : // Section 5.1 step 1
257 0 : if (result.test(USCRIPT_HAN, status)) {
258 0 : result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
259 0 : result.set(USCRIPT_JAPANESE, status);
260 0 : result.set(USCRIPT_KOREAN, status);
261 : }
262 0 : if (result.test(USCRIPT_HIRAGANA, status)) {
263 0 : result.set(USCRIPT_JAPANESE, status);
264 : }
265 0 : if (result.test(USCRIPT_KATAKANA, status)) {
266 0 : result.set(USCRIPT_JAPANESE, status);
267 : }
268 0 : if (result.test(USCRIPT_HANGUL, status)) {
269 0 : result.set(USCRIPT_KOREAN, status);
270 : }
271 0 : if (result.test(USCRIPT_BOPOMOFO, status)) {
272 0 : result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
273 : }
274 :
275 : // Section 5.1 step 2
276 0 : if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
277 0 : result.setAll();
278 : }
279 : }
280 :
281 : // Computes the resolved script set for a string, according to UTS 39 section 5.1.
282 0 : void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
283 0 : getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
284 0 : }
285 :
286 : // Computes the resolved script set for a string, omitting characters having the specified script.
287 : // If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
288 0 : void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
289 0 : result.setAll();
290 :
291 0 : ScriptSet temp;
292 : UChar32 codePoint;
293 0 : for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
294 0 : codePoint = input.char32At(i);
295 :
296 : // Compute the augmented script set for the character
297 0 : getAugmentedScriptSet(codePoint, temp, status);
298 0 : if (U_FAILURE(status)) { return; }
299 :
300 : // Intersect the augmented script set with the resolved script set, but only if the character doesn't
301 : // have the script specified in the function call
302 0 : if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
303 0 : result.intersect(temp);
304 : }
305 : }
306 : }
307 :
308 : // Computes the set of numerics for a string, according to UTS 39 section 5.3.
309 0 : void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
310 0 : result.clear();
311 :
312 : UChar32 codePoint;
313 0 : for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
314 0 : codePoint = input.char32At(i);
315 :
316 : // Store a representative character for each kind of decimal digit
317 0 : if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
318 : // Store the zero character as a representative for comparison.
319 : // Unicode guarantees it is codePoint - value
320 0 : result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
321 : }
322 : }
323 0 : }
324 :
325 : // Computes the restriction level of a string, according to UTS 39 section 5.2.
326 0 : URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
327 : // Section 5.2 step 1:
328 0 : if (!fAllowedCharsSet->containsAll(input)) {
329 0 : return USPOOF_UNRESTRICTIVE;
330 : }
331 :
332 : // Section 5.2 step 2
333 : // Java use a static UnicodeSet for this test. In C++, avoid the static variable
334 : // and just do a simple for loop.
335 0 : UBool allASCII = TRUE;
336 0 : for (int32_t i=0, length=input.length(); i<length; i++) {
337 0 : if (input.charAt(i) > 0x7f) {
338 0 : allASCII = FALSE;
339 0 : break;
340 : }
341 : }
342 0 : if (allASCII) {
343 0 : return USPOOF_ASCII;
344 : }
345 :
346 : // Section 5.2 steps 3:
347 0 : ScriptSet resolvedScriptSet;
348 0 : getResolvedScriptSet(input, resolvedScriptSet, status);
349 0 : if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
350 :
351 : // Section 5.2 step 4:
352 0 : if (!resolvedScriptSet.isEmpty()) {
353 0 : return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
354 : }
355 :
356 : // Section 5.2 step 5:
357 0 : ScriptSet resolvedNoLatn;
358 0 : getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
359 0 : if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
360 :
361 : // Section 5.2 step 6:
362 0 : if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
363 0 : || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
364 0 : || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
365 0 : return USPOOF_HIGHLY_RESTRICTIVE;
366 : }
367 :
368 : // Section 5.2 step 7:
369 0 : if (!resolvedNoLatn.isEmpty()
370 0 : && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
371 0 : && !resolvedNoLatn.test(USCRIPT_GREEK, status)
372 0 : && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
373 0 : return USPOOF_MODERATELY_RESTRICTIVE;
374 : }
375 :
376 : // Section 5.2 step 8:
377 0 : return USPOOF_MINIMALLY_RESTRICTIVE;
378 : }
379 :
380 :
381 :
382 : // Convert a text format hex number. Utility function used by builder code. Static.
383 : // Input: UChar *string text. Output: a UChar32
384 : // Input has been pre-checked, and will have no non-hex chars.
385 : // The number must fall in the code point range of 0..0x10ffff
386 : // Static Function.
387 0 : UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
388 0 : if (U_FAILURE(status)) {
389 0 : return 0;
390 : }
391 0 : U_ASSERT(limit-start > 0);
392 0 : uint32_t val = 0;
393 : int i;
394 0 : for (i=start; i<limit; i++) {
395 0 : int digitVal = s[i] - 0x30;
396 0 : if (digitVal>9) {
397 0 : digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
398 : }
399 0 : if (digitVal>15) {
400 0 : digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
401 : }
402 0 : U_ASSERT(digitVal <= 0xf);
403 0 : val <<= 4;
404 0 : val += digitVal;
405 : }
406 0 : if (val > 0x10ffff) {
407 0 : status = U_PARSE_ERROR;
408 0 : val = 0;
409 : }
410 0 : return (UChar32)val;
411 : }
412 :
413 :
414 : //-----------------------------------------
415 : //
416 : // class CheckResult Implementation
417 : //
418 : //-----------------------------------------
419 :
420 0 : CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) {
421 0 : clear();
422 0 : }
423 :
424 0 : USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
425 0 : return reinterpret_cast<USpoofCheckResult*>(this);
426 : }
427 :
428 : //
429 : // Incoming parameter check on Status and the CheckResult object
430 : // received from the C API.
431 : //
432 0 : const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
433 0 : if (U_FAILURE(status)) { return NULL; }
434 0 : if (ptr == NULL) {
435 0 : status = U_ILLEGAL_ARGUMENT_ERROR;
436 0 : return NULL;
437 : }
438 0 : CheckResult *This = (CheckResult*) ptr;
439 0 : if (This->fMagic != USPOOF_CHECK_MAGIC) {
440 0 : status = U_INVALID_FORMAT_ERROR;
441 0 : return NULL;
442 : }
443 0 : return This;
444 : }
445 :
446 0 : CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
447 : return const_cast<CheckResult *>
448 0 : (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status));
449 : }
450 :
451 0 : void CheckResult::clear() {
452 0 : fChecks = 0;
453 0 : fNumerics.clear();
454 0 : fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
455 0 : }
456 :
457 0 : int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
458 0 : if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
459 0 : return fChecks | fRestrictionLevel;
460 : } else {
461 0 : return fChecks;
462 : }
463 : }
464 :
465 0 : CheckResult::~CheckResult() {
466 0 : }
467 :
468 : //----------------------------------------------------------------------------------------------
469 : //
470 : // class SpoofData Implementation
471 : //
472 : //----------------------------------------------------------------------------------------------
473 :
474 :
475 0 : UBool SpoofData::validateDataVersion(UErrorCode &status) const {
476 0 : if (U_FAILURE(status) ||
477 0 : fRawData == NULL ||
478 0 : fRawData->fMagic != USPOOF_MAGIC ||
479 0 : fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
480 0 : fRawData->fFormatVersion[1] != 0 ||
481 0 : fRawData->fFormatVersion[2] != 0 ||
482 0 : fRawData->fFormatVersion[3] != 0) {
483 0 : status = U_INVALID_FORMAT_ERROR;
484 0 : return FALSE;
485 : }
486 0 : return TRUE;
487 : }
488 :
489 : static UBool U_CALLCONV
490 0 : spoofDataIsAcceptable(void *context,
491 : const char * /* type */, const char * /*name*/,
492 : const UDataInfo *pInfo) {
493 0 : if(
494 0 : pInfo->size >= 20 &&
495 0 : pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
496 0 : pInfo->charsetFamily == U_CHARSET_FAMILY &&
497 0 : pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
498 0 : pInfo->dataFormat[1] == 0x66 &&
499 0 : pInfo->dataFormat[2] == 0x75 &&
500 0 : pInfo->dataFormat[3] == 0x20 &&
501 0 : pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
502 : ) {
503 0 : UVersionInfo *version = static_cast<UVersionInfo *>(context);
504 0 : if(version != NULL) {
505 0 : uprv_memcpy(version, pInfo->dataVersion, 4);
506 : }
507 0 : return TRUE;
508 : } else {
509 0 : return FALSE;
510 : }
511 : }
512 :
513 : // Methods for the loading of the default confusables data file. The confusable
514 : // data is loaded only when it is needed.
515 : //
516 : // SpoofData::getDefault() - Return the default confusables data, and call the
517 : // initOnce() if it is not available. Adds a reference
518 : // to the SpoofData that the caller is responsible for
519 : // decrementing when they are done with the data.
520 : //
521 : // uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
522 : // is shared by all spoof checkers using the default data.
523 : //
524 : // uspoof_cleanupDefaultData - Called during cleanup.
525 : //
526 :
527 : static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
528 : static SpoofData* gDefaultSpoofData;
529 :
530 : static UBool U_CALLCONV
531 0 : uspoof_cleanupDefaultData(void) {
532 0 : if (gDefaultSpoofData) {
533 : // Will delete, assuming all user-level spoof checkers were closed.
534 0 : gDefaultSpoofData->removeReference();
535 0 : gDefaultSpoofData = NULL;
536 0 : gSpoofInitDefaultOnce.reset();
537 : }
538 0 : return TRUE;
539 : }
540 :
541 0 : static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
542 : UDataMemory *udm = udata_openChoice(NULL, "cfu", "confusables",
543 : spoofDataIsAcceptable,
544 : NULL, // context, would receive dataVersion if supplied.
545 0 : &status);
546 0 : if (U_FAILURE(status)) { return; }
547 0 : gDefaultSpoofData = new SpoofData(udm, status);
548 0 : if (U_FAILURE(status)) {
549 0 : delete gDefaultSpoofData;
550 0 : return;
551 : }
552 0 : if (gDefaultSpoofData == NULL) {
553 0 : status = U_MEMORY_ALLOCATION_ERROR;
554 0 : return;
555 : }
556 0 : ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
557 : }
558 :
559 0 : SpoofData* SpoofData::getDefault(UErrorCode& status) {
560 0 : umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
561 0 : if (U_FAILURE(status)) { return NULL; }
562 0 : gDefaultSpoofData->addReference();
563 0 : return gDefaultSpoofData;
564 : }
565 :
566 :
567 :
568 0 : SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
569 : {
570 0 : reset();
571 0 : if (U_FAILURE(status)) {
572 0 : return;
573 : }
574 0 : fUDM = udm;
575 : // fRawData is non-const because it may be constructed by the data builder.
576 0 : fRawData = reinterpret_cast<SpoofDataHeader *>(
577 0 : const_cast<void *>(udata_getMemory(udm)));
578 0 : validateDataVersion(status);
579 0 : initPtrs(status);
580 : }
581 :
582 :
583 0 : SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
584 : {
585 0 : reset();
586 0 : if (U_FAILURE(status)) {
587 0 : return;
588 : }
589 0 : if ((size_t)length < sizeof(SpoofDataHeader)) {
590 0 : status = U_INVALID_FORMAT_ERROR;
591 0 : return;
592 : }
593 0 : void *ncData = const_cast<void *>(data);
594 0 : fRawData = static_cast<SpoofDataHeader *>(ncData);
595 0 : if (length < fRawData->fLength) {
596 0 : status = U_INVALID_FORMAT_ERROR;
597 0 : return;
598 : }
599 0 : validateDataVersion(status);
600 0 : initPtrs(status);
601 : }
602 :
603 :
604 : // Spoof Data constructor for use from data builder.
605 : // Initializes a new, empty data area that will be populated later.
606 0 : SpoofData::SpoofData(UErrorCode &status) {
607 0 : reset();
608 0 : if (U_FAILURE(status)) {
609 0 : return;
610 : }
611 0 : fDataOwned = true;
612 :
613 : // The spoof header should already be sized to be a multiple of 16 bytes.
614 : // Just in case it's not, round it up.
615 0 : uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
616 0 : U_ASSERT(initialSize == sizeof(SpoofDataHeader));
617 :
618 0 : fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
619 0 : fMemLimit = initialSize;
620 0 : if (fRawData == NULL) {
621 0 : status = U_MEMORY_ALLOCATION_ERROR;
622 0 : return;
623 : }
624 0 : uprv_memset(fRawData, 0, initialSize);
625 :
626 0 : fRawData->fMagic = USPOOF_MAGIC;
627 0 : fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
628 0 : fRawData->fFormatVersion[1] = 0;
629 0 : fRawData->fFormatVersion[2] = 0;
630 0 : fRawData->fFormatVersion[3] = 0;
631 0 : initPtrs(status);
632 : }
633 :
634 : // reset() - initialize all fields.
635 : // Should be updated if any new fields are added.
636 : // Called by constructors to put things in a known initial state.
637 0 : void SpoofData::reset() {
638 0 : fRawData = NULL;
639 0 : fDataOwned = FALSE;
640 0 : fUDM = NULL;
641 0 : fMemLimit = 0;
642 0 : fRefCount = 1;
643 0 : fCFUKeys = NULL;
644 0 : fCFUValues = NULL;
645 0 : fCFUStrings = NULL;
646 0 : }
647 :
648 :
649 : // SpoofData::initPtrs()
650 : // Initialize the pointers to the various sections of the raw data.
651 : //
652 : // This function is used both during the Trie building process (multiple
653 : // times, as the individual data sections are added), and
654 : // during the opening of a Spoof Checker from prebuilt data.
655 : //
656 : // The pointers for non-existent data sections (identified by an offset of 0)
657 : // are set to NULL.
658 : //
659 : // Note: During building the data, adding each new data section
660 : // reallocs the raw data area, which likely relocates it, which
661 : // in turn requires reinitializing all of the pointers into it, hence
662 : // multiple calls to this function during building.
663 : //
664 0 : void SpoofData::initPtrs(UErrorCode &status) {
665 0 : fCFUKeys = NULL;
666 0 : fCFUValues = NULL;
667 0 : fCFUStrings = NULL;
668 0 : if (U_FAILURE(status)) {
669 0 : return;
670 : }
671 0 : if (fRawData->fCFUKeys != 0) {
672 0 : fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
673 : }
674 0 : if (fRawData->fCFUStringIndex != 0) {
675 0 : fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
676 : }
677 0 : if (fRawData->fCFUStringTable != 0) {
678 0 : fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
679 : }
680 : }
681 :
682 :
683 0 : SpoofData::~SpoofData() {
684 0 : if (fDataOwned) {
685 0 : uprv_free(fRawData);
686 : }
687 0 : fRawData = NULL;
688 0 : if (fUDM != NULL) {
689 0 : udata_close(fUDM);
690 : }
691 0 : fUDM = NULL;
692 0 : }
693 :
694 :
695 0 : void SpoofData::removeReference() {
696 0 : if (umtx_atomic_dec(&fRefCount) == 0) {
697 0 : delete this;
698 : }
699 0 : }
700 :
701 :
702 0 : SpoofData *SpoofData::addReference() {
703 0 : umtx_atomic_inc(&fRefCount);
704 0 : return this;
705 : }
706 :
707 :
708 0 : void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
709 0 : if (U_FAILURE(status)) {
710 0 : return NULL;
711 : }
712 0 : if (!fDataOwned) {
713 0 : U_ASSERT(FALSE);
714 : status = U_INTERNAL_PROGRAM_ERROR;
715 : return NULL;
716 : }
717 :
718 0 : numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
719 0 : uint32_t returnOffset = fMemLimit;
720 0 : fMemLimit += numBytes;
721 0 : fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
722 0 : fRawData->fLength = fMemLimit;
723 0 : uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
724 0 : initPtrs(status);
725 0 : return (char *)fRawData + returnOffset;
726 : }
727 :
728 0 : int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
729 0 : int32_t dataSize = fRawData->fLength;
730 0 : if (capacity < dataSize) {
731 0 : status = U_BUFFER_OVERFLOW_ERROR;
732 0 : return dataSize;
733 : }
734 0 : uprv_memcpy(buf, fRawData, dataSize);
735 0 : return dataSize;
736 : }
737 :
738 0 : int32_t SpoofData::size() const {
739 0 : return fRawData->fLength;
740 : }
741 :
742 : //-------------------------------
743 : //
744 : // Front-end APIs for SpoofData
745 : //
746 : //-------------------------------
747 :
748 0 : int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
749 : // Perform a binary search.
750 : // [lo, hi), i.e lo is inclusive, hi is exclusive.
751 : // The result after the loop will be in lo.
752 0 : int32_t lo = 0;
753 0 : int32_t hi = length();
754 0 : do {
755 0 : int32_t mid = (lo + hi) / 2;
756 0 : if (codePointAt(mid) > inChar) {
757 0 : hi = mid;
758 0 : } else if (codePointAt(mid) < inChar) {
759 0 : lo = mid;
760 : } else {
761 : // Found result. Break early.
762 0 : lo = mid;
763 0 : break;
764 : }
765 0 : } while (hi - lo > 1);
766 :
767 : // Did we find an entry? If not, the char maps to itself.
768 0 : if (codePointAt(lo) != inChar) {
769 0 : dest.append(inChar);
770 0 : return 1;
771 : }
772 :
773 : // Add the element to the string builder and return.
774 0 : return appendValueTo(lo, dest);
775 : }
776 :
777 0 : int32_t SpoofData::length() const {
778 0 : return fRawData->fCFUKeysSize;
779 : }
780 :
781 0 : UChar32 SpoofData::codePointAt(int32_t index) const {
782 0 : return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
783 : }
784 :
785 0 : int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
786 0 : int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
787 :
788 : // Value is either a char (for strings of length 1) or
789 : // an index into the string table (for longer strings)
790 0 : uint16_t value = fCFUValues[index];
791 0 : if (stringLength == 1) {
792 0 : dest.append((UChar)value);
793 : } else {
794 0 : dest.append(fCFUStrings + value, stringLength);
795 : }
796 :
797 0 : return stringLength;
798 : }
799 :
800 :
801 : U_NAMESPACE_END
802 :
803 : U_NAMESPACE_USE
804 :
805 : //-----------------------------------------------------------------------------
806 : //
807 : // uspoof_swap - byte swap and char encoding swap of spoof data
808 : //
809 : //-----------------------------------------------------------------------------
810 : U_CAPI int32_t U_EXPORT2
811 0 : uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
812 : UErrorCode *status) {
813 :
814 0 : if (status == NULL || U_FAILURE(*status)) {
815 0 : return 0;
816 : }
817 0 : if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
818 0 : *status=U_ILLEGAL_ARGUMENT_ERROR;
819 0 : return 0;
820 : }
821 :
822 : //
823 : // Check that the data header is for spoof data.
824 : // (Header contents are defined in gencfu.cpp)
825 : //
826 0 : const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
827 0 : if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
828 0 : pInfo->dataFormat[1]==0x66 &&
829 0 : pInfo->dataFormat[2]==0x75 &&
830 0 : pInfo->dataFormat[3]==0x20 &&
831 0 : pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
832 0 : pInfo->formatVersion[1]==0 &&
833 0 : pInfo->formatVersion[2]==0 &&
834 0 : pInfo->formatVersion[3]==0 )) {
835 0 : udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
836 : "(format version %02x %02x %02x %02x) is not recognized\n",
837 0 : pInfo->dataFormat[0], pInfo->dataFormat[1],
838 0 : pInfo->dataFormat[2], pInfo->dataFormat[3],
839 0 : pInfo->formatVersion[0], pInfo->formatVersion[1],
840 0 : pInfo->formatVersion[2], pInfo->formatVersion[3]);
841 0 : *status=U_UNSUPPORTED_ERROR;
842 0 : return 0;
843 : }
844 :
845 : //
846 : // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
847 : // header). This swap also conveniently gets us
848 : // the size of the ICU d.h., which lets us locate the start
849 : // of the uspoof specific data.
850 : //
851 0 : int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
852 :
853 :
854 : //
855 : // Get the Spoof Data Header, and check that it appears to be OK.
856 : //
857 : //
858 0 : const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
859 0 : SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
860 0 : if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
861 0 : ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
862 : {
863 0 : udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
864 0 : *status=U_UNSUPPORTED_ERROR;
865 0 : return 0;
866 : }
867 :
868 : //
869 : // Prefight operation? Just return the size
870 : //
871 0 : int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
872 0 : int32_t totalSize = headerSize + spoofDataLength;
873 0 : if (length < 0) {
874 0 : return totalSize;
875 : }
876 :
877 : //
878 : // Check that length passed in is consistent with length from Spoof data header.
879 : //
880 0 : if (length < totalSize) {
881 : udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
882 0 : spoofDataLength);
883 0 : *status=U_INDEX_OUTOFBOUNDS_ERROR;
884 0 : return 0;
885 : }
886 :
887 :
888 : //
889 : // Swap the Data. Do the data itself first, then the Spoof Data Header, because
890 : // we need to reference the header to locate the data, and an
891 : // inplace swap of the header leaves it unusable.
892 : //
893 0 : uint8_t *outBytes = (uint8_t *)outData + headerSize;
894 0 : SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
895 :
896 : int32_t sectionStart;
897 : int32_t sectionLength;
898 :
899 : //
900 : // If not swapping in place, zero out the output buffer before starting.
901 : // Gaps may exist between the individual sections, and these must be zeroed in
902 : // the output buffer. The simplest way to do that is to just zero the whole thing.
903 : //
904 0 : if (inBytes != outBytes) {
905 0 : uprv_memset(outBytes, 0, spoofDataLength);
906 : }
907 :
908 : // Confusables Keys Section (fCFUKeys)
909 0 : sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
910 0 : sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
911 0 : ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
912 :
913 : // String Index Section
914 0 : sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
915 0 : sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
916 0 : ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
917 :
918 : // String Table Section
919 0 : sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
920 0 : sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
921 0 : ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
922 :
923 : // And, last, swap the header itself.
924 : // int32_t fMagic // swap this
925 : // uint8_t fFormatVersion[4] // Do not swap this, just copy
926 : // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
927 : //
928 0 : uint32_t magic = ds->readUInt32(spoofDH->fMagic);
929 0 : ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
930 :
931 0 : if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
932 0 : uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
933 : }
934 : // swap starting at fLength
935 0 : ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
936 :
937 0 : return totalSize;
938 : }
939 :
940 : #endif
941 :
942 :
|