1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (C) 2008-2016, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11#include "unicode/uspoof.h"
12#include "unicode/uchar.h"
13#include "unicode/uniset.h"
14#include "unicode/utf16.h"
15#include "utrie2.h"
16#include "cmemory.h"
17#include "cstring.h"
18#include "scriptset.h"
19#include "umutex.h"
20#include "udataswp.h"
21#include "uassert.h"
22#include "ucln_in.h"
23#include "uspoof_impl.h"
24
25#if !UCONFIG_NO_NORMALIZATION
26
27
28U_NAMESPACE_BEGIN
29
30UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
31
32SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
33 construct(status);
34 fSpoofData = data;
35}
36
37SpoofImpl::SpoofImpl(UErrorCode& status) {
38 construct(status);
39
40 // TODO: Call this method where it is actually needed, instead of in the
41 // constructor, to allow for lazy data loading. See #12696.
42 fSpoofData = SpoofData::getDefault(status);
43}
44
45SpoofImpl::SpoofImpl() {
46 UErrorCode status = U_ZERO_ERROR;
47 construct(status);
48
49 // TODO: Call this method where it is actually needed, instead of in the
50 // constructor, to allow for lazy data loading. See #12696.
51 fSpoofData = SpoofData::getDefault(status);
52}
53
54void SpoofImpl::construct(UErrorCode& status) {
55 fChecks = USPOOF_ALL_CHECKS;
56 fSpoofData = nullptr;
57 fAllowedCharsSet = nullptr;
58 fAllowedLocales = nullptr;
59 fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
60
61 if (U_FAILURE(status)) { return; }
62
63 UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
64 fAllowedCharsSet = allowedCharsSet;
65 fAllowedLocales = uprv_strdup("");
66 if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
67 status = U_MEMORY_ALLOCATION_ERROR;
68 return;
69 }
70 allowedCharsSet->freeze();
71}
72
73
74// Copy Constructor, used by the user level clone() function.
75SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
76 fChecks(USPOOF_ALL_CHECKS), fSpoofData(nullptr), fAllowedCharsSet(nullptr) ,
77 fAllowedLocales(nullptr) {
78 if (U_FAILURE(status)) {
79 return;
80 }
81 fChecks = src.fChecks;
82 if (src.fSpoofData != nullptr) {
83 fSpoofData = src.fSpoofData->addReference();
84 }
85 fAllowedCharsSet = src.fAllowedCharsSet->clone();
86 fAllowedLocales = uprv_strdup(src.fAllowedLocales);
87 if (fAllowedCharsSet == nullptr || fAllowedLocales == nullptr) {
88 status = U_MEMORY_ALLOCATION_ERROR;
89 }
90 fRestrictionLevel = src.fRestrictionLevel;
91}
92
93SpoofImpl::~SpoofImpl() {
94 if (fSpoofData != nullptr) {
95 fSpoofData->removeReference(); // Will delete if refCount goes to zero.
96 }
97 delete fAllowedCharsSet;
98 uprv_free((void *)fAllowedLocales);
99}
100
101// Cast this instance as a USpoofChecker for the C API.
102USpoofChecker *SpoofImpl::asUSpoofChecker() {
103 return exportForC();
104}
105
106//
107// Incoming parameter check on Status and the SpoofChecker object
108// received from the C API.
109//
110const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
111 auto* This = validate(sc, status);
112 if (U_FAILURE(status)) {
113 return nullptr;
114 }
115 if (This->fSpoofData != nullptr && !This->fSpoofData->validateDataVersion(status)) {
116 return nullptr;
117 }
118 return This;
119}
120
121SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
122 return const_cast<SpoofImpl *>
123 (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
124}
125
126
127void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
128 UnicodeSet allowedChars;
129 UnicodeSet *tmpSet = nullptr;
130 const char *locStart = localesList;
131 const char *locEnd = nullptr;
132 const char *localesListEnd = localesList + uprv_strlen(localesList);
133 int32_t localeListCount = 0; // Number of locales provided by caller.
134
135 // Loop runs once per locale from the localesList, a comma separated list of locales.
136 do {
137 locEnd = uprv_strchr(locStart, ',');
138 if (locEnd == nullptr) {
139 locEnd = localesListEnd;
140 }
141 while (*locStart == ' ') {
142 locStart++;
143 }
144 const char *trimmedEnd = locEnd-1;
145 while (trimmedEnd > locStart && *trimmedEnd == ' ') {
146 trimmedEnd--;
147 }
148 if (trimmedEnd <= locStart) {
149 break;
150 }
151 const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart));
152 localeListCount++;
153
154 // We have one locale from the locales list.
155 // Add the script chars for this locale to the accumulating set of allowed chars.
156 // If the locale is no good, we will be notified back via status.
157 addScriptChars(locale, &allowedChars, status);
158 uprv_free((void *)locale);
159 if (U_FAILURE(status)) {
160 break;
161 }
162 locStart = locEnd + 1;
163 } while (locStart < localesListEnd);
164
165 // If our caller provided an empty list of locales, we disable the allowed characters checking
166 if (localeListCount == 0) {
167 uprv_free((void *)fAllowedLocales);
168 fAllowedLocales = uprv_strdup("");
169 tmpSet = new UnicodeSet(0, 0x10ffff);
170 if (fAllowedLocales == nullptr || tmpSet == nullptr) {
171 status = U_MEMORY_ALLOCATION_ERROR;
172 return;
173 }
174 tmpSet->freeze();
175 delete fAllowedCharsSet;
176 fAllowedCharsSet = tmpSet;
177 fChecks &= ~USPOOF_CHAR_LIMIT;
178 return;
179 }
180
181
182 // Add all common and inherited characters to the set of allowed chars.
183 UnicodeSet tempSet;
184 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
185 allowedChars.addAll(tempSet);
186 tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
187 allowedChars.addAll(tempSet);
188
189 // If anything went wrong, we bail out without changing
190 // the state of the spoof checker.
191 if (U_FAILURE(status)) {
192 return;
193 }
194
195 // Store the updated spoof checker state.
196 tmpSet = allowedChars.clone();
197 const char *tmpLocalesList = uprv_strdup(localesList);
198 if (tmpSet == nullptr || tmpLocalesList == nullptr) {
199 status = U_MEMORY_ALLOCATION_ERROR;
200 return;
201 }
202 uprv_free((void *)fAllowedLocales);
203 fAllowedLocales = tmpLocalesList;
204 tmpSet->freeze();
205 delete fAllowedCharsSet;
206 fAllowedCharsSet = tmpSet;
207 fChecks |= USPOOF_CHAR_LIMIT;
208}
209
210
211const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) {
212 return fAllowedLocales;
213}
214
215
216// Given a locale (a language), add all the characters from all of the scripts used with that language
217// to the allowedChars UnicodeSet
218
219void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) {
220 UScriptCode scripts[30];
221
222 int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
223 if (U_FAILURE(status)) {
224 return;
225 }
226 if (status == U_USING_DEFAULT_WARNING) {
227 status = U_ILLEGAL_ARGUMENT_ERROR;
228 return;
229 }
230 UnicodeSet tmpSet;
231 int32_t i;
232 for (i=0; i<numScripts; i++) {
233 tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
234 allowedChars->addAll(tmpSet);
235 }
236}
237
238// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
239void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
240 result.resetAll();
241 result.setScriptExtensions(codePoint, status);
242 if (U_FAILURE(status)) { return; }
243
244 // Section 5.1 step 1
245 if (result.test(USCRIPT_HAN, status)) {
246 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
247 result.set(USCRIPT_JAPANESE, status);
248 result.set(USCRIPT_KOREAN, status);
249 }
250 if (result.test(USCRIPT_HIRAGANA, status)) {
251 result.set(USCRIPT_JAPANESE, status);
252 }
253 if (result.test(USCRIPT_KATAKANA, status)) {
254 result.set(USCRIPT_JAPANESE, status);
255 }
256 if (result.test(USCRIPT_HANGUL, status)) {
257 result.set(USCRIPT_KOREAN, status);
258 }
259 if (result.test(USCRIPT_BOPOMOFO, status)) {
260 result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
261 }
262
263 // Section 5.1 step 2
264 if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) {
265 result.setAll();
266 }
267}
268
269// Computes the resolved script set for a string, according to UTS 39 section 5.1.
270void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
271 getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
272}
273
274// Computes the resolved script set for a string, omitting characters having the specified script.
275// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
276void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
277 result.setAll();
278
279 ScriptSet temp;
280 UChar32 codePoint;
281 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
282 codePoint = input.char32At(i);
283
284 // Compute the augmented script set for the character
285 getAugmentedScriptSet(codePoint, temp, status);
286 if (U_FAILURE(status)) { return; }
287
288 // Intersect the augmented script set with the resolved script set, but only if the character doesn't
289 // have the script specified in the function call
290 if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) {
291 result.intersect(temp);
292 }
293 }
294}
295
296// Computes the set of numerics for a string, according to UTS 39 section 5.3.
297void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const {
298 result.clear();
299
300 UChar32 codePoint;
301 for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) {
302 codePoint = input.char32At(i);
303
304 // Store a representative character for each kind of decimal digit
305 if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
306 // Store the zero character as a representative for comparison.
307 // Unicode guarantees it is codePoint - value
308 result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
309 }
310 }
311}
312
313// Computes the restriction level of a string, according to UTS 39 section 5.2.
314URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
315 // Section 5.2 step 1:
316 if (!fAllowedCharsSet->containsAll(input)) {
317 return USPOOF_UNRESTRICTIVE;
318 }
319
320 // Section 5.2 step 2
321 // Java use a static UnicodeSet for this test. In C++, avoid the static variable
322 // and just do a simple for loop.
323 UBool allASCII = true;
324 for (int32_t i=0, length=input.length(); i<length; i++) {
325 if (input.charAt(i) > 0x7f) {
326 allASCII = false;
327 break;
328 }
329 }
330 if (allASCII) {
331 return USPOOF_ASCII;
332 }
333
334 // Section 5.2 steps 3:
335 ScriptSet resolvedScriptSet;
336 getResolvedScriptSet(input, resolvedScriptSet, status);
337 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
338
339 // Section 5.2 step 4:
340 if (!resolvedScriptSet.isEmpty()) {
341 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
342 }
343
344 // Section 5.2 step 5:
345 ScriptSet resolvedNoLatn;
346 getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
347 if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
348
349 // Section 5.2 step 6:
350 if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
351 || resolvedNoLatn.test(USCRIPT_JAPANESE, status)
352 || resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
353 return USPOOF_HIGHLY_RESTRICTIVE;
354 }
355
356 // Section 5.2 step 7:
357 if (!resolvedNoLatn.isEmpty()
358 && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
359 && !resolvedNoLatn.test(USCRIPT_GREEK, status)
360 && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
361 return USPOOF_MODERATELY_RESTRICTIVE;
362 }
363
364 // Section 5.2 step 8:
365 return USPOOF_MINIMALLY_RESTRICTIVE;
366}
367
368int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
369 bool sawLeadCharacter = false;
370 for (int32_t i=0; i<input.length();) {
371 UChar32 cp = input.char32At(i);
372 if (sawLeadCharacter && cp == 0x0307) {
373 return i;
374 }
375 uint8_t combiningClass = u_getCombiningClass(cp);
376 // Skip over characters except for those with combining class 0 (non-combining characters) or with
377 // combining class 230 (same class as U+0307)
378 U_ASSERT(u_getCombiningClass(0x0307) == 230);
379 if (combiningClass == 0 || combiningClass == 230) {
380 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
381 }
382 i += U16_LENGTH(cp);
383 }
384 return -1;
385}
386
387static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
388 return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' ||
389 u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
390}
391
392bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
393 if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
394 return true;
395 }
396 UnicodeString skelStr;
397 fSpoofData->confusableLookup(cp, skelStr);
398 UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1));
399 if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
400 return true;
401 }
402 return false;
403}
404
405
406
407// Convert a text format hex number. Utility function used by builder code. Static.
408// Input: char16_t *string text. Output: a UChar32
409// Input has been pre-checked, and will have no non-hex chars.
410// The number must fall in the code point range of 0..0x10ffff
411// Static Function.
412UChar32 SpoofImpl::ScanHex(const char16_t *s, int32_t start, int32_t limit, UErrorCode &status) {
413 if (U_FAILURE(status)) {
414 return 0;
415 }
416 U_ASSERT(limit-start > 0);
417 uint32_t val = 0;
418 int i;
419 for (i=start; i<limit; i++) {
420 int digitVal = s[i] - 0x30;
421 if (digitVal>9) {
422 digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
423 }
424 if (digitVal>15) {
425 digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
426 }
427 U_ASSERT(digitVal <= 0xf);
428 val <<= 4;
429 val += digitVal;
430 }
431 if (val > 0x10ffff) {
432 status = U_PARSE_ERROR;
433 val = 0;
434 }
435 return (UChar32)val;
436}
437
438
439//-----------------------------------------
440//
441// class CheckResult Implementation
442//
443//-----------------------------------------
444
445CheckResult::CheckResult() {
446 clear();
447}
448
449USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
450 return exportForC();
451}
452
453//
454// Incoming parameter check on Status and the CheckResult object
455// received from the C API.
456//
457const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
458 return validate(ptr, status);
459}
460
461CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
462 return validate(ptr, status);
463}
464
465void CheckResult::clear() {
466 fChecks = 0;
467 fNumerics.clear();
468 fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
469}
470
471int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
472 if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
473 return fChecks | fRestrictionLevel;
474 } else {
475 return fChecks;
476 }
477}
478
479CheckResult::~CheckResult() {
480}
481
482//----------------------------------------------------------------------------------------------
483//
484// class SpoofData Implementation
485//
486//----------------------------------------------------------------------------------------------
487
488
489UBool SpoofData::validateDataVersion(UErrorCode &status) const {
490 if (U_FAILURE(status) ||
491 fRawData == nullptr ||
492 fRawData->fMagic != USPOOF_MAGIC ||
493 fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION ||
494 fRawData->fFormatVersion[1] != 0 ||
495 fRawData->fFormatVersion[2] != 0 ||
496 fRawData->fFormatVersion[3] != 0) {
497 status = U_INVALID_FORMAT_ERROR;
498 return false;
499 }
500 return true;
501}
502
503static UBool U_CALLCONV
504spoofDataIsAcceptable(void *context,
505 const char * /* type */, const char * /*name*/,
506 const UDataInfo *pInfo) {
507 if(
508 pInfo->size >= 20 &&
509 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
510 pInfo->charsetFamily == U_CHARSET_FAMILY &&
511 pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu "
512 pInfo->dataFormat[1] == 0x66 &&
513 pInfo->dataFormat[2] == 0x75 &&
514 pInfo->dataFormat[3] == 0x20 &&
515 pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
516 ) {
517 UVersionInfo *version = static_cast<UVersionInfo *>(context);
518 if(version != nullptr) {
519 uprv_memcpy(version, pInfo->dataVersion, 4);
520 }
521 return true;
522 } else {
523 return false;
524 }
525}
526
527// Methods for the loading of the default confusables data file. The confusable
528// data is loaded only when it is needed.
529//
530// SpoofData::getDefault() - Return the default confusables data, and call the
531// initOnce() if it is not available. Adds a reference
532// to the SpoofData that the caller is responsible for
533// decrementing when they are done with the data.
534//
535// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
536// is shared by all spoof checkers using the default data.
537//
538// uspoof_cleanupDefaultData - Called during cleanup.
539//
540
541static UInitOnce gSpoofInitDefaultOnce {};
542static SpoofData* gDefaultSpoofData;
543
544static UBool U_CALLCONV
545uspoof_cleanupDefaultData() {
546 if (gDefaultSpoofData) {
547 // Will delete, assuming all user-level spoof checkers were closed.
548 gDefaultSpoofData->removeReference();
549 gDefaultSpoofData = nullptr;
550 gSpoofInitDefaultOnce.reset();
551 }
552 return true;
553}
554
555static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
556 UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables",
557 spoofDataIsAcceptable,
558 nullptr, // context, would receive dataVersion if supplied.
559 &status);
560 if (U_FAILURE(status)) { return; }
561 gDefaultSpoofData = new SpoofData(udm, status);
562 if (U_FAILURE(status)) {
563 delete gDefaultSpoofData;
564 gDefaultSpoofData = nullptr;
565 return;
566 }
567 if (gDefaultSpoofData == nullptr) {
568 status = U_MEMORY_ALLOCATION_ERROR;
569 return;
570 }
571 ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
572}
573
574SpoofData* SpoofData::getDefault(UErrorCode& status) {
575 umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
576 if (U_FAILURE(status)) { return nullptr; }
577 gDefaultSpoofData->addReference();
578 return gDefaultSpoofData;
579}
580
581
582
583SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
584{
585 reset();
586 if (U_FAILURE(status)) {
587 return;
588 }
589 fUDM = udm;
590 // fRawData is non-const because it may be constructed by the data builder.
591 fRawData = reinterpret_cast<SpoofDataHeader *>(
592 const_cast<void *>(udata_getMemory(udm)));
593 validateDataVersion(status);
594 initPtrs(status);
595}
596
597
598SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
599{
600 reset();
601 if (U_FAILURE(status)) {
602 return;
603 }
604 if ((size_t)length < sizeof(SpoofDataHeader)) {
605 status = U_INVALID_FORMAT_ERROR;
606 return;
607 }
608 if (data == nullptr) {
609 status = U_ILLEGAL_ARGUMENT_ERROR;
610 return;
611 }
612 void *ncData = const_cast<void *>(data);
613 fRawData = static_cast<SpoofDataHeader *>(ncData);
614 if (length < fRawData->fLength) {
615 status = U_INVALID_FORMAT_ERROR;
616 return;
617 }
618 validateDataVersion(status);
619 initPtrs(status);
620}
621
622
623// Spoof Data constructor for use from data builder.
624// Initializes a new, empty data area that will be populated later.
625SpoofData::SpoofData(UErrorCode &status) {
626 reset();
627 if (U_FAILURE(status)) {
628 return;
629 }
630 fDataOwned = true;
631
632 // The spoof header should already be sized to be a multiple of 16 bytes.
633 // Just in case it's not, round it up.
634 uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
635 U_ASSERT(initialSize == sizeof(SpoofDataHeader));
636
637 fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
638 fMemLimit = initialSize;
639 if (fRawData == nullptr) {
640 status = U_MEMORY_ALLOCATION_ERROR;
641 return;
642 }
643 uprv_memset(fRawData, 0, initialSize);
644
645 fRawData->fMagic = USPOOF_MAGIC;
646 fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
647 fRawData->fFormatVersion[1] = 0;
648 fRawData->fFormatVersion[2] = 0;
649 fRawData->fFormatVersion[3] = 0;
650 initPtrs(status);
651}
652
653// reset() - initialize all fields.
654// Should be updated if any new fields are added.
655// Called by constructors to put things in a known initial state.
656void SpoofData::reset() {
657 fRawData = nullptr;
658 fDataOwned = false;
659 fUDM = nullptr;
660 fMemLimit = 0;
661 fRefCount = 1;
662 fCFUKeys = nullptr;
663 fCFUValues = nullptr;
664 fCFUStrings = nullptr;
665}
666
667
668// SpoofData::initPtrs()
669// Initialize the pointers to the various sections of the raw data.
670//
671// This function is used both during the Trie building process (multiple
672// times, as the individual data sections are added), and
673// during the opening of a Spoof Checker from prebuilt data.
674//
675// The pointers for non-existent data sections (identified by an offset of 0)
676// are set to nullptr.
677//
678// Note: During building the data, adding each new data section
679// reallocs the raw data area, which likely relocates it, which
680// in turn requires reinitializing all of the pointers into it, hence
681// multiple calls to this function during building.
682//
683void SpoofData::initPtrs(UErrorCode &status) {
684 fCFUKeys = nullptr;
685 fCFUValues = nullptr;
686 fCFUStrings = nullptr;
687 if (U_FAILURE(status)) {
688 return;
689 }
690 if (fRawData->fCFUKeys != 0) {
691 fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
692 }
693 if (fRawData->fCFUStringIndex != 0) {
694 fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
695 }
696 if (fRawData->fCFUStringTable != 0) {
697 fCFUStrings = (char16_t *)((char *)fRawData + fRawData->fCFUStringTable);
698 }
699}
700
701
702SpoofData::~SpoofData() {
703 if (fDataOwned) {
704 uprv_free(fRawData);
705 }
706 fRawData = nullptr;
707 if (fUDM != nullptr) {
708 udata_close(fUDM);
709 }
710 fUDM = nullptr;
711}
712
713
714void SpoofData::removeReference() {
715 if (umtx_atomic_dec(&fRefCount) == 0) {
716 delete this;
717 }
718}
719
720
721SpoofData *SpoofData::addReference() {
722 umtx_atomic_inc(&fRefCount);
723 return this;
724}
725
726
727void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
728 if (U_FAILURE(status)) {
729 return nullptr;
730 }
731 if (!fDataOwned) {
732 UPRV_UNREACHABLE_EXIT;
733 }
734
735 numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
736 uint32_t returnOffset = fMemLimit;
737 fMemLimit += numBytes;
738 fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
739 fRawData->fLength = fMemLimit;
740 uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
741 initPtrs(status);
742 return (char *)fRawData + returnOffset;
743}
744
745int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const {
746 int32_t dataSize = fRawData->fLength;
747 if (capacity < dataSize) {
748 status = U_BUFFER_OVERFLOW_ERROR;
749 return dataSize;
750 }
751 uprv_memcpy(buf, fRawData, dataSize);
752 return dataSize;
753}
754
755int32_t SpoofData::size() const {
756 return fRawData->fLength;
757}
758
759//-------------------------------
760//
761// Front-end APIs for SpoofData
762//
763//-------------------------------
764
765int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
766 // Perform a binary search.
767 // [lo, hi), i.e lo is inclusive, hi is exclusive.
768 // The result after the loop will be in lo.
769 int32_t lo = 0;
770 int32_t hi = length();
771 do {
772 int32_t mid = (lo + hi) / 2;
773 if (codePointAt(mid) > inChar) {
774 hi = mid;
775 } else if (codePointAt(mid) < inChar) {
776 lo = mid;
777 } else {
778 // Found result. Break early.
779 lo = mid;
780 break;
781 }
782 } while (hi - lo > 1);
783
784 // Did we find an entry? If not, the char maps to itself.
785 if (codePointAt(lo) != inChar) {
786 dest.append(inChar);
787 return 1;
788 }
789
790 // Add the element to the string builder and return.
791 return appendValueTo(lo, dest);
792}
793
794int32_t SpoofData::length() const {
795 return fRawData->fCFUKeysSize;
796}
797
798UChar32 SpoofData::codePointAt(int32_t index) const {
799 return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
800}
801
802int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
803 int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
804
805 // Value is either a char (for strings of length 1) or
806 // an index into the string table (for longer strings)
807 uint16_t value = fCFUValues[index];
808 if (stringLength == 1) {
809 dest.append((char16_t)value);
810 } else {
811 dest.append(fCFUStrings + value, stringLength);
812 }
813
814 return stringLength;
815}
816
817
818U_NAMESPACE_END
819
820U_NAMESPACE_USE
821
822//-----------------------------------------------------------------------------
823//
824// uspoof_swap - byte swap and char encoding swap of spoof data
825//
826//-----------------------------------------------------------------------------
827U_CAPI int32_t U_EXPORT2
828uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
829 UErrorCode *status) {
830
831 if (status == nullptr || U_FAILURE(*status)) {
832 return 0;
833 }
834 if(ds==nullptr || inData==nullptr || length<-1 || (length>0 && outData==nullptr)) {
835 *status=U_ILLEGAL_ARGUMENT_ERROR;
836 return 0;
837 }
838
839 //
840 // Check that the data header is for spoof data.
841 // (Header contents are defined in gencfu.cpp)
842 //
843 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
844 if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
845 pInfo->dataFormat[1]==0x66 &&
846 pInfo->dataFormat[2]==0x75 &&
847 pInfo->dataFormat[3]==0x20 &&
848 pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
849 pInfo->formatVersion[1]==0 &&
850 pInfo->formatVersion[2]==0 &&
851 pInfo->formatVersion[3]==0 )) {
852 udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
853 "(format version %02x %02x %02x %02x) is not recognized\n",
854 pInfo->dataFormat[0], pInfo->dataFormat[1],
855 pInfo->dataFormat[2], pInfo->dataFormat[3],
856 pInfo->formatVersion[0], pInfo->formatVersion[1],
857 pInfo->formatVersion[2], pInfo->formatVersion[3]);
858 *status=U_UNSUPPORTED_ERROR;
859 return 0;
860 }
861
862 //
863 // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
864 // header). This swap also conveniently gets us
865 // the size of the ICU d.h., which lets us locate the start
866 // of the uspoof specific data.
867 //
868 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
869
870
871 //
872 // Get the Spoof Data Header, and check that it appears to be OK.
873 //
874 //
875 const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
876 SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
877 if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
878 ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
879 {
880 udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
881 *status=U_UNSUPPORTED_ERROR;
882 return 0;
883 }
884
885 //
886 // Prefight operation? Just return the size
887 //
888 int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
889 int32_t totalSize = headerSize + spoofDataLength;
890 if (length < 0) {
891 return totalSize;
892 }
893
894 //
895 // Check that length passed in is consistent with length from Spoof data header.
896 //
897 if (length < totalSize) {
898 udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
899 spoofDataLength);
900 *status=U_INDEX_OUTOFBOUNDS_ERROR;
901 return 0;
902 }
903
904
905 //
906 // Swap the Data. Do the data itself first, then the Spoof Data Header, because
907 // we need to reference the header to locate the data, and an
908 // inplace swap of the header leaves it unusable.
909 //
910 uint8_t *outBytes = (uint8_t *)outData + headerSize;
911 SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
912
913 int32_t sectionStart;
914 int32_t sectionLength;
915
916 //
917 // If not swapping in place, zero out the output buffer before starting.
918 // Gaps may exist between the individual sections, and these must be zeroed in
919 // the output buffer. The simplest way to do that is to just zero the whole thing.
920 //
921 if (inBytes != outBytes) {
922 uprv_memset(outBytes, 0, spoofDataLength);
923 }
924
925 // Confusables Keys Section (fCFUKeys)
926 sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
927 sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
928 ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
929
930 // String Index Section
931 sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
932 sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
933 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
934
935 // String Table Section
936 sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
937 sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
938 ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
939
940 // And, last, swap the header itself.
941 // int32_t fMagic // swap this
942 // uint8_t fFormatVersion[4] // Do not swap this, just copy
943 // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
944 //
945 uint32_t magic = ds->readUInt32(spoofDH->fMagic);
946 ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
947
948 if (inBytes != outBytes) {
949 uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
950 }
951 // swap starting at fLength
952 ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status);
953
954 return totalSize;
955}
956
957#endif
958
959
960