uspoof_impl.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/uspoof_impl.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2008-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11	#include "unicode/uspoof.h"
12	#include "unicode/uchar.h"
13	#include "unicode/uniset.h"
14	#include "unicode/utf16.h"
15	#include "utrie2.h"
16	#include "cmemory.h"
17	#include "cstring.h"
18	#include "scriptset.h"
19	#include "umutex.h"
20	#include "udataswp.h"
21	#include "uassert.h"
22	#include "ucln_in.h"
23	#include "uspoof_impl.h"
24
25	#if !UCONFIG_NO_NORMALIZATION
26
27
28	U_NAMESPACE_BEGIN
29
30	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
31
32	SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) {
33	construct(status);
34	fSpoofData = data;
35	}
36
37	SpoofImpl::SpoofImpl(UErrorCode& status) {
38	construct(status);
39
40	// TODO: Call this method where it is actually needed, instead of in the
41	// constructor, to allow for lazy data loading. See #12696.
42	fSpoofData = SpoofData::getDefault(status);
43	}
44
45	SpoofImpl::SpoofImpl() {
46	UErrorCode status = U_ZERO_ERROR;
47	construct(status);
48
49	// TODO: Call this method where it is actually needed, instead of in the
50	// constructor, to allow for lazy data loading. See #12696.
51	fSpoofData = SpoofData::getDefault(status);
52	}
53
54	void SpoofImpl::construct(UErrorCode& status) {
55	fChecks = USPOOF_ALL_CHECKS;
56	fSpoofData = NULL;
57	fAllowedCharsSet = NULL;
58	fAllowedLocales = NULL;
59	fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE;
60
61	if (U_FAILURE(status)) { return; }
62
63	UnicodeSet allowedCharsSet = new* UnicodeSet (`0`, `0x10ffff`);
64	fAllowedCharsSet = allowedCharsSet;
65	fAllowedLocales = uprv_strdup("");
66	if (fAllowedCharsSet == NULL \|\| fAllowedLocales == NULL) {
67	status = U_MEMORY_ALLOCATION_ERROR;
68	return;
69	}
70	allowedCharsSet->freeze();
71	}
72
73
74	// Copy Constructor, used by the user level clone() function.
75	SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
76	fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) ,
77	fAllowedLocales(NULL) {
78	if (U_FAILURE(status)) {
79	return;
80	}
81	fChecks = src.fChecks;
82	if (src.fSpoofData != NULL) {
83	fSpoofData = src.fSpoofData->addReference();
84	}
85	fAllowedCharsSet = src.fAllowedCharsSet->clone();
86	fAllowedLocales = uprv_strdup(src.fAllowedLocales);
87	if (fAllowedCharsSet == NULL \|\| fAllowedLocales == NULL) {
88	status = U_MEMORY_ALLOCATION_ERROR;
89	}
90	fRestrictionLevel = src.fRestrictionLevel;
91	}
92
93	SpoofImpl::~SpoofImpl() {
94	if (fSpoofData != NULL) {
95	fSpoofData->removeReference(); // Will delete if refCount goes to zero.
96	}
97	delete fAllowedCharsSet;
98	uprv_free((void *)fAllowedLocales);
99	}
100
101	// Cast this instance as a USpoofChecker for the C API.
102	USpoofChecker *SpoofImpl::asUSpoofChecker() {
103	return exportForC();
104	}
105
106	//
107	// Incoming parameter check on Status and the SpoofChecker object
108	// received from the C API.
109	//
110	const SpoofImpl SpoofImpl::validateThis(const* USpoofChecker *sc, UErrorCode &status) {
111	auto* This = validate(sc, status);
112	if (U_FAILURE(status)) {
113	return NULL;
114	}
115	if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) {
116	return NULL;
117	}
118	return This;
119	}
120
121	SpoofImpl SpoofImpl::validateThis(USpoofChecker sc, UErrorCode &status) {
122	return const_cast<SpoofImpl *>
123	(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
124	}
125
126
127	void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) {
128	UnicodeSet allowedChars;
129	UnicodeSet *tmpSet = NULL;
130	const char *locStart = localesList;
131	const char *locEnd = NULL;
132	const char *localesListEnd = localesList + uprv_strlen(localesList);
133	int32_t localeListCount = `0`; // Number of locales provided by caller.
134
135	// Loop runs once per locale from the localesList, a comma separated list of locales.
136	do {
137	locEnd = uprv_strchr(locStart, `','`);
138	if (locEnd == NULL) {
139	locEnd = localesListEnd;
140	}
141	while (*locStart == `' '`) {
142	locStart++;
143	}
144	const char *trimmedEnd = locEnd-`1`;
145	while (trimmedEnd > locStart && *trimmedEnd == `' '`) {
146	trimmedEnd--;
147	}
148	if (trimmedEnd <= locStart) {
149	break;
150	}
151	const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + `1` - locStart));
152	localeListCount++;
153
154	// We have one locale from the locales list.
155	// Add the script chars for this locale to the accumulating set of allowed chars.
156	// If the locale is no good, we will be notified back via status.
157	addScriptChars(locale, &allowedChars, status);
158	uprv_free((void *)locale);
159	if (U_FAILURE(status)) {
160	break;
161	}
162	locStart = locEnd + `1`;
163	} while (locStart < localesListEnd);
164
165	// If our caller provided an empty list of locales, we disable the allowed characters checking
166	if (localeListCount == `0`) {
167	uprv_free((void *)fAllowedLocales);
168	fAllowedLocales = uprv_strdup("");
169	tmpSet = new UnicodeSet (`0`, `0x10ffff`);
170	if (fAllowedLocales == NULL \|\| tmpSet == NULL) {
171	status = U_MEMORY_ALLOCATION_ERROR;
172	return;
173	}
174	tmpSet->freeze();
175	delete fAllowedCharsSet;
176	fAllowedCharsSet = tmpSet;
177	fChecks &= ~USPOOF_CHAR_LIMIT;
178	return;
179	}
180
181
182	// Add all common and inherited characters to the set of allowed chars.
183	UnicodeSet tempSet;
184	tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
185	allowedChars.addAll(tempSet);
186	tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
187	allowedChars.addAll(tempSet);
188
189	// If anything went wrong, we bail out without changing
190	// the state of the spoof checker.
191	if (U_FAILURE(status)) {
192	return;
193	}
194
195	// Store the updated spoof checker state.
196	tmpSet = allowedChars.clone();
197	const char *tmpLocalesList = uprv_strdup(localesList);
198	if (tmpSet == NULL \|\| tmpLocalesList == NULL) {
199	status = U_MEMORY_ALLOCATION_ERROR;
200	return;
201	}
202	uprv_free((void *)fAllowedLocales);
203	fAllowedLocales = tmpLocalesList;
204	tmpSet->freeze();
205	delete fAllowedCharsSet;
206	fAllowedCharsSet = tmpSet;
207	fChecks \|= USPOOF_CHAR_LIMIT;
208	}
209
210
211	const char * SpoofImpl::getAllowedLocales(UErrorCode &/status/) {
212	return fAllowedLocales;
213	}
214
215
216	// Given a locale (a language), add all the characters from all of the scripts used with that language
217	// to the allowedChars UnicodeSet
218
219	void SpoofImpl::addScriptChars(const char locale, UnicodeSet allowedChars, UErrorCode &status) {
220	UScriptCode scripts[`30`];
221
222	int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status);
223	if (U_FAILURE(status)) {
224	return;
225	}
226	if (status == U_USING_DEFAULT_WARNING) {
227	status = U_ILLEGAL_ARGUMENT_ERROR;
228	return;
229	}
230	UnicodeSet tmpSet;
231	int32_t i;
232	for (i=`0`; i<numScripts; i++) {
233	tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status);
234	allowedChars->addAll(tmpSet);
235	}
236	}
237
238	// Computes the augmented script set for a code point, according to UTS 39 section 5.1.
239	void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) {
240	result.resetAll();
241	result.setScriptExtensions(codePoint, status);
242	if (U_FAILURE(status)) { return; }
243
244	// Section 5.1 step 1
245	if (result.test(USCRIPT_HAN, status)) {
246	result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
247	result.set(USCRIPT_JAPANESE, status);
248	result.set(USCRIPT_KOREAN, status);
249	}
250	if (result.test(USCRIPT_HIRAGANA, status)) {
251	result.set(USCRIPT_JAPANESE, status);
252	}
253	if (result.test(USCRIPT_KATAKANA, status)) {
254	result.set(USCRIPT_JAPANESE, status);
255	}
256	if (result.test(USCRIPT_HANGUL, status)) {
257	result.set(USCRIPT_KOREAN, status);
258	}
259	if (result.test(USCRIPT_BOPOMOFO, status)) {
260	result.set(USCRIPT_HAN_WITH_BOPOMOFO, status);
261	}
262
263	// Section 5.1 step 2
264	if (result.test(USCRIPT_COMMON, status) \|\| result.test(USCRIPT_INHERITED, status)) {
265	result.setAll();
266	}
267	}
268
269	// Computes the resolved script set for a string, according to UTS 39 section 5.1.
270	void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const {
271	getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status);
272	}
273
274	// Computes the resolved script set for a string, omitting characters having the specified script.
275	// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included.
276	void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const {
277	result.setAll();
278
279	ScriptSet temp;
280	UChar32 codePoint;
281	for (int32_t i = `0`; i < input.length(); i += U16_LENGTH(codePoint)) {
282	codePoint = input.char32At(i);
283
284	// Compute the augmented script set for the character
285	getAugmentedScriptSet(codePoint, temp, status);
286	if (U_FAILURE(status)) { return; }
287
288	// Intersect the augmented script set with the resolved script set, but only if the character doesn't
289	// have the script specified in the function call
290	if (script == USCRIPT_CODE_LIMIT \|\| !temp.test(script, status)) {
291	result.intersect(temp);
292	}
293	}
294	}
295
296	// Computes the set of numerics for a string, according to UTS 39 section 5.3.
297	void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /status/) const {
298	result.clear();
299
300	UChar32 codePoint;
301	for (int32_t i = `0`; i < input.length(); i += U16_LENGTH(codePoint)) {
302	codePoint = input.char32At(i);
303
304	// Store a representative character for each kind of decimal digit
305	if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) {
306	// Store the zero character as a representative for comparison.
307	// Unicode guarantees it is codePoint - value
308	result.add(codePoint - (UChar32)u_getNumericValue(codePoint));
309	}
310	}
311	}
312
313	// Computes the restriction level of a string, according to UTS 39 section 5.2.
314	URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const {
315	// Section 5.2 step 1:
316	if (!fAllowedCharsSet->containsAll(input)) {
317	return USPOOF_UNRESTRICTIVE;
318	}
319
320	// Section 5.2 step 2
321	// Java use a static UnicodeSet for this test. In C++, avoid the static variable
322	// and just do a simple for loop.
323	UBool allASCII = TRUE;
324	for (int32_t i=`0`, length=input.length(); i<length; i++) {
325	if (input.charAt(i) > `0x7f`) {
326	allASCII = FALSE;
327	break;
328	}
329	}
330	if (allASCII) {
331	return USPOOF_ASCII;
332	}
333
334	// Section 5.2 steps 3:
335	ScriptSet resolvedScriptSet;
336	getResolvedScriptSet(input, resolvedScriptSet, status);
337	if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
338
339	// Section 5.2 step 4:
340	if (!resolvedScriptSet.isEmpty()) {
341	return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
342	}
343
344	// Section 5.2 step 5:
345	ScriptSet resolvedNoLatn;
346	getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status);
347	if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; }
348
349	// Section 5.2 step 6:
350	if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status)
351	\|\| resolvedNoLatn.test(USCRIPT_JAPANESE, status)
352	\|\| resolvedNoLatn.test(USCRIPT_KOREAN, status)) {
353	return USPOOF_HIGHLY_RESTRICTIVE;
354	}
355
356	// Section 5.2 step 7:
357	if (!resolvedNoLatn.isEmpty()
358	&& !resolvedNoLatn.test(USCRIPT_CYRILLIC, status)
359	&& !resolvedNoLatn.test(USCRIPT_GREEK, status)
360	&& !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) {
361	return USPOOF_MODERATELY_RESTRICTIVE;
362	}
363
364	// Section 5.2 step 8:
365	return USPOOF_MINIMALLY_RESTRICTIVE;
366	}
367
368	int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const {
369	bool sawLeadCharacter = false;
370	for (int32_t i=`0`; i<input.length();) {
371	UChar32 cp = input.char32At(i);
372	if (sawLeadCharacter && cp == `0x0307`) {
373	return i;
374	}
375	uint8_t combiningClass = u_getCombiningClass(cp);
376	// Skip over characters except for those with combining class 0 (non-combining characters) or with
377	// combining class 230 (same class as U+0307)
378	U_ASSERT(u_getCombiningClass(`0x0307`) == `230`);
379	if (combiningClass == `0` \|\| combiningClass == `230`) {
380	sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp);
381	}
382	i += U16_LENGTH(cp);
383	}
384	return -`1`;
385	}
386
387	static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) {
388	return cp == u`'i'` \|\| cp == u`'j'` \|\| cp == u`'ı'` \|\| cp == u`'ȷ'` \|\| cp == u`'l'` \|\|
389	u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED);
390	}
391
392	bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const {
393	if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
394	return true;
395	}
396	UnicodeString skelStr;
397	fSpoofData->confusableLookup(cp, skelStr);
398	UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -`1`));
399	if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
400	return true;
401	}
402	return false;
403	}
404
405
406
407	// Convert a text format hex number. Utility function used by builder code. Static.
408	// Input: UChar string text. Output: a UChar32*
409	// Input has been pre-checked, and will have no non-hex chars.
410	// The number must fall in the code point range of 0..0x10ffff
411	// Static Function.
412	UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
413	if (U_FAILURE(status)) {
414	return `0`;
415	}
416	U_ASSERT(limit-start > `0`);
417	uint32_t val = `0`;
418	int i;
419	for (i=start; i<limit; i++) {
420	int digitVal = s[i] - `0x30`;
421	if (digitVal>`9`) {
422	digitVal = `0xa` + (s[i] - `0x41`); // Upper Case 'A'
423	}
424	if (digitVal>`15`) {
425	digitVal = `0xa` + (s[i] - `0x61`); // Lower Case 'a'
426	}
427	U_ASSERT(digitVal <= `0xf`);
428	val <<= `4`;
429	val += digitVal;
430	}
431	if (val > `0x10ffff`) {
432	status = U_PARSE_ERROR;
433	val = `0`;
434	}
435	return (UChar32)val;
436	}
437
438
439	//-----------------------------------------
440	//
441	// class CheckResult Implementation
442	//
443	//-----------------------------------------
444
445	CheckResult::CheckResult() {
446	clear();
447	}
448
449	USpoofCheckResult* CheckResult::asUSpoofCheckResult() {
450	return exportForC();
451	}
452
453	//
454	// Incoming parameter check on Status and the CheckResult object
455	// received from the C API.
456	//
457	const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) {
458	return validate(ptr, status);
459	}
460
461	CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) {
462	return validate(ptr, status);
463	}
464
465	void CheckResult::clear() {
466	fChecks = `0`;
467	fNumerics.clear();
468	fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE;
469	}
470
471	int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) {
472	if ((enabledChecks & USPOOF_AUX_INFO) != `0` && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) {
473	return fChecks \| fRestrictionLevel;
474	} else {
475	return fChecks;
476	}
477	}
478
479	CheckResult::~CheckResult() {
480	}
481
482	//----------------------------------------------------------------------------------------------
483	//
484	// class SpoofData Implementation
485	//
486	//----------------------------------------------------------------------------------------------
487
488
489	UBool SpoofData::validateDataVersion(UErrorCode &status) const {
490	if (U_FAILURE(status) \|\|
491	fRawData == NULL \|\|
492	fRawData->fMagic != USPOOF_MAGIC \|\|
493	fRawData->fFormatVersion[`0`] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION \|\|
494	fRawData->fFormatVersion[`1`] != `0` \|\|
495	fRawData->fFormatVersion[`2`] != `0` \|\|
496	fRawData->fFormatVersion[`3`] != `0`) {
497	status = U_INVALID_FORMAT_ERROR;
498	return FALSE;
499	}
500	return TRUE;
501	}
502
503	static UBool U_CALLCONV
504	spoofDataIsAcceptable(void *context,
505	const char * / type /, const char * /name/,
506	const UDataInfo *pInfo) {
507	if(
508	pInfo->size >= `20` &&
509	pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
510	pInfo->charsetFamily == U_CHARSET_FAMILY &&
511	pInfo->dataFormat[`0`] == `0x43` && // dataFormat="Cfu "
512	pInfo->dataFormat[`1`] == `0x66` &&
513	pInfo->dataFormat[`2`] == `0x75` &&
514	pInfo->dataFormat[`3`] == `0x20` &&
515	pInfo->formatVersion[`0`] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION
516	) {
517	UVersionInfo version = static_cast<UVersionInfo >(context);
518	if(version != NULL) {
519	uprv_memcpy(version, pInfo->dataVersion, `4`);
520	}
521	return TRUE;
522	} else {
523	return FALSE;
524	}
525	}
526
527	// Methods for the loading of the default confusables data file. The confusable
528	// data is loaded only when it is needed.
529	//
530	// SpoofData::getDefault() - Return the default confusables data, and call the
531	// initOnce() if it is not available. Adds a reference
532	// to the SpoofData that the caller is responsible for
533	// decrementing when they are done with the data.
534	//
535	// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData
536	// is shared by all spoof checkers using the default data.
537	//
538	// uspoof_cleanupDefaultData - Called during cleanup.
539	//
540
541	static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER;
542	static SpoofData* gDefaultSpoofData;
543
544	static UBool U_CALLCONV
545	uspoof_cleanupDefaultData(void) {
546	if (gDefaultSpoofData) {
547	// Will delete, assuming all user-level spoof checkers were closed.
548	gDefaultSpoofData->removeReference();
549	gDefaultSpoofData = nullptr;
550	gSpoofInitDefaultOnce.reset();
551	}
552	return TRUE;
553	}
554
555	static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) {
556	UDataMemory udm = udata_openChoice(nullptr*, "cfu", "confusables",
557	spoofDataIsAcceptable,
558	nullptr, // context, would receive dataVersion if supplied.
559	&status);
560	if (U_FAILURE(status)) { return; }
561	gDefaultSpoofData = new SpoofData (udm, status);
562	if (U_FAILURE(status)) {
563	delete gDefaultSpoofData;
564	gDefaultSpoofData = nullptr;
565	return;
566	}
567	if (gDefaultSpoofData == nullptr) {
568	status = U_MEMORY_ALLOCATION_ERROR;
569	return;
570	}
571	ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData);
572	}
573
574	SpoofData* SpoofData::getDefault(UErrorCode& status) {
575	umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status);
576	if (U_FAILURE(status)) { return NULL; }
577	gDefaultSpoofData->addReference();
578	return gDefaultSpoofData;
579	}
580
581
582
583	SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
584	{
585	reset();
586	if (U_FAILURE(status)) {
587	return;
588	}
589	fUDM = udm;
590	// fRawData is non-const because it may be constructed by the data builder.
591	fRawData = reinterpret_cast<SpoofDataHeader *>(
592	const_cast<void *>(udata_getMemory(udm)));
593	validateDataVersion(status);
594	initPtrs(status);
595	}
596
597
598	SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
599	{
600	reset();
601	if (U_FAILURE(status)) {
602	return;
603	}
604	if ((size_t)length < sizeof(SpoofDataHeader)) {
605	status = U_INVALID_FORMAT_ERROR;
606	return;
607	}
608	if (data == NULL) {
609	status = U_ILLEGAL_ARGUMENT_ERROR;
610	return;
611	}
612	void ncData = const_cast<void* *>(data);
613	fRawData = static_cast<SpoofDataHeader *>(ncData);
614	if (length < fRawData->fLength) {
615	status = U_INVALID_FORMAT_ERROR;
616	return;
617	}
618	validateDataVersion(status);
619	initPtrs(status);
620	}
621
622
623	// Spoof Data constructor for use from data builder.
624	// Initializes a new, empty data area that will be populated later.
625	SpoofData::SpoofData(UErrorCode &status) {
626	reset();
627	if (U_FAILURE(status)) {
628	return;
629	}
630	fDataOwned = true;
631
632	// The spoof header should already be sized to be a multiple of 16 bytes.
633	// Just in case it's not, round it up.
634	uint32_t initialSize = (sizeof(SpoofDataHeader) + `15`) & ~`15`;
635	U_ASSERT(initialSize == sizeof(SpoofDataHeader));
636
637	fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
638	fMemLimit = initialSize;
639	if (fRawData == NULL) {
640	status = U_MEMORY_ALLOCATION_ERROR;
641	return;
642	}
643	uprv_memset(fRawData, `0`, initialSize);
644
645	fRawData->fMagic = USPOOF_MAGIC;
646	fRawData->fFormatVersion[`0`] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION;
647	fRawData->fFormatVersion[`1`] = `0`;
648	fRawData->fFormatVersion[`2`] = `0`;
649	fRawData->fFormatVersion[`3`] = `0`;
650	initPtrs(status);
651	}
652
653	// reset() - initialize all fields.
654	// Should be updated if any new fields are added.
655	// Called by constructors to put things in a known initial state.
656	void SpoofData::reset() {
657	fRawData = NULL;
658	fDataOwned = FALSE;
659	fUDM = NULL;
660	fMemLimit = `0`;
661	fRefCount = `1`;
662	fCFUKeys = NULL;
663	fCFUValues = NULL;
664	fCFUStrings = NULL;
665	}
666
667
668	// SpoofData::initPtrs()
669	// Initialize the pointers to the various sections of the raw data.
670	//
671	// This function is used both during the Trie building process (multiple
672	// times, as the individual data sections are added), and
673	// during the opening of a Spoof Checker from prebuilt data.
674	//
675	// The pointers for non-existent data sections (identified by an offset of 0)
676	// are set to NULL.
677	//
678	// Note: During building the data, adding each new data section
679	// reallocs the raw data area, which likely relocates it, which
680	// in turn requires reinitializing all of the pointers into it, hence
681	// multiple calls to this function during building.
682	//
683	void SpoofData::initPtrs(UErrorCode &status) {
684	fCFUKeys = NULL;
685	fCFUValues = NULL;
686	fCFUStrings = NULL;
687	if (U_FAILURE(status)) {
688	return;
689	}
690	if (fRawData->fCFUKeys != `0`) {
691	fCFUKeys = (int32_t )((char* *)fRawData + fRawData->fCFUKeys);
692	}
693	if (fRawData->fCFUStringIndex != `0`) {
694	fCFUValues = (uint16_t )((char* *)fRawData + fRawData->fCFUStringIndex);
695	}
696	if (fRawData->fCFUStringTable != `0`) {
697	fCFUStrings = (UChar )((char* *)fRawData + fRawData->fCFUStringTable);
698	}
699	}
700
701
702	SpoofData::~SpoofData() {
703	if (fDataOwned) {
704	uprv_free(fRawData);
705	}
706	fRawData = NULL;
707	if (fUDM != NULL) {
708	udata_close(fUDM);
709	}
710	fUDM = NULL;
711	}
712
713
714	void SpoofData::removeReference() {
715	if (umtx_atomic_dec(&fRefCount) == `0`) {
716	delete this;
717	}
718	}
719
720
721	SpoofData *SpoofData::addReference() {
722	umtx_atomic_inc(&fRefCount);
723	return this;
724	}
725
726
727	void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
728	if (U_FAILURE(status)) {
729	return NULL;
730	}
731	if (!fDataOwned) {
732	UPRV_UNREACHABLE;
733	}
734
735	numBytes = (numBytes + `15`) & ~`15`; // Round up to a multiple of 16
736	uint32_t returnOffset = fMemLimit;
737	fMemLimit += numBytes;
738	fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
739	fRawData->fLength = fMemLimit;
740	uprv_memset((char *)fRawData + returnOffset, `0`, numBytes);
741	initPtrs(status);
742	return (char *)fRawData + returnOffset;
743	}
744
745	int32_t SpoofData::serialize(void buf, int32_t capacity, UErrorCode &status) const* {
746	int32_t dataSize = fRawData->fLength;
747	if (capacity < dataSize) {
748	status = U_BUFFER_OVERFLOW_ERROR;
749	return dataSize;
750	}
751	uprv_memcpy(buf, fRawData, dataSize);
752	return dataSize;
753	}
754
755	int32_t SpoofData::size() const {
756	return fRawData->fLength;
757	}
758
759	//-------------------------------
760	//
761	// Front-end APIs for SpoofData
762	//
763	//-------------------------------
764
765	int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const {
766	// Perform a binary search.
767	// [lo, hi), i.e lo is inclusive, hi is exclusive.
768	// The result after the loop will be in lo.
769	int32_t lo = `0`;
770	int32_t hi = length();
771	do {
772	int32_t mid = (lo + hi) / `2`;
773	if (codePointAt(mid) > inChar) {
774	hi = mid;
775	} else if (codePointAt(mid) < inChar) {
776	lo = mid;
777	} else {
778	// Found result. Break early.
779	lo = mid;
780	break;
781	}
782	} while (hi - lo > `1`);
783
784	// Did we find an entry? If not, the char maps to itself.
785	if (codePointAt(lo) != inChar) {
786	dest.append(inChar);
787	return `1`;
788	}
789
790	// Add the element to the string builder and return.
791	return appendValueTo(lo, dest);
792	}
793
794	int32_t SpoofData::length() const {
795	return fRawData->fCFUKeysSize;
796	}
797
798	UChar32 SpoofData::codePointAt(int32_t index) const {
799	return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]);
800	}
801
802	int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const {
803	int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]);
804
805	// Value is either a char (for strings of length 1) or
806	// an index into the string table (for longer strings)
807	uint16_t value = fCFUValues[index];
808	if (stringLength == `1`) {
809	dest.append((UChar)value);
810	} else {
811	dest.append(fCFUStrings + value, stringLength);
812	}
813
814	return stringLength;
815	}
816
817
818	U_NAMESPACE_END
819
820	U_NAMESPACE_USE
821
822	//-----------------------------------------------------------------------------
823	//
824	// uspoof_swap - byte swap and char encoding swap of spoof data
825	//
826	//-----------------------------------------------------------------------------
827	U_CAPI int32_t U_EXPORT2
828	uspoof_swap(const UDataSwapper ds, const* void inData, int32_t length, void* *outData,
829	UErrorCode *status) {
830
831	if (status == NULL \|\| U_FAILURE(*status)) {
832	return `0`;
833	}
834	if(ds==NULL \|\| inData==NULL \|\| length<-`1` \|\| (length>`0` && outData==NULL)) {
835	*status=U_ILLEGAL_ARGUMENT_ERROR;
836	return `0`;
837	}
838
839	//
840	// Check that the data header is for spoof data.
841	// (Header contents are defined in gencfu.cpp)
842	//
843	const UDataInfo pInfo = (const* UDataInfo )((const* char *)inData+`4`);
844	if(!( pInfo->dataFormat[`0`]==`0x43` && / dataFormat="Cfu " /
845	pInfo->dataFormat[`1`]==`0x66` &&
846	pInfo->dataFormat[`2`]==`0x75` &&
847	pInfo->dataFormat[`3`]==`0x20` &&
848	pInfo->formatVersion[`0`]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION &&
849	pInfo->formatVersion[`1`]==`0` &&
850	pInfo->formatVersion[`2`]==`0` &&
851	pInfo->formatVersion[`3`]==`0` )) {
852	udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
853	"(format version %02x %02x %02x %02x) is not recognized\n",
854	pInfo->dataFormat[`0`], pInfo->dataFormat[`1`],
855	pInfo->dataFormat[`2`], pInfo->dataFormat[`3`],
856	pInfo->formatVersion[`0`], pInfo->formatVersion[`1`],
857	pInfo->formatVersion[`2`], pInfo->formatVersion[`3`]);
858	*status=U_UNSUPPORTED_ERROR;
859	return `0`;
860	}
861
862	//
863	// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
864	// header). This swap also conveniently gets us
865	// the size of the ICU d.h., which lets us locate the start
866	// of the uspoof specific data.
867	//
868	int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
869
870
871	//
872	// Get the Spoof Data Header, and check that it appears to be OK.
873	//
874	//
875	const uint8_t inBytes =(const* uint8_t *)inData+headerSize;
876	SpoofDataHeader spoofDH = (SpoofDataHeader )inBytes;
877	if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC \|\|
878	ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
879	{
880	udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
881	*status=U_UNSUPPORTED_ERROR;
882	return `0`;
883	}
884
885	//
886	// Prefight operation? Just return the size
887	//
888	int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
889	int32_t totalSize = headerSize + spoofDataLength;
890	if (length < `0`) {
891	return totalSize;
892	}
893
894	//
895	// Check that length passed in is consistent with length from Spoof data header.
896	//
897	if (length < totalSize) {
898	udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
899	spoofDataLength);
900	*status=U_INDEX_OUTOFBOUNDS_ERROR;
901	return `0`;
902	}
903
904
905	//
906	// Swap the Data. Do the data itself first, then the Spoof Data Header, because
907	// we need to reference the header to locate the data, and an
908	// inplace swap of the header leaves it unusable.
909	//
910	uint8_t outBytes = (uint8_t )outData + headerSize;
911	SpoofDataHeader outputDH = (SpoofDataHeader )outBytes;
912
913	int32_t sectionStart;
914	int32_t sectionLength;
915
916	//
917	// If not swapping in place, zero out the output buffer before starting.
918	// Gaps may exist between the individual sections, and these must be zeroed in
919	// the output buffer. The simplest way to do that is to just zero the whole thing.
920	//
921	if (inBytes != outBytes) {
922	uprv_memset(outBytes, `0`, spoofDataLength);
923	}
924
925	// Confusables Keys Section (fCFUKeys)
926	sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
927	sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * `4`;
928	ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
929
930	// String Index Section
931	sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
932	sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * `2`;
933	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
934
935	// String Table Section
936	sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
937	sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * `2`;
938	ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
939
940	// And, last, swap the header itself.
941	// int32_t fMagic // swap this
942	// uint8_t fFormatVersion[4] // Do not swap this, just copy
943	// int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff.
944	//
945	uint32_t magic = ds->readUInt32(spoofDH->fMagic);
946	ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
947
948	if (outputDH->fFormatVersion != spoofDH->fFormatVersion) {
949	uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion));
950	}
951	// swap starting at fLength
952	ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-`8` / minus magic and fFormatVersion[4] /, &outputDH->fLength, status);
953
954	return totalSize;
955	}
956
957	#endif
958
959
960

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/uspoof_impl.cpp