uspoof_conf.cpp source code [engine/third_party/icu/source/i18n/uspoof_conf.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	******************************************************************************
5	*
6	* Copyright (C) 2008-2015, International Business Machines
7	* Corporation and others. All Rights Reserved.
8	*
9	******************************************************************************
10	* file name: uspoof_conf.cpp
11	* encoding: UTF-8
12	* tab size: 8 (not used)
13	* indentation:4
14	*
15	* created on: 2009Jan05 (refactoring earlier files)
16	* created by: Andy Heninger
17	*
18	* Internal classes for compililing confusable data into its binary (runtime) form.
19	*/
20
21	#include "unicode/utypes.h"
22	#include "unicode/uspoof.h"
23	#if !UCONFIG_NO_REGULAR_EXPRESSIONS
24	#if !UCONFIG_NO_NORMALIZATION
25
26	#include "unicode/unorm.h"
27	#include "unicode/uregex.h"
28	#include "unicode/ustring.h"
29	#include "cmemory.h"
30	#include "uspoof_impl.h"
31	#include "uhash.h"
32	#include "uvector.h"
33	#include "uassert.h"
34	#include "uarrsort.h"
35	#include "uspoof_conf.h"
36
37	U_NAMESPACE_USE
38
39
40	//---------------------------------------------------------------------
41	//
42	// buildConfusableData Compile the source confusable data, as defined by
43	// the Unicode data file confusables.txt, into the binary
44	// structures used by the confusable detector.
45	//
46	// The binary structures are described in uspoof_impl.h
47	//
48	// 1. Parse the data, making a hash table mapping from a UChar32 to a String.
49	//
50	// 2. Sort all of the strings encountered by length, since they will need to
51	// be stored in that order in the final string table.
52	// TODO: Sorting these strings by length is no longer needed since the removal of
53	// the string lengths table. This logic can be removed to save processing time
54	// when building confusables data.
55	//
56	// 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
57	// list because that will be the ordering of our runtime table.
58	//
59	// 4. Generate the run time string table. This is generated before the key & value
60	// tables because we need the string indexes when building those tables.
61	//
62	// 5. Build the run-time key and value tables. These are parallel tables, and are built
63	// at the same time
64	//
65
66	SPUString::SPUString(UnicodeString *s) {
67	fStr = s;
68	fCharOrStrTableIndex = `0`;
69	}
70
71
72	SPUString::~SPUString() {
73	delete fStr;
74	}
75
76
77	SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
78	fVec = new UVector (status);
79	if (fVec == NULL) {
80	status = U_MEMORY_ALLOCATION_ERROR;
81	return;
82	}
83	fHash = uhash_open(uhash_hashUnicodeString, // key hash function
84	uhash_compareUnicodeString, // Key Comparator
85	NULL, // Value Comparator
86	&status);
87	}
88
89
90	SPUStringPool::~SPUStringPool() {
91	int i;
92	for (i=fVec->size()-`1`; i>=`0`; i--) {
93	SPUString s = static_cast<SPUString >(fVec->elementAt(i));
94	delete s;
95	}
96	delete fVec;
97	uhash_close(fHash);
98	}
99
100
101	int32_t SPUStringPool::size() {
102	return fVec->size();
103	}
104
105	SPUString *SPUStringPool::getByIndex(int32_t index) {
106	SPUString retString = (SPUString )fVec->elementAt(index);
107	return retString;
108	}
109
110
111	// Comparison function for ordering strings in the string pool.
112	// Compare by length first, then, within a group of the same length,
113	// by code point order.
114	// Conforms to the type signature for a USortComparator in uvector.h
115
116	static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
117	const SPUString sL = const_cast<const* SPUString *>(
118	static_cast<SPUString *>(left.pointer));
119	const SPUString sR = const_cast<const* SPUString *>(
120	static_cast<SPUString *>(right.pointer));
121	int32_t lenL = sL->fStr->length();
122	int32_t lenR = sR->fStr->length();
123	if (lenL < lenR) {
124	return -`1`;
125	} else if (lenL > lenR) {
126	return `1`;
127	} else {
128	return sL->fStr->compare(*(sR->fStr));
129	}
130	}
131
132	void SPUStringPool::sort(UErrorCode &status) {
133	fVec->sort(SPUStringCompare, status);
134	}
135
136
137	SPUString SPUStringPool::addString(UnicodeString src, UErrorCode &status) {
138	SPUString hashedString = static_cast<SPUString >(uhash_get(fHash, src));
139	if (hashedString != NULL) {
140	delete src;
141	} else {
142	hashedString = new SPUString (src);
143	if (hashedString == NULL) {
144	status = U_MEMORY_ALLOCATION_ERROR;
145	return NULL;
146	}
147	uhash_put(fHash, src, hashedString, &status);
148	fVec->addElement(hashedString, status);
149	}
150	return hashedString;
151	}
152
153
154
155	ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
156	fSpoofImpl(spImpl),
157	fInput(NULL),
158	fTable(NULL),
159	fKeySet(NULL),
160	fKeyVec(NULL),
161	fValueVec(NULL),
162	fStringTable(NULL),
163	stringPool(NULL),
164	fParseLine(NULL),
165	fParseHexNum(NULL),
166	fLineNum(`0`)
167	{
168	if (U_FAILURE(status)) {
169	return;
170	}
171
172	fTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
173
174	fKeySet = new UnicodeSet ();
175	if (fKeySet == NULL) {
176	status = U_MEMORY_ALLOCATION_ERROR;
177	return;
178	}
179
180	fKeyVec = new UVector (status);
181	if (fKeyVec == NULL) {
182	status = U_MEMORY_ALLOCATION_ERROR;
183	return;
184	}
185
186	fValueVec = new UVector (status);
187	if (fValueVec == NULL) {
188	status = U_MEMORY_ALLOCATION_ERROR;
189	return;
190	}
191
192	stringPool = new SPUStringPool (status);
193	if (stringPool == NULL) {
194	status = U_MEMORY_ALLOCATION_ERROR;
195	return;
196	}
197	}
198
199
200	ConfusabledataBuilder::~ConfusabledataBuilder() {
201	uprv_free(fInput);
202	uregex_close(fParseLine);
203	uregex_close(fParseHexNum);
204	uhash_close(fTable);
205	delete fKeySet;
206	delete fKeyVec;
207	delete fStringTable;
208	delete fValueVec;
209	delete stringPool;
210	}
211
212
213	void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
214	int32_t confusablesLen, int32_t errorType, UParseError pe, UErrorCode &status) {
215
216	if (U_FAILURE(status)) {
217	return;
218	}
219	ConfusabledataBuilder builder(spImpl, status);
220	builder.build(confusables, confusablesLen, status);
221	if (U_FAILURE(status) && errorType != NULL) {
222	*errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
223	pe->line = builder.fLineNum;
224	}
225	}
226
227
228	void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
229	UErrorCode &status) {
230
231	// Convert the user input data from UTF-8 to UChar (UTF-16)
232	int32_t inputLen = `0`;
233	if (U_FAILURE(status)) {
234	return;
235	}
236	u_strFromUTF8(NULL, `0`, &inputLen, confusables, confusablesLen, &status);
237	if (status != U_BUFFER_OVERFLOW_ERROR) {
238	return;
239	}
240	status = U_ZERO_ERROR;
241	fInput = static_cast<UChar >(uprv_malloc((inputLen+`1`) sizeof(UChar)));
242	if (fInput == NULL) {
243	status = U_MEMORY_ALLOCATION_ERROR;
244	return;
245	}
246	u_strFromUTF8(fInput, inputLen+`1`, NULL, confusables, confusablesLen, &status);
247
248
249	// Regular Expression to parse a line from Confusables.txt. The expression will match
250	// any line. What was matched is determined by examining which capture groups have a match.
251	// Capture Group 1: the source char
252	// Capture Group 2: the replacement chars
253	// Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
254	// Capture Group 7: A blank or comment only line.
255	// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
256	// Example Line from the confusables.txt source file:
257	// "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
258	UnicodeString pattern(
259	"(?m)^[ \\t]([0-9A-Fa-f]+)[ \\t]+;" // Match the source char*
260	"[ \\t]([0-9A-Fa-f]+" // Match the replacement char(s)*
261	"(?:[ \\t]+[0-9A-Fa-f]+))[ \\t];" // (continued)
262	"\\s(?:(SL)\|(SA)\|(ML)\|(MA))" // Match the table type*
263	"[ \\t](?:#.?)?$" // Match any trailing #comment
264	"\|^([ \\t](?:#.?)?)$" // OR match empty lines or lines with only a #comment
265	"\|^(.?)$", -`1`, US_INV); // OR match any line, which catches illegal lines.*
266	// TODO: Why are we using the regex C API here? C++ would just take UnicodeString...
267	fParseLine = uregex_open(pattern.getBuffer(), pattern.length(), `0`, NULL, &status);
268
269	// Regular expression for parsing a hex number out of a space-separated list of them.
270	// Capture group 1 gets the number, with spaces removed.
271	pattern = UNICODE_STRING_SIMPLE("\\s*([0-9A-F]+)");
272	fParseHexNum = uregex_open(pattern.getBuffer(), pattern.length(), `0`, NULL, &status);
273
274	// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
275	// given the syntax of the input.
276	if (*fInput == `0xfeff`) {
277	*fInput = `0x20`;
278	}
279
280	// Parse the input, one line per iteration of this loop.
281	uregex_setText(fParseLine, fInput, inputLen, &status);
282	while (uregex_findNext(fParseLine, &status)) {
283	fLineNum++;
284	if (uregex_start(fParseLine, `7`, &status) >= `0`) {
285	// this was a blank or comment line.
286	continue;
287	}
288	if (uregex_start(fParseLine, `8`, &status) >= `0`) {
289	// input file syntax error.
290	status = U_PARSE_ERROR;
291	return;
292	}
293
294	// We have a good input line. Extract the key character and mapping string, and
295	// put them into the appropriate mapping table.
296	UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, `1`, &status),
297	uregex_end(fParseLine, `1`, &status), status);
298
299	int32_t mapStringStart = uregex_start(fParseLine, `2`, &status);
300	int32_t mapStringLength = uregex_end(fParseLine, `2`, &status) - mapStringStart;
301	uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
302
303	UnicodeString mapString = new* UnicodeString ();
304	if (mapString == NULL) {
305	status = U_MEMORY_ALLOCATION_ERROR;
306	return;
307	}
308	while (uregex_findNext(fParseHexNum, &status)) {
309	UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, `1`, &status),
310	uregex_end(fParseHexNum, `1`, &status), status);
311	mapString->append(c);
312	}
313	U_ASSERT(mapString->length() >= `1`);
314
315	// Put the map (value) string into the string pool
316	// This a little like a Java intern() - any duplicates will be eliminated.
317	SPUString *smapString = stringPool->addString(mapString, status);
318
319	// Add the UChar32 -> string mapping to the table.
320	// For Unicode 8, the SL, SA and ML tables have been discontinued.
321	// All input data from confusables.txt is tagged MA.
322	uhash_iput(fTable, keyChar, smapString, &status);
323	if (U_FAILURE(status)) { return; }
324	fKeySet->add(keyChar);
325	}
326
327	// Input data is now all parsed and collected.
328	// Now create the run-time binary form of the data.
329	//
330	// This is done in two steps. First the data is assembled into vectors and strings,
331	// for ease of construction, then the contents of these collections are dumped
332	// into the actual raw-bytes data storage.
333
334	// Build up the string array, and record the index of each string therein
335	// in the (build time only) string pool.
336	// Strings of length one are not entered into the strings array.
337	// (Strings in the table are sorted by length)
338	stringPool->sort(status);
339	fStringTable = new UnicodeString ();
340	int32_t poolSize = stringPool->size();
341	int32_t i;
342	for (i=`0`; i<poolSize; i++) {
343	SPUString *s = stringPool->getByIndex(i);
344	int32_t strLen = s->fStr->length();
345	int32_t strIndex = fStringTable->length();
346	if (strLen == `1`) {
347	// strings of length one do not get an entry in the string table.
348	// Keep the single string character itself here, which is the same
349	// convention that is used in the final run-time string table index.
350	s->fCharOrStrTableIndex = s->fStr->charAt(`0`);
351	} else {
352	s->fCharOrStrTableIndex = strIndex;
353	fStringTable->append(*(s->fStr));
354	}
355	}
356
357	// Construct the compile-time Key and Value tables
358	//
359	// For each key code point, check which mapping tables it applies to,
360	// and create the final data for the key & value structures.
361	//
362	// The four logical mapping tables are conflated into one combined table.
363	// If multiple logical tables have the same mapping for some key, they
364	// share a single entry in the combined table.
365	// If more than one mapping exists for the same key code point, multiple
366	// entries will be created in the table
367
368	for (int32_t range=`0`; range<fKeySet->getRangeCount(); range++) {
369	// It is an oddity of the UnicodeSet API that simply enumerating the contained
370	// code points requires a nested loop.
371	for (UChar32 keyChar=fKeySet->getRangeStart(range);
372	keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
373	SPUString targetMapping = static_cast<SPUString >(uhash_iget(fTable, keyChar));
374	U_ASSERT(targetMapping != NULL);
375
376	// Set an error code if trying to consume a long string. Otherwise,
377	// codePointAndLengthToKey will abort on a U_ASSERT.
378	if (targetMapping->fStr->length() > `256`) {
379	status = U_ILLEGAL_ARGUMENT_ERROR;
380	return;
381	}
382
383	int32_t key = ConfusableDataUtils::codePointAndLengthToKey(keyChar,
384	targetMapping->fStr->length());
385	int32_t value = targetMapping->fCharOrStrTableIndex;
386
387	fKeyVec->addElement(key, status);
388	fValueVec->addElement(value, status);
389	}
390	}
391
392	// Put the assembled data into the flat runtime array
393	outputData(status);
394
395	// All of the intermediate allocated data belongs to the ConfusabledataBuilder
396	// object (this), and is deleted in the destructor.
397	return;
398	}
399
400	//
401	// outputData The confusable data has been compiled and stored in intermediate
402	// collections and strings. Copy it from there to the final flat
403	// binary array.
404	//
405	// Note that as each section is added to the output data, the
406	// expand (reserveSpace() function will likely relocate it in memory.
407	// Be careful with pointers.
408	//
409	void ConfusabledataBuilder::outputData(UErrorCode &status) {
410
411	U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
412
413	// The Key Table
414	// While copying the keys to the runtime array,
415	// also sanity check that they are sorted.
416
417	int32_t numKeys = fKeyVec->size();
418	int32_t *keys =
419	static_cast<int32_t >(fSpoofImpl->fSpoofData->reserveSpace(numKeyssizeof(int32_t), status));
420	if (U_FAILURE(status)) {
421	return;
422	}
423	int i;
424	UChar32 previousCodePoint = `0`;
425	for (i=`0`; i<numKeys; i++) {
426	int32_t key = fKeyVec->elementAti(i);
427	UChar32 codePoint = ConfusableDataUtils::keyToCodePoint(key);
428	(void)previousCodePoint; // Suppress unused variable warning.
429	// strictly greater because there can be only one entry per code point
430	U_ASSERT(codePoint > previousCodePoint);
431	keys[i] = key;
432	previousCodePoint = codePoint;
433	}
434	SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
435	rawData->fCFUKeys = (int32_t)((char )keys - (char* *)rawData);
436	rawData->fCFUKeysSize = numKeys;
437	fSpoofImpl->fSpoofData->fCFUKeys = keys;
438
439
440	// The Value Table, parallels the key table
441	int32_t numValues = fValueVec->size();
442	U_ASSERT(numKeys == numValues);
443	uint16_t *values =
444	static_cast<uint16_t >(fSpoofImpl->fSpoofData->reserveSpace(numKeyssizeof(uint16_t), status));
445	if (U_FAILURE(status)) {
446	return;
447	}
448	for (i=`0`; i<numValues; i++) {
449	uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
450	U_ASSERT(value < `0xffff`);
451	values[i] = static_cast<uint16_t>(value);
452	}
453	rawData = fSpoofImpl->fSpoofData->fRawData;
454	rawData->fCFUStringIndex = (int32_t)((char )values - (char* *)rawData);
455	rawData->fCFUStringIndexSize = numValues;
456	fSpoofImpl->fSpoofData->fCFUValues = values;
457
458	// The Strings Table.
459
460	uint32_t stringsLength = fStringTable->length();
461	// Reserve an extra space so the string will be nul-terminated. This is
462	// only a convenience, for when debugging; it is not needed otherwise.
463	UChar *strings =
464	static_cast<UChar >(fSpoofImpl->fSpoofData->reserveSpace(stringsLengthsizeof(UChar)+`2`, status));
465	if (U_FAILURE(status)) {
466	return;
467	}
468	fStringTable->extract(strings, stringsLength+`1`, status);
469	rawData = fSpoofImpl->fSpoofData->fRawData;
470	U_ASSERT(rawData->fCFUStringTable == `0`);
471	rawData->fCFUStringTable = (int32_t)((char )strings - (char* *)rawData);
472	rawData->fCFUStringTableLen = stringsLength;
473	fSpoofImpl->fSpoofData->fCFUStrings = strings;
474	}
475
476	#endif
477	#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
478
479

Browse the source code of engine/third_party/icu/source/i18n/uspoof_conf.cpp