caniter.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/caniter.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*****************************************************************************
5	* Copyright (C) 1996-2015, International Business Machines Corporation and
6	* others. All Rights Reserved.
7	*****************************************************************************
8	*/
9
10	#include "unicode/utypes.h"
11
12	#if !UCONFIG_NO_NORMALIZATION
13
14	#include "unicode/caniter.h"
15	#include "unicode/normalizer2.h"
16	#include "unicode/uchar.h"
17	#include "unicode/uniset.h"
18	#include "unicode/usetiter.h"
19	#include "unicode/ustring.h"
20	#include "unicode/utf16.h"
21	#include "cmemory.h"
22	#include "hash.h"
23	#include "normalizer2impl.h"
24
25	/**
26	* This class allows one to iterate through all the strings that are canonically equivalent to a given
27	* string. For example, here are some sample results:
28	Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
29	1: \u0041\u030A\u0064\u0307\u0327
30	= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
31	2: \u0041\u030A\u0064\u0327\u0307
32	= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
33	3: \u0041\u030A\u1E0B\u0327
34	= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
35	4: \u0041\u030A\u1E11\u0307
36	= {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
37	5: \u00C5\u0064\u0307\u0327
38	= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
39	6: \u00C5\u0064\u0327\u0307
40	= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
41	7: \u00C5\u1E0B\u0327
42	= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
43	8: \u00C5\u1E11\u0307
44	= {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
45	9: \u212B\u0064\u0307\u0327
46	= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
47	10: \u212B\u0064\u0327\u0307
48	= {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
49	11: \u212B\u1E0B\u0327
50	= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
51	12: \u212B\u1E11\u0307
52	= {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
53	*<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
54	* since it has not been optimized for that situation.
55	*@author M. Davis
56	*@draft
57	*/
58
59	// public
60
61	U_NAMESPACE_BEGIN
62
63	// TODO: add boilerplate methods.
64
65	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
66
67	/**
68	*@param source string to get results for
69	*/
70	CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
71	pieces(NULL),
72	pieces_length(`0`),
73	pieces_lengths(NULL),
74	current(NULL),
75	current_length(`0`),
76	nfd(*Normalizer2::getNFDInstance(status)),
77	nfcImpl(*Normalizer2Factory::getNFCImpl(status))
78	{
79	if(U_SUCCESS(status) && nfcImpl.ensureCanonIterData(status)) {
80	setSource(sourceStr, status);
81	}
82	}
83
84	CanonicalIterator::~CanonicalIterator() {
85	cleanPieces();
86	}
87
88	void CanonicalIterator::cleanPieces() {
89	int32_t i = `0`;
90	if(pieces != NULL) {
91	for(i = `0`; i < pieces_length; i++) {
92	if(pieces[i] != NULL) {
93	delete[] pieces[i];
94	}
95	}
96	uprv_free(pieces);
97	pieces = NULL;
98	pieces_length = `0`;
99	}
100	if(pieces_lengths != NULL) {
101	uprv_free(pieces_lengths);
102	pieces_lengths = NULL;
103	}
104	if(current != NULL) {
105	uprv_free(current);
106	current = NULL;
107	current_length = `0`;
108	}
109	}
110
111	/**
112	*@return gets the source: NOTE: it is the NFD form of source
113	*/
114	UnicodeString CanonicalIterator::getSource() {
115	return source;
116	}
117
118	/**
119	* Resets the iterator so that one can start again from the beginning.
120	*/
121	void CanonicalIterator::reset() {
122	done = FALSE;
123	for (int i = `0`; i < current_length; ++i) {
124	current[i] = `0`;
125	}
126	}
127
128	/**
129	*@return the next string that is canonically equivalent. The value null is returned when
130	* the iteration is done.
131	*/
132	UnicodeString CanonicalIterator::next() {
133	int32_t i = `0`;
134
135	if (done) {
136	buffer.setToBogus();
137	return buffer;
138	}
139
140	// delete old contents
141	buffer.remove();
142
143	// construct return value
144
145	for (i = `0`; i < pieces_length; ++i) {
146	buffer.append(pieces[i][current[i]]);
147	}
148	//String result = buffer.toString(); // not needed
149
150	// find next value for next time
151
152	for (i = current_length - `1`; ; --i) {
153	if (i < `0`) {
154	done = TRUE;
155	break;
156	}
157	current[i]++;
158	if (current[i] < pieces_lengths[i]) break; // got sequence
159	current[i] = `0`;
160	}
161	return buffer;
162	}
163
164	/**
165	*@param set the source string to iterate against. This allows the same iterator to be used
166	* while changing the source string, saving object creation.
167	*/
168	void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
169	int32_t list_length = `0`;
170	UChar32 cp = `0`;
171	int32_t start = `0`;
172	int32_t i = `0`;
173	UnicodeString *list = NULL;
174
175	nfd.normalize(newSource, source, status);
176	if(U_FAILURE(status)) {
177	return;
178	}
179	done = FALSE;
180
181	cleanPieces();
182
183	// catch degenerate case
184	if (newSource.length() == `0`) {
185	pieces = (UnicodeString )uprv_malloc(sizeof*(UnicodeString ));
186	pieces_lengths = (int32_t)uprv_malloc(`1` sizeof(int32_t));
187	pieces_length = `1`;
188	current = (int32_t)uprv_malloc(`1` sizeof(int32_t));
189	current_length = `1`;
190	if (pieces == NULL \|\| pieces_lengths == NULL \|\| current == NULL) {
191	status = U_MEMORY_ALLOCATION_ERROR;
192	goto CleanPartialInitialization;
193	}
194	current[`0`] = `0`;
195	pieces[`0`] = new UnicodeString[`1`];
196	pieces_lengths[`0`] = `1`;
197	if (pieces[`0`] == `0`) {
198	status = U_MEMORY_ALLOCATION_ERROR;
199	goto CleanPartialInitialization;
200	}
201	return;
202	}
203
204
205	list = new UnicodeString[source.length()];
206	if (list == `0`) {
207	status = U_MEMORY_ALLOCATION_ERROR;
208	goto CleanPartialInitialization;
209	}
210
211	// i should initialy be the number of code units at the
212	// start of the string
213	i = U16_LENGTH(source.char32At(`0`));
214	//int32_t i = 1;
215	// find the segments
216	// This code iterates through the source string and
217	// extracts segments that end up on a codepoint that
218	// doesn't start any decompositions. (Analysis is done
219	// on the NFD form - see above).
220	for (; i < source.length(); i += U16_LENGTH(cp)) {
221	cp = source.char32At(i);
222	if (nfcImpl.isCanonSegmentStarter(cp)) {
223	source.extract(start, i-start, list[list_length++]); // add up to i
224	start = i;
225	}
226	}
227	source.extract(start, i-start, list[list_length++]); // add last one
228
229
230	// allocate the arrays, and find the strings that are CE to each segment
231	pieces = (UnicodeString *)uprv_malloc(list_length sizeof(UnicodeString *));
232	pieces_length = list_length;
233	pieces_lengths = (int32_t)uprv_malloc(list_length sizeof(int32_t));
234	current = (int32_t)uprv_malloc(list_length sizeof(int32_t));
235	current_length = list_length;
236	if (pieces == NULL \|\| pieces_lengths == NULL \|\| current == NULL) {
237	status = U_MEMORY_ALLOCATION_ERROR;
238	goto CleanPartialInitialization;
239	}
240
241	for (i = `0`; i < current_length; i++) {
242	current[i] = `0`;
243	}
244	// for each segment, get all the combinations that can produce
245	// it after NFD normalization
246	for (i = `0`; i < pieces_length; ++i) {
247	//if (PROGRESS) printf("SEGMENT\n");
248	pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
249	}
250
251	delete[] list;
252	return;
253	// Common section to cleanup all local variables and reset object variables.
254	CleanPartialInitialization:
255	if (list != NULL) {
256	delete[] list;
257	}
258	cleanPieces();
259	}
260
261	/**
262	* Dumb recursive implementation of permutation.
263	* TODO: optimize
264	* @param source the string to find permutations for
265	* @return the results in a set.
266	*/
267	void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
268	if(U_FAILURE(status)) {
269	return;
270	}
271	//if (PROGRESS) printf("Permute: %s\n", UToS(Tr(source)));
272	int32_t i = `0`;
273
274	// optimization:
275	// if zero or one character, just return a set with it
276	// we check for length < 2 to keep from counting code points all the time
277	if (source.length() <= `2` && source.countChar32() <= `1`) {
278	UnicodeString toPut = new* UnicodeString (source);
279	/ test for NULL /
280	if (toPut == `0`) {
281	status = U_MEMORY_ALLOCATION_ERROR;
282	return;
283	}
284	result->put(source, toPut, status);
285	return;
286	}
287
288	// otherwise iterate through the string, and recursively permute all the other characters
289	UChar32 cp;
290	Hashtable subpermute(status);
291	if(U_FAILURE(status)) {
292	return;
293	}
294	subpermute.setValueDeleter(uprv_deleteUObject);
295
296	for (i = `0`; i < source.length(); i += U16_LENGTH(cp)) {
297	cp = source.char32At(i);
298	const UHashElement *ne = NULL;
299	int32_t el = UHASH_FIRST;
300	UnicodeString subPermuteString = source;
301
302	// optimization:
303	// if the character is canonical combining class zero,
304	// don't permute it
305	if (skipZeros && i != `0` && u_getCombiningClass(cp) == `0`) {
306	//System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i)));
307	continue;
308	}
309
310	subpermute.removeAll();
311
312	// see what the permutations of the characters before and after this one are
313	//Hashtable subpermute = permute(source.substring(0,i) + source.substring(i + UTF16.getCharCount(cp)));*
314	permute(subPermuteString.remove(i, U16_LENGTH(cp)), skipZeros, &subpermute, status);
315	/ Test for buffer overflows /
316	if(U_FAILURE(status)) {
317	return;
318	}
319	// The upper remove is destructive. The question is do we have to make a copy, or we don't care about the contents
320	// of source at this point.
321
322	// prefix this character to all of them
323	ne = subpermute.nextElement(el);
324	while (ne != NULL) {
325	UnicodeString permRes = (UnicodeString )(ne->value.pointer);
326	UnicodeString chStr = new* UnicodeString (cp);
327	//test for NULL
328	if (chStr == NULL) {
329	status = U_MEMORY_ALLOCATION_ERROR;
330	return;
331	}
332	chStr->append(permRes); //((UnicodeString )(ne->value.pointer));*
333	//if (PROGRESS) printf(" Piece: %s\n", UToS(chStr));*
334	result->put(*chStr, chStr, status);
335	ne = subpermute.nextElement(el);
336	}
337	}
338	//return result;
339	}
340
341	// privates
342
343	// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
344	UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
345	Hashtable result(status);
346	Hashtable permutations(status);
347	Hashtable basic(status);
348	if (U_FAILURE(status)) {
349	return `0`;
350	}
351	result.setValueDeleter(uprv_deleteUObject);
352	permutations.setValueDeleter(uprv_deleteUObject);
353	basic.setValueDeleter(uprv_deleteUObject);
354
355	UChar USeg[`256`];
356	int32_t segLen = segment.extract(USeg, `256`, status);
357	getEquivalents2(&basic, USeg, segLen, status);
358
359	// now get all the permutations
360	// add only the ones that are canonically equivalent
361	// TODO: optimize by not permuting any class zero.
362
363	const UHashElement *ne = NULL;
364	int32_t el = UHASH_FIRST;
365	//Iterator it = basic.iterator();
366	ne = basic.nextElement(el);
367	//while (it.hasNext())
368	while (ne != NULL) {
369	//String item = (String) it.next();
370	UnicodeString item = ((UnicodeString )(ne->value.pointer));
371
372	permutations.removeAll();
373	permute(item, CANITER_SKIP_ZEROES, &permutations, status);
374	const UHashElement *ne2 = NULL;
375	int32_t el2 = UHASH_FIRST;
376	//Iterator it2 = permutations.iterator();
377	ne2 = permutations.nextElement(el2);
378	//while (it2.hasNext())
379	while (ne2 != NULL) {
380	//String possible = (String) it2.next();
381	//UnicodeString possible = new UnicodeString(((UnicodeString )(ne2->value.pointer)));*
382	UnicodeString possible(((UnicodeString )(ne2->value.pointer)));
383	UnicodeString attempt;
384	nfd.normalize(possible, attempt, status);
385
386	// TODO: check if operator == is semanticaly the same as attempt.equals(segment)
387	if (attempt ==segment) {
388	//if (PROGRESS) printf("Adding Permutation: %s\n", UToS(Tr(possible)));*
389	// TODO: use the hashtable just to catch duplicates - store strings directly (somehow).
390	result.put(possible, new UnicodeString (possible), status); //add(possible);
391	} else {
392	//if (PROGRESS) printf("-Skipping Permutation: %s\n", UToS(Tr(possible)));*
393	}
394
395	ne2 = permutations.nextElement(el2);
396	}
397	ne = basic.nextElement(el);
398	}
399
400	/ Test for buffer overflows /
401	if(U_FAILURE(status)) {
402	return `0`;
403	}
404	// convert into a String[] to clean up storage
405	//String[] finalResult = new String[result.size()];
406	UnicodeString *finalResult = NULL;
407	int32_t resultCount;
408	if((resultCount = result.count()) != `0`) {
409	finalResult = new UnicodeString[resultCount];
410	if (finalResult == `0`) {
411	status = U_MEMORY_ALLOCATION_ERROR;
412	return NULL;
413	}
414	}
415	else {
416	status = U_ILLEGAL_ARGUMENT_ERROR;
417	return NULL;
418	}
419	//result.toArray(finalResult);
420	result_len = `0`;
421	el = UHASH_FIRST;
422	ne = result.nextElement(el);
423	while(ne != NULL) {
424	finalResult[result_len++] = ((UnicodeString )(ne->value.pointer));
425	ne = result.nextElement(el);
426	}
427
428
429	return finalResult;
430	}
431
432	Hashtable CanonicalIterator::getEquivalents2(Hashtable fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) {
433
434	if (U_FAILURE(status)) {
435	return NULL;
436	}
437
438	//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));
439
440	UnicodeString toPut(segment, segLen);
441
442	fillinResult->put(toPut, new UnicodeString (toPut), status);
443
444	UnicodeSet starts;
445
446	// cycle through all the characters
447	UChar32 cp;
448	for (int32_t i = `0`; i < segLen; i += U16_LENGTH(cp)) {
449	// see if any character is at the start of some decomposition
450	U16_GET(segment, `0`, i, segLen, cp);
451	if (!nfcImpl.getCanonStartSet(cp, starts)) {
452	continue;
453	}
454	// if so, see which decompositions match
455	UnicodeSetIterator iter(starts);
456	while (iter.next()) {
457	UChar32 cp2 = iter.getCodepoint();
458	Hashtable remainder(status);
459	remainder.setValueDeleter(uprv_deleteUObject);
460	if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) {
461	continue;
462	}
463
464	// there were some matches, so add all the possibilities to the set.
465	UnicodeString prefix(segment, i);
466	prefix += cp2;
467
468	int32_t el = UHASH_FIRST;
469	const UHashElement *ne = remainder.nextElement(el);
470	while (ne != NULL) {
471	UnicodeString item = ((UnicodeString )(ne->value.pointer));
472	UnicodeString toAdd = new* UnicodeString (prefix);
473	/ test for NULL /
474	if (toAdd == `0`) {
475	status = U_MEMORY_ALLOCATION_ERROR;
476	return NULL;
477	}
478	*toAdd += item;
479	fillinResult->put(*toAdd, toAdd, status);
480
481	//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(toAdd)));*
482
483	ne = remainder.nextElement(el);
484	}
485	}
486	}
487
488	/ Test for buffer overflows /
489	if(U_FAILURE(status)) {
490	return NULL;
491	}
492	return fillinResult;
493	}
494
495	/**
496	* See if the decomposition of cp2 is at segment starting at segmentPos
497	* (with canonical rearrangment!)
498	* If so, take the remainder, and return the equivalents
499	*/
500	Hashtable CanonicalIterator::extract(Hashtable fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
501	//Hashtable CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {*
502	//if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
503	//if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);
504
505	if (U_FAILURE(status)) {
506	return NULL;
507	}
508
509	UnicodeString temp(comp);
510	int32_t inputLen=temp.length();
511	UnicodeString decompString;
512	nfd.normalize(temp, decompString, status);
513	if (U_FAILURE(status)) {
514	return NULL;
515	}
516	if (decompString.isBogus()) {
517	status = U_MEMORY_ALLOCATION_ERROR;
518	return NULL;
519	}
520	const UChar *decomp=decompString.getBuffer();
521	int32_t decompLen=decompString.length();
522
523	// See if it matches the start of segment (at segmentPos)
524	UBool ok = FALSE;
525	UChar32 cp;
526	int32_t decompPos = `0`;
527	UChar32 decompCp;
528	U16_NEXT(decomp, decompPos, decompLen, decompCp);
529
530	int32_t i = segmentPos;
531	while(i < segLen) {
532	U16_NEXT(segment, i, segLen, cp);
533
534	if (cp == decompCp) { // if equal, eat another cp from decomp
535
536	//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
537
538	if (decompPos == decompLen) { // done, have all decomp characters!
539	temp.append(segment+i, segLen-i);
540	ok = TRUE;
541	break;
542	}
543	U16_NEXT(decomp, decompPos, decompLen, decompCp);
544	} else {
545	//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
546
547	// brute force approach
548	temp.append(cp);
549
550	/ TODO: optimize*
551	// since we know that the classes are monotonically increasing, after zero
552	// e.g. 0 5 7 9 0 3
553	// we can do an optimization
554	// there are only a few cases that work: zero, less, same, greater
555	// if both classes are the same, we fail
556	// if the decomp class < the segment class, we fail
557
558	segClass = getClass(cp);
559	if (decompClass <= segClass) return null;
560	*/
561	}
562	}
563	if (!ok)
564	return NULL; // we failed, characters left over
565
566	//if (PROGRESS) printf("Matches\n");
567
568	if (inputLen == temp.length()) {
569	fillinResult->put(UnicodeString (), new UnicodeString (), status);
570	return fillinResult; // succeed, but no remainder
571	}
572
573	// brute force approach
574	// check to make sure result is canonically equivalent
575	UnicodeString trial;
576	nfd.normalize(temp, trial, status);
577	if(U_FAILURE(status) \|\| trial.compare(segment+segmentPos, segLen - segmentPos) != `0`) {
578	return NULL;
579	}
580
581	return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
582	}
583
584	U_NAMESPACE_END
585
586	#endif /* #if !UCONFIG_NO_NORMALIZATION */
587

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/caniter.cpp