normlzr.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/normlzr.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	*************************************************************************
5	* COPYRIGHT:
6	* Copyright (c) 1996-2012, International Business Machines Corporation and
7	* others. All Rights Reserved.
8	*************************************************************************
9	*/
10
11	#include "unicode/utypes.h"
12
13	#if !UCONFIG_NO_NORMALIZATION
14
15	#include "unicode/uniset.h"
16	#include "unicode/unistr.h"
17	#include "unicode/chariter.h"
18	#include "unicode/schriter.h"
19	#include "unicode/uchriter.h"
20	#include "unicode/normlzr.h"
21	#include "unicode/utf16.h"
22	#include "cmemory.h"
23	#include "normalizer2impl.h"
24	#include "uprops.h" // for uniset_getUnicode32Instance()
25
26	#if defined(move32)
27	// System can define move32 intrinsics, but the char iters define move32 method
28	// using same undef trick in headers, so undef here to re-enable the method.
29	#undef move32
30	#endif
31
32	U_NAMESPACE_BEGIN
33
34	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
35
36	//-------------------------------------------------------------------------
37	// Constructors and other boilerplate
38	//-------------------------------------------------------------------------
39
40	Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
41	UObject (), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(`0`),
42	text(new StringCharacterIterator (str)),
43	currentIndex(`0`), nextIndex(`0`),
44	buffer (), bufferPos(`0`)
45	{
46	init();
47	}
48
49	Normalizer::Normalizer(ConstChar16Ptr str, int32_t length, UNormalizationMode mode) :
50	UObject (), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(`0`),
51	text(new UCharCharacterIterator (str, length)),
52	currentIndex(`0`), nextIndex(`0`),
53	buffer (), bufferPos(`0`)
54	{
55	init();
56	}
57
58	Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
59	UObject (), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(`0`),
60	text(iter.clone()),
61	currentIndex(`0`), nextIndex(`0`),
62	buffer (), bufferPos(`0`)
63	{
64	init();
65	}
66
67	Normalizer::Normalizer(const Normalizer &copy) :
68	UObject (copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
69	text(copy.text->clone()),
70	currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
71	buffer (copy.buffer), bufferPos(copy.bufferPos)
72	{
73	init();
74	}
75
76	void
77	Normalizer::init() {
78	UErrorCode errorCode=U_ZERO_ERROR;
79	fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
80	if(fOptions&UNORM_UNICODE_3_2) {
81	delete fFilteredNorm2;
82	fNorm2=fFilteredNorm2=
83	new FilteredNormalizer2 (fNorm2, uniset_getUnicode32Instance(errorCode));
84	}
85	if(U_FAILURE(errorCode)) {
86	errorCode=U_ZERO_ERROR;
87	fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
88	}
89	}
90
91	Normalizer::~Normalizer()
92	{
93	delete fFilteredNorm2;
94	delete text;
95	}
96
97	Normalizer*
98	Normalizer::clone() const
99	{
100	return new Normalizer (*this);
101	}
102
103	/**
104	* Generates a hash code for this iterator.
105	*/
106	int32_t Normalizer::hashCode() const
107	{
108	return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
109	}
110
111	UBool Normalizer::operator==(const Normalizer& that) const
112	{
113	return
114	this==&that \|\|
115	(fUMode==that.fUMode &&
116	fOptions==that.fOptions &&
117	text ==that.text &&
118	buffer ==that.buffer &&
119	bufferPos==that.bufferPos &&
120	nextIndex==that.nextIndex);
121	}
122
123	//-------------------------------------------------------------------------
124	// Static utility methods
125	//-------------------------------------------------------------------------
126
127	void U_EXPORT2
128	Normalizer::normalize(const UnicodeString& source,
129	UNormalizationMode mode, int32_t options,
130	UnicodeString& result,
131	UErrorCode &status) {
132	if(source.isBogus() \|\| U_FAILURE(status)) {
133	result.setToBogus();
134	if(U_SUCCESS(status)) {
135	status=U_ILLEGAL_ARGUMENT_ERROR;
136	}
137	} else {
138	UnicodeString localDest;
139	UnicodeString *dest;
140
141	if(&source!=&result) {
142	dest=&result;
143	} else {
144	// the source and result strings are the same object, use a temporary one
145	dest=&localDest;
146	}
147	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
148	if(U_SUCCESS(status)) {
149	if(options&UNORM_UNICODE_3_2) {
150	FilteredNormalizer2 (n2, uniset_getUnicode32Instance(status)).
151	normalize(source, *dest, status);
152	} else {
153	n2->normalize(source, *dest, status);
154	}
155	}
156	if(dest==&localDest && U_SUCCESS(status)) {
157	result =*dest;
158	}
159	}
160	}
161
162	void U_EXPORT2
163	Normalizer::compose(const UnicodeString& source,
164	UBool compat, int32_t options,
165	UnicodeString& result,
166	UErrorCode &status) {
167	normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
168	}
169
170	void U_EXPORT2
171	Normalizer::decompose(const UnicodeString& source,
172	UBool compat, int32_t options,
173	UnicodeString& result,
174	UErrorCode &status) {
175	normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
176	}
177
178	UNormalizationCheckResult
179	Normalizer::quickCheck(const UnicodeString& source,
180	UNormalizationMode mode, int32_t options,
181	UErrorCode &status) {
182	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
183	if(U_SUCCESS(status)) {
184	if(options&UNORM_UNICODE_3_2) {
185	return FilteredNormalizer2 (n2, uniset_getUnicode32Instance(status)).
186	quickCheck(source, status);
187	} else {
188	return n2->quickCheck(source, status);
189	}
190	} else {
191	return UNORM_MAYBE;
192	}
193	}
194
195	UBool
196	Normalizer::isNormalized(const UnicodeString& source,
197	UNormalizationMode mode, int32_t options,
198	UErrorCode &status) {
199	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
200	if(U_SUCCESS(status)) {
201	if(options&UNORM_UNICODE_3_2) {
202	return FilteredNormalizer2 (n2, uniset_getUnicode32Instance(status)).
203	isNormalized(source, status);
204	} else {
205	return n2->isNormalized(source, status);
206	}
207	} else {
208	return FALSE;
209	}
210	}
211
212	UnicodeString & U_EXPORT2
213	Normalizer::concatenate(const UnicodeString &left, const UnicodeString &right,
214	UnicodeString &result,
215	UNormalizationMode mode, int32_t options,
216	UErrorCode &errorCode) {
217	if(left.isBogus() \|\| right.isBogus() \|\| U_FAILURE(errorCode)) {
218	result.setToBogus();
219	if(U_SUCCESS(errorCode)) {
220	errorCode=U_ILLEGAL_ARGUMENT_ERROR;
221	}
222	} else {
223	UnicodeString localDest;
224	UnicodeString *dest;
225
226	if(&right!=&result) {
227	dest=&result;
228	} else {
229	// the right and result strings are the same object, use a temporary one
230	dest=&localDest;
231	}
232	*dest =left;
233	const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
234	if(U_SUCCESS(errorCode)) {
235	if(options&UNORM_UNICODE_3_2) {
236	FilteredNormalizer2 (n2, uniset_getUnicode32Instance(errorCode)).
237	append(*dest, right, errorCode);
238	} else {
239	n2->append(*dest, right, errorCode);
240	}
241	}
242	if(dest==&localDest && U_SUCCESS(errorCode)) {
243	result =*dest;
244	}
245	}
246	return result;
247	}
248
249	//-------------------------------------------------------------------------
250	// Iteration API
251	//-------------------------------------------------------------------------
252
253	/**
254	* Return the current character in the normalized text.
255	*/
256	UChar32 Normalizer::current() {
257	if(bufferPos<buffer.length() \|\| nextNormalize()) {
258	return buffer.char32At(bufferPos);
259	} else {
260	return DONE;
261	}
262	}
263
264	/**
265	* Return the next character in the normalized text and advance
266	* the iteration position by one. If the end
267	* of the text has already been reached, {@link #DONE} is returned.
268	*/
269	UChar32 Normalizer::next() {
270	if(bufferPos<buffer.length() \|\| nextNormalize()) {
271	UChar32 c=buffer.char32At(bufferPos);
272	bufferPos+=U16_LENGTH(c);
273	return c;
274	} else {
275	return DONE;
276	}
277	}
278
279	/**
280	* Return the previous character in the normalized text and decrement
281	* the iteration position by one. If the beginning
282	* of the text has already been reached, {@link #DONE} is returned.
283	*/
284	UChar32 Normalizer::previous() {
285	if(bufferPos>`0` \|\| previousNormalize()) {
286	UChar32 c=buffer.char32At(bufferPos-`1`);
287	bufferPos-=U16_LENGTH(c);
288	return c;
289	} else {
290	return DONE;
291	}
292	}
293
294	void Normalizer::reset() {
295	currentIndex=nextIndex=text->setToStart();
296	clearBuffer();
297	}
298
299	void
300	Normalizer::setIndexOnly(int32_t index) {
301	text->setIndex(index); // pins index
302	currentIndex=nextIndex=text->getIndex();
303	clearBuffer();
304	}
305
306	/**
307	* Return the first character in the normalized text. This resets
308	* the <tt>Normalizer's</tt> position to the beginning of the text.
309	*/
310	UChar32 Normalizer::first() {
311	reset();
312	return next();
313	}
314
315	/**
316	* Return the last character in the normalized text. This resets
317	* the <tt>Normalizer's</tt> position to be just before the
318	* the input text corresponding to that normalized character.
319	*/
320	UChar32 Normalizer::last() {
321	currentIndex=nextIndex=text->setToEnd();
322	clearBuffer();
323	return previous();
324	}
325
326	/**
327	* Retrieve the current iteration position in the input text that is
328	* being normalized. This method is useful in applications such as
329	* searching, where you need to be able to determine the position in
330	* the input text that corresponds to a given normalized output character.
331	* <p>
332	* <b>Note:</b> This method sets the position in the <em>input</em>, while
333	* {@link #next} and {@link #previous} iterate through characters in the
334	* <em>output</em>. This means that there is not necessarily a one-to-one
335	* correspondence between characters returned by <tt>next</tt> and
336	* <tt>previous</tt> and the indices passed to and returned from
337	* <tt>setIndex</tt> and {@link #getIndex}.
338	*
339	*/
340	int32_t Normalizer::getIndex() const {
341	if(bufferPos<buffer.length()) {
342	return currentIndex;
343	} else {
344	return nextIndex;
345	}
346	}
347
348	/**
349	* Retrieve the index of the start of the input text. This is the begin index
350	* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
351	* over which this <tt>Normalizer</tt> is iterating
352	*/
353	int32_t Normalizer::startIndex() const {
354	return text->startIndex();
355	}
356
357	/**
358	* Retrieve the index of the end of the input text. This is the end index
359	* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
360	* over which this <tt>Normalizer</tt> is iterating
361	*/
362	int32_t Normalizer::endIndex() const {
363	return text->endIndex();
364	}
365
366	//-------------------------------------------------------------------------
367	// Property access methods
368	//-------------------------------------------------------------------------
369
370	void
371	Normalizer::setMode(UNormalizationMode newMode)
372	{
373	fUMode = newMode;
374	init();
375	}
376
377	UNormalizationMode
378	Normalizer::getUMode() const
379	{
380	return fUMode;
381	}
382
383	void
384	Normalizer::setOption(int32_t option,
385	UBool value)
386	{
387	if (value) {
388	fOptions \|= option;
389	} else {
390	fOptions &= (~option);
391	}
392	init();
393	}
394
395	UBool
396	Normalizer::getOption(int32_t option) const
397	{
398	return (fOptions & option) != `0`;
399	}
400
401	/**
402	* Set the input text over which this <tt>Normalizer</tt> will iterate.
403	* The iteration position is set to the beginning of the input text.
404	*/
405	void
406	Normalizer::setText(const UnicodeString& newText,
407	UErrorCode &status)
408	{
409	if (U_FAILURE(status)) {
410	return;
411	}
412	CharacterIterator newIter = new* StringCharacterIterator (newText);
413	if (newIter == NULL) {
414	status = U_MEMORY_ALLOCATION_ERROR;
415	return;
416	}
417	delete text;
418	text = newIter;
419	reset();
420	}
421
422	/**
423	* Set the input text over which this <tt>Normalizer</tt> will iterate.
424	* The iteration position is set to the beginning of the string.
425	*/
426	void
427	Normalizer::setText(const CharacterIterator& newText,
428	UErrorCode &status)
429	{
430	if (U_FAILURE(status)) {
431	return;
432	}
433	CharacterIterator *newIter = newText.clone();
434	if (newIter == NULL) {
435	status = U_MEMORY_ALLOCATION_ERROR;
436	return;
437	}
438	delete text;
439	text = newIter;
440	reset();
441	}
442
443	void
444	Normalizer::setText(ConstChar16Ptr newText,
445	int32_t length,
446	UErrorCode &status)
447	{
448	if (U_FAILURE(status)) {
449	return;
450	}
451	CharacterIterator newIter = new* UCharCharacterIterator (newText, length);
452	if (newIter == NULL) {
453	status = U_MEMORY_ALLOCATION_ERROR;
454	return;
455	}
456	delete text;
457	text = newIter;
458	reset();
459	}
460
461	/**
462	* Copies the text under iteration into the UnicodeString referred to by "result".
463	* @param result Receives a copy of the text under iteration.
464	*/
465	void
466	Normalizer::getText(UnicodeString& result)
467	{
468	text->getText(result);
469	}
470
471	//-------------------------------------------------------------------------
472	// Private utility methods
473	//-------------------------------------------------------------------------
474
475	void Normalizer::clearBuffer() {
476	buffer.remove();
477	bufferPos=`0`;
478	}
479
480	UBool
481	Normalizer::nextNormalize() {
482	clearBuffer();
483	currentIndex=nextIndex;
484	text->setIndex(nextIndex);
485	if(!text->hasNext()) {
486	return FALSE;
487	}
488	// Skip at least one character so we make progress.
489	UnicodeString segment(text->next32PostInc());
490	while(text->hasNext()) {
491	UChar32 c;
492	if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
493	text->move32(-`1`, CharacterIterator::kCurrent);
494	break;
495	}
496	segment.append(c);
497	}
498	nextIndex=text->getIndex();
499	UErrorCode errorCode=U_ZERO_ERROR;
500	fNorm2->normalize(segment, buffer, errorCode);
501	return U_SUCCESS(errorCode) && !buffer.isEmpty();
502	}
503
504	UBool
505	Normalizer::previousNormalize() {
506	clearBuffer();
507	nextIndex=currentIndex;
508	text->setIndex(currentIndex);
509	if(!text->hasPrevious()) {
510	return FALSE;
511	}
512	UnicodeString segment;
513	while(text->hasPrevious()) {
514	UChar32 c=text->previous32();
515	segment.insert(`0`, c);
516	if(fNorm2->hasBoundaryBefore(c)) {
517	break;
518	}
519	}
520	currentIndex=text->getIndex();
521	UErrorCode errorCode=U_ZERO_ERROR;
522	fNorm2->normalize(segment, buffer, errorCode);
523	bufferPos=buffer.length();
524	return U_SUCCESS(errorCode) && !buffer.isEmpty();
525	}
526
527	U_NAMESPACE_END
528
529	#endif /* #if !UCONFIG_NO_NORMALIZATION */
530

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/normlzr.cpp