nortrans.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/nortrans.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (C) 2001-2011, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 07/03/01 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "unicode/normalizer2.h"
18	#include "unicode/utf16.h"
19	#include "cstring.h"
20	#include "nortrans.h"
21
22	U_NAMESPACE_BEGIN
23
24	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
25
26	static inline Transliterator::Token cstrToken(const char *s) {
27	return Transliterator::pointerToken((void *)s);
28	}
29
30	/**
31	* System registration hook.
32	*/
33	void NormalizationTransliterator::registerIDs() {
34	// In the Token, the byte after the NUL is the UNormalization2Mode.
35	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
36	_create, cstrToken("nfc\0\0"));
37	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
38	_create, cstrToken("nfkc\0\0"));
39	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
40	_create, cstrToken("nfc\0\1"));
41	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
42	_create, cstrToken("nfkc\0\1"));
43	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
44	_create, cstrToken("nfc\0\2"));
45	Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
46	_create, cstrToken("nfc\0\3"));
47	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
48	UNICODE_STRING_SIMPLE("NFD"), TRUE);
49	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
50	UNICODE_STRING_SIMPLE("NFKD"), TRUE);
51	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
52	UNICODE_STRING_SIMPLE("NFD"), FALSE);
53	Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
54	UNICODE_STRING_SIMPLE("FCD"), FALSE);
55	}
56
57	/**
58	* Factory methods
59	*/
60	Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
61	Token context) {
62	const char name = (const* char *)context.pointer;
63	UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, `0`)[`1`];
64	UErrorCode errorCode = U_ZERO_ERROR;
65	const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
66	if(U_SUCCESS(errorCode)) {
67	return new NormalizationTransliterator (ID, *norm2);
68	} else {
69	return NULL;
70	}
71	}
72
73	/**
74	* Constructs a transliterator.
75	*/
76	NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
77	const Normalizer2 &norm2) :
78	Transliterator (id, `0`), fNorm2(norm2) {}
79
80	/**
81	* Destructor.
82	*/
83	NormalizationTransliterator::~NormalizationTransliterator() {
84	}
85
86	/**
87	* Copy constructor.
88	*/
89	NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
90	Transliterator (o), fNorm2(o.fNorm2) {}
91
92	/**
93	* Transliterator API.
94	*/
95	NormalizationTransliterator* NormalizationTransliterator::clone() const {
96	return new NormalizationTransliterator (*this);
97	}
98
99	/**
100	* Implements {@link Transliterator#handleTransliterate}.
101	*/
102	void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
103	UBool isIncremental) const {
104	// start and limit of the input range
105	int32_t start = offsets.start;
106	int32_t limit = offsets.limit;
107	if(start >= limit) {
108	return;
109	}
110
111	/*
112	* Normalize as short chunks at a time as possible even in
113	* bulk mode, so that styled text is minimally disrupted.
114	* In incremental mode, a chunk that ends with offsets.limit
115	* must not be normalized.
116	*
117	* If it was known that the input text is not styled, then
118	* a bulk mode normalization could look like this:
119
120	UnicodeString input, normalized;
121	int32_t length = limit - start;
122	_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
123	input.releaseBuffer(length);
124
125	UErrorCode status = U_ZERO_ERROR;
126	fNorm2.normalize(input, normalized, status);
127
128	text.handleReplaceBetween(start, limit, normalized);
129
130	int32_t delta = normalized.length() - length;
131	offsets.contextLimit += delta;
132	offsets.limit += delta;
133	offsets.start = limit + delta;
134
135	*/
136	UErrorCode errorCode = U_ZERO_ERROR;
137	UnicodeString segment;
138	UnicodeString normalized;
139	UChar32 c = text.char32At(start);
140	do {
141	int32_t prev = start;
142	// Skip at least one character so we make progress.
143	// c holds the character at start.
144	segment.remove();
145	do {
146	segment.append(c);
147	start += U16_LENGTH(c);
148	} while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
149	if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
150	// stop in incremental mode when we reach the input limit
151	// in case there are additional characters that could change the
152	// normalization result
153	start=prev;
154	break;
155	}
156	fNorm2.normalize(segment, normalized, errorCode);
157	if(U_FAILURE(errorCode)) {
158	break;
159	}
160	if(segment != normalized) {
161	// replace the input chunk with its normalized form
162	text.handleReplaceBetween(prev, start, normalized);
163
164	// update all necessary indexes accordingly
165	int32_t delta = normalized.length() - (start - prev);
166	start += delta;
167	limit += delta;
168	}
169	} while(start < limit);
170
171	offsets.start = start;
172	offsets.contextLimit += limit - offsets.limit;
173	offsets.limit = limit;
174	}
175
176	U_NAMESPACE_END
177
178	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
179

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/nortrans.cpp