strmatch.cpp source code [ClickHouse/contrib/icu/icu4c/source/i18n/strmatch.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (c) 2001-2012, International Business Machines Corporation
6	* and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 07/23/01 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/utypes.h"
14
15	#if !UCONFIG_NO_TRANSLITERATION
16
17	#include "strmatch.h"
18	#include "rbt_data.h"
19	#include "util.h"
20	#include "unicode/uniset.h"
21	#include "unicode/utf16.h"
22
23	U_NAMESPACE_BEGIN
24
25	UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
26
27	StringMatcher::StringMatcher(const UnicodeString& theString,
28	int32_t start,
29	int32_t limit,
30	int32_t segmentNum,
31	const TransliterationRuleData& theData) :
32	data(&theData),
33	segmentNumber(segmentNum),
34	matchStart(-`1`),
35	matchLimit(-`1`)
36	{
37	theString.extractBetween(start, limit, pattern);
38	}
39
40	StringMatcher::StringMatcher(const StringMatcher& o) :
41	UnicodeFunctor (o),
42	UnicodeMatcher (o),
43	UnicodeReplacer (o),
44	pattern (o.pattern),
45	data(o.data),
46	segmentNumber(o.segmentNumber),
47	matchStart(o.matchStart),
48	matchLimit(o.matchLimit)
49	{
50	}
51
52	/**
53	* Destructor
54	*/
55	StringMatcher::~StringMatcher() {
56	}
57
58	/**
59	* Implement UnicodeFunctor
60	*/
61	StringMatcher* StringMatcher::clone() const {
62	return new StringMatcher (*this);
63	}
64
65	/**
66	* UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
67	* and return the pointer.
68	*/
69	UnicodeMatcher* StringMatcher::toMatcher() const {
70	StringMatcher nonconst_this = const_cast<StringMatcher >(this);
71	UnicodeMatcher nonconst_base = static_cast<UnicodeMatcher >(nonconst_this);
72
73	return nonconst_base;
74	}
75
76	/**
77	* UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
78	* and return the pointer.
79	*/
80	UnicodeReplacer* StringMatcher::toReplacer() const {
81	StringMatcher nonconst_this = const_cast<StringMatcher >(this);
82	UnicodeReplacer nonconst_base = static_cast<UnicodeReplacer >(nonconst_this);
83
84	return nonconst_base;
85	}
86
87	/**
88	* Implement UnicodeMatcher
89	*/
90	UMatchDegree StringMatcher::matches(const Replaceable& text,
91	int32_t& offset,
92	int32_t limit,
93	UBool incremental) {
94	int32_t i;
95	int32_t cursor = offset;
96	if (limit < cursor) {
97	// Match in the reverse direction
98	for (i=pattern.length()-`1`; i>=`0`; --i) {
99	UChar keyChar = pattern.charAt(i);
100	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
101	if (subm == `0`) {
102	if (cursor > limit &&
103	keyChar == text.charAt(cursor)) {
104	--cursor;
105	} else {
106	return U_MISMATCH;
107	}
108	} else {
109	UMatchDegree m =
110	subm->matches(text, cursor, limit, incremental);
111	if (m != U_MATCH) {
112	return m;
113	}
114	}
115	}
116	// Record the match position, but adjust for a normal
117	// forward start, limit, and only if a prior match does not
118	// exist -- we want the rightmost match.
119	if (matchStart < `0`) {
120	matchStart = cursor+`1`;
121	matchLimit = offset+`1`;
122	}
123	} else {
124	for (i=`0`; i<pattern.length(); ++i) {
125	if (incremental && cursor == limit) {
126	// We've reached the context limit without a mismatch and
127	// without completing our match.
128	return U_PARTIAL_MATCH;
129	}
130	UChar keyChar = pattern.charAt(i);
131	UnicodeMatcher* subm = data->lookupMatcher(keyChar);
132	if (subm == `0`) {
133	// Don't need the cursor < limit check if
134	// incremental is TRUE (because it's done above); do need
135	// it otherwise.
136	if (cursor < limit &&
137	keyChar == text.charAt(cursor)) {
138	++cursor;
139	} else {
140	return U_MISMATCH;
141	}
142	} else {
143	UMatchDegree m =
144	subm->matches(text, cursor, limit, incremental);
145	if (m != U_MATCH) {
146	return m;
147	}
148	}
149	}
150	// Record the match position
151	matchStart = offset;
152	matchLimit = cursor;
153	}
154
155	offset = cursor;
156	return U_MATCH;
157	}
158
159	/**
160	* Implement UnicodeMatcher
161	*/
162	UnicodeString& StringMatcher::toPattern(UnicodeString& result,
163	UBool escapeUnprintable) const
164	{
165	result.truncate(`0`);
166	UnicodeString str, quoteBuf;
167	if (segmentNumber > `0`) {
168	result.append((UChar)`40`); /(/
169	}
170	for (int32_t i=`0`; i<pattern.length(); ++i) {
171	UChar keyChar = pattern.charAt(i);
172	const UnicodeMatcher* m = data->lookupMatcher(keyChar);
173	if (m == `0`) {
174	ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
175	} else {
176	ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
177	TRUE, escapeUnprintable, quoteBuf);
178	}
179	}
180	if (segmentNumber > `0`) {
181	result.append((UChar)`41`); /)/
182	}
183	// Flush quoteBuf out to result
184	ICU_Utility::appendToRule(result, -`1`,
185	TRUE, escapeUnprintable, quoteBuf);
186	return result;
187	}
188
189	/**
190	* Implement UnicodeMatcher
191	*/
192	UBool StringMatcher::matchesIndexValue(uint8_t v) const {
193	if (pattern.length() == `0`) {
194	return TRUE;
195	}
196	UChar32 c = pattern.char32At(`0`);
197	const UnicodeMatcher *m = data->lookupMatcher(c);
198	return (m == `0`) ? ((c & `0xFF`) == v) : m->matchesIndexValue(v);
199	}
200
201	/**
202	* Implement UnicodeMatcher
203	*/
204	void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
205	UChar32 ch;
206	for (int32_t i=`0`; i<pattern.length(); i+=U16_LENGTH(ch)) {
207	ch = pattern.char32At(i);
208	const UnicodeMatcher* matcher = data->lookupMatcher(ch);
209	if (matcher == NULL) {
210	toUnionTo.add(ch);
211	} else {
212	matcher->addMatchSetTo(toUnionTo);
213	}
214	}
215	}
216
217	/**
218	* UnicodeReplacer API
219	*/
220	int32_t StringMatcher::replace(Replaceable& text,
221	int32_t start,
222	int32_t limit,
223	int32_t& /cursor/) {
224
225	int32_t outLen = `0`;
226
227	// Copy segment with out-of-band data
228	int32_t dest = limit;
229	// If there was no match, that means that a quantifier
230	// matched zero-length. E.g., x (a) y matched "xy".*
231	if (matchStart >= `0`) {
232	if (matchStart != matchLimit) {
233	text.copy(matchStart, matchLimit, dest);
234	outLen = matchLimit - matchStart;
235	}
236	}
237
238	text.handleReplaceBetween(start, limit, UnicodeString ()); // delete original text
239
240	return outLen;
241	}
242
243	/**
244	* UnicodeReplacer API
245	*/
246	UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
247	UBool /escapeUnprintable/) const {
248	// assert(segmentNumber > 0);
249	rule.truncate(`0`);
250	rule.append((UChar)`0x0024` /$/);
251	ICU_Utility::appendNumber(rule, segmentNumber, `10`, `1`);
252	return rule;
253	}
254
255	/**
256	* Remove any match info. This must be called before performing a
257	* set of matches with this segment.
258	*/
259	void StringMatcher::resetMatch() {
260	matchStart = matchLimit = -`1`;
261	}
262
263	/**
264	* Union the set of all characters that may output by this object
265	* into the given set.
266	* @param toUnionTo the set into which to union the output characters
267	*/
268	void StringMatcher::addReplacementSetTo(UnicodeSet& /toUnionTo/) const {
269	// The output of this replacer varies; it is the source text between
270	// matchStart and matchLimit. Since this varies depending on the
271	// input text, we can't compute it here. We can either do nothing
272	// or we can add ALL characters to the set. It's probably more useful
273	// to do nothing.
274	}
275
276	/**
277	* Implement UnicodeFunctor
278	*/
279	void StringMatcher::setData(const TransliterationRuleData* d) {
280	data = d;
281	int32_t i = `0`;
282	while (i<pattern.length()) {
283	UChar32 c = pattern.char32At(i);
284	UnicodeFunctor* f = data->lookup(c);
285	if (f != NULL) {
286	f->setData(data);
287	}
288	i += U16_LENGTH(c);
289	}
290	}
291
292	U_NAMESPACE_END
293
294	#endif /* #if !UCONFIG_NO_TRANSLITERATION */
295
296	//eof
297

Browse the source code of ClickHouse/contrib/icu/icu4c/source/i18n/strmatch.cpp