1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (c) 2001-2012, International Business Machines Corporation
6* and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 07/23/01 aliu Creation.
10**********************************************************************
11*/
12
13#include "unicode/utypes.h"
14
15#if !UCONFIG_NO_TRANSLITERATION
16
17#include "strmatch.h"
18#include "rbt_data.h"
19#include "util.h"
20#include "unicode/uniset.h"
21#include "unicode/utf16.h"
22
23U_NAMESPACE_BEGIN
24
25UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
26
27StringMatcher::StringMatcher(const UnicodeString& theString,
28 int32_t start,
29 int32_t limit,
30 int32_t segmentNum,
31 const TransliterationRuleData& theData) :
32 data(&theData),
33 segmentNumber(segmentNum),
34 matchStart(-1),
35 matchLimit(-1)
36{
37 theString.extractBetween(start, limit, pattern);
38}
39
40StringMatcher::StringMatcher(const StringMatcher& o) :
41 UnicodeFunctor(o),
42 UnicodeMatcher(o),
43 UnicodeReplacer(o),
44 pattern(o.pattern),
45 data(o.data),
46 segmentNumber(o.segmentNumber),
47 matchStart(o.matchStart),
48 matchLimit(o.matchLimit)
49{
50}
51
52/**
53 * Destructor
54 */
55StringMatcher::~StringMatcher() {
56}
57
58/**
59 * Implement UnicodeFunctor
60 */
61StringMatcher* StringMatcher::clone() const {
62 return new StringMatcher(*this);
63}
64
65/**
66 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
67 * and return the pointer.
68 */
69UnicodeMatcher* StringMatcher::toMatcher() const {
70 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
71 UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
72
73 return nonconst_base;
74}
75
76/**
77 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
78 * and return the pointer.
79 */
80UnicodeReplacer* StringMatcher::toReplacer() const {
81 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
82 UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
83
84 return nonconst_base;
85}
86
87/**
88 * Implement UnicodeMatcher
89 */
90UMatchDegree StringMatcher::matches(const Replaceable& text,
91 int32_t& offset,
92 int32_t limit,
93 UBool incremental) {
94 int32_t i;
95 int32_t cursor = offset;
96 if (limit < cursor) {
97 // Match in the reverse direction
98 for (i=pattern.length()-1; i>=0; --i) {
99 UChar keyChar = pattern.charAt(i);
100 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
101 if (subm == 0) {
102 if (cursor > limit &&
103 keyChar == text.charAt(cursor)) {
104 --cursor;
105 } else {
106 return U_MISMATCH;
107 }
108 } else {
109 UMatchDegree m =
110 subm->matches(text, cursor, limit, incremental);
111 if (m != U_MATCH) {
112 return m;
113 }
114 }
115 }
116 // Record the match position, but adjust for a normal
117 // forward start, limit, and only if a prior match does not
118 // exist -- we want the rightmost match.
119 if (matchStart < 0) {
120 matchStart = cursor+1;
121 matchLimit = offset+1;
122 }
123 } else {
124 for (i=0; i<pattern.length(); ++i) {
125 if (incremental && cursor == limit) {
126 // We've reached the context limit without a mismatch and
127 // without completing our match.
128 return U_PARTIAL_MATCH;
129 }
130 UChar keyChar = pattern.charAt(i);
131 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
132 if (subm == 0) {
133 // Don't need the cursor < limit check if
134 // incremental is TRUE (because it's done above); do need
135 // it otherwise.
136 if (cursor < limit &&
137 keyChar == text.charAt(cursor)) {
138 ++cursor;
139 } else {
140 return U_MISMATCH;
141 }
142 } else {
143 UMatchDegree m =
144 subm->matches(text, cursor, limit, incremental);
145 if (m != U_MATCH) {
146 return m;
147 }
148 }
149 }
150 // Record the match position
151 matchStart = offset;
152 matchLimit = cursor;
153 }
154
155 offset = cursor;
156 return U_MATCH;
157}
158
159/**
160 * Implement UnicodeMatcher
161 */
162UnicodeString& StringMatcher::toPattern(UnicodeString& result,
163 UBool escapeUnprintable) const
164{
165 result.truncate(0);
166 UnicodeString str, quoteBuf;
167 if (segmentNumber > 0) {
168 result.append((UChar)40); /*(*/
169 }
170 for (int32_t i=0; i<pattern.length(); ++i) {
171 UChar keyChar = pattern.charAt(i);
172 const UnicodeMatcher* m = data->lookupMatcher(keyChar);
173 if (m == 0) {
174 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
175 } else {
176 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
177 TRUE, escapeUnprintable, quoteBuf);
178 }
179 }
180 if (segmentNumber > 0) {
181 result.append((UChar)41); /*)*/
182 }
183 // Flush quoteBuf out to result
184 ICU_Utility::appendToRule(result, -1,
185 TRUE, escapeUnprintable, quoteBuf);
186 return result;
187}
188
189/**
190 * Implement UnicodeMatcher
191 */
192UBool StringMatcher::matchesIndexValue(uint8_t v) const {
193 if (pattern.length() == 0) {
194 return TRUE;
195 }
196 UChar32 c = pattern.char32At(0);
197 const UnicodeMatcher *m = data->lookupMatcher(c);
198 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
199}
200
201/**
202 * Implement UnicodeMatcher
203 */
204void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
205 UChar32 ch;
206 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
207 ch = pattern.char32At(i);
208 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
209 if (matcher == NULL) {
210 toUnionTo.add(ch);
211 } else {
212 matcher->addMatchSetTo(toUnionTo);
213 }
214 }
215}
216
217/**
218 * UnicodeReplacer API
219 */
220int32_t StringMatcher::replace(Replaceable& text,
221 int32_t start,
222 int32_t limit,
223 int32_t& /*cursor*/) {
224
225 int32_t outLen = 0;
226
227 // Copy segment with out-of-band data
228 int32_t dest = limit;
229 // If there was no match, that means that a quantifier
230 // matched zero-length. E.g., x (a)* y matched "xy".
231 if (matchStart >= 0) {
232 if (matchStart != matchLimit) {
233 text.copy(matchStart, matchLimit, dest);
234 outLen = matchLimit - matchStart;
235 }
236 }
237
238 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
239
240 return outLen;
241}
242
243/**
244 * UnicodeReplacer API
245 */
246UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
247 UBool /*escapeUnprintable*/) const {
248 // assert(segmentNumber > 0);
249 rule.truncate(0);
250 rule.append((UChar)0x0024 /*$*/);
251 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
252 return rule;
253}
254
255/**
256 * Remove any match info. This must be called before performing a
257 * set of matches with this segment.
258 */
259 void StringMatcher::resetMatch() {
260 matchStart = matchLimit = -1;
261}
262
263/**
264 * Union the set of all characters that may output by this object
265 * into the given set.
266 * @param toUnionTo the set into which to union the output characters
267 */
268void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
269 // The output of this replacer varies; it is the source text between
270 // matchStart and matchLimit. Since this varies depending on the
271 // input text, we can't compute it here. We can either do nothing
272 // or we can add ALL characters to the set. It's probably more useful
273 // to do nothing.
274}
275
276/**
277 * Implement UnicodeFunctor
278 */
279void StringMatcher::setData(const TransliterationRuleData* d) {
280 data = d;
281 int32_t i = 0;
282 while (i<pattern.length()) {
283 UChar32 c = pattern.char32At(i);
284 UnicodeFunctor* f = data->lookup(c);
285 if (f != NULL) {
286 f->setData(data);
287 }
288 i += U16_LENGTH(c);
289 }
290}
291
292U_NAMESPACE_END
293
294#endif /* #if !UCONFIG_NO_TRANSLITERATION */
295
296//eof
297