1// © 2017 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4#include "unicode/utypes.h"
5
6#if !UCONFIG_NO_FORMATTING
7
8#include "number_affixutils.h"
9#include "unicode/utf16.h"
10#include "unicode/uniset.h"
11
12using namespace icu;
13using namespace icu::number;
14using namespace icu::number::impl;
15
16TokenConsumer::~TokenConsumer() = default;
17SymbolProvider::~SymbolProvider() = default;
18
19int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) {
20 AffixPatternState state = STATE_BASE;
21 int32_t offset = 0;
22 int32_t length = 0;
23 for (; offset < patternString.length();) {
24 UChar32 cp = patternString.char32At(offset);
25
26 switch (state) {
27 case STATE_BASE:
28 if (cp == u'\'') {
29 // First quote
30 state = STATE_FIRST_QUOTE;
31 } else {
32 // Unquoted symbol
33 length++;
34 }
35 break;
36 case STATE_FIRST_QUOTE:
37 if (cp == u'\'') {
38 // Repeated quote
39 length++;
40 state = STATE_BASE;
41 } else {
42 // Quoted code point
43 length++;
44 state = STATE_INSIDE_QUOTE;
45 }
46 break;
47 case STATE_INSIDE_QUOTE:
48 if (cp == u'\'') {
49 // End of quoted sequence
50 state = STATE_AFTER_QUOTE;
51 } else {
52 // Quoted code point
53 length++;
54 }
55 break;
56 case STATE_AFTER_QUOTE:
57 if (cp == u'\'') {
58 // Double quote inside of quoted sequence
59 length++;
60 state = STATE_INSIDE_QUOTE;
61 } else {
62 // Unquoted symbol
63 length++;
64 }
65 break;
66 default:
67 UPRV_UNREACHABLE;
68 }
69
70 offset += U16_LENGTH(cp);
71 }
72
73 switch (state) {
74 case STATE_FIRST_QUOTE:
75 case STATE_INSIDE_QUOTE:
76 status = U_ILLEGAL_ARGUMENT_ERROR;
77 break;
78 default:
79 break;
80 }
81
82 return length;
83}
84
85UnicodeString AffixUtils::escape(const UnicodeString &input) {
86 AffixPatternState state = STATE_BASE;
87 int32_t offset = 0;
88 UnicodeString output;
89 for (; offset < input.length();) {
90 UChar32 cp = input.char32At(offset);
91
92 switch (cp) {
93 case u'\'':
94 output.append(u"''", -1);
95 break;
96
97 case u'-':
98 case u'+':
99 case u'%':
100 case u'‰':
101 case u'¤':
102 if (state == STATE_BASE) {
103 output.append(u'\'');
104 output.append(cp);
105 state = STATE_INSIDE_QUOTE;
106 } else {
107 output.append(cp);
108 }
109 break;
110
111 default:
112 if (state == STATE_INSIDE_QUOTE) {
113 output.append(u'\'');
114 output.append(cp);
115 state = STATE_BASE;
116 } else {
117 output.append(cp);
118 }
119 break;
120 }
121 offset += U16_LENGTH(cp);
122 }
123
124 if (state == STATE_INSIDE_QUOTE) {
125 output.append(u'\'');
126 }
127
128 return output;
129}
130
131Field AffixUtils::getFieldForType(AffixPatternType type) {
132 switch (type) {
133 case TYPE_MINUS_SIGN:
134 return UNUM_SIGN_FIELD;
135 case TYPE_PLUS_SIGN:
136 return UNUM_SIGN_FIELD;
137 case TYPE_PERCENT:
138 return UNUM_PERCENT_FIELD;
139 case TYPE_PERMILLE:
140 return UNUM_PERMILL_FIELD;
141 case TYPE_CURRENCY_SINGLE:
142 return UNUM_CURRENCY_FIELD;
143 case TYPE_CURRENCY_DOUBLE:
144 return UNUM_CURRENCY_FIELD;
145 case TYPE_CURRENCY_TRIPLE:
146 return UNUM_CURRENCY_FIELD;
147 case TYPE_CURRENCY_QUAD:
148 return UNUM_CURRENCY_FIELD;
149 case TYPE_CURRENCY_QUINT:
150 return UNUM_CURRENCY_FIELD;
151 case TYPE_CURRENCY_OVERFLOW:
152 return UNUM_CURRENCY_FIELD;
153 default:
154 UPRV_UNREACHABLE;
155 }
156}
157
158int32_t
159AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position,
160 const SymbolProvider &provider, Field field, UErrorCode &status) {
161 int32_t length = 0;
162 AffixTag tag;
163 while (hasNext(tag, affixPattern)) {
164 tag = nextToken(tag, affixPattern, status);
165 if (U_FAILURE(status)) { return length; }
166 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
167 // Don't go to the provider for this special case
168 length += output.insertCodePoint(position + length, 0xFFFD, UNUM_CURRENCY_FIELD, status);
169 } else if (tag.type < 0) {
170 length += output.insert(
171 position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status);
172 } else {
173 length += output.insertCodePoint(position + length, tag.codePoint, field, status);
174 }
175 }
176 return length;
177}
178
179int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern,
180 const SymbolProvider &provider, UErrorCode &status) {
181 int32_t length = 0;
182 AffixTag tag;
183 while (hasNext(tag, affixPattern)) {
184 tag = nextToken(tag, affixPattern, status);
185 if (U_FAILURE(status)) { return length; }
186 if (tag.type == TYPE_CURRENCY_OVERFLOW) {
187 length += 1;
188 } else if (tag.type < 0) {
189 length += provider.getSymbol(tag.type).length();
190 } else {
191 length += U16_LENGTH(tag.codePoint);
192 }
193 }
194 return length;
195}
196
197bool
198AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) {
199 if (affixPattern.length() == 0) {
200 return false;
201 }
202 AffixTag tag;
203 while (hasNext(tag, affixPattern)) {
204 tag = nextToken(tag, affixPattern, status);
205 if (U_FAILURE(status)) { return false; }
206 if (tag.type == type) {
207 return true;
208 }
209 }
210 return false;
211}
212
213bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) {
214 if (affixPattern.length() == 0) {
215 return false;
216 }
217 AffixTag tag;
218 while (hasNext(tag, affixPattern)) {
219 tag = nextToken(tag, affixPattern, status);
220 if (U_FAILURE(status)) { return false; }
221 if (tag.type < 0 && getFieldForType(tag.type) == UNUM_CURRENCY_FIELD) {
222 return true;
223 }
224 }
225 return false;
226}
227
228UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type,
229 char16_t replacementChar, UErrorCode &status) {
230 UnicodeString output(affixPattern); // copy
231 if (affixPattern.length() == 0) {
232 return output;
233 }
234 AffixTag tag;
235 while (hasNext(tag, affixPattern)) {
236 tag = nextToken(tag, affixPattern, status);
237 if (U_FAILURE(status)) { return output; }
238 if (tag.type == type) {
239 output.replace(tag.offset - 1, 1, replacementChar);
240 }
241 }
242 return output;
243}
244
245bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern,
246 const UnicodeSet& ignorables, UErrorCode& status) {
247 if (affixPattern.length() == 0) {
248 return true;
249 }
250 AffixTag tag;
251 while (hasNext(tag, affixPattern)) {
252 tag = nextToken(tag, affixPattern, status);
253 if (U_FAILURE(status)) { return false; }
254 if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) {
255 return false;
256 }
257 }
258 return true;
259}
260
261void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer,
262 UErrorCode& status) {
263 if (affixPattern.length() == 0) {
264 return;
265 }
266 AffixTag tag;
267 while (hasNext(tag, affixPattern)) {
268 tag = nextToken(tag, affixPattern, status);
269 if (U_FAILURE(status)) { return; }
270 consumer.consumeToken(tag.type, tag.codePoint, status);
271 if (U_FAILURE(status)) { return; }
272 }
273}
274
275AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) {
276 int32_t offset = tag.offset;
277 int32_t state = tag.state;
278 for (; offset < patternString.length();) {
279 UChar32 cp = patternString.char32At(offset);
280 int32_t count = U16_LENGTH(cp);
281
282 switch (state) {
283 case STATE_BASE:
284 switch (cp) {
285 case u'\'':
286 state = STATE_FIRST_QUOTE;
287 offset += count;
288 // continue to the next code point
289 break;
290 case u'-':
291 return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0);
292 case u'+':
293 return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0);
294 case u'%':
295 return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0);
296 case u'‰':
297 return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0);
298 case u'¤':
299 state = STATE_FIRST_CURR;
300 offset += count;
301 // continue to the next code point
302 break;
303 default:
304 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
305 }
306 break;
307 case STATE_FIRST_QUOTE:
308 if (cp == u'\'') {
309 return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp);
310 } else {
311 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
312 }
313 case STATE_INSIDE_QUOTE:
314 if (cp == u'\'') {
315 state = STATE_AFTER_QUOTE;
316 offset += count;
317 // continue to the next code point
318 break;
319 } else {
320 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
321 }
322 case STATE_AFTER_QUOTE:
323 if (cp == u'\'') {
324 return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp);
325 } else {
326 state = STATE_BASE;
327 // re-evaluate this code point
328 break;
329 }
330 case STATE_FIRST_CURR:
331 if (cp == u'¤') {
332 state = STATE_SECOND_CURR;
333 offset += count;
334 // continue to the next code point
335 break;
336 } else {
337 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
338 }
339 case STATE_SECOND_CURR:
340 if (cp == u'¤') {
341 state = STATE_THIRD_CURR;
342 offset += count;
343 // continue to the next code point
344 break;
345 } else {
346 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
347 }
348 case STATE_THIRD_CURR:
349 if (cp == u'¤') {
350 state = STATE_FOURTH_CURR;
351 offset += count;
352 // continue to the next code point
353 break;
354 } else {
355 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
356 }
357 case STATE_FOURTH_CURR:
358 if (cp == u'¤') {
359 state = STATE_FIFTH_CURR;
360 offset += count;
361 // continue to the next code point
362 break;
363 } else {
364 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
365 }
366 case STATE_FIFTH_CURR:
367 if (cp == u'¤') {
368 state = STATE_OVERFLOW_CURR;
369 offset += count;
370 // continue to the next code point
371 break;
372 } else {
373 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
374 }
375 case STATE_OVERFLOW_CURR:
376 if (cp == u'¤') {
377 offset += count;
378 // continue to the next code point and loop back to this state
379 break;
380 } else {
381 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
382 }
383 default:
384 UPRV_UNREACHABLE;
385 }
386 }
387 // End of string
388 switch (state) {
389 case STATE_BASE:
390 // No more tokens in string.
391 return {-1};
392 case STATE_FIRST_QUOTE:
393 case STATE_INSIDE_QUOTE:
394 // For consistent behavior with the JDK and ICU 58, set an error here.
395 status = U_ILLEGAL_ARGUMENT_ERROR;
396 return {-1};
397 case STATE_AFTER_QUOTE:
398 // No more tokens in string.
399 return {-1};
400 case STATE_FIRST_CURR:
401 return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0);
402 case STATE_SECOND_CURR:
403 return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0);
404 case STATE_THIRD_CURR:
405 return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0);
406 case STATE_FOURTH_CURR:
407 return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0);
408 case STATE_FIFTH_CURR:
409 return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0);
410 case STATE_OVERFLOW_CURR:
411 return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0);
412 default:
413 UPRV_UNREACHABLE;
414 }
415}
416
417bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) {
418 // First check for the {-1} and default initializer syntax.
419 if (tag.offset < 0) {
420 return false;
421 } else if (tag.offset == 0) {
422 return string.length() > 0;
423 }
424 // The rest of the fields are safe to use now.
425 // Special case: the last character in string is an end quote.
426 if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 &&
427 string.charAt(tag.offset) == u'\'') {
428 return false;
429 } else if (tag.state != STATE_BASE) {
430 return true;
431 } else {
432 return tag.offset < string.length();
433 }
434}
435
436#endif /* #if !UCONFIG_NO_FORMATTING */
437