1 | // © 2017 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | #include "unicode/utypes.h" |
5 | |
6 | #if !UCONFIG_NO_FORMATTING |
7 | |
8 | #include "number_affixutils.h" |
9 | #include "unicode/utf16.h" |
10 | #include "unicode/uniset.h" |
11 | |
12 | using namespace icu; |
13 | using namespace icu::number; |
14 | using namespace icu::number::impl; |
15 | |
16 | TokenConsumer::~TokenConsumer() = default; |
17 | SymbolProvider::~SymbolProvider() = default; |
18 | |
19 | int32_t AffixUtils::estimateLength(const UnicodeString &patternString, UErrorCode &status) { |
20 | AffixPatternState state = STATE_BASE; |
21 | int32_t offset = 0; |
22 | int32_t length = 0; |
23 | for (; offset < patternString.length();) { |
24 | UChar32 cp = patternString.char32At(offset); |
25 | |
26 | switch (state) { |
27 | case STATE_BASE: |
28 | if (cp == u'\'') { |
29 | // First quote |
30 | state = STATE_FIRST_QUOTE; |
31 | } else { |
32 | // Unquoted symbol |
33 | length++; |
34 | } |
35 | break; |
36 | case STATE_FIRST_QUOTE: |
37 | if (cp == u'\'') { |
38 | // Repeated quote |
39 | length++; |
40 | state = STATE_BASE; |
41 | } else { |
42 | // Quoted code point |
43 | length++; |
44 | state = STATE_INSIDE_QUOTE; |
45 | } |
46 | break; |
47 | case STATE_INSIDE_QUOTE: |
48 | if (cp == u'\'') { |
49 | // End of quoted sequence |
50 | state = STATE_AFTER_QUOTE; |
51 | } else { |
52 | // Quoted code point |
53 | length++; |
54 | } |
55 | break; |
56 | case STATE_AFTER_QUOTE: |
57 | if (cp == u'\'') { |
58 | // Double quote inside of quoted sequence |
59 | length++; |
60 | state = STATE_INSIDE_QUOTE; |
61 | } else { |
62 | // Unquoted symbol |
63 | length++; |
64 | } |
65 | break; |
66 | default: |
67 | UPRV_UNREACHABLE; |
68 | } |
69 | |
70 | offset += U16_LENGTH(cp); |
71 | } |
72 | |
73 | switch (state) { |
74 | case STATE_FIRST_QUOTE: |
75 | case STATE_INSIDE_QUOTE: |
76 | status = U_ILLEGAL_ARGUMENT_ERROR; |
77 | break; |
78 | default: |
79 | break; |
80 | } |
81 | |
82 | return length; |
83 | } |
84 | |
85 | UnicodeString AffixUtils::escape(const UnicodeString &input) { |
86 | AffixPatternState state = STATE_BASE; |
87 | int32_t offset = 0; |
88 | UnicodeString output; |
89 | for (; offset < input.length();) { |
90 | UChar32 cp = input.char32At(offset); |
91 | |
92 | switch (cp) { |
93 | case u'\'': |
94 | output.append(u"''" , -1); |
95 | break; |
96 | |
97 | case u'-': |
98 | case u'+': |
99 | case u'%': |
100 | case u'‰': |
101 | case u'¤': |
102 | if (state == STATE_BASE) { |
103 | output.append(u'\''); |
104 | output.append(cp); |
105 | state = STATE_INSIDE_QUOTE; |
106 | } else { |
107 | output.append(cp); |
108 | } |
109 | break; |
110 | |
111 | default: |
112 | if (state == STATE_INSIDE_QUOTE) { |
113 | output.append(u'\''); |
114 | output.append(cp); |
115 | state = STATE_BASE; |
116 | } else { |
117 | output.append(cp); |
118 | } |
119 | break; |
120 | } |
121 | offset += U16_LENGTH(cp); |
122 | } |
123 | |
124 | if (state == STATE_INSIDE_QUOTE) { |
125 | output.append(u'\''); |
126 | } |
127 | |
128 | return output; |
129 | } |
130 | |
131 | Field AffixUtils::getFieldForType(AffixPatternType type) { |
132 | switch (type) { |
133 | case TYPE_MINUS_SIGN: |
134 | return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; |
135 | case TYPE_PLUS_SIGN: |
136 | return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD}; |
137 | case TYPE_PERCENT: |
138 | return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD}; |
139 | case TYPE_PERMILLE: |
140 | return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD}; |
141 | case TYPE_CURRENCY_SINGLE: |
142 | return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
143 | case TYPE_CURRENCY_DOUBLE: |
144 | return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
145 | case TYPE_CURRENCY_TRIPLE: |
146 | return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
147 | case TYPE_CURRENCY_QUAD: |
148 | return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
149 | case TYPE_CURRENCY_QUINT: |
150 | return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
151 | case TYPE_CURRENCY_OVERFLOW: |
152 | return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}; |
153 | default: |
154 | UPRV_UNREACHABLE; |
155 | } |
156 | } |
157 | |
158 | int32_t |
159 | AffixUtils::unescape(const UnicodeString &affixPattern, FormattedStringBuilder &output, int32_t position, |
160 | const SymbolProvider &provider, Field field, UErrorCode &status) { |
161 | int32_t length = 0; |
162 | AffixTag tag; |
163 | while (hasNext(tag, affixPattern)) { |
164 | tag = nextToken(tag, affixPattern, status); |
165 | if (U_FAILURE(status)) { return length; } |
166 | if (tag.type == TYPE_CURRENCY_OVERFLOW) { |
167 | // Don't go to the provider for this special case |
168 | length += output.insertCodePoint( |
169 | position + length, |
170 | 0xFFFD, |
171 | {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD}, |
172 | status); |
173 | } else if (tag.type < 0) { |
174 | length += output.insert( |
175 | position + length, provider.getSymbol(tag.type), getFieldForType(tag.type), status); |
176 | } else { |
177 | length += output.insertCodePoint(position + length, tag.codePoint, field, status); |
178 | } |
179 | } |
180 | return length; |
181 | } |
182 | |
183 | int32_t AffixUtils::unescapedCodePointCount(const UnicodeString &affixPattern, |
184 | const SymbolProvider &provider, UErrorCode &status) { |
185 | int32_t length = 0; |
186 | AffixTag tag; |
187 | while (hasNext(tag, affixPattern)) { |
188 | tag = nextToken(tag, affixPattern, status); |
189 | if (U_FAILURE(status)) { return length; } |
190 | if (tag.type == TYPE_CURRENCY_OVERFLOW) { |
191 | length += 1; |
192 | } else if (tag.type < 0) { |
193 | length += provider.getSymbol(tag.type).length(); |
194 | } else { |
195 | length += U16_LENGTH(tag.codePoint); |
196 | } |
197 | } |
198 | return length; |
199 | } |
200 | |
201 | bool |
202 | AffixUtils::containsType(const UnicodeString &affixPattern, AffixPatternType type, UErrorCode &status) { |
203 | if (affixPattern.length() == 0) { |
204 | return false; |
205 | } |
206 | AffixTag tag; |
207 | while (hasNext(tag, affixPattern)) { |
208 | tag = nextToken(tag, affixPattern, status); |
209 | if (U_FAILURE(status)) { return false; } |
210 | if (tag.type == type) { |
211 | return true; |
212 | } |
213 | } |
214 | return false; |
215 | } |
216 | |
217 | bool AffixUtils::hasCurrencySymbols(const UnicodeString &affixPattern, UErrorCode &status) { |
218 | if (affixPattern.length() == 0) { |
219 | return false; |
220 | } |
221 | AffixTag tag; |
222 | while (hasNext(tag, affixPattern)) { |
223 | tag = nextToken(tag, affixPattern, status); |
224 | if (U_FAILURE(status)) { return false; } |
225 | if (tag.type < 0 && getFieldForType(tag.type) == Field(UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD)) { |
226 | return true; |
227 | } |
228 | } |
229 | return false; |
230 | } |
231 | |
232 | UnicodeString AffixUtils::replaceType(const UnicodeString &affixPattern, AffixPatternType type, |
233 | char16_t replacementChar, UErrorCode &status) { |
234 | UnicodeString output(affixPattern); // copy |
235 | if (affixPattern.length() == 0) { |
236 | return output; |
237 | } |
238 | AffixTag tag; |
239 | while (hasNext(tag, affixPattern)) { |
240 | tag = nextToken(tag, affixPattern, status); |
241 | if (U_FAILURE(status)) { return output; } |
242 | if (tag.type == type) { |
243 | output.replace(tag.offset - 1, 1, replacementChar); |
244 | } |
245 | } |
246 | return output; |
247 | } |
248 | |
249 | bool AffixUtils::containsOnlySymbolsAndIgnorables(const UnicodeString& affixPattern, |
250 | const UnicodeSet& ignorables, UErrorCode& status) { |
251 | if (affixPattern.length() == 0) { |
252 | return true; |
253 | } |
254 | AffixTag tag; |
255 | while (hasNext(tag, affixPattern)) { |
256 | tag = nextToken(tag, affixPattern, status); |
257 | if (U_FAILURE(status)) { return false; } |
258 | if (tag.type == TYPE_CODEPOINT && !ignorables.contains(tag.codePoint)) { |
259 | return false; |
260 | } |
261 | } |
262 | return true; |
263 | } |
264 | |
265 | void AffixUtils::iterateWithConsumer(const UnicodeString& affixPattern, TokenConsumer& consumer, |
266 | UErrorCode& status) { |
267 | if (affixPattern.length() == 0) { |
268 | return; |
269 | } |
270 | AffixTag tag; |
271 | while (hasNext(tag, affixPattern)) { |
272 | tag = nextToken(tag, affixPattern, status); |
273 | if (U_FAILURE(status)) { return; } |
274 | consumer.consumeToken(tag.type, tag.codePoint, status); |
275 | if (U_FAILURE(status)) { return; } |
276 | } |
277 | } |
278 | |
279 | AffixTag AffixUtils::nextToken(AffixTag tag, const UnicodeString &patternString, UErrorCode &status) { |
280 | int32_t offset = tag.offset; |
281 | int32_t state = tag.state; |
282 | for (; offset < patternString.length();) { |
283 | UChar32 cp = patternString.char32At(offset); |
284 | int32_t count = U16_LENGTH(cp); |
285 | |
286 | switch (state) { |
287 | case STATE_BASE: |
288 | switch (cp) { |
289 | case u'\'': |
290 | state = STATE_FIRST_QUOTE; |
291 | offset += count; |
292 | // continue to the next code point |
293 | break; |
294 | case u'-': |
295 | return makeTag(offset + count, TYPE_MINUS_SIGN, STATE_BASE, 0); |
296 | case u'+': |
297 | return makeTag(offset + count, TYPE_PLUS_SIGN, STATE_BASE, 0); |
298 | case u'%': |
299 | return makeTag(offset + count, TYPE_PERCENT, STATE_BASE, 0); |
300 | case u'‰': |
301 | return makeTag(offset + count, TYPE_PERMILLE, STATE_BASE, 0); |
302 | case u'¤': |
303 | state = STATE_FIRST_CURR; |
304 | offset += count; |
305 | // continue to the next code point |
306 | break; |
307 | default: |
308 | return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); |
309 | } |
310 | break; |
311 | case STATE_FIRST_QUOTE: |
312 | if (cp == u'\'') { |
313 | return makeTag(offset + count, TYPE_CODEPOINT, STATE_BASE, cp); |
314 | } else { |
315 | return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); |
316 | } |
317 | case STATE_INSIDE_QUOTE: |
318 | if (cp == u'\'') { |
319 | state = STATE_AFTER_QUOTE; |
320 | offset += count; |
321 | // continue to the next code point |
322 | break; |
323 | } else { |
324 | return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); |
325 | } |
326 | case STATE_AFTER_QUOTE: |
327 | if (cp == u'\'') { |
328 | return makeTag(offset + count, TYPE_CODEPOINT, STATE_INSIDE_QUOTE, cp); |
329 | } else { |
330 | state = STATE_BASE; |
331 | // re-evaluate this code point |
332 | break; |
333 | } |
334 | case STATE_FIRST_CURR: |
335 | if (cp == u'¤') { |
336 | state = STATE_SECOND_CURR; |
337 | offset += count; |
338 | // continue to the next code point |
339 | break; |
340 | } else { |
341 | return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); |
342 | } |
343 | case STATE_SECOND_CURR: |
344 | if (cp == u'¤') { |
345 | state = STATE_THIRD_CURR; |
346 | offset += count; |
347 | // continue to the next code point |
348 | break; |
349 | } else { |
350 | return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); |
351 | } |
352 | case STATE_THIRD_CURR: |
353 | if (cp == u'¤') { |
354 | state = STATE_FOURTH_CURR; |
355 | offset += count; |
356 | // continue to the next code point |
357 | break; |
358 | } else { |
359 | return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); |
360 | } |
361 | case STATE_FOURTH_CURR: |
362 | if (cp == u'¤') { |
363 | state = STATE_FIFTH_CURR; |
364 | offset += count; |
365 | // continue to the next code point |
366 | break; |
367 | } else { |
368 | return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); |
369 | } |
370 | case STATE_FIFTH_CURR: |
371 | if (cp == u'¤') { |
372 | state = STATE_OVERFLOW_CURR; |
373 | offset += count; |
374 | // continue to the next code point |
375 | break; |
376 | } else { |
377 | return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); |
378 | } |
379 | case STATE_OVERFLOW_CURR: |
380 | if (cp == u'¤') { |
381 | offset += count; |
382 | // continue to the next code point and loop back to this state |
383 | break; |
384 | } else { |
385 | return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); |
386 | } |
387 | default: |
388 | UPRV_UNREACHABLE; |
389 | } |
390 | } |
391 | // End of string |
392 | switch (state) { |
393 | case STATE_BASE: |
394 | // No more tokens in string. |
395 | return {-1}; |
396 | case STATE_FIRST_QUOTE: |
397 | case STATE_INSIDE_QUOTE: |
398 | // For consistent behavior with the JDK and ICU 58, set an error here. |
399 | status = U_ILLEGAL_ARGUMENT_ERROR; |
400 | return {-1}; |
401 | case STATE_AFTER_QUOTE: |
402 | // No more tokens in string. |
403 | return {-1}; |
404 | case STATE_FIRST_CURR: |
405 | return makeTag(offset, TYPE_CURRENCY_SINGLE, STATE_BASE, 0); |
406 | case STATE_SECOND_CURR: |
407 | return makeTag(offset, TYPE_CURRENCY_DOUBLE, STATE_BASE, 0); |
408 | case STATE_THIRD_CURR: |
409 | return makeTag(offset, TYPE_CURRENCY_TRIPLE, STATE_BASE, 0); |
410 | case STATE_FOURTH_CURR: |
411 | return makeTag(offset, TYPE_CURRENCY_QUAD, STATE_BASE, 0); |
412 | case STATE_FIFTH_CURR: |
413 | return makeTag(offset, TYPE_CURRENCY_QUINT, STATE_BASE, 0); |
414 | case STATE_OVERFLOW_CURR: |
415 | return makeTag(offset, TYPE_CURRENCY_OVERFLOW, STATE_BASE, 0); |
416 | default: |
417 | UPRV_UNREACHABLE; |
418 | } |
419 | } |
420 | |
421 | bool AffixUtils::hasNext(const AffixTag &tag, const UnicodeString &string) { |
422 | // First check for the {-1} and default initializer syntax. |
423 | if (tag.offset < 0) { |
424 | return false; |
425 | } else if (tag.offset == 0) { |
426 | return string.length() > 0; |
427 | } |
428 | // The rest of the fields are safe to use now. |
429 | // Special case: the last character in string is an end quote. |
430 | if (tag.state == STATE_INSIDE_QUOTE && tag.offset == string.length() - 1 && |
431 | string.charAt(tag.offset) == u'\'') { |
432 | return false; |
433 | } else if (tag.state != STATE_BASE) { |
434 | return true; |
435 | } else { |
436 | return tag.offset < string.length(); |
437 | } |
438 | } |
439 | |
440 | #endif /* #if !UCONFIG_NO_FORMATTING */ |
441 | |