1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5* Copyright (c) 2001-2011, International Business Machines
6* Corporation and others. All Rights Reserved.
7**********************************************************************
8* Date Name Description
9* 11/19/2001 aliu Creation.
10**********************************************************************
11*/
12
13#include "unicode/unimatch.h"
14#include "unicode/utf16.h"
15#include "patternprops.h"
16#include "util.h"
17
18// Define char16_t constants using hex for EBCDIC compatibility
19
20static const char16_t BACKSLASH = 0x005C; /*\*/
21static const char16_t UPPER_U = 0x0055; /*U*/
22static const char16_t LOWER_U = 0x0075; /*u*/
23static const char16_t APOSTROPHE = 0x0027; // '\''
24static const char16_t SPACE = 0x0020; // ' '
25
26// "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
27static const char16_t DIGITS[] = {
28 48,49,50,51,52,53,54,55,56,57,
29 65,66,67,68,69,70,71,72,73,74,
30 75,76,77,78,79,80,81,82,83,84,
31 85,86,87,88,89,90
32};
33
34U_NAMESPACE_BEGIN
35
36UnicodeString& ICU_Utility::appendNumber(UnicodeString& result, int32_t n,
37 int32_t radix, int32_t minDigits) {
38 if (radix < 2 || radix > 36) {
39 // Bogus radix
40 return result.append((char16_t)63/*?*/);
41 }
42 // Handle negatives
43 if (n < 0) {
44 n = -n;
45 result.append((char16_t)45/*-*/);
46 }
47 // First determine the number of digits
48 int32_t nn = n;
49 int32_t r = 1;
50 while (nn >= radix) {
51 nn /= radix;
52 r *= radix;
53 --minDigits;
54 }
55 // Now generate the digits
56 while (--minDigits > 0) {
57 result.append(DIGITS[0]);
58 }
59 while (r > 0) {
60 int32_t digit = n / r;
61 result.append(DIGITS[digit]);
62 n -= digit * r;
63 r /= radix;
64 }
65 return result;
66}
67
68UBool ICU_Utility::isUnprintable(UChar32 c) {
69 return !(c >= 0x20 && c <= 0x7E);
70}
71
72UBool ICU_Utility::shouldAlwaysBeEscaped(UChar32 c) {
73 if (c < 0x20) {
74 return true; // C0 control codes
75 } else if (c <= 0x7e) {
76 return false; // printable ASCII
77 } else if (c <= 0x9f) {
78 return true; // C1 control codes
79 } else if (c < 0xd800) {
80 return false; // most of the BMP
81 } else if (c <= 0xdfff || (0xfdd0 <= c && c <= 0xfdef) || (c & 0xfffe) == 0xfffe) {
82 return true; // surrogate or noncharacter code points
83 } else if (c <= 0x10ffff) {
84 return false; // all else
85 } else {
86 return true; // not a code point
87 }
88}
89
90UBool ICU_Utility::escapeUnprintable(UnicodeString& result, UChar32 c) {
91 if (isUnprintable(c)) {
92 escape(result, c);
93 return true;
94 }
95 return false;
96}
97
98UnicodeString &ICU_Utility::escape(UnicodeString& result, UChar32 c) {
99 result.append(BACKSLASH);
100 if (c & ~0xFFFF) {
101 result.append(UPPER_U);
102 result.append(DIGITS[0xF&(c>>28)]);
103 result.append(DIGITS[0xF&(c>>24)]);
104 result.append(DIGITS[0xF&(c>>20)]);
105 result.append(DIGITS[0xF&(c>>16)]);
106 } else {
107 result.append(LOWER_U);
108 }
109 result.append(DIGITS[0xF&(c>>12)]);
110 result.append(DIGITS[0xF&(c>>8)]);
111 result.append(DIGITS[0xF&(c>>4)]);
112 result.append(DIGITS[0xF&c]);
113 return result;
114}
115
116/**
117 * Returns the index of a character, ignoring quoted text.
118 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
119 * found by a search for 'h'.
120 */
121// FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
122/*
123int32_t ICU_Utility::quotedIndexOf(const UnicodeString& text,
124 int32_t start, int32_t limit,
125 char16_t charToFind) {
126 for (int32_t i=start; i<limit; ++i) {
127 char16_t c = text.charAt(i);
128 if (c == BACKSLASH) {
129 ++i;
130 } else if (c == APOSTROPHE) {
131 while (++i < limit
132 && text.charAt(i) != APOSTROPHE) {}
133 } else if (c == charToFind) {
134 return i;
135 }
136 }
137 return -1;
138}
139*/
140
141/**
142 * Skip over a sequence of zero or more white space characters at pos.
143 * @param advance if true, advance pos to the first non-white-space
144 * character at or after pos, or str.length(), if there is none.
145 * Otherwise leave pos unchanged.
146 * @return the index of the first non-white-space character at or
147 * after pos, or str.length(), if there is none.
148 */
149int32_t ICU_Utility::skipWhitespace(const UnicodeString& str, int32_t& pos,
150 UBool advance) {
151 int32_t p = pos;
152 const char16_t* s = str.getBuffer();
153 p = (int32_t)(PatternProps::skipWhiteSpace(s + p, str.length() - p) - s);
154 if (advance) {
155 pos = p;
156 }
157 return p;
158}
159
160/**
161 * Skip over Pattern_White_Space in a Replaceable.
162 * Skipping may be done in the forward or
163 * reverse direction. In either case, the leftmost index will be
164 * inclusive, and the rightmost index will be exclusive. That is,
165 * given a range defined as [start, limit), the call
166 * skipWhitespace(text, start, limit) will advance start past leading
167 * whitespace, whereas the call skipWhitespace(text, limit, start),
168 * will back up limit past trailing whitespace.
169 * @param text the text to be analyzed
170 * @param pos either the start or limit of a range of 'text', to skip
171 * leading or trailing whitespace, respectively
172 * @param stop either the limit or start of a range of 'text', to skip
173 * leading or trailing whitespace, respectively
174 * @return the new start or limit, depending on what was passed in to
175 * 'pos'
176 */
177//?FOR FUTURE USE. DISABLE FOR NOW for coverage reasons.
178//?int32_t ICU_Utility::skipWhitespace(const Replaceable& text,
179//? int32_t pos, int32_t stop) {
180//? UChar32 c;
181//? UBool isForward = (stop >= pos);
182//?
183//? if (!isForward) {
184//? --pos; // pos is a limit, so back up by one
185//? }
186//?
187//? while (pos != stop &&
188//? PatternProps::isWhiteSpace(c = text.char32At(pos))) {
189//? if (isForward) {
190//? pos += U16_LENGTH(c);
191//? } else {
192//? pos -= U16_LENGTH(c);
193//? }
194//? }
195//?
196//? if (!isForward) {
197//? ++pos; // make pos back into a limit
198//? }
199//?
200//? return pos;
201//?}
202
203/**
204 * Parse a single non-whitespace character 'ch', optionally
205 * preceded by whitespace.
206 * @param id the string to be parsed
207 * @param pos INPUT-OUTPUT parameter. On input, pos[0] is the
208 * offset of the first character to be parsed. On output, pos[0]
209 * is the index after the last parsed character. If the parse
210 * fails, pos[0] will be unchanged.
211 * @param ch the non-whitespace character to be parsed.
212 * @return true if 'ch' is seen preceded by zero or more
213 * whitespace characters.
214 */
215UBool ICU_Utility::parseChar(const UnicodeString& id, int32_t& pos, char16_t ch) {
216 int32_t start = pos;
217 skipWhitespace(id, pos, true);
218 if (pos == id.length() ||
219 id.charAt(pos) != ch) {
220 pos = start;
221 return false;
222 }
223 ++pos;
224 return true;
225}
226
227/**
228 * Parse a pattern string within the given Replaceable and a parsing
229 * pattern. Characters are matched literally and case-sensitively
230 * except for the following special characters:
231 *
232 * ~ zero or more Pattern_White_Space chars
233 *
234 * If end of pattern is reached with all matches along the way,
235 * pos is advanced to the first unparsed index and returned.
236 * Otherwise -1 is returned.
237 * @param pat pattern that controls parsing
238 * @param text text to be parsed, starting at index
239 * @param index offset to first character to parse
240 * @param limit offset after last character to parse
241 * @return index after last parsed character, or -1 on parse failure.
242 */
243int32_t ICU_Utility::parsePattern(const UnicodeString& pat,
244 const Replaceable& text,
245 int32_t index,
246 int32_t limit) {
247 int32_t ipat = 0;
248
249 // empty pattern matches immediately
250 if (ipat == pat.length()) {
251 return index;
252 }
253
254 UChar32 cpat = pat.char32At(ipat);
255
256 while (index < limit) {
257 UChar32 c = text.char32At(index);
258
259 // parse \s*
260 if (cpat == 126 /*~*/) {
261 if (PatternProps::isWhiteSpace(c)) {
262 index += U16_LENGTH(c);
263 continue;
264 } else {
265 if (++ipat == pat.length()) {
266 return index; // success; c unparsed
267 }
268 // fall thru; process c again with next cpat
269 }
270 }
271
272 // parse literal
273 else if (c == cpat) {
274 index += U16_LENGTH(c);
275 ipat += U16_LENGTH(cpat);
276 if (ipat == pat.length()) {
277 return index; // success; c parsed
278 }
279 // fall thru; get next cpat
280 }
281
282 // match failure of literal
283 else {
284 return -1;
285 }
286
287 cpat = pat.char32At(ipat);
288 }
289
290 return -1; // text ended before end of pat
291}
292
293int32_t ICU_Utility::parseAsciiInteger(const UnicodeString& str, int32_t& pos) {
294 int32_t result = 0;
295 char16_t c;
296 while (pos < str.length() && (c = str.charAt(pos)) >= u'0' && c <= u'9') {
297 result = result * 10 + (c - u'0');
298 pos++;
299 }
300 return result;
301}
302
303/**
304 * Append a character to a rule that is being built up. To flush
305 * the quoteBuf to rule, make one final call with isLiteral == true.
306 * If there is no final character, pass in (UChar32)-1 as c.
307 * @param rule the string to append the character to
308 * @param c the character to append, or (UChar32)-1 if none.
309 * @param isLiteral if true, then the given character should not be
310 * quoted or escaped. Usually this means it is a syntactic element
311 * such as > or $
312 * @param escapeUnprintable if true, then unprintable characters
313 * should be escaped using \uxxxx or \Uxxxxxxxx. These escapes will
314 * appear outside of quotes.
315 * @param quoteBuf a buffer which is used to build up quoted
316 * substrings. The caller should initially supply an empty buffer,
317 * and thereafter should not modify the buffer. The buffer should be
318 * cleared out by, at the end, calling this method with a literal
319 * character.
320 */
321void ICU_Utility::appendToRule(UnicodeString& rule,
322 UChar32 c,
323 UBool isLiteral,
324 UBool escapeUnprintable,
325 UnicodeString& quoteBuf) {
326 // If we are escaping unprintables, then escape them outside
327 // quotes. \u and \U are not recognized within quotes. The same
328 // logic applies to literals, but literals are never escaped.
329 if (isLiteral ||
330 (escapeUnprintable && ICU_Utility::isUnprintable(c))) {
331 if (quoteBuf.length() > 0) {
332 // We prefer backslash APOSTROPHE to double APOSTROPHE
333 // (more readable, less similar to ") so if there are
334 // double APOSTROPHEs at the ends, we pull them outside
335 // of the quote.
336
337 // If the first thing in the quoteBuf is APOSTROPHE
338 // (doubled) then pull it out.
339 while (quoteBuf.length() >= 2 &&
340 quoteBuf.charAt(0) == APOSTROPHE &&
341 quoteBuf.charAt(1) == APOSTROPHE) {
342 rule.append(BACKSLASH).append(APOSTROPHE);
343 quoteBuf.remove(0, 2);
344 }
345 // If the last thing in the quoteBuf is APOSTROPHE
346 // (doubled) then remove and count it and add it after.
347 int32_t trailingCount = 0;
348 while (quoteBuf.length() >= 2 &&
349 quoteBuf.charAt(quoteBuf.length()-2) == APOSTROPHE &&
350 quoteBuf.charAt(quoteBuf.length()-1) == APOSTROPHE) {
351 quoteBuf.truncate(quoteBuf.length()-2);
352 ++trailingCount;
353 }
354 if (quoteBuf.length() > 0) {
355 rule.append(APOSTROPHE);
356 rule.append(quoteBuf);
357 rule.append(APOSTROPHE);
358 quoteBuf.truncate(0);
359 }
360 while (trailingCount-- > 0) {
361 rule.append(BACKSLASH).append(APOSTROPHE);
362 }
363 }
364 if (c != (UChar32)-1) {
365 /* Since spaces are ignored during parsing, they are
366 * emitted only for readability. We emit one here
367 * only if there isn't already one at the end of the
368 * rule.
369 */
370 if (c == SPACE) {
371 int32_t len = rule.length();
372 if (len > 0 && rule.charAt(len-1) != c) {
373 rule.append(c);
374 }
375 } else if (!escapeUnprintable || !ICU_Utility::escapeUnprintable(rule, c)) {
376 rule.append(c);
377 }
378 }
379 }
380
381 // Escape ' and '\' and don't begin a quote just for them
382 else if (quoteBuf.length() == 0 &&
383 (c == APOSTROPHE || c == BACKSLASH)) {
384 rule.append(BACKSLASH);
385 rule.append(c);
386 }
387
388 // Specials (printable ascii that isn't [0-9a-zA-Z]) and
389 // whitespace need quoting. Also append stuff to quotes if we are
390 // building up a quoted substring already.
391 else if (quoteBuf.length() > 0 ||
392 (c >= 0x0021 && c <= 0x007E &&
393 !((c >= 0x0030/*'0'*/ && c <= 0x0039/*'9'*/) ||
394 (c >= 0x0041/*'A'*/ && c <= 0x005A/*'Z'*/) ||
395 (c >= 0x0061/*'a'*/ && c <= 0x007A/*'z'*/))) ||
396 PatternProps::isWhiteSpace(c)) {
397 quoteBuf.append(c);
398 // Double ' within a quote
399 if (c == APOSTROPHE) {
400 quoteBuf.append(c);
401 }
402 }
403
404 // Otherwise just append
405 else {
406 rule.append(c);
407 }
408}
409
410void ICU_Utility::appendToRule(UnicodeString& rule,
411 const UnicodeString& text,
412 UBool isLiteral,
413 UBool escapeUnprintable,
414 UnicodeString& quoteBuf) {
415 for (int32_t i=0; i<text.length(); ++i) {
416 appendToRule(rule, text[i], isLiteral, escapeUnprintable, quoteBuf);
417 }
418}
419
420/**
421 * Given a matcher reference, which may be null, append its
422 * pattern as a literal to the given rule.
423 */
424void ICU_Utility::appendToRule(UnicodeString& rule,
425 const UnicodeMatcher* matcher,
426 UBool escapeUnprintable,
427 UnicodeString& quoteBuf) {
428 if (matcher != nullptr) {
429 UnicodeString pat;
430 appendToRule(rule, matcher->toPattern(pat, escapeUnprintable),
431 true, escapeUnprintable, quoteBuf);
432 }
433}
434
435U_NAMESPACE_END
436