util_props.cpp source code [ClickHouse/contrib/icu/icu4c/source/common/util_props.cpp]

1	// © 2016 and later: Unicode, Inc. and others.
2	// License & terms of use: http://www.unicode.org/copyright.html
3	/*
4	**********************************************************************
5	* Copyright (c) 2001-2016, International Business Machines
6	* Corporation and others. All Rights Reserved.
7	**********************************************************************
8	* Date Name Description
9	* 11/19/2001 aliu Creation.
10	**********************************************************************
11	*/
12
13	#include "unicode/uchar.h"
14	#include "unicode/utf16.h"
15	#include "patternprops.h"
16	#include "util.h"
17
18	U_NAMESPACE_BEGIN
19
20	/**
21	* Parse an integer at pos, either of the form \d+ or of the form
22	* 0x[0-9A-Fa-f]+ or 0[0-7]+, that is, in standard decimal, hex,
23	* or octal format.
24	* @param pos INPUT-OUTPUT parameter. On input, the first
25	* character to parse. On output, the character after the last
26	* parsed character.
27	*/
28	int32_t ICU_Utility::parseInteger(const UnicodeString& rule, int32_t& pos, int32_t limit) {
29	int32_t count = `0`;
30	int32_t value = `0`;
31	int32_t p = pos;
32	int8_t radix = `10`;
33
34	if (p < limit && rule.charAt(p) == `48` /0/) {
35	if (p+`1` < limit && (rule.charAt(p+`1`) == `0x78` /x/ \|\| rule.charAt(p+`1`) == `0x58` /X/)) {
36	p += `2`;
37	radix = `16`;
38	}
39	else {
40	p++;
41	count = `1`;
42	radix = `8`;
43	}
44	}
45
46	while (p < limit) {
47	int32_t d = u_digit(rule.charAt(p++), radix);
48	if (d < `0`) {
49	--p;
50	break;
51	}
52	++count;
53	int32_t v = (value * radix) + d;
54	if (v <= value) {
55	// If there are too many input digits, at some point
56	// the value will go negative, e.g., if we have seen
57	// "0x8000000" already and there is another '0', when
58	// we parse the next 0 the value will go negative.
59	return `0`;
60	}
61	value = v;
62	}
63	if (count > `0`) {
64	pos = p;
65	}
66	return value;
67	}
68
69	/**
70	* Parse a pattern string starting at offset pos. Keywords are
71	* matched case-insensitively. Spaces may be skipped and may be
72	* optional or required. Integer values may be parsed, and if
73	* they are, they will be returned in the given array. If
74	* successful, the offset of the next non-space character is
75	* returned. On failure, -1 is returned.
76	* @param pattern must only contain lowercase characters, which
77	* will match their uppercase equivalents as well. A space
78	* character matches one or more required spaces. A '~' character
79	* matches zero or more optional spaces. A '#' character matches
80	* an integer and stores it in parsedInts, which the caller must
81	* ensure has enough capacity.
82	* @param parsedInts array to receive parsed integers. Caller
83	* must ensure that parsedInts.length is >= the number of '#'
84	* signs in 'pattern'.
85	* @return the position after the last character parsed, or -1 if
86	* the parse failed
87	*/
88	int32_t ICU_Utility::parsePattern(const UnicodeString& rule, int32_t pos, int32_t limit,
89	const UnicodeString& pattern, int32_t* parsedInts) {
90	// TODO Update this to handle surrogates
91	int32_t p;
92	int32_t intCount = `0`; // number of integers parsed
93	for (int32_t i=`0`; i<pattern.length(); ++i) {
94	UChar cpat = pattern.charAt(i);
95	UChar c;
96	switch (cpat) {
97	case `32` /' '/:
98	if (pos >= limit) {
99	return -`1`;
100	}
101	c = rule.charAt(pos++);
102	if (!PatternProps::isWhiteSpace(c)) {
103	return -`1`;
104	}
105	// FALL THROUGH to skipWhitespace
106	U_FALLTHROUGH;
107	case `126` /'~'/:
108	pos = skipWhitespace(rule, pos);
109	break;
110	case `35` /'#'/:
111	p = pos;
112	parsedInts[intCount++] = parseInteger(rule, p, limit);
113	if (p == pos) {
114	// Syntax error; failed to parse integer
115	return -`1`;
116	}
117	pos = p;
118	break;
119	default:
120	if (pos >= limit) {
121	return -`1`;
122	}
123	c = (UChar) u_tolower(rule.charAt(pos++));
124	if (c != cpat) {
125	return -`1`;
126	}
127	break;
128	}
129	}
130	return pos;
131	}
132
133	/**
134	* Parse a Unicode identifier from the given string at the given
135	* position. Return the identifier, or an empty string if there
136	* is no identifier.
137	* @param str the string to parse
138	* @param pos INPUT-OUPUT parameter. On INPUT, pos is the
139	* first character to examine. It must be less than str.length(),
140	* and it must not point to a whitespace character. That is, must
141	* have pos < str.length(). On
142	* OUTPUT, the position after the last parsed character.
143	* @return the Unicode identifier, or an empty string if there is
144	* no valid identifier at pos.
145	*/
146	UnicodeString ICU_Utility::parseUnicodeIdentifier(const UnicodeString& str, int32_t& pos) {
147	// assert(pos < str.length());
148	UnicodeString buf;
149	int p = pos;
150	while (p < str.length()) {
151	UChar32 ch = str.char32At(p);
152	if (buf.length() == `0`) {
153	if (u_isIDStart(ch)) {
154	buf.append(ch);
155	} else {
156	buf.truncate(`0`);
157	return buf;
158	}
159	} else {
160	if (u_isIDPart(ch)) {
161	buf.append(ch);
162	} else {
163	break;
164	}
165	}
166	p += U16_LENGTH(ch);
167	}
168	pos = p;
169	return buf;
170	}
171
172	/**
173	* Parse an unsigned 31-bit integer at the given offset. Use
174	* UCharacter.digit() to parse individual characters into digits.
175	* @param text the text to be parsed
176	* @param pos INPUT-OUTPUT parameter. On entry, pos[0] is the
177	* offset within text at which to start parsing; it should point
178	* to a valid digit. On exit, pos[0] is the offset after the last
179	* parsed character. If the parse failed, it will be unchanged on
180	* exit. Must be >= 0 on entry.
181	* @param radix the radix in which to parse; must be >= 2 and <=
182	* 36.
183	* @return a non-negative parsed number, or -1 upon parse failure.
184	* Parse fails if there are no digits, that is, if pos[0] does not
185	* point to a valid digit on entry, or if the number to be parsed
186	* does not fit into a 31-bit unsigned integer.
187	*/
188	int32_t ICU_Utility::parseNumber(const UnicodeString& text,
189	int32_t& pos, int8_t radix) {
190	// assert(pos[0] >= 0);
191	// assert(radix >= 2);
192	// assert(radix <= 36);
193	int32_t n = `0`;
194	int32_t p = pos;
195	while (p < text.length()) {
196	UChar32 ch = text.char32At(p);
197	int32_t d = u_digit(ch, radix);
198	if (d < `0`) {
199	break;
200	}
201	n = radix*n + d;
202	// ASSUME that when a 32-bit integer overflows it becomes
203	// negative. E.g., 214748364 10 + 8 => negative value.*
204	if (n < `0`) {
205	return -`1`;
206	}
207	++p;
208	}
209	if (p == pos) {
210	return -`1`;
211	}
212	pos = p;
213	return n;
214	}
215
216	U_NAMESPACE_END
217
218

Browse the source code of ClickHouse/contrib/icu/icu4c/source/common/util_props.cpp