icu.cpp source code [ClickHouse/contrib/boost/libs/regex/src/icu.cpp]

1	/*
2	*
3	* Copyright (c) 2004
4	* John Maddock
5	*
6	* Use, modification and distribution are subject to the
7	* Boost Software License, Version 1.0. (See accompanying file
8	* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9	*
10	*/
11
12	/*
13	* LOCATION: see http://www.boost.org for most recent version.
14	* FILE icu.cpp
15	* VERSION see <boost/version.hpp>
16	* DESCRIPTION: Unicode regular expressions on top of the ICU Library.
17	*/
18	#define BOOST_REGEX_SOURCE
19
20	#include <boost/regex/config.hpp>
21	#ifdef BOOST_HAS_ICU
22	#define BOOST_REGEX_ICU_INSTANTIATE
23	#include <boost/regex/icu.hpp>
24
25	#ifdef BOOST_INTEL
26	#pragma warning(disable:981 2259 383)
27	#endif
28
29	namespace boost{
30
31	namespace BOOST_REGEX_DETAIL_NS{
32
33	icu_regex_traits_implementation::string_type icu_regex_traits_implementation::do_transform(const char_type* p1, const char_type* p2, const U_NAMESPACE_QUALIFIER Collator* pcoll) const
34	{
35	// TODO make thread safe!!!! :
36	typedef u32_to_u16_iterator<const char_type*, ::UChar> itt;
37	itt i(p1), j(p2);
38	#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
39	std::vector< ::UChar> t(i, j);
40	#else
41	std::vector< ::UChar> t;
42	while(i != j)
43	t.push_back(*i++);
44	#endif
45	::uint8_t result[`100`];
46	::int32_t len;
47	if(t.size())
48	len = pcoll->getSortKey(&t.begin(), static_cast< ::int32_t>(t.size()), result, sizeof*(result));
49	else
50	len = pcoll->getSortKey(static_cast<UChar const>(`0`), static_cast< ::int32_t>(`0`), result, sizeof*(result));
51	if(std::size_t(len) > sizeof(result))
52	{
53	scoped_array< ::uint8_t> presult(new ::uint8_t[len+`1`]);
54	if(t.size())
55	len = pcoll->getSortKey(&t.begin(), static_cast*< ::int32_t>(t.size()), presult.get(), len+`1`);
56	else
57	len = pcoll->getSortKey(static_cast<UChar const>(`0`), static_cast*< ::int32_t>(`0`), presult.get(), len+`1`);
58	if((`0` == presult[len-`1`]) && (len > `1`))
59	--len;
60	#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
61	return string_type(presult.get(), presult.get()+len);
62	#else
63	string_type sresult;
64	::uint8_t const* ia = presult.get();
65	::uint8_t const* ib = presult.get()+len;
66	while(ia != ib)
67	sresult.push_back(*ia++);
68	return sresult;
69	#endif
70	}
71	if((`0` == result[len-`1`]) && (len > `1`))
72	--len;
73	#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
74	return string_type(result, result+len);
75	#else
76	string_type sresult;
77	::uint8_t const* ia = result;
78	::uint8_t const* ib = result+len;
79	while(ia != ib)
80	sresult.push_back(*ia++);
81	return sresult;
82	#endif
83	}
84
85	}
86
87	icu_regex_traits::size_type icu_regex_traits::length(const char_type* p)
88	{
89	size_type result = `0`;
90	while(*p)
91	{
92	++p;
93	++result;
94	}
95	return result;
96	}
97
98	//
99	// define our bitmasks:
100	//
101	const icu_regex_traits::char_class_type icu_regex_traits::mask_blank = icu_regex_traits::char_class_type(`1`) << offset_blank;
102	const icu_regex_traits::char_class_type icu_regex_traits::mask_space = icu_regex_traits::char_class_type(`1`) << offset_space;
103	const icu_regex_traits::char_class_type icu_regex_traits::mask_xdigit = icu_regex_traits::char_class_type(`1`) << offset_xdigit;
104	const icu_regex_traits::char_class_type icu_regex_traits::mask_underscore = icu_regex_traits::char_class_type(`1`) << offset_underscore;
105	const icu_regex_traits::char_class_type icu_regex_traits::mask_unicode = icu_regex_traits::char_class_type(`1`) << offset_unicode;
106	const icu_regex_traits::char_class_type icu_regex_traits::mask_any = icu_regex_traits::char_class_type(`1`) << offset_any;
107	const icu_regex_traits::char_class_type icu_regex_traits::mask_ascii = icu_regex_traits::char_class_type(`1`) << offset_ascii;
108	const icu_regex_traits::char_class_type icu_regex_traits::mask_horizontal = icu_regex_traits::char_class_type(`1`) << offset_horizontal;
109	const icu_regex_traits::char_class_type icu_regex_traits::mask_vertical = icu_regex_traits::char_class_type(`1`) << offset_vertical;
110
111	icu_regex_traits::char_class_type icu_regex_traits::lookup_icu_mask(const ::UChar32* p1, const ::UChar32* p2)
112	{
113	static const ::UChar32 prop_name_table[] = {
114	/ any / `'a'`, `'n'`, `'y'`,
115	/ ascii / `'a'`, `'s'`, `'c'`, `'i'`, `'i'`,
116	/ assigned / `'a'`, `'s'`, `'s'`, `'i'`, `'g'`, `'n'`, `'e'`, `'d'`,
117	/ c* / `'c'`, `'*'`,
118	/ cc / `'c'`, `'c'`,
119	/ cf / `'c'`, `'f'`,
120	/ closepunctuation / `'c'`, `'l'`, `'o'`, `'s'`, `'e'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
121	/ cn / `'c'`, `'n'`,
122	/ co / `'c'`, `'o'`,
123	/ connectorpunctuation / `'c'`, `'o'`, `'n'`, `'n'`, `'e'`, `'c'`, `'t'`, `'o'`, `'r'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
124	/ control / `'c'`, `'o'`, `'n'`, `'t'`, `'r'`, `'o'`, `'l'`,
125	/ cs / `'c'`, `'s'`,
126	/ currencysymbol / `'c'`, `'u'`, `'r'`, `'r'`, `'e'`, `'n'`, `'c'`, `'y'`, `'s'`, `'y'`, `'m'`, `'b'`, `'o'`, `'l'`,
127	/ dashpunctuation / `'d'`, `'a'`, `'s'`, `'h'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
128	/ decimaldigitnumber / `'d'`, `'e'`, `'c'`, `'i'`, `'m'`, `'a'`, `'l'`, `'d'`, `'i'`, `'g'`, `'i'`, `'t'`, `'n'`, `'u'`, `'m'`, `'b'`, `'e'`, `'r'`,
129	/ enclosingmark / `'e'`, `'n'`, `'c'`, `'l'`, `'o'`, `'s'`, `'i'`, `'n'`, `'g'`, `'m'`, `'a'`, `'r'`, `'k'`,
130	/ finalpunctuation / `'f'`, `'i'`, `'n'`, `'a'`, `'l'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
131	/ format / `'f'`, `'o'`, `'r'`, `'m'`, `'a'`, `'t'`,
132	/ initialpunctuation / `'i'`, `'n'`, `'i'`, `'t'`, `'i'`, `'a'`, `'l'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
133	/ l* / `'l'`, `'*'`,
134	/ letter / `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`,
135	/ letternumber / `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`, `'n'`, `'u'`, `'m'`, `'b'`, `'e'`, `'r'`,
136	/ lineseparator / `'l'`, `'i'`, `'n'`, `'e'`, `'s'`, `'e'`, `'p'`, `'a'`, `'r'`, `'a'`, `'t'`, `'o'`, `'r'`,
137	/ ll / `'l'`, `'l'`,
138	/ lm / `'l'`, `'m'`,
139	/ lo / `'l'`, `'o'`,
140	/ lowercaseletter / `'l'`, `'o'`, `'w'`, `'e'`, `'r'`, `'c'`, `'a'`, `'s'`, `'e'`, `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`,
141	/ lt / `'l'`, `'t'`,
142	/ lu / `'l'`, `'u'`,
143	/ m* / `'m'`, `'*'`,
144	/ mark / `'m'`, `'a'`, `'r'`, `'k'`,
145	/ mathsymbol / `'m'`, `'a'`, `'t'`, `'h'`, `'s'`, `'y'`, `'m'`, `'b'`, `'o'`, `'l'`,
146	/ mc / `'m'`, `'c'`,
147	/ me / `'m'`, `'e'`,
148	/ mn / `'m'`, `'n'`,
149	/ modifierletter / `'m'`, `'o'`, `'d'`, `'i'`, `'f'`, `'i'`, `'e'`, `'r'`, `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`,
150	/ modifiersymbol / `'m'`, `'o'`, `'d'`, `'i'`, `'f'`, `'i'`, `'e'`, `'r'`, `'s'`, `'y'`, `'m'`, `'b'`, `'o'`, `'l'`,
151	/ n* / `'n'`, `'*'`,
152	/ nd / `'n'`, `'d'`,
153	/ nl / `'n'`, `'l'`,
154	/ no / `'n'`, `'o'`,
155	/ nonspacingmark / `'n'`, `'o'`, `'n'`, `'s'`, `'p'`, `'a'`, `'c'`, `'i'`, `'n'`, `'g'`, `'m'`, `'a'`, `'r'`, `'k'`,
156	/ notassigned / `'n'`, `'o'`, `'t'`, `'a'`, `'s'`, `'s'`, `'i'`, `'g'`, `'n'`, `'e'`, `'d'`,
157	/ number / `'n'`, `'u'`, `'m'`, `'b'`, `'e'`, `'r'`,
158	/ openpunctuation / `'o'`, `'p'`, `'e'`, `'n'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
159	/ other / `'o'`, `'t'`, `'h'`, `'e'`, `'r'`,
160	/ otherletter / `'o'`, `'t'`, `'h'`, `'e'`, `'r'`, `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`,
161	/ othernumber / `'o'`, `'t'`, `'h'`, `'e'`, `'r'`, `'n'`, `'u'`, `'m'`, `'b'`, `'e'`, `'r'`,
162	/ otherpunctuation / `'o'`, `'t'`, `'h'`, `'e'`, `'r'`, `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
163	/ othersymbol / `'o'`, `'t'`, `'h'`, `'e'`, `'r'`, `'s'`, `'y'`, `'m'`, `'b'`, `'o'`, `'l'`,
164	/ p* / `'p'`, `'*'`,
165	/ paragraphseparator / `'p'`, `'a'`, `'r'`, `'a'`, `'g'`, `'r'`, `'a'`, `'p'`, `'h'`, `'s'`, `'e'`, `'p'`, `'a'`, `'r'`, `'a'`, `'t'`, `'o'`, `'r'`,
166	/ pc / `'p'`, `'c'`,
167	/ pd / `'p'`, `'d'`,
168	/ pe / `'p'`, `'e'`,
169	/ pf / `'p'`, `'f'`,
170	/ pi / `'p'`, `'i'`,
171	/ po / `'p'`, `'o'`,
172	/ privateuse / `'p'`, `'r'`, `'i'`, `'v'`, `'a'`, `'t'`, `'e'`, `'u'`, `'s'`, `'e'`,
173	/ ps / `'p'`, `'s'`,
174	/ punctuation / `'p'`, `'u'`, `'n'`, `'c'`, `'t'`, `'u'`, `'a'`, `'t'`, `'i'`, `'o'`, `'n'`,
175	/ s* / `'s'`, `'*'`,
176	/ sc / `'s'`, `'c'`,
177	/ separator / `'s'`, `'e'`, `'p'`, `'a'`, `'r'`, `'a'`, `'t'`, `'o'`, `'r'`,
178	/ sk / `'s'`, `'k'`,
179	/ sm / `'s'`, `'m'`,
180	/ so / `'s'`, `'o'`,
181	/ spaceseparator / `'s'`, `'p'`, `'a'`, `'c'`, `'e'`, `'s'`, `'e'`, `'p'`, `'a'`, `'r'`, `'a'`, `'t'`, `'o'`, `'r'`,
182	/ spacingcombiningmark / `'s'`, `'p'`, `'a'`, `'c'`, `'i'`, `'n'`, `'g'`, `'c'`, `'o'`, `'m'`, `'b'`, `'i'`, `'n'`, `'i'`, `'n'`, `'g'`, `'m'`, `'a'`, `'r'`, `'k'`,
183	/ surrogate / `'s'`, `'u'`, `'r'`, `'r'`, `'o'`, `'g'`, `'a'`, `'t'`, `'e'`,
184	/ symbol / `'s'`, `'y'`, `'m'`, `'b'`, `'o'`, `'l'`,
185	/ titlecase / `'t'`, `'i'`, `'t'`, `'l'`, `'e'`, `'c'`, `'a'`, `'s'`, `'e'`,
186	/ titlecaseletter / `'t'`, `'i'`, `'t'`, `'l'`, `'e'`, `'c'`, `'a'`, `'s'`, `'e'`, `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`,
187	/ uppercaseletter / `'u'`, `'p'`, `'p'`, `'e'`, `'r'`, `'c'`, `'a'`, `'s'`, `'e'`, `'l'`, `'e'`, `'t'`, `'t'`, `'e'`, `'r'`,
188	/ z* / `'z'`, `'*'`,
189	/ zl / `'z'`, `'l'`,
190	/ zp / `'z'`, `'p'`,
191	/ zs / `'z'`, `'s'`,
192	};
193
194	static const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32> range_data[] = {
195	{ prop_name_table+`0`, prop_name_table+`3`, }, // any
196	{ prop_name_table+`3`, prop_name_table+`8`, }, // ascii
197	{ prop_name_table+`8`, prop_name_table+`16`, }, // assigned
198	{ prop_name_table+`16`, prop_name_table+`18`, }, // c*
199	{ prop_name_table+`18`, prop_name_table+`20`, }, // cc
200	{ prop_name_table+`20`, prop_name_table+`22`, }, // cf
201	{ prop_name_table+`22`, prop_name_table+`38`, }, // closepunctuation
202	{ prop_name_table+`38`, prop_name_table+`40`, }, // cn
203	{ prop_name_table+`40`, prop_name_table+`42`, }, // co
204	{ prop_name_table+`42`, prop_name_table+`62`, }, // connectorpunctuation
205	{ prop_name_table+`62`, prop_name_table+`69`, }, // control
206	{ prop_name_table+`69`, prop_name_table+`71`, }, // cs
207	{ prop_name_table+`71`, prop_name_table+`85`, }, // currencysymbol
208	{ prop_name_table+`85`, prop_name_table+`100`, }, // dashpunctuation
209	{ prop_name_table+`100`, prop_name_table+`118`, }, // decimaldigitnumber
210	{ prop_name_table+`118`, prop_name_table+`131`, }, // enclosingmark
211	{ prop_name_table+`131`, prop_name_table+`147`, }, // finalpunctuation
212	{ prop_name_table+`147`, prop_name_table+`153`, }, // format
213	{ prop_name_table+`153`, prop_name_table+`171`, }, // initialpunctuation
214	{ prop_name_table+`171`, prop_name_table+`173`, }, // l*
215	{ prop_name_table+`173`, prop_name_table+`179`, }, // letter
216	{ prop_name_table+`179`, prop_name_table+`191`, }, // letternumber
217	{ prop_name_table+`191`, prop_name_table+`204`, }, // lineseparator
218	{ prop_name_table+`204`, prop_name_table+`206`, }, // ll
219	{ prop_name_table+`206`, prop_name_table+`208`, }, // lm
220	{ prop_name_table+`208`, prop_name_table+`210`, }, // lo
221	{ prop_name_table+`210`, prop_name_table+`225`, }, // lowercaseletter
222	{ prop_name_table+`225`, prop_name_table+`227`, }, // lt
223	{ prop_name_table+`227`, prop_name_table+`229`, }, // lu
224	{ prop_name_table+`229`, prop_name_table+`231`, }, // m*
225	{ prop_name_table+`231`, prop_name_table+`235`, }, // mark
226	{ prop_name_table+`235`, prop_name_table+`245`, }, // mathsymbol
227	{ prop_name_table+`245`, prop_name_table+`247`, }, // mc
228	{ prop_name_table+`247`, prop_name_table+`249`, }, // me
229	{ prop_name_table+`249`, prop_name_table+`251`, }, // mn
230	{ prop_name_table+`251`, prop_name_table+`265`, }, // modifierletter
231	{ prop_name_table+`265`, prop_name_table+`279`, }, // modifiersymbol
232	{ prop_name_table+`279`, prop_name_table+`281`, }, // n*
233	{ prop_name_table+`281`, prop_name_table+`283`, }, // nd
234	{ prop_name_table+`283`, prop_name_table+`285`, }, // nl
235	{ prop_name_table+`285`, prop_name_table+`287`, }, // no
236	{ prop_name_table+`287`, prop_name_table+`301`, }, // nonspacingmark
237	{ prop_name_table+`301`, prop_name_table+`312`, }, // notassigned
238	{ prop_name_table+`312`, prop_name_table+`318`, }, // number
239	{ prop_name_table+`318`, prop_name_table+`333`, }, // openpunctuation
240	{ prop_name_table+`333`, prop_name_table+`338`, }, // other
241	{ prop_name_table+`338`, prop_name_table+`349`, }, // otherletter
242	{ prop_name_table+`349`, prop_name_table+`360`, }, // othernumber
243	{ prop_name_table+`360`, prop_name_table+`376`, }, // otherpunctuation
244	{ prop_name_table+`376`, prop_name_table+`387`, }, // othersymbol
245	{ prop_name_table+`387`, prop_name_table+`389`, }, // p*
246	{ prop_name_table+`389`, prop_name_table+`407`, }, // paragraphseparator
247	{ prop_name_table+`407`, prop_name_table+`409`, }, // pc
248	{ prop_name_table+`409`, prop_name_table+`411`, }, // pd
249	{ prop_name_table+`411`, prop_name_table+`413`, }, // pe
250	{ prop_name_table+`413`, prop_name_table+`415`, }, // pf
251	{ prop_name_table+`415`, prop_name_table+`417`, }, // pi
252	{ prop_name_table+`417`, prop_name_table+`419`, }, // po
253	{ prop_name_table+`419`, prop_name_table+`429`, }, // privateuse
254	{ prop_name_table+`429`, prop_name_table+`431`, }, // ps
255	{ prop_name_table+`431`, prop_name_table+`442`, }, // punctuation
256	{ prop_name_table+`442`, prop_name_table+`444`, }, // s*
257	{ prop_name_table+`444`, prop_name_table+`446`, }, // sc
258	{ prop_name_table+`446`, prop_name_table+`455`, }, // separator
259	{ prop_name_table+`455`, prop_name_table+`457`, }, // sk
260	{ prop_name_table+`457`, prop_name_table+`459`, }, // sm
261	{ prop_name_table+`459`, prop_name_table+`461`, }, // so
262	{ prop_name_table+`461`, prop_name_table+`475`, }, // spaceseparator
263	{ prop_name_table+`475`, prop_name_table+`495`, }, // spacingcombiningmark
264	{ prop_name_table+`495`, prop_name_table+`504`, }, // surrogate
265	{ prop_name_table+`504`, prop_name_table+`510`, }, // symbol
266	{ prop_name_table+`510`, prop_name_table+`519`, }, // titlecase
267	{ prop_name_table+`519`, prop_name_table+`534`, }, // titlecaseletter
268	{ prop_name_table+`534`, prop_name_table+`549`, }, // uppercaseletter
269	{ prop_name_table+`549`, prop_name_table+`551`, }, // z*
270	{ prop_name_table+`551`, prop_name_table+`553`, }, // zl
271	{ prop_name_table+`553`, prop_name_table+`555`, }, // zp
272	{ prop_name_table+`555`, prop_name_table+`557`, }, // zs
273	};
274
275	static const icu_regex_traits::char_class_type icu_class_map[] = {
276	icu_regex_traits::mask_any, // any
277	icu_regex_traits::mask_ascii, // ascii
278	(`0x3FFFFFFFu`) & ~(U_GC_CN_MASK), // assigned
279	U_GC_C_MASK, // c*
280	U_GC_CC_MASK, // cc
281	U_GC_CF_MASK, // cf
282	U_GC_PE_MASK, // closepunctuation
283	U_GC_CN_MASK, // cn
284	U_GC_CO_MASK, // co
285	U_GC_PC_MASK, // connectorpunctuation
286	U_GC_CC_MASK, // control
287	U_GC_CS_MASK, // cs
288	U_GC_SC_MASK, // currencysymbol
289	U_GC_PD_MASK, // dashpunctuation
290	U_GC_ND_MASK, // decimaldigitnumber
291	U_GC_ME_MASK, // enclosingmark
292	U_GC_PF_MASK, // finalpunctuation
293	U_GC_CF_MASK, // format
294	U_GC_PI_MASK, // initialpunctuation
295	U_GC_L_MASK, // l*
296	U_GC_L_MASK, // letter
297	U_GC_NL_MASK, // letternumber
298	U_GC_ZL_MASK, // lineseparator
299	U_GC_LL_MASK, // ll
300	U_GC_LM_MASK, // lm
301	U_GC_LO_MASK, // lo
302	U_GC_LL_MASK, // lowercaseletter
303	U_GC_LT_MASK, // lt
304	U_GC_LU_MASK, // lu
305	U_GC_M_MASK, // m*
306	U_GC_M_MASK, // mark
307	U_GC_SM_MASK, // mathsymbol
308	U_GC_MC_MASK, // mc
309	U_GC_ME_MASK, // me
310	U_GC_MN_MASK, // mn
311	U_GC_LM_MASK, // modifierletter
312	U_GC_SK_MASK, // modifiersymbol
313	U_GC_N_MASK, // n*
314	U_GC_ND_MASK, // nd
315	U_GC_NL_MASK, // nl
316	U_GC_NO_MASK, // no
317	U_GC_MN_MASK, // nonspacingmark
318	U_GC_CN_MASK, // notassigned
319	U_GC_N_MASK, // number
320	U_GC_PS_MASK, // openpunctuation
321	U_GC_C_MASK, // other
322	U_GC_LO_MASK, // otherletter
323	U_GC_NO_MASK, // othernumber
324	U_GC_PO_MASK, // otherpunctuation
325	U_GC_SO_MASK, // othersymbol
326	U_GC_P_MASK, // p*
327	U_GC_ZP_MASK, // paragraphseparator
328	U_GC_PC_MASK, // pc
329	U_GC_PD_MASK, // pd
330	U_GC_PE_MASK, // pe
331	U_GC_PF_MASK, // pf
332	U_GC_PI_MASK, // pi
333	U_GC_PO_MASK, // po
334	U_GC_CO_MASK, // privateuse
335	U_GC_PS_MASK, // ps
336	U_GC_P_MASK, // punctuation
337	U_GC_S_MASK, // s*
338	U_GC_SC_MASK, // sc
339	U_GC_Z_MASK, // separator
340	U_GC_SK_MASK, // sk
341	U_GC_SM_MASK, // sm
342	U_GC_SO_MASK, // so
343	U_GC_ZS_MASK, // spaceseparator
344	U_GC_MC_MASK, // spacingcombiningmark
345	U_GC_CS_MASK, // surrogate
346	U_GC_S_MASK, // symbol
347	U_GC_LT_MASK, // titlecase
348	U_GC_LT_MASK, // titlecaseletter
349	U_GC_LU_MASK, // uppercaseletter
350	U_GC_Z_MASK, // z*
351	U_GC_ZL_MASK, // zl
352	U_GC_ZP_MASK, // zp
353	U_GC_ZS_MASK, // zs
354	};
355
356
357	static const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32>* ranges_begin = range_data;
358	static const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32>* ranges_end = range_data + (sizeof(range_data)/sizeof(range_data[`0`]));
359
360	BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32> t = { p1, p2, };
361	const BOOST_REGEX_DETAIL_NS::character_pointer_range< ::UChar32>* p = std::lower_bound(ranges_begin, ranges_end, t);
362	if((p != ranges_end) && (t == *p))
363	return icu_class_map[p - ranges_begin];
364	return `0`;
365	}
366
367	icu_regex_traits::char_class_type icu_regex_traits::lookup_classname(const char_type* p1, const char_type* p2) const
368	{
369	static const char_class_type masks[] =
370	{
371	`0`,
372	U_GC_L_MASK \| U_GC_ND_MASK,
373	U_GC_L_MASK,
374	mask_blank,
375	U_GC_CC_MASK \| U_GC_CF_MASK \| U_GC_ZL_MASK \| U_GC_ZP_MASK,
376	U_GC_ND_MASK,
377	U_GC_ND_MASK,
378	(`0x3FFFFFFFu`) & ~(U_GC_CC_MASK \| U_GC_CF_MASK \| U_GC_CS_MASK \| U_GC_CN_MASK \| U_GC_Z_MASK),
379	mask_horizontal,
380	U_GC_LL_MASK,
381	U_GC_LL_MASK,
382	~(U_GC_C_MASK),
383	U_GC_P_MASK,
384	char_class_type(U_GC_Z_MASK) \| mask_space,
385	char_class_type(U_GC_Z_MASK) \| mask_space,
386	U_GC_LU_MASK,
387	mask_unicode,
388	U_GC_LU_MASK,
389	mask_vertical,
390	char_class_type(U_GC_L_MASK \| U_GC_ND_MASK \| U_GC_MN_MASK) \| mask_underscore,
391	char_class_type(U_GC_L_MASK \| U_GC_ND_MASK \| U_GC_MN_MASK) \| mask_underscore,
392	char_class_type(U_GC_ND_MASK) \| mask_xdigit,
393	};
394
395	int idx = ::boost::BOOST_REGEX_DETAIL_NS::get_default_class_id(p1, p2);
396	if(idx >= `0`)
397	return masks[idx+`1`];
398	char_class_type result = lookup_icu_mask(p1, p2);
399	if(result != `0`)
400	return result;
401
402	if(idx < `0`)
403	{
404	string_type s(p1, p2);
405	string_type::size_type i = `0`;
406	while(i < s.size())
407	{
408	s[i] = static_cast<char>((::u_tolower)(s[i]));
409	if(::u_isspace(s[i]) \|\| (s[i] == `'-'`) \|\| (s[i] == `'_'`))
410	s.erase(s.begin()+i, s.begin()+i+`1`);
411	else
412	{
413	s[i] = static_cast<char>((::u_tolower)(s[i]));
414	++i;
415	}
416	}
417	if(s.size())
418	idx = ::boost::BOOST_REGEX_DETAIL_NS::get_default_class_id(&s.begin(), &s.begin() + s.size());
419	if(idx >= `0`)
420	return masks[idx+`1`];
421	if(s.size())
422	result = lookup_icu_mask(&s.begin(), &s.begin() + s.size());
423	if(result != `0`)
424	return result;
425	}
426	BOOST_ASSERT(std::size_t(idx+`1`) < sizeof(masks) / sizeof(masks[`0`]));
427	return masks[idx+`1`];
428	}
429
430	icu_regex_traits::string_type icu_regex_traits::lookup_collatename(const char_type* p1, const char_type* p2) const
431	{
432	string_type result;
433	#ifdef BOOST_NO_CXX98_BINDERS
434	if(std::find_if(p1, p2, std::bind(std::greater< ::UChar32>(), std::placeholders::_1, `0x7f`)) == p2)
435	#else
436	if(std::find_if(p1, p2, std::bind2nd(std::greater< ::UChar32>(), `0x7f`)) == p2)
437	#endif
438	{
439	#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
440	std::string s(p1, p2);
441	#else
442	std::string s;
443	const char_type* p3 = p1;
444	while(p3 != p2)
445	s.append(`1`, *p3++);
446	#endif
447	// Try Unicode name:
448	UErrorCode err = U_ZERO_ERROR;
449	UChar32 c = ::u_charFromName(U_UNICODE_CHAR_NAME, s.c_str(), &err);
450	if(U_SUCCESS(err))
451	{
452	result.push_back(c);
453	return result;
454	}
455	// Try Unicode-extended name:
456	err = U_ZERO_ERROR;
457	c = ::u_charFromName(U_EXTENDED_CHAR_NAME, s.c_str(), &err);
458	if(U_SUCCESS(err))
459	{
460	result.push_back(c);
461	return result;
462	}
463	// try POSIX name:
464	s = ::boost::BOOST_REGEX_DETAIL_NS::lookup_default_collate_name(s);
465	#ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
466	result.assign(s.begin(), s.end());
467	#else
468	result.clear();
469	std::string::const_iterator si, sj;
470	si = s.begin();
471	sj = s.end();
472	while(si != sj)
473	result.push_back(*si++);
474	#endif
475	}
476	if(result.empty() && (p2-p1 == `1`))
477	result.push_back(*p1);
478	return result;
479	}
480
481	bool icu_regex_traits::isctype(char_type c, char_class_type f) const
482	{
483	// check for standard catagories first:
484	char_class_type m = char_class_type(static_cast<char_class_type>(`1`) << u_charType(c));
485	if((m & f) != `0`)
486	return true;
487	// now check for special cases:
488	if(((f & mask_blank) != `0`) && u_isblank(c))
489	return true;
490	if(((f & mask_space) != `0`) && u_isspace(c))
491	return true;
492	if(((f & mask_xdigit) != `0`) && (u_digit(c, `16`) >= `0`))
493	return true;
494	if(((f & mask_unicode) != `0`) && (c >= `0x100`))
495	return true;
496	if(((f & mask_underscore) != `0`) && (c == `'_'`))
497	return true;
498	if(((f & mask_any) != `0`) && (c <= `0x10FFFF`))
499	return true;
500	if(((f & mask_ascii) != `0`) && (c <= `0x7F`))
501	return true;
502	if(((f & mask_vertical) != `0`) && (::boost::BOOST_REGEX_DETAIL_NS::is_separator(c) \|\| (c == static_cast<char_type>(`'\v'`)) \|\| (m == U_GC_ZL_MASK) \|\| (m == U_GC_ZP_MASK)))
503	return true;
504	if(((f & mask_horizontal) != `0`) && !::boost::BOOST_REGEX_DETAIL_NS::is_separator(c) && u_isspace(c) && (c != static_cast<char_type>(`'\v'`)))
505	return true;
506	return false;
507	}
508
509	}
510
511	#endif // BOOST_HAS_ICU
512

Browse the source code of ClickHouse/contrib/boost/libs/regex/src/icu.cpp