pcre2_xclass.c source code [Godot/thirdparty/pcre2/src/pcre2_xclass.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Original API code Copyright (c) 1997-2012 University of Cambridge
10	New API code Copyright (c) 2016-2022 University of Cambridge
11
12	-----------------------------------------------------------------------------
13	Redistribution and use in source and binary forms, with or without
14	modification, are permitted provided that the following conditions are met:
15
16	* Redistributions of source code must retain the above copyright notice,
17	this list of conditions and the following disclaimer.
18
19	* Redistributions in binary form must reproduce the above copyright
20	notice, this list of conditions and the following disclaimer in the
21	documentation and/or other materials provided with the distribution.
22
23	* Neither the name of the University of Cambridge nor the names of its
24	contributors may be used to endorse or promote products derived from
25	this software without specific prior written permission.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE.
38	-----------------------------------------------------------------------------
39	*/
40
41	/ This module contains an internal function that is used to match an extended*
42	class. It is used by pcre2_auto_possessify() and by both pcre2_match() and
43	pcre2_def_match(). /*
44
45
46	#ifdef HAVE_CONFIG_H
47	#include "config.h"
48	#endif
49
50
51	#include "pcre2_internal.h"
52
53	/*************************************************
54	* Match character against an XCLASS *
55	*************************************************/
56
57	/ This function is called to match a character against an extended class that*
58	might contain codepoints above 255 and/or Unicode properties.
59
60	Arguments:
61	c the character
62	data points to the flag code unit of the XCLASS data
63	utf TRUE if in UTF mode
64
65	Returns: TRUE if character matches, else FALSE
66	*/
67
68	BOOL
69	PRIV(xclass)(uint32_t c, PCRE2_SPTR data, BOOL utf)
70	{
71	PCRE2_UCHAR t;
72	BOOL negated = (*data & XCL_NOT) != `0`;
73
74	#if PCRE2_CODE_UNIT_WIDTH == 8
75	/ In 8 bit mode, this must always be TRUE. Help the compiler to know that. /
76	utf = TRUE;
77	#endif
78
79	/ Code points < 256 are matched against a bitmap, if one is present. If not,*
80	we still carry on, because there may be ranges that start below 256 in the
81	additional data. /*
82
83	if (c < `256`)
84	{
85	if ((*data & XCL_HASPROP) == `0`)
86	{
87	if ((data & XCL_MAP) == `0`) return* negated;
88	return (((uint8_t *)(data + `1`))[c/`8`] & (`1u` << (c&`7`))) != `0`;
89	}
90	if ((*data & XCL_MAP) != `0` &&
91	(((uint8_t *)(data + `1`))[c/`8`] & (`1u` << (c&`7`))) != `0`)
92	return !negated; / char found /
93	}
94
95	/ First skip the bit map if present. Then match against the list of Unicode*
96	properties or large chars or ranges that end with a large char. We won't ever
97	encounter XCL_PROP or XCL_NOTPROP when UTF support is not compiled. /*
98
99	if ((data++ & XCL_MAP) != `0`) data += `32` / sizeof*(PCRE2_UCHAR);
100
101	while ((t = *data++) != XCL_END)
102	{
103	uint32_t x, y;
104	if (t == XCL_SINGLE)
105	{
106	#ifdef SUPPORT_UNICODE
107	if (utf)
108	{
109	GETCHARINC(x, data); / macro generates multiple statements /
110	}
111	else
112	#endif
113	x = *data++;
114	if (c == x) return !negated;
115	}
116	else if (t == XCL_RANGE)
117	{
118	#ifdef SUPPORT_UNICODE
119	if (utf)
120	{
121	GETCHARINC(x, data); / macro generates multiple statements /
122	GETCHARINC(y, data); / macro generates multiple statements /
123	}
124	else
125	#endif
126	{
127	x = *data++;
128	y = *data++;
129	}
130	if (c >= x && c <= y) return !negated;
131	}
132
133	#ifdef SUPPORT_UNICODE
134	else / XCL_PROP & XCL_NOTPROP /
135	{
136	const ucd_record *prop = GET_UCD(c);
137	BOOL isprop = t == XCL_PROP;
138	BOOL ok;
139
140	switch(*data)
141	{
142	case PT_ANY:
143	if (isprop) return !negated;
144	break;
145
146	case PT_LAMP:
147	if ((prop->chartype == ucp_Lu \|\| prop->chartype == ucp_Ll \|\|
148	prop->chartype == ucp_Lt) == isprop) return !negated;
149	break;
150
151	case PT_GC:
152	if ((data[`1`] == PRIV(ucp_gentype)[prop->chartype]) == isprop)
153	return !negated;
154	break;
155
156	case PT_PC:
157	if ((data[`1`] == prop->chartype) == isprop) return !negated;
158	break;
159
160	case PT_SC:
161	if ((data[`1`] == prop->script) == isprop) return !negated;
162	break;
163
164	case PT_SCX:
165	ok = (data[`1`] == prop->script \|\|
166	MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), data[`1`]) != `0`);
167	if (ok == isprop) return !negated;
168	break;
169
170	case PT_ALNUM:
171	if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
172	PRIV(ucp_gentype)[prop->chartype] == ucp_N) == isprop)
173	return !negated;
174	break;
175
176	/ Perl space used to exclude VT, but from Perl 5.18 it is included,*
177	which means that Perl space and POSIX space are now identical. PCRE
178	was changed at release 8.34. /*
179
180	case PT_SPACE: / Perl space /
181	case PT_PXSPACE: / POSIX space /
182	switch(c)
183	{
184	HSPACE_CASES:
185	VSPACE_CASES:
186	if (isprop) return !negated;
187	break;
188
189	default:
190	if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) == isprop)
191	return !negated;
192	break;
193	}
194	break;
195
196	case PT_WORD:
197	if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L \|\|
198	PRIV(ucp_gentype)[prop->chartype] == ucp_N \|\| c == CHAR_UNDERSCORE)
199	== isprop)
200	return !negated;
201	break;
202
203	case PT_UCNC:
204	if (c < `0xa0`)
205	{
206	if ((c == CHAR_DOLLAR_SIGN \|\| c == CHAR_COMMERCIAL_AT \|\|
207	c == CHAR_GRAVE_ACCENT) == isprop)
208	return !negated;
209	}
210	else
211	{
212	if ((c < `0xd800` \|\| c > `0xdfff`) == isprop)
213	return !negated;
214	}
215	break;
216
217	case PT_BIDICL:
218	if ((UCD_BIDICLASS_PROP(prop) == data[`1`]) == isprop)
219	return !negated;
220	break;
221
222	case PT_BOOL:
223	ok = MAPBIT(PRIV(ucd_boolprop_sets) +
224	UCD_BPROPS_PROP(prop), data[`1`]) != `0`;
225	if (ok == isprop) return !negated;
226	break;
227
228	/ The following three properties can occur only in an XCLASS, as there*
229	is no \p or \P coding for them. /*
230
231	/ Graphic character. Implement this as not Z (space or separator) and*
232	not C (other), except for Cf (format) with a few exceptions. This seems
233	to be what Perl does. The exceptional characters are:
234
235	U+061C Arabic Letter Mark
236	U+180E Mongolian Vowel Separator
237	U+2066 - U+2069 Various "isolate"s
238	*/
239
240	case PT_PXGRAPH:
241	if ((PRIV(ucp_gentype)[prop->chartype] != ucp_Z &&
242	(PRIV(ucp_gentype)[prop->chartype] != ucp_C \|\|
243	(prop->chartype == ucp_Cf &&
244	c != `0x061c` && c != `0x180e` && (c < `0x2066` \|\| c > `0x2069`))
245	)) == isprop)
246	return !negated;
247	break;
248
249	/ Printable character: same as graphic, with the addition of Zs, i.e.*
250	not Zl and not Zp, and U+180E. /*
251
252	case PT_PXPRINT:
253	if ((prop->chartype != ucp_Zl &&
254	prop->chartype != ucp_Zp &&
255	(PRIV(ucp_gentype)[prop->chartype] != ucp_C \|\|
256	(prop->chartype == ucp_Cf &&
257	c != `0x061c` && (c < `0x2066` \|\| c > `0x2069`))
258	)) == isprop)
259	return !negated;
260	break;
261
262	/ Punctuation: all Unicode punctuation, plus ASCII characters that*
263	Unicode treats as symbols rather than punctuation, for Perl
264	compatibility (these are $+<=>^`\|~). /*
265
266	case PT_PXPUNCT:
267	if ((PRIV(ucp_gentype)[prop->chartype] == ucp_P \|\|
268	(c < `128` && PRIV(ucp_gentype)[prop->chartype] == ucp_S)) == isprop)
269	return !negated;
270	break;
271
272	/ This should never occur, but compilers may mutter if there is no*
273	default. /*
274
275	default:
276	return FALSE;
277	}
278
279	data += `2`;
280	}
281	#else
282	(void)utf; / Avoid compiler warning /
283	#endif /* SUPPORT_UNICODE */
284	}
285
286	return negated; / char did not match /
287	}
288
289	/ End of pcre2_xclass.c /
290

Browse the source code of Godot/thirdparty/pcre2/src/pcre2_xclass.c