pcre2_script_run.c source code [Godot/thirdparty/pcre2/src/pcre2_script_run.c]

1	/*************************************************
2	* Perl-Compatible Regular Expressions *
3	*************************************************/
4
5	/ PCRE is a library of functions to support regular expressions whose syntax*
6	and semantics are as close as possible to those of the Perl 5 language.
7
8	Written by Philip Hazel
9	Original API code Copyright (c) 1997-2012 University of Cambridge
10	New API code Copyright (c) 2016-2021 University of Cambridge
11
12	-----------------------------------------------------------------------------
13	Redistribution and use in source and binary forms, with or without
14	modification, are permitted provided that the following conditions are met:
15
16	* Redistributions of source code must retain the above copyright notice,
17	this list of conditions and the following disclaimer.
18
19	* Redistributions in binary form must reproduce the above copyright
20	notice, this list of conditions and the following disclaimer in the
21	documentation and/or other materials provided with the distribution.
22
23	* Neither the name of the University of Cambridge nor the names of its
24	contributors may be used to endorse or promote products derived from
25	this software without specific prior written permission.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE.
38	-----------------------------------------------------------------------------
39	*/
40
41	/ This module contains the function for checking a script run. /
42
43	#ifdef HAVE_CONFIG_H
44	#include "config.h"
45	#endif
46
47	#include "pcre2_internal.h"
48
49
50	/*************************************************
51	* Check script run *
52	*************************************************/
53
54	/ A script run is conceptually a sequence of characters all in the same*
55	Unicode script. However, it isn't quite that simple. There are special rules
56	for scripts that are commonly used together, and also special rules for digits.
57	This function implements the appropriate checks, which is possible only when
58	PCRE2 is compiled with Unicode support. The function returns TRUE if there is
59	no Unicode support; however, it should never be called in that circumstance
60	because an error is given by pcre2_compile() if a script run is called for in a
61	version of PCRE2 compiled without Unicode support.
62
63	Arguments:
64	pgr point to the first character
65	endptr point after the last character
66	utf TRUE if in UTF mode
67
68	Returns: TRUE if this is a valid script run
69	*/
70
71	/ These are states in the checking process. /
72
73	enum { SCRIPT_UNSET, / Requirement as yet unknown /
74	SCRIPT_MAP, / Bitmap contains acceptable scripts /
75	SCRIPT_HANPENDING, / Have had only Han characters /
76	SCRIPT_HANHIRAKATA, / Expect Han or Hirikata /
77	SCRIPT_HANBOPOMOFO, / Expect Han or Bopomofo /
78	SCRIPT_HANHANGUL / Expect Han or Hangul /
79	};
80
81	#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
82	#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
83
84	BOOL
85	PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
86	{
87	#ifdef SUPPORT_UNICODE
88	uint32_t require_state = SCRIPT_UNSET;
89	uint32_t require_map[FULL_MAPSIZE];
90	uint32_t map[FULL_MAPSIZE];
91	uint32_t require_digitset = `0`;
92	uint32_t c;
93
94	#if PCRE2_CODE_UNIT_WIDTH == 32
95	(void)utf; / Avoid compiler warning /
96	#endif
97
98	/ Any string containing fewer than 2 characters is a valid script run. /
99
100	if (ptr >= endptr) return TRUE;
101	GETCHARINCTEST(c, ptr);
102	if (ptr >= endptr) return TRUE;
103
104	/ Initialize the require map. This is a full-size bitmap that has a bit for*
105	every script, as opposed to the maps in ucd_script_sets, which only have bits
106	for scripts less than ucp_Unknown - those that appear in script extension
107	lists. /*
108
109	for (int i = `0`; i < FULL_MAPSIZE; i++) require_map[i] = `0`;
110
111	/ Scan strings of two or more characters, checking the Unicode characteristics*
112	of each code point. There is special code for scripts that can be combined with
113	characters from the Han Chinese script. This may be used in conjunction with
114	four other scripts in these combinations:
115
116	. Han with Hiragana and Katakana is allowed (for Japanese).
117	. Han with Bopomofo is allowed (for Taiwanese Mandarin).
118	. Han with Hangul is allowed (for Korean).
119
120	If the first significant character's script is one of the four, the required
121	script type is immediately known. However, if the first significant
122	character's script is Han, we have to keep checking for a non-Han character.
123	Hence the SCRIPT_HANPENDING state. /*
124
125	for (;;)
126	{
127	const ucd_record *ucd = GET_UCD(c);
128	uint32_t script = ucd->script;
129
130	/ If the script is Unknown, the string is not a valid script run. Such*
131	characters can only form script runs of length one (see test above). /*
132
133	if (script == ucp_Unknown) return FALSE;
134
135	/ A character without any script extensions whose script is Inherited or*
136	Common is always accepted with any script. If there are extensions, the
137	following processing happens for all scripts. /*
138
139	if (UCD_SCRIPTX_PROP(ucd) != `0` \|\| (script != ucp_Inherited && script != ucp_Common))
140	{
141	BOOL OK;
142
143	/ Set up a full-sized map for this character that can include bits for all*
144	scripts. Copy the scriptx map for this character (which covers those
145	scripts that appear in script extension lists), set the remaining values to
146	zero, and then, except for Common or Inherited, add this script's bit to
147	the map. /*
148
149	memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
150	memset(map + UCD_MAPSIZE, `0`, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
151	if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
152
153	/ Handle the different checking states /
154
155	switch(require_state)
156	{
157	/ First significant character - it might follow Common or Inherited*
158	characters that do not have any script extensions. /*
159
160	case SCRIPT_UNSET:
161	switch(script)
162	{
163	case ucp_Han:
164	require_state = SCRIPT_HANPENDING;
165	break;
166
167	case ucp_Hiragana:
168	case ucp_Katakana:
169	require_state = SCRIPT_HANHIRAKATA;
170	break;
171
172	case ucp_Bopomofo:
173	require_state = SCRIPT_HANBOPOMOFO;
174	break;
175
176	case ucp_Hangul:
177	require_state = SCRIPT_HANHANGUL;
178	break;
179
180	default:
181	memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
182	require_state = SCRIPT_MAP;
183	break;
184	}
185	break;
186
187	/ The first significant character was Han. An inspection of the Unicode*
188	11.0.0 files shows that there are the following types of Script Extension
189	list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
190	scripts:
191
192	. Bopomofo + Han
193	. Han + Hiragana + Katakana
194	. Hiragana + Katakana
195	. Bopopmofo + Hangul + Han + Hiragana + Katakana
196
197	The following code tries to make sense of this. /*
198
199	#define FOUND_BOPOMOFO 1
200	#define FOUND_HIRAGANA 2
201	#define FOUND_KATAKANA 4
202	#define FOUND_HANGUL 8
203
204	case SCRIPT_HANPENDING:
205	if (script != ucp_Han) / Another Han does nothing /
206	{
207	uint32_t chspecial = `0`;
208
209	if (MAPBIT(map, ucp_Bopomofo) != `0`) chspecial \|= FOUND_BOPOMOFO;
210	if (MAPBIT(map, ucp_Hiragana) != `0`) chspecial \|= FOUND_HIRAGANA;
211	if (MAPBIT(map, ucp_Katakana) != `0`) chspecial \|= FOUND_KATAKANA;
212	if (MAPBIT(map, ucp_Hangul) != `0`) chspecial \|= FOUND_HANGUL;
213
214	if (chspecial == `0`) return FALSE; / Not allowed with Han /
215
216	if (chspecial == FOUND_BOPOMOFO)
217	require_state = SCRIPT_HANBOPOMOFO;
218	else if (chspecial == (FOUND_HIRAGANA\|FOUND_KATAKANA))
219	require_state = SCRIPT_HANHIRAKATA;
220
221	/ Otherwise this character must be allowed with all of them, so remain*
222	in the pending state. /*
223	}
224	break;
225
226	/ Previously encountered one of the "with Han" scripts. Check that*
227	this character is appropriate. /*
228
229	case SCRIPT_HANHIRAKATA:
230	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
231	MAPBIT(map, ucp_Katakana) == `0`) return FALSE;
232	break;
233
234	case SCRIPT_HANBOPOMOFO:
235	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == `0`) return FALSE;
236	break;
237
238	case SCRIPT_HANHANGUL:
239	if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == `0`) return FALSE;
240	break;
241
242	/ Previously encountered one or more characters that are allowed with a*
243	list of scripts. /*
244
245	case SCRIPT_MAP:
246	OK = FALSE;
247
248	for (int i = `0`; i < FULL_MAPSIZE; i++)
249	{
250	if ((require_map[i] & map[i]) != `0`)
251	{
252	OK = TRUE;
253	break;
254	}
255	}
256
257	if (!OK) return FALSE;
258
259	/ The rest of the string must be in this script, but we have to*
260	allow for the Han complications. /*
261
262	switch(script)
263	{
264	case ucp_Han:
265	require_state = SCRIPT_HANPENDING;
266	break;
267
268	case ucp_Hiragana:
269	case ucp_Katakana:
270	require_state = SCRIPT_HANHIRAKATA;
271	break;
272
273	case ucp_Bopomofo:
274	require_state = SCRIPT_HANBOPOMOFO;
275	break;
276
277	case ucp_Hangul:
278	require_state = SCRIPT_HANHANGUL;
279	break;
280
281	/ Compute the intersection of the required list of scripts and the*
282	allowed scripts for this character. /*
283
284	default:
285	for (int i = `0`; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
286	break;
287	}
288
289	break;
290	}
291	} / End checking character's script and extensions. /
292
293	/ The character is in an acceptable script. We must now ensure that all*
294	decimal digits in the string come from the same set. Some scripts (e.g.
295	Common, Arabic) have more than one set of decimal digits. This code does
296	not allow mixing sets, even within the same script. The vector called
297	PRIV(ucd_digit_sets)[] contains, in its first element, the number of
298	following elements, and then, in ascending order, the code points of the
299	'9' characters in every set of 10 digits. Each set is identified by the
300	offset in the vector of its '9' character. An initial check of the first
301	value picks up ASCII digits quickly. Otherwise, a binary chop is used. /*
302
303	if (ucd->chartype == ucp_Nd)
304	{
305	uint32_t digitset;
306
307	if (c <= PRIV(ucd_digit_sets)[`1`]) digitset = `1`; else
308	{
309	int mid;
310	int bot = `1`;
311	int top = PRIV(ucd_digit_sets)[`0`];
312	for (;;)
313	{
314	if (top <= bot + `1`) / <= rather than == is paranoia /
315	{
316	digitset = top;
317	break;
318	}
319	mid = (top + bot) / `2`;
320	if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
321	}
322	}
323
324	/ A required value of 0 means "unset". /
325
326	if (require_digitset == `0`) require_digitset = digitset;
327	else if (digitset != require_digitset) return FALSE;
328	} / End digit handling /
329
330	/ If we haven't yet got to the end, pick up the next character. /
331
332	if (ptr >= endptr) return TRUE;
333	GETCHARINCTEST(c, ptr);
334	} / End checking loop /
335
336	#else /* NOT SUPPORT_UNICODE */
337	(void)ptr;
338	(void)endptr;
339	(void)utf;
340	return TRUE;
341	#endif /* SUPPORT_UNICODE */
342	}
343
344	/ End of pcre2_script_run.c /
345

Browse the source code of Godot/thirdparty/pcre2/src/pcre2_script_run.c