regc_pg_locale.c source code [PostgreSQL/src/backend/regex/regc_pg_locale.c]

1	/-------------------------------------------------------------------------*
2	*
3	* regc_pg_locale.c
4	* ctype functions adapted to work on pg_wchar (a/k/a chr),
5	* and functions to cache the results of wholesale ctype probing.
6	*
7	* This file is #included by regcomp.c; it's not meant to compile standalone.
8	*
9	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
10	* Portions Copyright (c) 1994, Regents of the University of California
11	*
12	* IDENTIFICATION
13	* src/backend/regex/regc_pg_locale.c
14	*
15	*-------------------------------------------------------------------------
16	*/
17
18	#include "catalog/pg_collation.h"
19	#include "utils/pg_locale.h"
20
21	/*
22	* To provide as much functionality as possible on a variety of platforms,
23	* without going so far as to implement everything from scratch, we use
24	* several implementation strategies depending on the situation:
25	*
26	* 1. In C/POSIX collations, we use hard-wired code. We can't depend on
27	* the <ctype.h> functions since those will obey LC_CTYPE. Note that these
28	* collations don't give a fig about multibyte characters.
29	*
30	* 2. In the "default" collation (which is supposed to obey LC_CTYPE):
31	*
32	* 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
33	* This assumes that every platform uses Unicode codepoints directly
34	* as the wchar_t representation of Unicode. On some platforms
35	* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
36	*
37	* 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
38	* values up to 255, and punt for values above that. This is 100% correct
39	* only in single-byte encodings such as LATINn. However, non-Unicode
40	* multibyte encodings are mostly Far Eastern character sets for which the
41	* properties being tested here aren't very relevant for higher code values
42	* anyway. The difficulty with using the <wctype.h> functions with
43	* non-Unicode multibyte encodings is that we can have no certainty that
44	* the platform's wchar_t representation matches what we do in pg_wchar
45	* conversions.
46	*
47	* 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
48	* Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
49	* functions, under exactly the same cases as #2.
50	*
51	* There is one notable difference between cases 2 and 3: in the "default"
52	* collation we force ASCII letters to follow ASCII upcase/downcase rules,
53	* while in a non-default collation we just let the library functions do what
54	* they will. The case where this matters is treatment of I/i in Turkish,
55	* and the behavior is meant to match the upper()/lower() SQL functions.
56	*
57	* We store the active collation setting in static variables. In principle
58	* it could be passed down to here via the regex library's "struct vars" data
59	* structure; but that would require somewhat invasive changes in the regex
60	* library, and right now there's no real benefit to be gained from that.
61	*
62	* NB: the coding here assumes pg_wchar is an unsigned type.
63	*/
64
65	typedef enum
66	{
67	PG_REGEX_LOCALE_C, / C locale (encoding independent) /
68	PG_REGEX_LOCALE_WIDE, / Use <wctype.h> functions /
69	PG_REGEX_LOCALE_1BYTE, / Use <ctype.h> functions /
70	PG_REGEX_LOCALE_WIDE_L, / Use locale_t <wctype.h> functions /
71	PG_REGEX_LOCALE_1BYTE_L, / Use locale_t <ctype.h> functions /
72	PG_REGEX_LOCALE_ICU / Use ICU uchar.h functions /
73	} PG_Locale_Strategy;
74
75	static PG_Locale_Strategy pg_regex_strategy;
76	static pg_locale_t pg_regex_locale;
77	static Oid pg_regex_collation;
78
79	/*
80	* Hard-wired character properties for C locale
81	*/
82	#define PG_ISDIGIT 0x01
83	#define PG_ISALPHA 0x02
84	#define PG_ISALNUM (PG_ISDIGIT \| PG_ISALPHA)
85	#define PG_ISUPPER 0x04
86	#define PG_ISLOWER 0x08
87	#define PG_ISGRAPH 0x10
88	#define PG_ISPRINT 0x20
89	#define PG_ISPUNCT 0x40
90	#define PG_ISSPACE 0x80
91
92	static const unsigned char pg_char_properties[`128`] = {
93	/ NUL / `0`,
94	/ ^A / `0`,
95	/ ^B / `0`,
96	/ ^C / `0`,
97	/ ^D / `0`,
98	/ ^E / `0`,
99	/ ^F / `0`,
100	/ ^G / `0`,
101	/ ^H / `0`,
102	/ ^I / PG_ISSPACE,
103	/ ^J / PG_ISSPACE,
104	/ ^K / PG_ISSPACE,
105	/ ^L / PG_ISSPACE,
106	/ ^M / PG_ISSPACE,
107	/ ^N / `0`,
108	/ ^O / `0`,
109	/ ^P / `0`,
110	/ ^Q / `0`,
111	/ ^R / `0`,
112	/ ^S / `0`,
113	/ ^T / `0`,
114	/ ^U / `0`,
115	/ ^V / `0`,
116	/ ^W / `0`,
117	/ ^X / `0`,
118	/ ^Y / `0`,
119	/ ^Z / `0`,
120	/ ^[ / `0`,
121	/ ^\ / `0`,
122	/ ^] / `0`,
123	/ ^^ / `0`,
124	/ ^_ / `0`,
125	/ / PG_ISPRINT \| PG_ISSPACE,
126	/ ! / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
127	/ " / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
128	/ # / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
129	/ $ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
130	/ % / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
131	/ & / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
132	/ ' / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
133	/ ( / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
134	/ ) / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
135	/ * / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
136	/ + / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
137	/ , / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
138	/ - / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
139	/ . / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
140	/ / / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
141	/ 0 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
142	/ 1 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
143	/ 2 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
144	/ 3 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
145	/ 4 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
146	/ 5 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
147	/ 6 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
148	/ 7 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
149	/ 8 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
150	/ 9 / PG_ISDIGIT \| PG_ISGRAPH \| PG_ISPRINT,
151	/ : / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
152	/ ; / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
153	/ < / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
154	/ = / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
155	/ > / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
156	/ ? / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
157	/ @ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
158	/ A / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
159	/ B / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
160	/ C / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
161	/ D / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
162	/ E / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
163	/ F / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
164	/ G / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
165	/ H / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
166	/ I / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
167	/ J / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
168	/ K / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
169	/ L / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
170	/ M / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
171	/ N / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
172	/ O / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
173	/ P / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
174	/ Q / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
175	/ R / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
176	/ S / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
177	/ T / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
178	/ U / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
179	/ V / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
180	/ W / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
181	/ X / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
182	/ Y / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
183	/ Z / PG_ISALPHA \| PG_ISUPPER \| PG_ISGRAPH \| PG_ISPRINT,
184	/ [ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
185	/ \ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
186	/ ] / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
187	/ ^ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
188	/ _ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
189	/ ` / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
190	/ a / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
191	/ b / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
192	/ c / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
193	/ d / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
194	/ e / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
195	/ f / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
196	/ g / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
197	/ h / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
198	/ i / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
199	/ j / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
200	/ k / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
201	/ l / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
202	/ m / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
203	/ n / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
204	/ o / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
205	/ p / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
206	/ q / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
207	/ r / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
208	/ s / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
209	/ t / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
210	/ u / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
211	/ v / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
212	/ w / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
213	/ x / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
214	/ y / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
215	/ z / PG_ISALPHA \| PG_ISLOWER \| PG_ISGRAPH \| PG_ISPRINT,
216	/ { / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
217	/ \| / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
218	/ } / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
219	/ ~ / PG_ISGRAPH \| PG_ISPRINT \| PG_ISPUNCT,
220	/ DEL / `0`
221	};
222
223
224	/*
225	* pg_set_regex_collation: set collation for these functions to obey
226	*
227	* This is called when beginning compilation or execution of a regexp.
228	* Since there's no need for reentrancy of regexp operations, it's okay
229	* to store the results in static variables.
230	*/
231	void
232	pg_set_regex_collation(Oid collation)
233	{
234	if (lc_ctype_is_c(collation))
235	{
236	/ C/POSIX collations use this path regardless of database encoding /
237	pg_regex_strategy = PG_REGEX_LOCALE_C;
238	pg_regex_locale = `0`;
239	pg_regex_collation = C_COLLATION_OID;
240	}
241	else
242	{
243	if (collation == DEFAULT_COLLATION_OID)
244	pg_regex_locale = `0`;
245	else if (OidIsValid(collation))
246	{
247	/*
248	* NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T;
249	* the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not
250	* have to be considered below.
251	*/
252	pg_regex_locale = pg_newlocale_from_collation(collation);
253	}
254	else
255	{
256	/*
257	* This typically means that the parser could not resolve a
258	* conflict of implicit collations, so report it that way.
259	*/
260	ereport(ERROR,
261	(errcode(ERRCODE_INDETERMINATE_COLLATION),
262	errmsg("could not determine which collation to use for regular expression"),
263	errhint("Use the COLLATE clause to set the collation explicitly.")));
264	}
265
266	if (pg_regex_locale && !pg_regex_locale->deterministic)
267	ereport(ERROR,
268	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
269	errmsg("nondeterministic collations are not supported for regular expressions")));
270
271	#ifdef USE_ICU
272	if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
273	pg_regex_strategy = PG_REGEX_LOCALE_ICU;
274	else
275	#endif
276	if (GetDatabaseEncoding() == PG_UTF8)
277	{
278	if (pg_regex_locale)
279	pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
280	else
281	pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
282	}
283	else
284	{
285	if (pg_regex_locale)
286	pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
287	else
288	pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
289	}
290
291	pg_regex_collation = collation;
292	}
293	}
294
295	static int
296	pg_wc_isdigit(pg_wchar c)
297	{
298	switch (pg_regex_strategy)
299	{
300	case PG_REGEX_LOCALE_C:
301	return (c <= (pg_wchar) `127` &&
302	(pg_char_properties[c] & PG_ISDIGIT));
303	case PG_REGEX_LOCALE_WIDE:
304	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
305	return iswdigit((wint_t) c);
306	/ FALL THRU /
307	case PG_REGEX_LOCALE_1BYTE:
308	return (c <= (pg_wchar) UCHAR_MAX &&
309	isdigit((unsigned char) c));
310	case PG_REGEX_LOCALE_WIDE_L:
311	#ifdef HAVE_LOCALE_T
312	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
313	return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
314	#endif
315	/ FALL THRU /
316	case PG_REGEX_LOCALE_1BYTE_L:
317	#ifdef HAVE_LOCALE_T
318	return (c <= (pg_wchar) UCHAR_MAX &&
319	isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
320	#endif
321	break;
322	case PG_REGEX_LOCALE_ICU:
323	#ifdef USE_ICU
324	return u_isdigit(c);
325	#endif
326	break;
327	}
328	return `0`; / can't get here, but keep compiler quiet /
329	}
330
331	static int
332	pg_wc_isalpha(pg_wchar c)
333	{
334	switch (pg_regex_strategy)
335	{
336	case PG_REGEX_LOCALE_C:
337	return (c <= (pg_wchar) `127` &&
338	(pg_char_properties[c] & PG_ISALPHA));
339	case PG_REGEX_LOCALE_WIDE:
340	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
341	return iswalpha((wint_t) c);
342	/ FALL THRU /
343	case PG_REGEX_LOCALE_1BYTE:
344	return (c <= (pg_wchar) UCHAR_MAX &&
345	isalpha((unsigned char) c));
346	case PG_REGEX_LOCALE_WIDE_L:
347	#ifdef HAVE_LOCALE_T
348	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
349	return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
350	#endif
351	/ FALL THRU /
352	case PG_REGEX_LOCALE_1BYTE_L:
353	#ifdef HAVE_LOCALE_T
354	return (c <= (pg_wchar) UCHAR_MAX &&
355	isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
356	#endif
357	break;
358	case PG_REGEX_LOCALE_ICU:
359	#ifdef USE_ICU
360	return u_isalpha(c);
361	#endif
362	break;
363	}
364	return `0`; / can't get here, but keep compiler quiet /
365	}
366
367	static int
368	pg_wc_isalnum(pg_wchar c)
369	{
370	switch (pg_regex_strategy)
371	{
372	case PG_REGEX_LOCALE_C:
373	return (c <= (pg_wchar) `127` &&
374	(pg_char_properties[c] & PG_ISALNUM));
375	case PG_REGEX_LOCALE_WIDE:
376	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
377	return iswalnum((wint_t) c);
378	/ FALL THRU /
379	case PG_REGEX_LOCALE_1BYTE:
380	return (c <= (pg_wchar) UCHAR_MAX &&
381	isalnum((unsigned char) c));
382	case PG_REGEX_LOCALE_WIDE_L:
383	#ifdef HAVE_LOCALE_T
384	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
385	return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
386	#endif
387	/ FALL THRU /
388	case PG_REGEX_LOCALE_1BYTE_L:
389	#ifdef HAVE_LOCALE_T
390	return (c <= (pg_wchar) UCHAR_MAX &&
391	isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
392	#endif
393	break;
394	case PG_REGEX_LOCALE_ICU:
395	#ifdef USE_ICU
396	return u_isalnum(c);
397	#endif
398	break;
399	}
400	return `0`; / can't get here, but keep compiler quiet /
401	}
402
403	static int
404	pg_wc_isupper(pg_wchar c)
405	{
406	switch (pg_regex_strategy)
407	{
408	case PG_REGEX_LOCALE_C:
409	return (c <= (pg_wchar) `127` &&
410	(pg_char_properties[c] & PG_ISUPPER));
411	case PG_REGEX_LOCALE_WIDE:
412	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
413	return iswupper((wint_t) c);
414	/ FALL THRU /
415	case PG_REGEX_LOCALE_1BYTE:
416	return (c <= (pg_wchar) UCHAR_MAX &&
417	isupper((unsigned char) c));
418	case PG_REGEX_LOCALE_WIDE_L:
419	#ifdef HAVE_LOCALE_T
420	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
421	return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
422	#endif
423	/ FALL THRU /
424	case PG_REGEX_LOCALE_1BYTE_L:
425	#ifdef HAVE_LOCALE_T
426	return (c <= (pg_wchar) UCHAR_MAX &&
427	isupper_l((unsigned char) c, pg_regex_locale->info.lt));
428	#endif
429	break;
430	case PG_REGEX_LOCALE_ICU:
431	#ifdef USE_ICU
432	return u_isupper(c);
433	#endif
434	break;
435	}
436	return `0`; / can't get here, but keep compiler quiet /
437	}
438
439	static int
440	pg_wc_islower(pg_wchar c)
441	{
442	switch (pg_regex_strategy)
443	{
444	case PG_REGEX_LOCALE_C:
445	return (c <= (pg_wchar) `127` &&
446	(pg_char_properties[c] & PG_ISLOWER));
447	case PG_REGEX_LOCALE_WIDE:
448	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
449	return iswlower((wint_t) c);
450	/ FALL THRU /
451	case PG_REGEX_LOCALE_1BYTE:
452	return (c <= (pg_wchar) UCHAR_MAX &&
453	islower((unsigned char) c));
454	case PG_REGEX_LOCALE_WIDE_L:
455	#ifdef HAVE_LOCALE_T
456	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
457	return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
458	#endif
459	/ FALL THRU /
460	case PG_REGEX_LOCALE_1BYTE_L:
461	#ifdef HAVE_LOCALE_T
462	return (c <= (pg_wchar) UCHAR_MAX &&
463	islower_l((unsigned char) c, pg_regex_locale->info.lt));
464	#endif
465	break;
466	case PG_REGEX_LOCALE_ICU:
467	#ifdef USE_ICU
468	return u_islower(c);
469	#endif
470	break;
471	}
472	return `0`; / can't get here, but keep compiler quiet /
473	}
474
475	static int
476	pg_wc_isgraph(pg_wchar c)
477	{
478	switch (pg_regex_strategy)
479	{
480	case PG_REGEX_LOCALE_C:
481	return (c <= (pg_wchar) `127` &&
482	(pg_char_properties[c] & PG_ISGRAPH));
483	case PG_REGEX_LOCALE_WIDE:
484	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
485	return iswgraph((wint_t) c);
486	/ FALL THRU /
487	case PG_REGEX_LOCALE_1BYTE:
488	return (c <= (pg_wchar) UCHAR_MAX &&
489	isgraph((unsigned char) c));
490	case PG_REGEX_LOCALE_WIDE_L:
491	#ifdef HAVE_LOCALE_T
492	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
493	return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
494	#endif
495	/ FALL THRU /
496	case PG_REGEX_LOCALE_1BYTE_L:
497	#ifdef HAVE_LOCALE_T
498	return (c <= (pg_wchar) UCHAR_MAX &&
499	isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
500	#endif
501	break;
502	case PG_REGEX_LOCALE_ICU:
503	#ifdef USE_ICU
504	return u_isgraph(c);
505	#endif
506	break;
507	}
508	return `0`; / can't get here, but keep compiler quiet /
509	}
510
511	static int
512	pg_wc_isprint(pg_wchar c)
513	{
514	switch (pg_regex_strategy)
515	{
516	case PG_REGEX_LOCALE_C:
517	return (c <= (pg_wchar) `127` &&
518	(pg_char_properties[c] & PG_ISPRINT));
519	case PG_REGEX_LOCALE_WIDE:
520	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
521	return iswprint((wint_t) c);
522	/ FALL THRU /
523	case PG_REGEX_LOCALE_1BYTE:
524	return (c <= (pg_wchar) UCHAR_MAX &&
525	isprint((unsigned char) c));
526	case PG_REGEX_LOCALE_WIDE_L:
527	#ifdef HAVE_LOCALE_T
528	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
529	return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
530	#endif
531	/ FALL THRU /
532	case PG_REGEX_LOCALE_1BYTE_L:
533	#ifdef HAVE_LOCALE_T
534	return (c <= (pg_wchar) UCHAR_MAX &&
535	isprint_l((unsigned char) c, pg_regex_locale->info.lt));
536	#endif
537	break;
538	case PG_REGEX_LOCALE_ICU:
539	#ifdef USE_ICU
540	return u_isprint(c);
541	#endif
542	break;
543	}
544	return `0`; / can't get here, but keep compiler quiet /
545	}
546
547	static int
548	pg_wc_ispunct(pg_wchar c)
549	{
550	switch (pg_regex_strategy)
551	{
552	case PG_REGEX_LOCALE_C:
553	return (c <= (pg_wchar) `127` &&
554	(pg_char_properties[c] & PG_ISPUNCT));
555	case PG_REGEX_LOCALE_WIDE:
556	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
557	return iswpunct((wint_t) c);
558	/ FALL THRU /
559	case PG_REGEX_LOCALE_1BYTE:
560	return (c <= (pg_wchar) UCHAR_MAX &&
561	ispunct((unsigned char) c));
562	case PG_REGEX_LOCALE_WIDE_L:
563	#ifdef HAVE_LOCALE_T
564	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
565	return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
566	#endif
567	/ FALL THRU /
568	case PG_REGEX_LOCALE_1BYTE_L:
569	#ifdef HAVE_LOCALE_T
570	return (c <= (pg_wchar) UCHAR_MAX &&
571	ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
572	#endif
573	break;
574	case PG_REGEX_LOCALE_ICU:
575	#ifdef USE_ICU
576	return u_ispunct(c);
577	#endif
578	break;
579	}
580	return `0`; / can't get here, but keep compiler quiet /
581	}
582
583	static int
584	pg_wc_isspace(pg_wchar c)
585	{
586	switch (pg_regex_strategy)
587	{
588	case PG_REGEX_LOCALE_C:
589	return (c <= (pg_wchar) `127` &&
590	(pg_char_properties[c] & PG_ISSPACE));
591	case PG_REGEX_LOCALE_WIDE:
592	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
593	return iswspace((wint_t) c);
594	/ FALL THRU /
595	case PG_REGEX_LOCALE_1BYTE:
596	return (c <= (pg_wchar) UCHAR_MAX &&
597	isspace((unsigned char) c));
598	case PG_REGEX_LOCALE_WIDE_L:
599	#ifdef HAVE_LOCALE_T
600	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
601	return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
602	#endif
603	/ FALL THRU /
604	case PG_REGEX_LOCALE_1BYTE_L:
605	#ifdef HAVE_LOCALE_T
606	return (c <= (pg_wchar) UCHAR_MAX &&
607	isspace_l((unsigned char) c, pg_regex_locale->info.lt));
608	#endif
609	break;
610	case PG_REGEX_LOCALE_ICU:
611	#ifdef USE_ICU
612	return u_isspace(c);
613	#endif
614	break;
615	}
616	return `0`; / can't get here, but keep compiler quiet /
617	}
618
619	static pg_wchar
620	pg_wc_toupper(pg_wchar c)
621	{
622	switch (pg_regex_strategy)
623	{
624	case PG_REGEX_LOCALE_C:
625	if (c <= (pg_wchar) `127`)
626	return pg_ascii_toupper((unsigned char) c);
627	return c;
628	case PG_REGEX_LOCALE_WIDE:
629	/ force C behavior for ASCII characters, per comments above /
630	if (c <= (pg_wchar) `127`)
631	return pg_ascii_toupper((unsigned char) c);
632	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
633	return towupper((wint_t) c);
634	/ FALL THRU /
635	case PG_REGEX_LOCALE_1BYTE:
636	/ force C behavior for ASCII characters, per comments above /
637	if (c <= (pg_wchar) `127`)
638	return pg_ascii_toupper((unsigned char) c);
639	if (c <= (pg_wchar) UCHAR_MAX)
640	return toupper((unsigned char) c);
641	return c;
642	case PG_REGEX_LOCALE_WIDE_L:
643	#ifdef HAVE_LOCALE_T
644	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
645	return towupper_l((wint_t) c, pg_regex_locale->info.lt);
646	#endif
647	/ FALL THRU /
648	case PG_REGEX_LOCALE_1BYTE_L:
649	#ifdef HAVE_LOCALE_T
650	if (c <= (pg_wchar) UCHAR_MAX)
651	return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
652	#endif
653	return c;
654	case PG_REGEX_LOCALE_ICU:
655	#ifdef USE_ICU
656	return u_toupper(c);
657	#endif
658	break;
659	}
660	return `0`; / can't get here, but keep compiler quiet /
661	}
662
663	static pg_wchar
664	pg_wc_tolower(pg_wchar c)
665	{
666	switch (pg_regex_strategy)
667	{
668	case PG_REGEX_LOCALE_C:
669	if (c <= (pg_wchar) `127`)
670	return pg_ascii_tolower((unsigned char) c);
671	return c;
672	case PG_REGEX_LOCALE_WIDE:
673	/ force C behavior for ASCII characters, per comments above /
674	if (c <= (pg_wchar) `127`)
675	return pg_ascii_tolower((unsigned char) c);
676	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
677	return towlower((wint_t) c);
678	/ FALL THRU /
679	case PG_REGEX_LOCALE_1BYTE:
680	/ force C behavior for ASCII characters, per comments above /
681	if (c <= (pg_wchar) `127`)
682	return pg_ascii_tolower((unsigned char) c);
683	if (c <= (pg_wchar) UCHAR_MAX)
684	return tolower((unsigned char) c);
685	return c;
686	case PG_REGEX_LOCALE_WIDE_L:
687	#ifdef HAVE_LOCALE_T
688	if (sizeof(wchar_t) >= `4` \|\| c <= (pg_wchar) `0xFFFF`)
689	return towlower_l((wint_t) c, pg_regex_locale->info.lt);
690	#endif
691	/ FALL THRU /
692	case PG_REGEX_LOCALE_1BYTE_L:
693	#ifdef HAVE_LOCALE_T
694	if (c <= (pg_wchar) UCHAR_MAX)
695	return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
696	#endif
697	return c;
698	case PG_REGEX_LOCALE_ICU:
699	#ifdef USE_ICU
700	return u_tolower(c);
701	#endif
702	break;
703	}
704	return `0`; / can't get here, but keep compiler quiet /
705	}
706
707
708	/*
709	* These functions cache the results of probing libc's ctype behavior for
710	* all character codes of interest in a given encoding/collation. The
711	* result is provided as a "struct cvec", but notice that the representation
712	* is a touch different from a cvec created by regc_cvec.c: we allocate the
713	* chrs[] and ranges[] arrays separately from the struct so that we can
714	* realloc them larger at need. This is okay since the cvecs made here
715	* should never be freed by freecvec().
716	*
717	* We use malloc not palloc since we mustn't lose control on out-of-memory;
718	* the main regex code expects us to return a failure indication instead.
719	*/
720
721	typedef int (*pg_wc_probefunc) (pg_wchar c);
722
723	typedef struct pg_ctype_cache
724	{
725	pg_wc_probefunc probefunc; / pg_wc_isalpha or a sibling /
726	Oid collation; / collation this entry is for /
727	struct cvec cv; / cache entry contents /
728	struct pg_ctype_cache next; /* chain link /
729	} pg_ctype_cache;
730
731	static pg_ctype_cache *pg_ctype_cache_list = NULL;
732
733	/*
734	* Add a chr or range to pcc->cv; return false if run out of memory
735	*/
736	static bool
737	store_match(pg_ctype_cache pcc, pg_wchar chr1, int* nchrs)
738	{
739	chr *newchrs;
740
741	if (nchrs > `1`)
742	{
743	if (pcc->cv.nranges >= pcc->cv.rangespace)
744	{
745	pcc->cv.rangespace *= `2`;
746	newchrs = (chr *) realloc(pcc->cv.ranges,
747	pcc->cv.rangespace * sizeof(chr) * `2`);
748	if (newchrs == NULL)
749	return false;
750	pcc->cv.ranges = newchrs;
751	}
752	pcc->cv.ranges[pcc->cv.nranges * `2`] = chr1;
753	pcc->cv.ranges[pcc->cv.nranges * `2` + `1`] = chr1 + nchrs - `1`;
754	pcc->cv.nranges++;
755	}
756	else
757	{
758	assert(nchrs == `1`);
759	if (pcc->cv.nchrs >= pcc->cv.chrspace)
760	{
761	pcc->cv.chrspace *= `2`;
762	newchrs = (chr *) realloc(pcc->cv.chrs,
763	pcc->cv.chrspace * sizeof(chr));
764	if (newchrs == NULL)
765	return false;
766	pcc->cv.chrs = newchrs;
767	}
768	pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
769	}
770	return true;
771	}
772
773	/*
774	* Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
775	* chrs satisfying the probe function. The active collation is the one
776	* previously set by pg_set_regex_collation. Return NULL if out of memory.
777	*
778	* Note that the result must not be freed or modified by caller.
779	*/
780	static struct cvec *
781	pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
782	{
783	pg_ctype_cache *pcc;
784	pg_wchar max_chr;
785	pg_wchar cur_chr;
786	int nmatches;
787	chr *newchrs;
788
789	/*
790	* Do we already have the answer cached?
791	*/
792	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
793	{
794	if (pcc->probefunc == probefunc &&
795	pcc->collation == pg_regex_collation)
796	return &pcc->cv;
797	}
798
799	/*
800	* Nope, so initialize some workspace ...
801	*/
802	pcc = (pg_ctype_cache ) malloc(sizeof*(pg_ctype_cache));
803	if (pcc == NULL)
804	return NULL;
805	pcc->probefunc = probefunc;
806	pcc->collation = pg_regex_collation;
807	pcc->cv.nchrs = `0`;
808	pcc->cv.chrspace = `128`;
809	pcc->cv.chrs = (chr ) malloc(pcc->cv.chrspace sizeof(chr));
810	pcc->cv.nranges = `0`;
811	pcc->cv.rangespace = `64`;
812	pcc->cv.ranges = (chr ) malloc(pcc->cv.rangespace sizeof(chr) * `2`);
813	if (pcc->cv.chrs == NULL \|\| pcc->cv.ranges == NULL)
814	goto out_of_memory;
815	pcc->cv.cclasscode = cclasscode;
816
817	/*
818	* Decide how many character codes we ought to look through. In general
819	* we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
820	* runtime using the "high colormap" mechanism. However, in C locale
821	* there's no need to go further than 127, and if we only have a 1-byte
822	* <ctype.h> API there's no need to go further than that can handle.
823	*
824	* If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
825	* output cvec as not having any locale-dependent behavior, since there
826	* will be no need to do any run-time locale checks. (The #if's here
827	* would always be true for production values of MAX_SIMPLE_CHR, but it's
828	* useful to allow it to be small for testing purposes.)
829	*/
830	switch (pg_regex_strategy)
831	{
832	case PG_REGEX_LOCALE_C:
833	#if MAX_SIMPLE_CHR >= 127
834	max_chr = (pg_wchar) `127`;
835	pcc->cv.cclasscode = -`1`;
836	#else
837	max_chr = (pg_wchar) MAX_SIMPLE_CHR;
838	#endif
839	break;
840	case PG_REGEX_LOCALE_WIDE:
841	case PG_REGEX_LOCALE_WIDE_L:
842	max_chr = (pg_wchar) MAX_SIMPLE_CHR;
843	break;
844	case PG_REGEX_LOCALE_1BYTE:
845	case PG_REGEX_LOCALE_1BYTE_L:
846	#if MAX_SIMPLE_CHR >= UCHAR_MAX
847	max_chr = (pg_wchar) UCHAR_MAX;
848	pcc->cv.cclasscode = -`1`;
849	#else
850	max_chr = (pg_wchar) MAX_SIMPLE_CHR;
851	#endif
852	break;
853	case PG_REGEX_LOCALE_ICU:
854	max_chr = (pg_wchar) MAX_SIMPLE_CHR;
855	break;
856	default:
857	max_chr = `0`; / can't get here, but keep compiler quiet /
858	break;
859	}
860
861	/*
862	* And scan 'em ...
863	*/
864	nmatches = `0`; / number of consecutive matches /
865
866	for (cur_chr = `0`; cur_chr <= max_chr; cur_chr++)
867	{
868	if ((*probefunc) (cur_chr))
869	nmatches++;
870	else if (nmatches > `0`)
871	{
872	if (!store_match(pcc, cur_chr - nmatches, nmatches))
873	goto out_of_memory;
874	nmatches = `0`;
875	}
876	}
877
878	if (nmatches > `0`)
879	if (!store_match(pcc, cur_chr - nmatches, nmatches))
880	goto out_of_memory;
881
882	/*
883	* We might have allocated more memory than needed, if so free it
884	*/
885	if (pcc->cv.nchrs == `0`)
886	{
887	free(pcc->cv.chrs);
888	pcc->cv.chrs = NULL;
889	pcc->cv.chrspace = `0`;
890	}
891	else if (pcc->cv.nchrs < pcc->cv.chrspace)
892	{
893	newchrs = (chr *) realloc(pcc->cv.chrs,
894	pcc->cv.nchrs * sizeof(chr));
895	if (newchrs == NULL)
896	goto out_of_memory;
897	pcc->cv.chrs = newchrs;
898	pcc->cv.chrspace = pcc->cv.nchrs;
899	}
900	if (pcc->cv.nranges == `0`)
901	{
902	free(pcc->cv.ranges);
903	pcc->cv.ranges = NULL;
904	pcc->cv.rangespace = `0`;
905	}
906	else if (pcc->cv.nranges < pcc->cv.rangespace)
907	{
908	newchrs = (chr *) realloc(pcc->cv.ranges,
909	pcc->cv.nranges * sizeof(chr) * `2`);
910	if (newchrs == NULL)
911	goto out_of_memory;
912	pcc->cv.ranges = newchrs;
913	pcc->cv.rangespace = pcc->cv.nranges;
914	}
915
916	/*
917	* Success, link it into cache chain
918	*/
919	pcc->next = pg_ctype_cache_list;
920	pg_ctype_cache_list = pcc;
921
922	return &pcc->cv;
923
924	/*
925	* Failure, clean up
926	*/
927	out_of_memory:
928	if (pcc->cv.chrs)
929	free(pcc->cv.chrs);
930	if (pcc->cv.ranges)
931	free(pcc->cv.ranges);
932	free(pcc);
933
934	return NULL;
935	}
936

Browse the source code of PostgreSQL/src/backend/regex/regc_pg_locale.c