1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * regc_pg_locale.c |
4 | * ctype functions adapted to work on pg_wchar (a/k/a chr), |
5 | * and functions to cache the results of wholesale ctype probing. |
6 | * |
7 | * This file is #included by regcomp.c; it's not meant to compile standalone. |
8 | * |
9 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
10 | * Portions Copyright (c) 1994, Regents of the University of California |
11 | * |
12 | * IDENTIFICATION |
13 | * src/backend/regex/regc_pg_locale.c |
14 | * |
15 | *------------------------------------------------------------------------- |
16 | */ |
17 | |
18 | #include "catalog/pg_collation.h" |
19 | #include "utils/pg_locale.h" |
20 | |
21 | /* |
22 | * To provide as much functionality as possible on a variety of platforms, |
23 | * without going so far as to implement everything from scratch, we use |
24 | * several implementation strategies depending on the situation: |
25 | * |
26 | * 1. In C/POSIX collations, we use hard-wired code. We can't depend on |
27 | * the <ctype.h> functions since those will obey LC_CTYPE. Note that these |
28 | * collations don't give a fig about multibyte characters. |
29 | * |
30 | * 2. In the "default" collation (which is supposed to obey LC_CTYPE): |
31 | * |
32 | * 2a. When working in UTF8 encoding, we use the <wctype.h> functions. |
33 | * This assumes that every platform uses Unicode codepoints directly |
34 | * as the wchar_t representation of Unicode. On some platforms |
35 | * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF. |
36 | * |
37 | * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar |
38 | * values up to 255, and punt for values above that. This is 100% correct |
39 | * only in single-byte encodings such as LATINn. However, non-Unicode |
40 | * multibyte encodings are mostly Far Eastern character sets for which the |
41 | * properties being tested here aren't very relevant for higher code values |
42 | * anyway. The difficulty with using the <wctype.h> functions with |
43 | * non-Unicode multibyte encodings is that we can have no certainty that |
44 | * the platform's wchar_t representation matches what we do in pg_wchar |
45 | * conversions. |
46 | * |
47 | * 3. Other collations are only supported on platforms that HAVE_LOCALE_T. |
48 | * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h> |
49 | * functions, under exactly the same cases as #2. |
50 | * |
51 | * There is one notable difference between cases 2 and 3: in the "default" |
52 | * collation we force ASCII letters to follow ASCII upcase/downcase rules, |
53 | * while in a non-default collation we just let the library functions do what |
54 | * they will. The case where this matters is treatment of I/i in Turkish, |
55 | * and the behavior is meant to match the upper()/lower() SQL functions. |
56 | * |
57 | * We store the active collation setting in static variables. In principle |
58 | * it could be passed down to here via the regex library's "struct vars" data |
59 | * structure; but that would require somewhat invasive changes in the regex |
60 | * library, and right now there's no real benefit to be gained from that. |
61 | * |
62 | * NB: the coding here assumes pg_wchar is an unsigned type. |
63 | */ |
64 | |
65 | typedef enum |
66 | { |
67 | PG_REGEX_LOCALE_C, /* C locale (encoding independent) */ |
68 | PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */ |
69 | PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */ |
70 | PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */ |
71 | PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */ |
72 | PG_REGEX_LOCALE_ICU /* Use ICU uchar.h functions */ |
73 | } PG_Locale_Strategy; |
74 | |
75 | static PG_Locale_Strategy pg_regex_strategy; |
76 | static pg_locale_t pg_regex_locale; |
77 | static Oid pg_regex_collation; |
78 | |
79 | /* |
80 | * Hard-wired character properties for C locale |
81 | */ |
82 | #define PG_ISDIGIT 0x01 |
83 | #define PG_ISALPHA 0x02 |
84 | #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA) |
85 | #define PG_ISUPPER 0x04 |
86 | #define PG_ISLOWER 0x08 |
87 | #define PG_ISGRAPH 0x10 |
88 | #define PG_ISPRINT 0x20 |
89 | #define PG_ISPUNCT 0x40 |
90 | #define PG_ISSPACE 0x80 |
91 | |
92 | static const unsigned char pg_char_properties[128] = { |
93 | /* NUL */ 0, |
94 | /* ^A */ 0, |
95 | /* ^B */ 0, |
96 | /* ^C */ 0, |
97 | /* ^D */ 0, |
98 | /* ^E */ 0, |
99 | /* ^F */ 0, |
100 | /* ^G */ 0, |
101 | /* ^H */ 0, |
102 | /* ^I */ PG_ISSPACE, |
103 | /* ^J */ PG_ISSPACE, |
104 | /* ^K */ PG_ISSPACE, |
105 | /* ^L */ PG_ISSPACE, |
106 | /* ^M */ PG_ISSPACE, |
107 | /* ^N */ 0, |
108 | /* ^O */ 0, |
109 | /* ^P */ 0, |
110 | /* ^Q */ 0, |
111 | /* ^R */ 0, |
112 | /* ^S */ 0, |
113 | /* ^T */ 0, |
114 | /* ^U */ 0, |
115 | /* ^V */ 0, |
116 | /* ^W */ 0, |
117 | /* ^X */ 0, |
118 | /* ^Y */ 0, |
119 | /* ^Z */ 0, |
120 | /* ^[ */ 0, |
121 | /* ^\ */ 0, |
122 | /* ^] */ 0, |
123 | /* ^^ */ 0, |
124 | /* ^_ */ 0, |
125 | /* */ PG_ISPRINT | PG_ISSPACE, |
126 | /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
127 | /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
128 | /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
129 | /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
130 | /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
131 | /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
132 | /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
133 | /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
134 | /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
135 | /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
136 | /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
137 | /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
138 | /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
139 | /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
140 | /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
141 | /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
142 | /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
143 | /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
144 | /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
145 | /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
146 | /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
147 | /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
148 | /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
149 | /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
150 | /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT, |
151 | /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
152 | /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
153 | /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
154 | /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
155 | /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
156 | /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
157 | /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
158 | /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
159 | /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
160 | /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
161 | /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
162 | /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
163 | /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
164 | /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
165 | /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
166 | /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
167 | /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
168 | /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
169 | /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
170 | /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
171 | /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
172 | /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
173 | /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
174 | /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
175 | /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
176 | /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
177 | /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
178 | /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
179 | /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
180 | /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
181 | /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
182 | /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
183 | /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT, |
184 | /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
185 | /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
186 | /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
187 | /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
188 | /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
189 | /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
190 | /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
191 | /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
192 | /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
193 | /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
194 | /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
195 | /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
196 | /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
197 | /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
198 | /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
199 | /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
200 | /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
201 | /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
202 | /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
203 | /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
204 | /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
205 | /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
206 | /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
207 | /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
208 | /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
209 | /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
210 | /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
211 | /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
212 | /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
213 | /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
214 | /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
215 | /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT, |
216 | /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
217 | /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
218 | /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
219 | /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT, |
220 | /* DEL */ 0 |
221 | }; |
222 | |
223 | |
224 | /* |
225 | * pg_set_regex_collation: set collation for these functions to obey |
226 | * |
227 | * This is called when beginning compilation or execution of a regexp. |
228 | * Since there's no need for reentrancy of regexp operations, it's okay |
229 | * to store the results in static variables. |
230 | */ |
231 | void |
232 | pg_set_regex_collation(Oid collation) |
233 | { |
234 | if (lc_ctype_is_c(collation)) |
235 | { |
236 | /* C/POSIX collations use this path regardless of database encoding */ |
237 | pg_regex_strategy = PG_REGEX_LOCALE_C; |
238 | pg_regex_locale = 0; |
239 | pg_regex_collation = C_COLLATION_OID; |
240 | } |
241 | else |
242 | { |
243 | if (collation == DEFAULT_COLLATION_OID) |
244 | pg_regex_locale = 0; |
245 | else if (OidIsValid(collation)) |
246 | { |
247 | /* |
248 | * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T; |
249 | * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not |
250 | * have to be considered below. |
251 | */ |
252 | pg_regex_locale = pg_newlocale_from_collation(collation); |
253 | } |
254 | else |
255 | { |
256 | /* |
257 | * This typically means that the parser could not resolve a |
258 | * conflict of implicit collations, so report it that way. |
259 | */ |
260 | ereport(ERROR, |
261 | (errcode(ERRCODE_INDETERMINATE_COLLATION), |
262 | errmsg("could not determine which collation to use for regular expression" ), |
263 | errhint("Use the COLLATE clause to set the collation explicitly." ))); |
264 | } |
265 | |
266 | if (pg_regex_locale && !pg_regex_locale->deterministic) |
267 | ereport(ERROR, |
268 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
269 | errmsg("nondeterministic collations are not supported for regular expressions" ))); |
270 | |
271 | #ifdef USE_ICU |
272 | if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU) |
273 | pg_regex_strategy = PG_REGEX_LOCALE_ICU; |
274 | else |
275 | #endif |
276 | if (GetDatabaseEncoding() == PG_UTF8) |
277 | { |
278 | if (pg_regex_locale) |
279 | pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L; |
280 | else |
281 | pg_regex_strategy = PG_REGEX_LOCALE_WIDE; |
282 | } |
283 | else |
284 | { |
285 | if (pg_regex_locale) |
286 | pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L; |
287 | else |
288 | pg_regex_strategy = PG_REGEX_LOCALE_1BYTE; |
289 | } |
290 | |
291 | pg_regex_collation = collation; |
292 | } |
293 | } |
294 | |
295 | static int |
296 | pg_wc_isdigit(pg_wchar c) |
297 | { |
298 | switch (pg_regex_strategy) |
299 | { |
300 | case PG_REGEX_LOCALE_C: |
301 | return (c <= (pg_wchar) 127 && |
302 | (pg_char_properties[c] & PG_ISDIGIT)); |
303 | case PG_REGEX_LOCALE_WIDE: |
304 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
305 | return iswdigit((wint_t) c); |
306 | /* FALL THRU */ |
307 | case PG_REGEX_LOCALE_1BYTE: |
308 | return (c <= (pg_wchar) UCHAR_MAX && |
309 | isdigit((unsigned char) c)); |
310 | case PG_REGEX_LOCALE_WIDE_L: |
311 | #ifdef HAVE_LOCALE_T |
312 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
313 | return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); |
314 | #endif |
315 | /* FALL THRU */ |
316 | case PG_REGEX_LOCALE_1BYTE_L: |
317 | #ifdef HAVE_LOCALE_T |
318 | return (c <= (pg_wchar) UCHAR_MAX && |
319 | isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); |
320 | #endif |
321 | break; |
322 | case PG_REGEX_LOCALE_ICU: |
323 | #ifdef USE_ICU |
324 | return u_isdigit(c); |
325 | #endif |
326 | break; |
327 | } |
328 | return 0; /* can't get here, but keep compiler quiet */ |
329 | } |
330 | |
331 | static int |
332 | pg_wc_isalpha(pg_wchar c) |
333 | { |
334 | switch (pg_regex_strategy) |
335 | { |
336 | case PG_REGEX_LOCALE_C: |
337 | return (c <= (pg_wchar) 127 && |
338 | (pg_char_properties[c] & PG_ISALPHA)); |
339 | case PG_REGEX_LOCALE_WIDE: |
340 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
341 | return iswalpha((wint_t) c); |
342 | /* FALL THRU */ |
343 | case PG_REGEX_LOCALE_1BYTE: |
344 | return (c <= (pg_wchar) UCHAR_MAX && |
345 | isalpha((unsigned char) c)); |
346 | case PG_REGEX_LOCALE_WIDE_L: |
347 | #ifdef HAVE_LOCALE_T |
348 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
349 | return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); |
350 | #endif |
351 | /* FALL THRU */ |
352 | case PG_REGEX_LOCALE_1BYTE_L: |
353 | #ifdef HAVE_LOCALE_T |
354 | return (c <= (pg_wchar) UCHAR_MAX && |
355 | isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); |
356 | #endif |
357 | break; |
358 | case PG_REGEX_LOCALE_ICU: |
359 | #ifdef USE_ICU |
360 | return u_isalpha(c); |
361 | #endif |
362 | break; |
363 | } |
364 | return 0; /* can't get here, but keep compiler quiet */ |
365 | } |
366 | |
367 | static int |
368 | pg_wc_isalnum(pg_wchar c) |
369 | { |
370 | switch (pg_regex_strategy) |
371 | { |
372 | case PG_REGEX_LOCALE_C: |
373 | return (c <= (pg_wchar) 127 && |
374 | (pg_char_properties[c] & PG_ISALNUM)); |
375 | case PG_REGEX_LOCALE_WIDE: |
376 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
377 | return iswalnum((wint_t) c); |
378 | /* FALL THRU */ |
379 | case PG_REGEX_LOCALE_1BYTE: |
380 | return (c <= (pg_wchar) UCHAR_MAX && |
381 | isalnum((unsigned char) c)); |
382 | case PG_REGEX_LOCALE_WIDE_L: |
383 | #ifdef HAVE_LOCALE_T |
384 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
385 | return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); |
386 | #endif |
387 | /* FALL THRU */ |
388 | case PG_REGEX_LOCALE_1BYTE_L: |
389 | #ifdef HAVE_LOCALE_T |
390 | return (c <= (pg_wchar) UCHAR_MAX && |
391 | isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); |
392 | #endif |
393 | break; |
394 | case PG_REGEX_LOCALE_ICU: |
395 | #ifdef USE_ICU |
396 | return u_isalnum(c); |
397 | #endif |
398 | break; |
399 | } |
400 | return 0; /* can't get here, but keep compiler quiet */ |
401 | } |
402 | |
403 | static int |
404 | pg_wc_isupper(pg_wchar c) |
405 | { |
406 | switch (pg_regex_strategy) |
407 | { |
408 | case PG_REGEX_LOCALE_C: |
409 | return (c <= (pg_wchar) 127 && |
410 | (pg_char_properties[c] & PG_ISUPPER)); |
411 | case PG_REGEX_LOCALE_WIDE: |
412 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
413 | return iswupper((wint_t) c); |
414 | /* FALL THRU */ |
415 | case PG_REGEX_LOCALE_1BYTE: |
416 | return (c <= (pg_wchar) UCHAR_MAX && |
417 | isupper((unsigned char) c)); |
418 | case PG_REGEX_LOCALE_WIDE_L: |
419 | #ifdef HAVE_LOCALE_T |
420 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
421 | return iswupper_l((wint_t) c, pg_regex_locale->info.lt); |
422 | #endif |
423 | /* FALL THRU */ |
424 | case PG_REGEX_LOCALE_1BYTE_L: |
425 | #ifdef HAVE_LOCALE_T |
426 | return (c <= (pg_wchar) UCHAR_MAX && |
427 | isupper_l((unsigned char) c, pg_regex_locale->info.lt)); |
428 | #endif |
429 | break; |
430 | case PG_REGEX_LOCALE_ICU: |
431 | #ifdef USE_ICU |
432 | return u_isupper(c); |
433 | #endif |
434 | break; |
435 | } |
436 | return 0; /* can't get here, but keep compiler quiet */ |
437 | } |
438 | |
439 | static int |
440 | pg_wc_islower(pg_wchar c) |
441 | { |
442 | switch (pg_regex_strategy) |
443 | { |
444 | case PG_REGEX_LOCALE_C: |
445 | return (c <= (pg_wchar) 127 && |
446 | (pg_char_properties[c] & PG_ISLOWER)); |
447 | case PG_REGEX_LOCALE_WIDE: |
448 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
449 | return iswlower((wint_t) c); |
450 | /* FALL THRU */ |
451 | case PG_REGEX_LOCALE_1BYTE: |
452 | return (c <= (pg_wchar) UCHAR_MAX && |
453 | islower((unsigned char) c)); |
454 | case PG_REGEX_LOCALE_WIDE_L: |
455 | #ifdef HAVE_LOCALE_T |
456 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
457 | return iswlower_l((wint_t) c, pg_regex_locale->info.lt); |
458 | #endif |
459 | /* FALL THRU */ |
460 | case PG_REGEX_LOCALE_1BYTE_L: |
461 | #ifdef HAVE_LOCALE_T |
462 | return (c <= (pg_wchar) UCHAR_MAX && |
463 | islower_l((unsigned char) c, pg_regex_locale->info.lt)); |
464 | #endif |
465 | break; |
466 | case PG_REGEX_LOCALE_ICU: |
467 | #ifdef USE_ICU |
468 | return u_islower(c); |
469 | #endif |
470 | break; |
471 | } |
472 | return 0; /* can't get here, but keep compiler quiet */ |
473 | } |
474 | |
475 | static int |
476 | pg_wc_isgraph(pg_wchar c) |
477 | { |
478 | switch (pg_regex_strategy) |
479 | { |
480 | case PG_REGEX_LOCALE_C: |
481 | return (c <= (pg_wchar) 127 && |
482 | (pg_char_properties[c] & PG_ISGRAPH)); |
483 | case PG_REGEX_LOCALE_WIDE: |
484 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
485 | return iswgraph((wint_t) c); |
486 | /* FALL THRU */ |
487 | case PG_REGEX_LOCALE_1BYTE: |
488 | return (c <= (pg_wchar) UCHAR_MAX && |
489 | isgraph((unsigned char) c)); |
490 | case PG_REGEX_LOCALE_WIDE_L: |
491 | #ifdef HAVE_LOCALE_T |
492 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
493 | return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); |
494 | #endif |
495 | /* FALL THRU */ |
496 | case PG_REGEX_LOCALE_1BYTE_L: |
497 | #ifdef HAVE_LOCALE_T |
498 | return (c <= (pg_wchar) UCHAR_MAX && |
499 | isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); |
500 | #endif |
501 | break; |
502 | case PG_REGEX_LOCALE_ICU: |
503 | #ifdef USE_ICU |
504 | return u_isgraph(c); |
505 | #endif |
506 | break; |
507 | } |
508 | return 0; /* can't get here, but keep compiler quiet */ |
509 | } |
510 | |
511 | static int |
512 | pg_wc_isprint(pg_wchar c) |
513 | { |
514 | switch (pg_regex_strategy) |
515 | { |
516 | case PG_REGEX_LOCALE_C: |
517 | return (c <= (pg_wchar) 127 && |
518 | (pg_char_properties[c] & PG_ISPRINT)); |
519 | case PG_REGEX_LOCALE_WIDE: |
520 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
521 | return iswprint((wint_t) c); |
522 | /* FALL THRU */ |
523 | case PG_REGEX_LOCALE_1BYTE: |
524 | return (c <= (pg_wchar) UCHAR_MAX && |
525 | isprint((unsigned char) c)); |
526 | case PG_REGEX_LOCALE_WIDE_L: |
527 | #ifdef HAVE_LOCALE_T |
528 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
529 | return iswprint_l((wint_t) c, pg_regex_locale->info.lt); |
530 | #endif |
531 | /* FALL THRU */ |
532 | case PG_REGEX_LOCALE_1BYTE_L: |
533 | #ifdef HAVE_LOCALE_T |
534 | return (c <= (pg_wchar) UCHAR_MAX && |
535 | isprint_l((unsigned char) c, pg_regex_locale->info.lt)); |
536 | #endif |
537 | break; |
538 | case PG_REGEX_LOCALE_ICU: |
539 | #ifdef USE_ICU |
540 | return u_isprint(c); |
541 | #endif |
542 | break; |
543 | } |
544 | return 0; /* can't get here, but keep compiler quiet */ |
545 | } |
546 | |
547 | static int |
548 | pg_wc_ispunct(pg_wchar c) |
549 | { |
550 | switch (pg_regex_strategy) |
551 | { |
552 | case PG_REGEX_LOCALE_C: |
553 | return (c <= (pg_wchar) 127 && |
554 | (pg_char_properties[c] & PG_ISPUNCT)); |
555 | case PG_REGEX_LOCALE_WIDE: |
556 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
557 | return iswpunct((wint_t) c); |
558 | /* FALL THRU */ |
559 | case PG_REGEX_LOCALE_1BYTE: |
560 | return (c <= (pg_wchar) UCHAR_MAX && |
561 | ispunct((unsigned char) c)); |
562 | case PG_REGEX_LOCALE_WIDE_L: |
563 | #ifdef HAVE_LOCALE_T |
564 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
565 | return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); |
566 | #endif |
567 | /* FALL THRU */ |
568 | case PG_REGEX_LOCALE_1BYTE_L: |
569 | #ifdef HAVE_LOCALE_T |
570 | return (c <= (pg_wchar) UCHAR_MAX && |
571 | ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); |
572 | #endif |
573 | break; |
574 | case PG_REGEX_LOCALE_ICU: |
575 | #ifdef USE_ICU |
576 | return u_ispunct(c); |
577 | #endif |
578 | break; |
579 | } |
580 | return 0; /* can't get here, but keep compiler quiet */ |
581 | } |
582 | |
583 | static int |
584 | pg_wc_isspace(pg_wchar c) |
585 | { |
586 | switch (pg_regex_strategy) |
587 | { |
588 | case PG_REGEX_LOCALE_C: |
589 | return (c <= (pg_wchar) 127 && |
590 | (pg_char_properties[c] & PG_ISSPACE)); |
591 | case PG_REGEX_LOCALE_WIDE: |
592 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
593 | return iswspace((wint_t) c); |
594 | /* FALL THRU */ |
595 | case PG_REGEX_LOCALE_1BYTE: |
596 | return (c <= (pg_wchar) UCHAR_MAX && |
597 | isspace((unsigned char) c)); |
598 | case PG_REGEX_LOCALE_WIDE_L: |
599 | #ifdef HAVE_LOCALE_T |
600 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
601 | return iswspace_l((wint_t) c, pg_regex_locale->info.lt); |
602 | #endif |
603 | /* FALL THRU */ |
604 | case PG_REGEX_LOCALE_1BYTE_L: |
605 | #ifdef HAVE_LOCALE_T |
606 | return (c <= (pg_wchar) UCHAR_MAX && |
607 | isspace_l((unsigned char) c, pg_regex_locale->info.lt)); |
608 | #endif |
609 | break; |
610 | case PG_REGEX_LOCALE_ICU: |
611 | #ifdef USE_ICU |
612 | return u_isspace(c); |
613 | #endif |
614 | break; |
615 | } |
616 | return 0; /* can't get here, but keep compiler quiet */ |
617 | } |
618 | |
619 | static pg_wchar |
620 | pg_wc_toupper(pg_wchar c) |
621 | { |
622 | switch (pg_regex_strategy) |
623 | { |
624 | case PG_REGEX_LOCALE_C: |
625 | if (c <= (pg_wchar) 127) |
626 | return pg_ascii_toupper((unsigned char) c); |
627 | return c; |
628 | case PG_REGEX_LOCALE_WIDE: |
629 | /* force C behavior for ASCII characters, per comments above */ |
630 | if (c <= (pg_wchar) 127) |
631 | return pg_ascii_toupper((unsigned char) c); |
632 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
633 | return towupper((wint_t) c); |
634 | /* FALL THRU */ |
635 | case PG_REGEX_LOCALE_1BYTE: |
636 | /* force C behavior for ASCII characters, per comments above */ |
637 | if (c <= (pg_wchar) 127) |
638 | return pg_ascii_toupper((unsigned char) c); |
639 | if (c <= (pg_wchar) UCHAR_MAX) |
640 | return toupper((unsigned char) c); |
641 | return c; |
642 | case PG_REGEX_LOCALE_WIDE_L: |
643 | #ifdef HAVE_LOCALE_T |
644 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
645 | return towupper_l((wint_t) c, pg_regex_locale->info.lt); |
646 | #endif |
647 | /* FALL THRU */ |
648 | case PG_REGEX_LOCALE_1BYTE_L: |
649 | #ifdef HAVE_LOCALE_T |
650 | if (c <= (pg_wchar) UCHAR_MAX) |
651 | return toupper_l((unsigned char) c, pg_regex_locale->info.lt); |
652 | #endif |
653 | return c; |
654 | case PG_REGEX_LOCALE_ICU: |
655 | #ifdef USE_ICU |
656 | return u_toupper(c); |
657 | #endif |
658 | break; |
659 | } |
660 | return 0; /* can't get here, but keep compiler quiet */ |
661 | } |
662 | |
663 | static pg_wchar |
664 | pg_wc_tolower(pg_wchar c) |
665 | { |
666 | switch (pg_regex_strategy) |
667 | { |
668 | case PG_REGEX_LOCALE_C: |
669 | if (c <= (pg_wchar) 127) |
670 | return pg_ascii_tolower((unsigned char) c); |
671 | return c; |
672 | case PG_REGEX_LOCALE_WIDE: |
673 | /* force C behavior for ASCII characters, per comments above */ |
674 | if (c <= (pg_wchar) 127) |
675 | return pg_ascii_tolower((unsigned char) c); |
676 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
677 | return towlower((wint_t) c); |
678 | /* FALL THRU */ |
679 | case PG_REGEX_LOCALE_1BYTE: |
680 | /* force C behavior for ASCII characters, per comments above */ |
681 | if (c <= (pg_wchar) 127) |
682 | return pg_ascii_tolower((unsigned char) c); |
683 | if (c <= (pg_wchar) UCHAR_MAX) |
684 | return tolower((unsigned char) c); |
685 | return c; |
686 | case PG_REGEX_LOCALE_WIDE_L: |
687 | #ifdef HAVE_LOCALE_T |
688 | if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) |
689 | return towlower_l((wint_t) c, pg_regex_locale->info.lt); |
690 | #endif |
691 | /* FALL THRU */ |
692 | case PG_REGEX_LOCALE_1BYTE_L: |
693 | #ifdef HAVE_LOCALE_T |
694 | if (c <= (pg_wchar) UCHAR_MAX) |
695 | return tolower_l((unsigned char) c, pg_regex_locale->info.lt); |
696 | #endif |
697 | return c; |
698 | case PG_REGEX_LOCALE_ICU: |
699 | #ifdef USE_ICU |
700 | return u_tolower(c); |
701 | #endif |
702 | break; |
703 | } |
704 | return 0; /* can't get here, but keep compiler quiet */ |
705 | } |
706 | |
707 | |
708 | /* |
709 | * These functions cache the results of probing libc's ctype behavior for |
710 | * all character codes of interest in a given encoding/collation. The |
711 | * result is provided as a "struct cvec", but notice that the representation |
712 | * is a touch different from a cvec created by regc_cvec.c: we allocate the |
713 | * chrs[] and ranges[] arrays separately from the struct so that we can |
714 | * realloc them larger at need. This is okay since the cvecs made here |
715 | * should never be freed by freecvec(). |
716 | * |
717 | * We use malloc not palloc since we mustn't lose control on out-of-memory; |
718 | * the main regex code expects us to return a failure indication instead. |
719 | */ |
720 | |
721 | typedef int (*pg_wc_probefunc) (pg_wchar c); |
722 | |
723 | typedef struct pg_ctype_cache |
724 | { |
725 | pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */ |
726 | Oid collation; /* collation this entry is for */ |
727 | struct cvec cv; /* cache entry contents */ |
728 | struct pg_ctype_cache *next; /* chain link */ |
729 | } pg_ctype_cache; |
730 | |
731 | static pg_ctype_cache *pg_ctype_cache_list = NULL; |
732 | |
733 | /* |
734 | * Add a chr or range to pcc->cv; return false if run out of memory |
735 | */ |
736 | static bool |
737 | store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs) |
738 | { |
739 | chr *newchrs; |
740 | |
741 | if (nchrs > 1) |
742 | { |
743 | if (pcc->cv.nranges >= pcc->cv.rangespace) |
744 | { |
745 | pcc->cv.rangespace *= 2; |
746 | newchrs = (chr *) realloc(pcc->cv.ranges, |
747 | pcc->cv.rangespace * sizeof(chr) * 2); |
748 | if (newchrs == NULL) |
749 | return false; |
750 | pcc->cv.ranges = newchrs; |
751 | } |
752 | pcc->cv.ranges[pcc->cv.nranges * 2] = chr1; |
753 | pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1; |
754 | pcc->cv.nranges++; |
755 | } |
756 | else |
757 | { |
758 | assert(nchrs == 1); |
759 | if (pcc->cv.nchrs >= pcc->cv.chrspace) |
760 | { |
761 | pcc->cv.chrspace *= 2; |
762 | newchrs = (chr *) realloc(pcc->cv.chrs, |
763 | pcc->cv.chrspace * sizeof(chr)); |
764 | if (newchrs == NULL) |
765 | return false; |
766 | pcc->cv.chrs = newchrs; |
767 | } |
768 | pcc->cv.chrs[pcc->cv.nchrs++] = chr1; |
769 | } |
770 | return true; |
771 | } |
772 | |
773 | /* |
774 | * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all |
775 | * chrs satisfying the probe function. The active collation is the one |
776 | * previously set by pg_set_regex_collation. Return NULL if out of memory. |
777 | * |
778 | * Note that the result must not be freed or modified by caller. |
779 | */ |
780 | static struct cvec * |
781 | pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) |
782 | { |
783 | pg_ctype_cache *pcc; |
784 | pg_wchar max_chr; |
785 | pg_wchar cur_chr; |
786 | int nmatches; |
787 | chr *newchrs; |
788 | |
789 | /* |
790 | * Do we already have the answer cached? |
791 | */ |
792 | for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next) |
793 | { |
794 | if (pcc->probefunc == probefunc && |
795 | pcc->collation == pg_regex_collation) |
796 | return &pcc->cv; |
797 | } |
798 | |
799 | /* |
800 | * Nope, so initialize some workspace ... |
801 | */ |
802 | pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache)); |
803 | if (pcc == NULL) |
804 | return NULL; |
805 | pcc->probefunc = probefunc; |
806 | pcc->collation = pg_regex_collation; |
807 | pcc->cv.nchrs = 0; |
808 | pcc->cv.chrspace = 128; |
809 | pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr)); |
810 | pcc->cv.nranges = 0; |
811 | pcc->cv.rangespace = 64; |
812 | pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2); |
813 | if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL) |
814 | goto out_of_memory; |
815 | pcc->cv.cclasscode = cclasscode; |
816 | |
817 | /* |
818 | * Decide how many character codes we ought to look through. In general |
819 | * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at |
820 | * runtime using the "high colormap" mechanism. However, in C locale |
821 | * there's no need to go further than 127, and if we only have a 1-byte |
822 | * <ctype.h> API there's no need to go further than that can handle. |
823 | * |
824 | * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the |
825 | * output cvec as not having any locale-dependent behavior, since there |
826 | * will be no need to do any run-time locale checks. (The #if's here |
827 | * would always be true for production values of MAX_SIMPLE_CHR, but it's |
828 | * useful to allow it to be small for testing purposes.) |
829 | */ |
830 | switch (pg_regex_strategy) |
831 | { |
832 | case PG_REGEX_LOCALE_C: |
833 | #if MAX_SIMPLE_CHR >= 127 |
834 | max_chr = (pg_wchar) 127; |
835 | pcc->cv.cclasscode = -1; |
836 | #else |
837 | max_chr = (pg_wchar) MAX_SIMPLE_CHR; |
838 | #endif |
839 | break; |
840 | case PG_REGEX_LOCALE_WIDE: |
841 | case PG_REGEX_LOCALE_WIDE_L: |
842 | max_chr = (pg_wchar) MAX_SIMPLE_CHR; |
843 | break; |
844 | case PG_REGEX_LOCALE_1BYTE: |
845 | case PG_REGEX_LOCALE_1BYTE_L: |
846 | #if MAX_SIMPLE_CHR >= UCHAR_MAX |
847 | max_chr = (pg_wchar) UCHAR_MAX; |
848 | pcc->cv.cclasscode = -1; |
849 | #else |
850 | max_chr = (pg_wchar) MAX_SIMPLE_CHR; |
851 | #endif |
852 | break; |
853 | case PG_REGEX_LOCALE_ICU: |
854 | max_chr = (pg_wchar) MAX_SIMPLE_CHR; |
855 | break; |
856 | default: |
857 | max_chr = 0; /* can't get here, but keep compiler quiet */ |
858 | break; |
859 | } |
860 | |
861 | /* |
862 | * And scan 'em ... |
863 | */ |
864 | nmatches = 0; /* number of consecutive matches */ |
865 | |
866 | for (cur_chr = 0; cur_chr <= max_chr; cur_chr++) |
867 | { |
868 | if ((*probefunc) (cur_chr)) |
869 | nmatches++; |
870 | else if (nmatches > 0) |
871 | { |
872 | if (!store_match(pcc, cur_chr - nmatches, nmatches)) |
873 | goto out_of_memory; |
874 | nmatches = 0; |
875 | } |
876 | } |
877 | |
878 | if (nmatches > 0) |
879 | if (!store_match(pcc, cur_chr - nmatches, nmatches)) |
880 | goto out_of_memory; |
881 | |
882 | /* |
883 | * We might have allocated more memory than needed, if so free it |
884 | */ |
885 | if (pcc->cv.nchrs == 0) |
886 | { |
887 | free(pcc->cv.chrs); |
888 | pcc->cv.chrs = NULL; |
889 | pcc->cv.chrspace = 0; |
890 | } |
891 | else if (pcc->cv.nchrs < pcc->cv.chrspace) |
892 | { |
893 | newchrs = (chr *) realloc(pcc->cv.chrs, |
894 | pcc->cv.nchrs * sizeof(chr)); |
895 | if (newchrs == NULL) |
896 | goto out_of_memory; |
897 | pcc->cv.chrs = newchrs; |
898 | pcc->cv.chrspace = pcc->cv.nchrs; |
899 | } |
900 | if (pcc->cv.nranges == 0) |
901 | { |
902 | free(pcc->cv.ranges); |
903 | pcc->cv.ranges = NULL; |
904 | pcc->cv.rangespace = 0; |
905 | } |
906 | else if (pcc->cv.nranges < pcc->cv.rangespace) |
907 | { |
908 | newchrs = (chr *) realloc(pcc->cv.ranges, |
909 | pcc->cv.nranges * sizeof(chr) * 2); |
910 | if (newchrs == NULL) |
911 | goto out_of_memory; |
912 | pcc->cv.ranges = newchrs; |
913 | pcc->cv.rangespace = pcc->cv.nranges; |
914 | } |
915 | |
916 | /* |
917 | * Success, link it into cache chain |
918 | */ |
919 | pcc->next = pg_ctype_cache_list; |
920 | pg_ctype_cache_list = pcc; |
921 | |
922 | return &pcc->cv; |
923 | |
924 | /* |
925 | * Failure, clean up |
926 | */ |
927 | out_of_memory: |
928 | if (pcc->cv.chrs) |
929 | free(pcc->cv.chrs); |
930 | if (pcc->cv.ranges) |
931 | free(pcc->cv.ranges); |
932 | free(pcc); |
933 | |
934 | return NULL; |
935 | } |
936 | |