regc_locale.c source code [PostgreSQL/src/backend/regex/regc_locale.c]

1	/*
2	* regc_locale.c --
3	*
4	* This file contains locale-specific regexp routines.
5	* This file is #included by regcomp.c.
6	*
7	* Copyright (c) 1998 by Scriptics Corporation.
8	*
9	* This software is copyrighted by the Regents of the University of
10	* California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11	* Corporation and other parties. The following terms apply to all files
12	* associated with the software unless explicitly disclaimed in
13	* individual files.
14	*
15	* The authors hereby grant permission to use, copy, modify, distribute,
16	* and license this software and its documentation for any purpose, provided
17	* that existing copyright notices are retained in all copies and that this
18	* notice is included verbatim in any distributions. No written agreement,
19	* license, or royalty fee is required for any of the authorized uses.
20	* Modifications to this software may be copyrighted by their authors
21	* and need not follow the licensing terms described here, provided that
22	* the new terms are clearly indicated on the first page of each file where
23	* they apply.
24	*
25	* IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26	* FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27	* ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28	* DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29	* POSSIBILITY OF SUCH DAMAGE.
30	*
31	* THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33	* FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34	* IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35	* NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36	* MODIFICATIONS.
37	*
38	* GOVERNMENT USE: If you are acquiring this software on behalf of the
39	* U.S. government, the Government shall have only "Restricted Rights"
40	* in the software and related documentation as defined in the Federal
41	* Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42	* are acquiring the software on behalf of the Department of Defense, the
43	* software shall be classified as "Commercial Computer Software" and the
44	* Government shall have only "Restricted Rights" as defined in Clause
45	* 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46	* authors grant the U.S. Government and others acting in its behalf
47	* permission to use and distribute the software in accordance with the
48	* terms specified in this license.
49	*
50	* src/backend/regex/regc_locale.c
51	*/
52
53	/ ASCII character-name table /
54
55	static const struct cname
56	{
57	const char *name;
58	const char code;
59	} cnames[] =
60
61	{
62	{
63	"NUL", `'\0'`
64	},
65	{
66	"SOH", `'\001'`
67	},
68	{
69	"STX", `'\002'`
70	},
71	{
72	"ETX", `'\003'`
73	},
74	{
75	"EOT", `'\004'`
76	},
77	{
78	"ENQ", `'\005'`
79	},
80	{
81	"ACK", `'\006'`
82	},
83	{
84	"BEL", `'\007'`
85	},
86	{
87	"alert", `'\007'`
88	},
89	{
90	"BS", `'\010'`
91	},
92	{
93	"backspace", `'\b'`
94	},
95	{
96	"HT", `'\011'`
97	},
98	{
99	"tab", `'\t'`
100	},
101	{
102	"LF", `'\012'`
103	},
104	{
105	"newline", `'\n'`
106	},
107	{
108	"VT", `'\013'`
109	},
110	{
111	"vertical-tab", `'\v'`
112	},
113	{
114	"FF", `'\014'`
115	},
116	{
117	"form-feed", `'\f'`
118	},
119	{
120	"CR", `'\015'`
121	},
122	{
123	"carriage-return", `'\r'`
124	},
125	{
126	"SO", `'\016'`
127	},
128	{
129	"SI", `'\017'`
130	},
131	{
132	"DLE", `'\020'`
133	},
134	{
135	"DC1", `'\021'`
136	},
137	{
138	"DC2", `'\022'`
139	},
140	{
141	"DC3", `'\023'`
142	},
143	{
144	"DC4", `'\024'`
145	},
146	{
147	"NAK", `'\025'`
148	},
149	{
150	"SYN", `'\026'`
151	},
152	{
153	"ETB", `'\027'`
154	},
155	{
156	"CAN", `'\030'`
157	},
158	{
159	"EM", `'\031'`
160	},
161	{
162	"SUB", `'\032'`
163	},
164	{
165	"ESC", `'\033'`
166	},
167	{
168	"IS4", `'\034'`
169	},
170	{
171	"FS", `'\034'`
172	},
173	{
174	"IS3", `'\035'`
175	},
176	{
177	"GS", `'\035'`
178	},
179	{
180	"IS2", `'\036'`
181	},
182	{
183	"RS", `'\036'`
184	},
185	{
186	"IS1", `'\037'`
187	},
188	{
189	"US", `'\037'`
190	},
191	{
192	"space", `' '`
193	},
194	{
195	"exclamation-mark", `'!'`
196	},
197	{
198	"quotation-mark", `'"'`
199	},
200	{
201	"number-sign", `'#'`
202	},
203	{
204	"dollar-sign", `'$'`
205	},
206	{
207	"percent-sign", `'%'`
208	},
209	{
210	"ampersand", `'&'`
211	},
212	{
213	"apostrophe", `'\''`
214	},
215	{
216	"left-parenthesis", `'('`
217	},
218	{
219	"right-parenthesis", `')'`
220	},
221	{
222	"asterisk", `'*'`
223	},
224	{
225	"plus-sign", `'+'`
226	},
227	{
228	"comma", `','`
229	},
230	{
231	"hyphen", `'-'`
232	},
233	{
234	"hyphen-minus", `'-'`
235	},
236	{
237	"period", `'.'`
238	},
239	{
240	"full-stop", `'.'`
241	},
242	{
243	"slash", `'/'`
244	},
245	{
246	"solidus", `'/'`
247	},
248	{
249	"zero", `'0'`
250	},
251	{
252	"one", `'1'`
253	},
254	{
255	"two", `'2'`
256	},
257	{
258	"three", `'3'`
259	},
260	{
261	"four", `'4'`
262	},
263	{
264	"five", `'5'`
265	},
266	{
267	"six", `'6'`
268	},
269	{
270	"seven", `'7'`
271	},
272	{
273	"eight", `'8'`
274	},
275	{
276	"nine", `'9'`
277	},
278	{
279	"colon", `':'`
280	},
281	{
282	"semicolon", `';'`
283	},
284	{
285	"less-than-sign", `'<'`
286	},
287	{
288	"equals-sign", `'='`
289	},
290	{
291	"greater-than-sign", `'>'`
292	},
293	{
294	"question-mark", `'?'`
295	},
296	{
297	"commercial-at", `'@'`
298	},
299	{
300	"left-square-bracket", `'['`
301	},
302	{
303	"backslash", `'\\'`
304	},
305	{
306	"reverse-solidus", `'\\'`
307	},
308	{
309	"right-square-bracket", `']'`
310	},
311	{
312	"circumflex", `'^'`
313	},
314	{
315	"circumflex-accent", `'^'`
316	},
317	{
318	"underscore", `'_'`
319	},
320	{
321	"low-line", `'_'`
322	},
323	{
324	"grave-accent", '`'
325	},
326	{
327	"left-brace", `'{'`
328	},
329	{
330	"left-curly-bracket", `'{'`
331	},
332	{
333	"vertical-line", `'\|'`
334	},
335	{
336	"right-brace", `'}'`
337	},
338	{
339	"right-curly-bracket", `'}'`
340	},
341	{
342	"tilde", `'~'`
343	},
344	{
345	"DEL", `'\177'`
346	},
347	{
348	NULL, `0`
349	}
350	};
351
352	/*
353	* The following arrays define the valid character class names.
354	*/
355	static const char *const classNames[NUM_CCLASSES + `1`] = {
356	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
357	"lower", "print", "punct", "space", "upper", "xdigit", NULL
358	};
359
360	enum classes
361	{
362	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
363	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
364	};
365
366	/*
367	* We do not use the hard-wired Unicode classification tables that Tcl does.
368	* This is because (a) we need to deal with other encodings besides Unicode,
369	* and (b) we want to track the behavior of the libc locale routines as
370	* closely as possible. For example, it wouldn't be unreasonable for a
371	* locale to not consider every Unicode letter as a letter. So we build
372	* character classification cvecs by asking libc, even for Unicode.
373	*/
374
375
376	/*
377	* element - map collating-element name to chr
378	*/
379	static chr
380	element(struct vars v, /* context /
381	const chr startp, /* points to start of name /
382	const chr endp) /* points just past end of name /
383	{
384	const struct cname *cn;
385	size_t len;
386
387	/ generic: one-chr names stand for themselves /
388	assert(startp < endp);
389	len = endp - startp;
390	if (len == `1`)
391	return *startp;
392
393	NOTE(REG_ULOCALE);
394
395	/ search table /
396	for (cn = cnames; cn->name != NULL; cn++)
397	{
398	if (strlen(cn->name) == len &&
399	pg_char_and_wchar_strncmp(cn->name, startp, len) == `0`)
400	{
401	break; / NOTE BREAK OUT /
402	}
403	}
404	if (cn->name != NULL)
405	return CHR(cn->code);
406
407	/ couldn't find it /
408	ERR(REG_ECOLLATE);
409	return `0`;
410	}
411
412	/*
413	* range - supply cvec for a range, including legality check
414	*/
415	static struct cvec *
416	range(struct vars v, /* context /
417	chr a, / range start /
418	chr b, / range end, might equal a /
419	int cases) / case-independent? /
420	{
421	int nchrs;
422	struct cvec *cv;
423	chr c,
424	cc;
425
426	if (a != b && !before(a, b))
427	{
428	ERR(REG_ERANGE);
429	return NULL;
430	}
431
432	if (!cases)
433	{ / easy version /
434	cv = getcvec(v, `0`, `1`);
435	NOERRN();
436	addrange(cv, a, b);
437	return cv;
438	}
439
440	/*
441	* When case-independent, it's hard to decide when cvec ranges are usable,
442	* so for now at least, we won't try. We use a range for the originally
443	* specified chrs and then add on any case-equivalents that are outside
444	* that range as individual chrs.
445	*
446	* To ensure sane behavior if someone specifies a very large range, limit
447	* the allocation size to 100000 chrs (arbitrary) and check for overrun
448	* inside the loop below.
449	*/
450	nchrs = b - a + `1`;
451	if (nchrs <= `0` \|\| nchrs > `100000`)
452	nchrs = `100000`;
453
454	cv = getcvec(v, nchrs, `1`);
455	NOERRN();
456	addrange(cv, a, b);
457
458	for (c = a; c <= b; c++)
459	{
460	cc = pg_wc_tolower(c);
461	if (cc != c &&
462	(before(cc, a) \|\| before(b, cc)))
463	{
464	if (cv->nchrs >= cv->chrspace)
465	{
466	ERR(REG_ETOOBIG);
467	return NULL;
468	}
469	addchr(cv, cc);
470	}
471	cc = pg_wc_toupper(c);
472	if (cc != c &&
473	(before(cc, a) \|\| before(b, cc)))
474	{
475	if (cv->nchrs >= cv->chrspace)
476	{
477	ERR(REG_ETOOBIG);
478	return NULL;
479	}
480	addchr(cv, cc);
481	}
482	if (CANCEL_REQUESTED(v->re))
483	{
484	ERR(REG_CANCEL);
485	return NULL;
486	}
487	}
488
489	return cv;
490	}
491
492	/*
493	* before - is chr x before chr y, for purposes of range legality?
494	*/
495	static int / predicate /
496	before(chr x, chr y)
497	{
498	if (x < y)
499	return `1`;
500	return `0`;
501	}
502
503	/*
504	* eclass - supply cvec for an equivalence class
505	* Must include case counterparts on request.
506	*/
507	static struct cvec *
508	eclass(struct vars v, /* context /
509	chr c, / Collating element representing the*
510	* equivalence class. */
511	int cases) / all cases? /
512	{
513	struct cvec *cv;
514
515	/ crude fake equivalence class for testing /
516	if ((v->cflags & REG_FAKE) && c == `'x'`)
517	{
518	cv = getcvec(v, `4`, `0`);
519	addchr(cv, CHR(`'x'`));
520	addchr(cv, CHR(`'y'`));
521	if (cases)
522	{
523	addchr(cv, CHR(`'X'`));
524	addchr(cv, CHR(`'Y'`));
525	}
526	return cv;
527	}
528
529	/ otherwise, none /
530	if (cases)
531	return allcases(v, c);
532	cv = getcvec(v, `1`, `0`);
533	assert(cv != NULL);
534	addchr(cv, c);
535	return cv;
536	}
537
538	/*
539	* cclass - supply cvec for a character class
540	*
541	* Must include case counterparts if "cases" is true.
542	*
543	* The returned cvec might be either a transient cvec gotten from getcvec(),
544	* or a permanently cached one from pg_ctype_get_cache(). This is okay
545	* because callers are not supposed to explicitly free the result either way.
546	*/
547	static struct cvec *
548	cclass(struct vars v, /* context /
549	const chr startp, /* where the name starts /
550	const chr endp, /* just past the end of the name /
551	int cases) / case-independent? /
552	{
553	size_t len;
554	struct cvec *cv = NULL;
555	const char *const *namePtr;
556	int i,
557	index;
558
559	/*
560	* Map the name to the corresponding enumerated value.
561	*/
562	len = endp - startp;
563	index = -`1`;
564	for (namePtr = classNames, i = `0`; *namePtr != NULL; namePtr++, i++)
565	{
566	if (strlen(*namePtr) == len &&
567	pg_char_and_wchar_strncmp(*namePtr, startp, len) == `0`)
568	{
569	index = i;
570	break;
571	}
572	}
573	if (index == -`1`)
574	{
575	ERR(REG_ECTYPE);
576	return NULL;
577	}
578
579	/*
580	* Remap lower and upper to alpha if the match is case insensitive.
581	*/
582
583	if (cases &&
584	((enum classes) index == CC_LOWER \|\|
585	(enum classes) index == CC_UPPER))
586	index = (int) CC_ALPHA;
587
588	/*
589	* Now compute the character class contents. For classes that are based
590	* on the behavior of a <wctype.h> or <ctype.h> function, we use
591	* pg_ctype_get_cache so that we can cache the results. Other classes
592	* have definitions that are hard-wired here, and for those we just
593	* construct a transient cvec on the fly.
594	*
595	* NB: keep this code in sync with cclass_column_index(), below.
596	*/
597
598	switch ((enum classes) index)
599	{
600	case CC_PRINT:
601	cv = pg_ctype_get_cache(pg_wc_isprint, index);
602	break;
603	case CC_ALNUM:
604	cv = pg_ctype_get_cache(pg_wc_isalnum, index);
605	break;
606	case CC_ALPHA:
607	cv = pg_ctype_get_cache(pg_wc_isalpha, index);
608	break;
609	case CC_ASCII:
610	/ hard-wired meaning /
611	cv = getcvec(v, `0`, `1`);
612	if (cv)
613	addrange(cv, `0`, `0x7f`);
614	break;
615	case CC_BLANK:
616	/ hard-wired meaning /
617	cv = getcvec(v, `2`, `0`);
618	addchr(cv, `'\t'`);
619	addchr(cv, `' '`);
620	break;
621	case CC_CNTRL:
622	/ hard-wired meaning /
623	cv = getcvec(v, `0`, `2`);
624	addrange(cv, `0x0`, `0x1f`);
625	addrange(cv, `0x7f`, `0x9f`);
626	break;
627	case CC_DIGIT:
628	cv = pg_ctype_get_cache(pg_wc_isdigit, index);
629	break;
630	case CC_PUNCT:
631	cv = pg_ctype_get_cache(pg_wc_ispunct, index);
632	break;
633	case CC_XDIGIT:
634
635	/*
636	* It's not clear how to define this in non-western locales, and
637	* even less clear that there's any particular use in trying. So
638	* just hard-wire the meaning.
639	*/
640	cv = getcvec(v, `0`, `3`);
641	if (cv)
642	{
643	addrange(cv, `'0'`, `'9'`);
644	addrange(cv, `'a'`, `'f'`);
645	addrange(cv, `'A'`, `'F'`);
646	}
647	break;
648	case CC_SPACE:
649	cv = pg_ctype_get_cache(pg_wc_isspace, index);
650	break;
651	case CC_LOWER:
652	cv = pg_ctype_get_cache(pg_wc_islower, index);
653	break;
654	case CC_UPPER:
655	cv = pg_ctype_get_cache(pg_wc_isupper, index);
656	break;
657	case CC_GRAPH:
658	cv = pg_ctype_get_cache(pg_wc_isgraph, index);
659	break;
660	}
661
662	/ If cv is NULL now, the reason must be "out of memory" /
663	if (cv == NULL)
664	ERR(REG_ESPACE);
665	return cv;
666	}
667
668	/*
669	* cclass_column_index - get appropriate high colormap column index for chr
670	*/
671	static int
672	cclass_column_index(struct colormap *cm, chr c)
673	{
674	int colnum = `0`;
675
676	/ Shouldn't go through all these pushups for simple chrs /
677	assert(c > MAX_SIMPLE_CHR);
678
679	/*
680	* Note: we should not see requests to consider cclasses that are not
681	* treated as locale-specific by cclass(), above.
682	*/
683	if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
684	colnum \|= cm->classbits[CC_PRINT];
685	if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
686	colnum \|= cm->classbits[CC_ALNUM];
687	if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
688	colnum \|= cm->classbits[CC_ALPHA];
689	assert(cm->classbits[CC_ASCII] == `0`);
690	assert(cm->classbits[CC_BLANK] == `0`);
691	assert(cm->classbits[CC_CNTRL] == `0`);
692	if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
693	colnum \|= cm->classbits[CC_DIGIT];
694	if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
695	colnum \|= cm->classbits[CC_PUNCT];
696	assert(cm->classbits[CC_XDIGIT] == `0`);
697	if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
698	colnum \|= cm->classbits[CC_SPACE];
699	if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
700	colnum \|= cm->classbits[CC_LOWER];
701	if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
702	colnum \|= cm->classbits[CC_UPPER];
703	if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
704	colnum \|= cm->classbits[CC_GRAPH];
705
706	return colnum;
707	}
708
709	/*
710	* allcases - supply cvec for all case counterparts of a chr (including itself)
711	*
712	* This is a shortcut, preferably an efficient one, for simple characters;
713	* messy cases are done via range().
714	*/
715	static struct cvec *
716	allcases(struct vars v, /* context /
717	chr c) / character to get case equivs of /
718	{
719	struct cvec *cv;
720	chr lc,
721	uc;
722
723	lc = pg_wc_tolower(c);
724	uc = pg_wc_toupper(c);
725
726	cv = getcvec(v, `2`, `0`);
727	addchr(cv, lc);
728	if (lc != uc)
729	addchr(cv, uc);
730	return cv;
731	}
732
733	/*
734	* cmp - chr-substring compare
735	*
736	* Backrefs need this. It should preferably be efficient.
737	* Note that it does not need to report anything except equal/unequal.
738	* Note also that the length is exact, and the comparison should not
739	* stop at embedded NULs!
740	*/
741	static int / 0 for equal, nonzero for unequal /
742	cmp(const chr x, const* chr y, /* strings to compare /
743	size_t len) / exact length of comparison /
744	{
745	return memcmp(VS(x), VS(y), len * sizeof(chr));
746	}
747
748	/*
749	* casecmp - case-independent chr-substring compare
750	*
751	* REG_ICASE backrefs need this. It should preferably be efficient.
752	* Note that it does not need to report anything except equal/unequal.
753	* Note also that the length is exact, and the comparison should not
754	* stop at embedded NULs!
755	*/
756	static int / 0 for equal, nonzero for unequal /
757	casecmp(const chr x, const* chr y, /* strings to compare /
758	size_t len) / exact length of comparison /
759	{
760	for (; len > `0`; len--, x++, y++)
761	{
762	if ((x != y) && (pg_wc_tolower(x) != pg_wc_tolower(y)))
763	return `1`;
764	}
765	return `0`;
766	}
767

Browse the source code of PostgreSQL/src/backend/regex/regc_locale.c