regexp.c source code [PostgreSQL/src/backend/utils/adt/regexp.c]

1	/-------------------------------------------------------------------------*
2	*
3	* regexp.c
4	* Postgres' interface to the regular expression package.
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	*
10	* IDENTIFICATION
11	* src/backend/utils/adt/regexp.c
12	*
13	* Alistair Crooks added the code for the regex caching
14	* agc - cached the regular expressions used - there's a good chance
15	* that we'll get a hit, so this saves a compile step for every
16	* attempted match. I haven't actually measured the speed improvement,
17	* but it `looks' a lot quicker visually when watching regression
18	* test output.
19	*
20	* agc - incorporated Keith Bostic's Berkeley regex code into
21	* the tree for all ports. To distinguish this regex code from any that
22	* is existent on a platform, I've prepended the string "pg_" to
23	* the functions regcomp, regerror, regexec and regfree.
24	* Fixed a bug that was originally a typo by me, where `i' was used
25	* instead of `oldest' when compiling regular expressions - benign
26	* results mostly, although occasionally it bit you...
27	*
28	*-------------------------------------------------------------------------
29	*/
30	#include "postgres.h"
31
32	#include "catalog/pg_type.h"
33	#include "funcapi.h"
34	#include "miscadmin.h"
35	#include "regex/regex.h"
36	#include "utils/array.h"
37	#include "utils/builtins.h"
38	#include "utils/memutils.h"
39	#include "utils/varlena.h"
40
41	#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
42	(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
43
44
45	/ all the options of interest for regex functions /
46	typedef struct pg_re_flags
47	{
48	int cflags; / compile flags for Spencer's regex code /
49	bool glob; / do it globally (for each occurrence) /
50	} pg_re_flags;
51
52	/ cross-call state for regexp_match and regexp_split functions /
53	typedef struct regexp_matches_ctx
54	{
55	text orig_str; /* data string in original TEXT form /
56	int nmatches; / number of places where pattern matched /
57	int npatterns; / number of capturing subpatterns /
58	/ We store start char index and end+1 char index for each match /
59	/ so the number of entries in match_locs is nmatches * npatterns * 2 /
60	int match_locs; /* 0-based character indexes /
61	int next_match; / 0-based index of next match to process /
62	/ workspace for build_regexp_match_result() /
63	Datum elems; /* has npatterns elements /
64	bool nulls; /* has npatterns elements /
65	pg_wchar wide_str; /* wide-char version of original string /
66	char conv_buf; /* conversion buffer /
67	int conv_bufsiz; / size thereof /
68	} regexp_matches_ctx;
69
70	/*
71	* We cache precompiled regular expressions using a "self organizing list"
72	* structure, in which recently-used items tend to be near the front.
73	* Whenever we use an entry, it's moved up to the front of the list.
74	* Over time, an item's average position corresponds to its frequency of use.
75	*
76	* When we first create an entry, it's inserted at the front of
77	* the array, dropping the entry at the end of the array if necessary to
78	* make room. (This might seem to be weighting the new entry too heavily,
79	* but if we insert new entries further back, we'll be unable to adjust to
80	* a sudden shift in the query mix where we are presented with MAX_CACHED_RES
81	* never-before-seen items used circularly. We ought to be able to handle
82	* that case, so we have to insert at the front.)
83	*
84	* Knuth mentions a variant strategy in which a used item is moved up just
85	* one place in the list. Although he says this uses fewer comparisons on
86	* average, it seems not to adapt very well to the situation where you have
87	* both some reusable patterns and a steady stream of non-reusable patterns.
88	* A reusable pattern that isn't used at least as often as non-reusable
89	* patterns are seen will "fail to keep up" and will drop off the end of the
90	* cache. With move-to-front, a reusable pattern is guaranteed to stay in
91	* the cache as long as it's used at least once in every MAX_CACHED_RES uses.
92	*/
93
94	/ this is the maximum number of cached regular expressions /
95	#ifndef MAX_CACHED_RES
96	#define MAX_CACHED_RES 32
97	#endif
98
99	/ this structure describes one cached regular expression /
100	typedef struct cached_re_str
101	{
102	char cre_pat; /* original RE (not null terminated!) /
103	int cre_pat_len; / length of original RE, in bytes /
104	int cre_flags; / compile flags: extended,icase etc /
105	Oid cre_collation; / collation to use /
106	regex_t cre_re; / the compiled regular expression /
107	} cached_re_str;
108
109	static int num_res = `0`; / # of cached re's /
110	static cached_re_str re_array[MAX_CACHED_RES]; / cached re's /
111
112
113	/ Local functions /
114	static regexp_matches_ctx setup_regexp_matches(text orig_str, text *pattern,
115	pg_re_flags *flags,
116	Oid collation,
117	bool use_subpatterns,
118	bool ignore_degenerate,
119	bool fetching_unmatched);
120	static ArrayType build_regexp_match_result(regexp_matches_ctx matchctx);
121	static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
122
123
124	/*
125	* RE_compile_and_cache - compile a RE, caching if possible
126	*
127	* Returns regex_t *
128	*
129	* text_re --- the pattern, expressed as a TEXT object
130	* cflags --- compile options for the pattern
131	* collation --- collation to use for LC_CTYPE-dependent behavior
132	*
133	* Pattern is given in the database encoding. We internally convert to
134	* an array of pg_wchar, which is what Spencer's regex package wants.
135	*/
136	regex_t *
137	RE_compile_and_cache(text text_re, int* cflags, Oid collation)
138	{
139	int text_re_len = VARSIZE_ANY_EXHDR(text_re);
140	char *text_re_val = VARDATA_ANY(text_re);
141	pg_wchar *pattern;
142	int pattern_len;
143	int i;
144	int regcomp_result;
145	cached_re_str re_temp;
146	char errMsg[`100`];
147
148	/*
149	* Look for a match among previously compiled REs. Since the data
150	* structure is self-organizing with most-used entries at the front, our
151	* search strategy can just be to scan from the front.
152	*/
153	for (i = `0`; i < num_res; i++)
154	{
155	if (re_array[i].cre_pat_len == text_re_len &&
156	re_array[i].cre_flags == cflags &&
157	re_array[i].cre_collation == collation &&
158	memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == `0`)
159	{
160	/*
161	* Found a match; move it to front if not there already.
162	*/
163	if (i > `0`)
164	{
165	re_temp = re_array[i];
166	memmove(&re_array[`1`], &re_array[`0`], i * sizeof(cached_re_str));
167	re_array[`0`] = re_temp;
168	}
169
170	return &re_array[`0`].cre_re;
171	}
172	}
173
174	/*
175	* Couldn't find it, so try to compile the new RE. To avoid leaking
176	* resources on failure, we build into the re_temp local.
177	*/
178
179	/ Convert pattern string to wide characters /
180	pattern = (pg_wchar ) palloc((text_re_len + `1`) sizeof(pg_wchar));
181	pattern_len = pg_mb2wchar_with_len(text_re_val,
182	pattern,
183	text_re_len);
184
185	regcomp_result = pg_regcomp(&re_temp.cre_re,
186	pattern,
187	pattern_len,
188	cflags,
189	collation);
190
191	pfree(pattern);
192
193	if (regcomp_result != REG_OKAY)
194	{
195	/ re didn't compile (no need for pg_regfree, if so) /
196
197	/*
198	* Here and in other places in this file, do CHECK_FOR_INTERRUPTS
199	* before reporting a regex error. This is so that if the regex
200	* library aborts and returns REG_CANCEL, we don't print an error
201	* message that implies the regex was invalid.
202	*/
203	CHECK_FOR_INTERRUPTS();
204
205	pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
206	ereport(ERROR,
207	(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
208	errmsg("invalid regular expression: %s", errMsg)));
209	}
210
211	/*
212	* We use malloc/free for the cre_pat field because the storage has to
213	* persist across transactions, and because we want to get control back on
214	* out-of-memory. The Max() is because some malloc implementations return
215	* NULL for malloc(0).
216	*/
217	re_temp.cre_pat = malloc(Max(text_re_len, `1`));
218	if (re_temp.cre_pat == NULL)
219	{
220	pg_regfree(&re_temp.cre_re);
221	ereport(ERROR,
222	(errcode(ERRCODE_OUT_OF_MEMORY),
223	errmsg("out of memory")));
224	}
225	memcpy(re_temp.cre_pat, text_re_val, text_re_len);
226	re_temp.cre_pat_len = text_re_len;
227	re_temp.cre_flags = cflags;
228	re_temp.cre_collation = collation;
229
230	/*
231	* Okay, we have a valid new item in re_temp; insert it into the storage
232	* array. Discard last entry if needed.
233	*/
234	if (num_res >= MAX_CACHED_RES)
235	{
236	--num_res;
237	Assert(num_res < MAX_CACHED_RES);
238	pg_regfree(&re_array[num_res].cre_re);
239	free(re_array[num_res].cre_pat);
240	}
241
242	if (num_res > `0`)
243	memmove(&re_array[`1`], &re_array[`0`], num_res * sizeof(cached_re_str));
244
245	re_array[`0`] = re_temp;
246	num_res++;
247
248	return &re_array[`0`].cre_re;
249	}
250
251	/*
252	* RE_wchar_execute - execute a RE on pg_wchar data
253	*
254	* Returns true on match, false on no match
255	*
256	* re --- the compiled pattern as returned by RE_compile_and_cache
257	* data --- the data to match against (need not be null-terminated)
258	* data_len --- the length of the data string
259	* start_search -- the offset in the data to start searching
260	* nmatch, pmatch --- optional return area for match details
261	*
262	* Data is given as array of pg_wchar which is what Spencer's regex package
263	* wants.
264	*/
265	static bool
266	RE_wchar_execute(regex_t re, pg_wchar data, int data_len,
267	int start_search, int nmatch, regmatch_t *pmatch)
268	{
269	int regexec_result;
270	char errMsg[`100`];
271
272	/ Perform RE match and return result /
273	regexec_result = pg_regexec(re,
274	data,
275	data_len,
276	start_search,
277	NULL, / no details /
278	nmatch,
279	pmatch,
280	`0`);
281
282	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
283	{
284	/ re failed??? /
285	CHECK_FOR_INTERRUPTS();
286	pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
287	ereport(ERROR,
288	(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
289	errmsg("regular expression failed: %s", errMsg)));
290	}
291
292	return (regexec_result == REG_OKAY);
293	}
294
295	/*
296	* RE_execute - execute a RE
297	*
298	* Returns true on match, false on no match
299	*
300	* re --- the compiled pattern as returned by RE_compile_and_cache
301	* dat --- the data to match against (need not be null-terminated)
302	* dat_len --- the length of the data string
303	* nmatch, pmatch --- optional return area for match details
304	*
305	* Data is given in the database encoding. We internally
306	* convert to array of pg_wchar which is what Spencer's regex package wants.
307	*/
308	static bool
309	RE_execute(regex_t re, char* dat, int* dat_len,
310	int nmatch, regmatch_t *pmatch)
311	{
312	pg_wchar *data;
313	int data_len;
314	bool match;
315
316	/ Convert data string to wide characters /
317	data = (pg_wchar ) palloc((dat_len + `1`) sizeof(pg_wchar));
318	data_len = pg_mb2wchar_with_len(dat, data, dat_len);
319
320	/ Perform RE match and return result /
321	match = RE_wchar_execute(re, data, data_len, `0`, nmatch, pmatch);
322
323	pfree(data);
324	return match;
325	}
326
327	/*
328	* RE_compile_and_execute - compile and execute a RE
329	*
330	* Returns true on match, false on no match
331	*
332	* text_re --- the pattern, expressed as a TEXT object
333	* dat --- the data to match against (need not be null-terminated)
334	* dat_len --- the length of the data string
335	* cflags --- compile options for the pattern
336	* collation --- collation to use for LC_CTYPE-dependent behavior
337	* nmatch, pmatch --- optional return area for match details
338	*
339	* Both pattern and data are given in the database encoding. We internally
340	* convert to array of pg_wchar which is what Spencer's regex package wants.
341	*/
342	bool
343	RE_compile_and_execute(text text_re, char* dat, int* dat_len,
344	int cflags, Oid collation,
345	int nmatch, regmatch_t *pmatch)
346	{
347	regex_t *re;
348
349	/ Compile RE /
350	re = RE_compile_and_cache(text_re, cflags, collation);
351
352	return RE_execute(re, dat, dat_len, nmatch, pmatch);
353	}
354
355
356	/*
357	* parse_re_flags - parse the options argument of regexp_match and friends
358	*
359	* flags --- output argument, filled with desired options
360	* opts --- TEXT object, or NULL for defaults
361	*
362	* This accepts all the options allowed by any of the callers; callers that
363	* don't want some have to reject them after the fact.
364	*/
365	static void
366	parse_re_flags(pg_re_flags flags, text opts)
367	{
368	/ regex flavor is always folded into the compile flags /
369	flags->cflags = REG_ADVANCED;
370	flags->glob = false;
371
372	if (opts)
373	{
374	char *opt_p = VARDATA_ANY(opts);
375	int opt_len = VARSIZE_ANY_EXHDR(opts);
376	int i;
377
378	for (i = `0`; i < opt_len; i++)
379	{
380	switch (opt_p[i])
381	{
382	case `'g'`:
383	flags->glob = true;
384	break;
385	case `'b'`: / BREs (but why???) /
386	flags->cflags &= ~(REG_ADVANCED \| REG_EXTENDED \| REG_QUOTE);
387	break;
388	case `'c'`: / case sensitive /
389	flags->cflags &= ~REG_ICASE;
390	break;
391	case `'e'`: / plain EREs /
392	flags->cflags \|= REG_EXTENDED;
393	flags->cflags &= ~(REG_ADVANCED \| REG_QUOTE);
394	break;
395	case `'i'`: / case insensitive /
396	flags->cflags \|= REG_ICASE;
397	break;
398	case `'m'`: / Perloid synonym for n /
399	case `'n'`: / \n affects ^ $ . [^ /
400	flags->cflags \|= REG_NEWLINE;
401	break;
402	case `'p'`: / ~Perl, \n affects . [^ /
403	flags->cflags \|= REG_NLSTOP;
404	flags->cflags &= ~REG_NLANCH;
405	break;
406	case `'q'`: / literal string /
407	flags->cflags \|= REG_QUOTE;
408	flags->cflags &= ~(REG_ADVANCED \| REG_EXTENDED);
409	break;
410	case `'s'`: / single line, \n ordinary /
411	flags->cflags &= ~REG_NEWLINE;
412	break;
413	case `'t'`: / tight syntax /
414	flags->cflags &= ~REG_EXPANDED;
415	break;
416	case `'w'`: / weird, \n affects ^ $ only /
417	flags->cflags &= ~REG_NLSTOP;
418	flags->cflags \|= REG_NLANCH;
419	break;
420	case `'x'`: / expanded syntax /
421	flags->cflags \|= REG_EXPANDED;
422	break;
423	default:
424	ereport(ERROR,
425	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
426	errmsg("invalid regular expression option: \"%c\"",
427	opt_p[i])));
428	break;
429	}
430	}
431	}
432	}
433
434
435	/*
436	* interface routines called by the function manager
437	*/
438
439	Datum
440	nameregexeq(PG_FUNCTION_ARGS)
441	{
442	Name n = PG_GETARG_NAME(`0`);
443	text *p = PG_GETARG_TEXT_PP(`1`);
444
445	PG_RETURN_BOOL(RE_compile_and_execute(p,
446	NameStr(*n),
447	strlen(NameStr(*n)),
448	REG_ADVANCED,
449	PG_GET_COLLATION(),
450	`0`, NULL));
451	}
452
453	Datum
454	nameregexne(PG_FUNCTION_ARGS)
455	{
456	Name n = PG_GETARG_NAME(`0`);
457	text *p = PG_GETARG_TEXT_PP(`1`);
458
459	PG_RETURN_BOOL(!RE_compile_and_execute(p,
460	NameStr(*n),
461	strlen(NameStr(*n)),
462	REG_ADVANCED,
463	PG_GET_COLLATION(),
464	`0`, NULL));
465	}
466
467	Datum
468	textregexeq(PG_FUNCTION_ARGS)
469	{
470	text *s = PG_GETARG_TEXT_PP(`0`);
471	text *p = PG_GETARG_TEXT_PP(`1`);
472
473	PG_RETURN_BOOL(RE_compile_and_execute(p,
474	VARDATA_ANY(s),
475	VARSIZE_ANY_EXHDR(s),
476	REG_ADVANCED,
477	PG_GET_COLLATION(),
478	`0`, NULL));
479	}
480
481	Datum
482	textregexne(PG_FUNCTION_ARGS)
483	{
484	text *s = PG_GETARG_TEXT_PP(`0`);
485	text *p = PG_GETARG_TEXT_PP(`1`);
486
487	PG_RETURN_BOOL(!RE_compile_and_execute(p,
488	VARDATA_ANY(s),
489	VARSIZE_ANY_EXHDR(s),
490	REG_ADVANCED,
491	PG_GET_COLLATION(),
492	`0`, NULL));
493	}
494
495
496	/*
497	* routines that use the regexp stuff, but ignore the case.
498	* for this, we use the REG_ICASE flag to pg_regcomp
499	*/
500
501
502	Datum
503	nameicregexeq(PG_FUNCTION_ARGS)
504	{
505	Name n = PG_GETARG_NAME(`0`);
506	text *p = PG_GETARG_TEXT_PP(`1`);
507
508	PG_RETURN_BOOL(RE_compile_and_execute(p,
509	NameStr(*n),
510	strlen(NameStr(*n)),
511	REG_ADVANCED \| REG_ICASE,
512	PG_GET_COLLATION(),
513	`0`, NULL));
514	}
515
516	Datum
517	nameicregexne(PG_FUNCTION_ARGS)
518	{
519	Name n = PG_GETARG_NAME(`0`);
520	text *p = PG_GETARG_TEXT_PP(`1`);
521
522	PG_RETURN_BOOL(!RE_compile_and_execute(p,
523	NameStr(*n),
524	strlen(NameStr(*n)),
525	REG_ADVANCED \| REG_ICASE,
526	PG_GET_COLLATION(),
527	`0`, NULL));
528	}
529
530	Datum
531	texticregexeq(PG_FUNCTION_ARGS)
532	{
533	text *s = PG_GETARG_TEXT_PP(`0`);
534	text *p = PG_GETARG_TEXT_PP(`1`);
535
536	PG_RETURN_BOOL(RE_compile_and_execute(p,
537	VARDATA_ANY(s),
538	VARSIZE_ANY_EXHDR(s),
539	REG_ADVANCED \| REG_ICASE,
540	PG_GET_COLLATION(),
541	`0`, NULL));
542	}
543
544	Datum
545	texticregexne(PG_FUNCTION_ARGS)
546	{
547	text *s = PG_GETARG_TEXT_PP(`0`);
548	text *p = PG_GETARG_TEXT_PP(`1`);
549
550	PG_RETURN_BOOL(!RE_compile_and_execute(p,
551	VARDATA_ANY(s),
552	VARSIZE_ANY_EXHDR(s),
553	REG_ADVANCED \| REG_ICASE,
554	PG_GET_COLLATION(),
555	`0`, NULL));
556	}
557
558
559	/*
560	* textregexsubstr()
561	* Return a substring matched by a regular expression.
562	*/
563	Datum
564	textregexsubstr(PG_FUNCTION_ARGS)
565	{
566	text *s = PG_GETARG_TEXT_PP(`0`);
567	text *p = PG_GETARG_TEXT_PP(`1`);
568	regex_t *re;
569	regmatch_t pmatch[`2`];
570	int so,
571	eo;
572
573	/ Compile RE /
574	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
575
576	/*
577	* We pass two regmatch_t structs to get info about the overall match and
578	* the match for the first parenthesized subexpression (if any). If there
579	* is a parenthesized subexpression, we return what it matched; else
580	* return what the whole regexp matched.
581	*/
582	if (!RE_execute(re,
583	VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
584	`2`, pmatch))
585	PG_RETURN_NULL(); / definitely no match /
586
587	if (re->re_nsub > `0`)
588	{
589	/ has parenthesized subexpressions, use the first one /
590	so = pmatch[`1`].rm_so;
591	eo = pmatch[`1`].rm_eo;
592	}
593	else
594	{
595	/ no parenthesized subexpression, use whole match /
596	so = pmatch[`0`].rm_so;
597	eo = pmatch[`0`].rm_eo;
598	}
599
600	/*
601	* It is possible to have a match to the whole pattern but no match for a
602	* subexpression; for example 'foo(bar)?' is considered to match 'foo' but
603	* there is no subexpression match. So this extra test for match failure
604	* is not redundant.
605	*/
606	if (so < `0` \|\| eo < `0`)
607	PG_RETURN_NULL();
608
609	return DirectFunctionCall3(text_substr,
610	PointerGetDatum(s),
611	Int32GetDatum(so + `1`),
612	Int32GetDatum(eo - so));
613	}
614
615	/*
616	* textregexreplace_noopt()
617	* Return a string matched by a regular expression, with replacement.
618	*
619	* This version doesn't have an option argument: we default to case
620	* sensitive match, replace the first instance only.
621	*/
622	Datum
623	textregexreplace_noopt(PG_FUNCTION_ARGS)
624	{
625	text *s = PG_GETARG_TEXT_PP(`0`);
626	text *p = PG_GETARG_TEXT_PP(`1`);
627	text *r = PG_GETARG_TEXT_PP(`2`);
628	regex_t *re;
629
630	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
631
632	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
633	}
634
635	/*
636	* textregexreplace()
637	* Return a string matched by a regular expression, with replacement.
638	*/
639	Datum
640	textregexreplace(PG_FUNCTION_ARGS)
641	{
642	text *s = PG_GETARG_TEXT_PP(`0`);
643	text *p = PG_GETARG_TEXT_PP(`1`);
644	text *r = PG_GETARG_TEXT_PP(`2`);
645	text *opt = PG_GETARG_TEXT_PP(`3`);
646	regex_t *re;
647	pg_re_flags flags;
648
649	parse_re_flags(&flags, opt);
650
651	re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
652
653	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
654	}
655
656	/*
657	* similar_escape()
658	* Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
659	* our regexp engine.
660	*/
661	Datum
662	similar_escape(PG_FUNCTION_ARGS)
663	{
664	text *pat_text;
665	text *esc_text;
666	text *result;
667	char *p,
668	*e,
669	*r;
670	int plen,
671	elen;
672	bool afterescape = false;
673	bool incharclass = false;
674	int nquotes = `0`;
675
676	/ This function is not strict, so must test explicitly /
677	if (PG_ARGISNULL(`0`))
678	PG_RETURN_NULL();
679	pat_text = PG_GETARG_TEXT_PP(`0`);
680	p = VARDATA_ANY(pat_text);
681	plen = VARSIZE_ANY_EXHDR(pat_text);
682	if (PG_ARGISNULL(`1`))
683	{
684	/ No ESCAPE clause provided; default to backslash as escape /
685	e = "\\";
686	elen = `1`;
687	}
688	else
689	{
690	esc_text = PG_GETARG_TEXT_PP(`1`);
691	e = VARDATA_ANY(esc_text);
692	elen = VARSIZE_ANY_EXHDR(esc_text);
693	if (elen == `0`)
694	e = NULL; / no escape character /
695	else
696	{
697	int escape_mblen = pg_mbstrlen_with_len(e, elen);
698
699	if (escape_mblen > `1`)
700	ereport(ERROR,
701	(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
702	errmsg("invalid escape string"),
703	errhint("Escape string must be empty or one character.")));
704	}
705	}
706
707	/----------*
708	* We surround the transformed input string with
709	* ^(?: ... )$
710	* which requires some explanation. We need "^" and "$" to force
711	* the pattern to match the entire input string as per the SQL spec.
712	* The "(?:" and ")" are a non-capturing set of parens; we have to have
713	* parens in case the string contains "\|", else the "^" and "$" will
714	* be bound into the first and last alternatives which is not what we
715	* want, and the parens must be non capturing because we don't want them
716	* to count when selecting output for SUBSTRING.
717	*
718	* When the pattern is divided into three parts by escape-double-quotes,
719	* what we emit is
720	* ^(?:part1){1,1}?(part2){1,1}(?:part3)$
721	* which requires even more explanation. The "{1,1}?" on part1 makes it
722	* non-greedy so that it will match the smallest possible amount of text
723	* not the largest, as required by SQL. The plain parens around part2
724	* are capturing parens so that that part is what controls the result of
725	* SUBSTRING. The "{1,1}" forces part2 to be greedy, so that it matches
726	* the largest possible amount of text; hence part3 must match the
727	* smallest amount of text, as required by SQL. We don't need an explicit
728	* greediness marker on part3. Note that this also confines the effects
729	* of any "\|" characters to the respective part, which is what we want.
730	*
731	* The SQL spec says that SUBSTRING's pattern must contain exactly two
732	* escape-double-quotes, but we only complain if there's more than two.
733	* With none, we act as though part1 and part3 are empty; with one, we
734	* act as though part3 is empty. Both behaviors fall out of omitting
735	* the relevant part separators in the above expansion. If the result
736	* of this function is used in a plain regexp match (SIMILAR TO), the
737	* escape-double-quotes have no effect on the match behavior.
738	*----------
739	*/
740
741	/*
742	* We need room for the prefix/postfix and part separators, plus as many
743	* as 3 output bytes per input byte; since the input is at most 1GB this
744	* can't overflow size_t.
745	*/
746	result = (text ) palloc(VARHDRSZ + `23` + `3` (size_t) plen);
747	r = VARDATA(result);
748
749	*r++ = `'^'`;
750	*r++ = `'('`;
751	*r++ = `'?'`;
752	*r++ = `':'`;
753
754	while (plen > `0`)
755	{
756	char pchar = *p;
757
758	/*
759	* If both the escape character and the current character from the
760	* pattern are multi-byte, we need to take the slow path.
761	*
762	* But if one of them is single-byte, we can process the pattern one
763	* byte at a time, ignoring multi-byte characters. (This works
764	* because all server-encodings have the property that a valid
765	* multi-byte character representation cannot contain the
766	* representation of a valid single-byte character.)
767	*/
768
769	if (elen > `1`)
770	{
771	int mblen = pg_mblen(p);
772
773	if (mblen > `1`)
774	{
775	/ slow, multi-byte path /
776	if (afterescape)
777	{
778	*r++ = `'\\'`;
779	memcpy(r, p, mblen);
780	r += mblen;
781	afterescape = false;
782	}
783	else if (e && elen == mblen && memcmp(e, p, mblen) == `0`)
784	{
785	/ SQL escape character; do not send to output /
786	afterescape = true;
787	}
788	else
789	{
790	/*
791	* We know it's a multi-byte character, so we don't need
792	* to do all the comparisons to single-byte characters
793	* that we do below.
794	*/
795	memcpy(r, p, mblen);
796	r += mblen;
797	}
798
799	p += mblen;
800	plen -= mblen;
801
802	continue;
803	}
804	}
805
806	/ fast path /
807	if (afterescape)
808	{
809	if (pchar == `'"'` && !incharclass) / escape-double-quote? /
810	{
811	/ emit appropriate part separator, per notes above /
812	if (nquotes == `0`)
813	{
814	*r++ = `')'`;
815	*r++ = `'{'`;
816	*r++ = `'1'`;
817	*r++ = `','`;
818	*r++ = `'1'`;
819	*r++ = `'}'`;
820	*r++ = `'?'`;
821	*r++ = `'('`;
822	}
823	else if (nquotes == `1`)
824	{
825	*r++ = `')'`;
826	*r++ = `'{'`;
827	*r++ = `'1'`;
828	*r++ = `','`;
829	*r++ = `'1'`;
830	*r++ = `'}'`;
831	*r++ = `'('`;
832	*r++ = `'?'`;
833	*r++ = `':'`;
834	}
835	else
836	ereport(ERROR,
837	(errcode(ERRCODE_INVALID_USE_OF_ESCAPE_CHARACTER),
838	errmsg("SQL regular expression may not contain more than two escape-double-quote separators")));
839	nquotes++;
840	}
841	else
842	{
843	/*
844	* We allow any character at all to be escaped; notably, this
845	* allows access to POSIX character-class escapes such as
846	* "\d". The SQL spec is considerably more restrictive.
847	*/
848	*r++ = `'\\'`;
849	*r++ = pchar;
850	}
851	afterescape = false;
852	}
853	else if (e && pchar == *e)
854	{
855	/ SQL escape character; do not send to output /
856	afterescape = true;
857	}
858	else if (incharclass)
859	{
860	if (pchar == `'\\'`)
861	*r++ = `'\\'`;
862	*r++ = pchar;
863	if (pchar == `']'`)
864	incharclass = false;
865	}
866	else if (pchar == `'['`)
867	{
868	*r++ = pchar;
869	incharclass = true;
870	}
871	else if (pchar == `'%'`)
872	{
873	*r++ = `'.'`;
874	r++ = `''`;
875	}
876	else if (pchar == `'_'`)
877	*r++ = `'.'`;
878	else if (pchar == `'('`)
879	{
880	/ convert to non-capturing parenthesis /
881	*r++ = `'('`;
882	*r++ = `'?'`;
883	*r++ = `':'`;
884	}
885	else if (pchar == `'\\'` \|\| pchar == `'.'` \|\|
886	pchar == `'^'` \|\| pchar == `'$'`)
887	{
888	*r++ = `'\\'`;
889	*r++ = pchar;
890	}
891	else
892	*r++ = pchar;
893	p++, plen--;
894	}
895
896	*r++ = `')'`;
897	*r++ = `'$'`;
898
899	SET_VARSIZE(result, r - ((char *) result));
900
901	PG_RETURN_TEXT_P(result);
902	}
903
904	/*
905	* regexp_match()
906	* Return the first substring(s) matching a pattern within a string.
907	*/
908	Datum
909	regexp_match(PG_FUNCTION_ARGS)
910	{
911	text *orig_str = PG_GETARG_TEXT_PP(`0`);
912	text *pattern = PG_GETARG_TEXT_PP(`1`);
913	text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(`2`);
914	pg_re_flags re_flags;
915	regexp_matches_ctx *matchctx;
916
917	/ Determine options /
918	parse_re_flags(&re_flags, flags);
919	/ User mustn't specify 'g' /
920	if (re_flags.glob)
921	ereport(ERROR,
922	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
923	/ translator: %s is a SQL function name /
924	errmsg("%s does not support the \"global\" option",
925	"regexp_match()"),
926	errhint("Use the regexp_matches function instead.")));
927
928	matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
929	PG_GET_COLLATION(), true, false, false);
930
931	if (matchctx->nmatches == `0`)
932	PG_RETURN_NULL();
933
934	Assert(matchctx->nmatches == `1`);
935
936	/ Create workspace that build_regexp_match_result needs /
937	matchctx->elems = (Datum ) palloc(sizeof(Datum) matchctx->npatterns);
938	matchctx->nulls = (bool ) palloc(sizeof(bool) matchctx->npatterns);
939
940	PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
941	}
942
943	/ This is separate to keep the opr_sanity regression test from complaining /
944	Datum
945	regexp_match_no_flags(PG_FUNCTION_ARGS)
946	{
947	return regexp_match(fcinfo);
948	}
949
950	/*
951	* regexp_matches()
952	* Return a table of all matches of a pattern within a string.
953	*/
954	Datum
955	regexp_matches(PG_FUNCTION_ARGS)
956	{
957	FuncCallContext *funcctx;
958	regexp_matches_ctx *matchctx;
959
960	if (SRF_IS_FIRSTCALL())
961	{
962	text *pattern = PG_GETARG_TEXT_PP(`1`);
963	text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(`2`);
964	pg_re_flags re_flags;
965	MemoryContext oldcontext;
966
967	funcctx = SRF_FIRSTCALL_INIT();
968	oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
969
970	/ Determine options /
971	parse_re_flags(&re_flags, flags);
972
973	/ be sure to copy the input string into the multi-call ctx /
974	matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(`0`), pattern,
975	&re_flags,
976	PG_GET_COLLATION(),
977	true, false, false);
978
979	/ Pre-create workspace that build_regexp_match_result needs /
980	matchctx->elems = (Datum ) palloc(sizeof(Datum) matchctx->npatterns);
981	matchctx->nulls = (bool ) palloc(sizeof(bool) matchctx->npatterns);
982
983	MemoryContextSwitchTo(oldcontext);
984	funcctx->user_fctx = (void *) matchctx;
985	}
986
987	funcctx = SRF_PERCALL_SETUP();
988	matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
989
990	if (matchctx->next_match < matchctx->nmatches)
991	{
992	ArrayType *result_ary;
993
994	result_ary = build_regexp_match_result(matchctx);
995	matchctx->next_match++;
996	SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
997	}
998
999	SRF_RETURN_DONE(funcctx);
1000	}
1001
1002	/ This is separate to keep the opr_sanity regression test from complaining /
1003	Datum
1004	regexp_matches_no_flags(PG_FUNCTION_ARGS)
1005	{
1006	return regexp_matches(fcinfo);
1007	}
1008
1009	/*
1010	* setup_regexp_matches --- do the initial matching for regexp_match
1011	* and regexp_split functions
1012	*
1013	* To avoid having to re-find the compiled pattern on each call, we do
1014	* all the matching in one swoop. The returned regexp_matches_ctx contains
1015	* the locations of all the substrings matching the pattern.
1016	*
1017	* The three bool parameters have only two patterns (one for matching, one for
1018	* splitting) but it seems clearer to distinguish the functionality this way
1019	* than to key it all off one "is_split" flag. We don't currently assume that
1020	* fetching_unmatched is exclusive of fetching the matched text too; if it's
1021	* set, the conversion buffer is large enough to fetch any single matched or
1022	* unmatched string, but not any larger substring. (In practice, when splitting
1023	* the matches are usually small anyway, and it didn't seem worth complicating
1024	* the code further.)
1025	*/
1026	static regexp_matches_ctx *
1027	setup_regexp_matches(text orig_str, text pattern, pg_re_flags *re_flags,
1028	Oid collation,
1029	bool use_subpatterns,
1030	bool ignore_degenerate,
1031	bool fetching_unmatched)
1032	{
1033	regexp_matches_ctx matchctx = palloc0(sizeof*(regexp_matches_ctx));
1034	int eml = pg_database_encoding_max_length();
1035	int orig_len;
1036	pg_wchar *wide_str;
1037	int wide_len;
1038	regex_t *cpattern;
1039	regmatch_t *pmatch;
1040	int pmatch_len;
1041	int array_len;
1042	int array_idx;
1043	int prev_match_end;
1044	int prev_valid_match_end;
1045	int start_search;
1046	int maxlen = `0`; / largest fetch length in characters /
1047
1048	/ save original string --- we'll extract result substrings from it /
1049	matchctx->orig_str = orig_str;
1050
1051	/ convert string to pg_wchar form for matching /
1052	orig_len = VARSIZE_ANY_EXHDR(orig_str);
1053	wide_str = (pg_wchar ) palloc(sizeof(pg_wchar) (orig_len + `1`));
1054	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
1055
1056	/ set up the compiled pattern /
1057	cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
1058
1059	/ do we want to remember subpatterns? /
1060	if (use_subpatterns && cpattern->re_nsub > `0`)
1061	{
1062	matchctx->npatterns = cpattern->re_nsub;
1063	pmatch_len = cpattern->re_nsub + `1`;
1064	}
1065	else
1066	{
1067	use_subpatterns = false;
1068	matchctx->npatterns = `1`;
1069	pmatch_len = `1`;
1070	}
1071
1072	/ temporary output space for RE package /
1073	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
1074
1075	/*
1076	* the real output space (grown dynamically if needed)
1077	*
1078	* use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
1079	* than at 2^27
1080	*/
1081	array_len = re_flags->glob ? `255` : `31`;
1082	matchctx->match_locs = (int ) palloc(sizeof(int) array_len);
1083	array_idx = `0`;
1084
1085	/ search for the pattern, perhaps repeatedly /
1086	prev_match_end = `0`;
1087	prev_valid_match_end = `0`;
1088	start_search = `0`;
1089	while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
1090	pmatch_len, pmatch))
1091	{
1092	/*
1093	* If requested, ignore degenerate matches, which are zero-length
1094	* matches occurring at the start or end of a string or just after a
1095	* previous match.
1096	*/
1097	if (!ignore_degenerate \|\|
1098	(pmatch[`0`].rm_so < wide_len &&
1099	pmatch[`0`].rm_eo > prev_match_end))
1100	{
1101	/ enlarge output space if needed /
1102	while (array_idx + matchctx->npatterns * `2` + `1` > array_len)
1103	{
1104	array_len += array_len + `1`; / 2^n-1 => 2^(n+1)-1 /
1105	if (array_len > MaxAllocSize / sizeof(int))
1106	ereport(ERROR,
1107	(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1108	errmsg("too many regular expression matches")));
1109	matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
1110	sizeof(int) * array_len);
1111	}
1112
1113	/ save this match's locations /
1114	if (use_subpatterns)
1115	{
1116	int i;
1117
1118	for (i = `1`; i <= matchctx->npatterns; i++)
1119	{
1120	int so = pmatch[i].rm_so;
1121	int eo = pmatch[i].rm_eo;
1122
1123	matchctx->match_locs[array_idx++] = so;
1124	matchctx->match_locs[array_idx++] = eo;
1125	if (so >= `0` && eo >= `0` && (eo - so) > maxlen)
1126	maxlen = (eo - so);
1127	}
1128	}
1129	else
1130	{
1131	int so = pmatch[`0`].rm_so;
1132	int eo = pmatch[`0`].rm_eo;
1133
1134	matchctx->match_locs[array_idx++] = so;
1135	matchctx->match_locs[array_idx++] = eo;
1136	if (so >= `0` && eo >= `0` && (eo - so) > maxlen)
1137	maxlen = (eo - so);
1138	}
1139	matchctx->nmatches++;
1140
1141	/*
1142	* check length of unmatched portion between end of previous valid
1143	* (nondegenerate, or degenerate but not ignored) match and start
1144	* of current one
1145	*/
1146	if (fetching_unmatched &&
1147	pmatch[`0`].rm_so >= `0` &&
1148	(pmatch[`0`].rm_so - prev_valid_match_end) > maxlen)
1149	maxlen = (pmatch[`0`].rm_so - prev_valid_match_end);
1150	prev_valid_match_end = pmatch[`0`].rm_eo;
1151	}
1152	prev_match_end = pmatch[`0`].rm_eo;
1153
1154	/ if not glob, stop after one match /
1155	if (!re_flags->glob)
1156	break;
1157
1158	/*
1159	* Advance search position. Normally we start the next search at the
1160	* end of the previous match; but if the match was of zero length, we
1161	* have to advance by one character, or we'd just find the same match
1162	* again.
1163	*/
1164	start_search = prev_match_end;
1165	if (pmatch[`0`].rm_so == pmatch[`0`].rm_eo)
1166	start_search++;
1167	if (start_search > wide_len)
1168	break;
1169	}
1170
1171	/*
1172	* check length of unmatched portion between end of last match and end of
1173	* input string
1174	*/
1175	if (fetching_unmatched &&
1176	(wide_len - prev_valid_match_end) > maxlen)
1177	maxlen = (wide_len - prev_valid_match_end);
1178
1179	/*
1180	* Keep a note of the end position of the string for the benefit of
1181	* splitting code.
1182	*/
1183	matchctx->match_locs[array_idx] = wide_len;
1184
1185	if (eml > `1`)
1186	{
1187	int64 maxsiz = eml * (int64) maxlen;
1188	int conv_bufsiz;
1189
1190	/*
1191	* Make the conversion buffer large enough for any substring of
1192	* interest.
1193	*
1194	* Worst case: assume we need the maximum size (maxlen*eml), but take
1195	* advantage of the fact that the original string length in bytes is
1196	* an upper bound on the byte length of any fetched substring (and we
1197	* know that len+1 is safe to allocate because the varlena header is
1198	* longer than 1 byte).
1199	*/
1200	if (maxsiz > orig_len)
1201	conv_bufsiz = orig_len + `1`;
1202	else
1203	conv_bufsiz = maxsiz + `1`; / safe since maxsiz < 2^30 /
1204
1205	matchctx->conv_buf = palloc(conv_bufsiz);
1206	matchctx->conv_bufsiz = conv_bufsiz;
1207	matchctx->wide_str = wide_str;
1208	}
1209	else
1210	{
1211	/ No need to keep the wide string if we're in a single-byte charset. /
1212	pfree(wide_str);
1213	matchctx->wide_str = NULL;
1214	matchctx->conv_buf = NULL;
1215	matchctx->conv_bufsiz = `0`;
1216	}
1217
1218	/ Clean up temp storage /
1219	pfree(pmatch);
1220
1221	return matchctx;
1222	}
1223
1224	/*
1225	* build_regexp_match_result - build output array for current match
1226	*/
1227	static ArrayType *
1228	build_regexp_match_result(regexp_matches_ctx *matchctx)
1229	{
1230	char *buf = matchctx->conv_buf;
1231	int bufsiz PG_USED_FOR_ASSERTS_ONLY = matchctx->conv_bufsiz;
1232	Datum *elems = matchctx->elems;
1233	bool *nulls = matchctx->nulls;
1234	int dims[`1`];
1235	int lbs[`1`];
1236	int loc;
1237	int i;
1238
1239	/ Extract matching substrings from the original string /
1240	loc = matchctx->next_match * matchctx->npatterns * `2`;
1241	for (i = `0`; i < matchctx->npatterns; i++)
1242	{
1243	int so = matchctx->match_locs[loc++];
1244	int eo = matchctx->match_locs[loc++];
1245
1246	if (so < `0` \|\| eo < `0`)
1247	{
1248	elems[i] = (Datum) `0`;
1249	nulls[i] = true;
1250	}
1251	else if (buf)
1252	{
1253	int len = pg_wchar2mb_with_len(matchctx->wide_str + so,
1254	buf,
1255	eo - so);
1256
1257	Assert(len < bufsiz);
1258	elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
1259	nulls[i] = false;
1260	}
1261	else
1262	{
1263	elems[i] = DirectFunctionCall3(text_substr,
1264	PointerGetDatum(matchctx->orig_str),
1265	Int32GetDatum(so + `1`),
1266	Int32GetDatum(eo - so));
1267	nulls[i] = false;
1268	}
1269	}
1270
1271	/ And form an array /
1272	dims[`0`] = matchctx->npatterns;
1273	lbs[`0`] = `1`;
1274	/ XXX: this hardcodes assumptions about the text type /
1275	return construct_md_array(elems, nulls, `1`, dims, lbs,
1276	TEXTOID, -`1`, false, `'i'`);
1277	}
1278
1279	/*
1280	* regexp_split_to_table()
1281	* Split the string at matches of the pattern, returning the
1282	* split-out substrings as a table.
1283	*/
1284	Datum
1285	regexp_split_to_table(PG_FUNCTION_ARGS)
1286	{
1287	FuncCallContext *funcctx;
1288	regexp_matches_ctx *splitctx;
1289
1290	if (SRF_IS_FIRSTCALL())
1291	{
1292	text *pattern = PG_GETARG_TEXT_PP(`1`);
1293	text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(`2`);
1294	pg_re_flags re_flags;
1295	MemoryContext oldcontext;
1296
1297	funcctx = SRF_FIRSTCALL_INIT();
1298	oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1299
1300	/ Determine options /
1301	parse_re_flags(&re_flags, flags);
1302	/ User mustn't specify 'g' /
1303	if (re_flags.glob)
1304	ereport(ERROR,
1305	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1306	/ translator: %s is a SQL function name /
1307	errmsg("%s does not support the \"global\" option",
1308	"regexp_split_to_table()")));
1309	/ But we find all the matches anyway /
1310	re_flags.glob = true;
1311
1312	/ be sure to copy the input string into the multi-call ctx /
1313	splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(`0`), pattern,
1314	&re_flags,
1315	PG_GET_COLLATION(),
1316	false, true, true);
1317
1318	MemoryContextSwitchTo(oldcontext);
1319	funcctx->user_fctx = (void *) splitctx;
1320	}
1321
1322	funcctx = SRF_PERCALL_SETUP();
1323	splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1324
1325	if (splitctx->next_match <= splitctx->nmatches)
1326	{
1327	Datum result = build_regexp_split_result(splitctx);
1328
1329	splitctx->next_match++;
1330	SRF_RETURN_NEXT(funcctx, result);
1331	}
1332
1333	SRF_RETURN_DONE(funcctx);
1334	}
1335
1336	/ This is separate to keep the opr_sanity regression test from complaining /
1337	Datum
1338	regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
1339	{
1340	return regexp_split_to_table(fcinfo);
1341	}
1342
1343	/*
1344	* regexp_split_to_array()
1345	* Split the string at matches of the pattern, returning the
1346	* split-out substrings as an array.
1347	*/
1348	Datum
1349	regexp_split_to_array(PG_FUNCTION_ARGS)
1350	{
1351	ArrayBuildState *astate = NULL;
1352	pg_re_flags re_flags;
1353	regexp_matches_ctx *splitctx;
1354
1355	/ Determine options /
1356	parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(`2`));
1357	/ User mustn't specify 'g' /
1358	if (re_flags.glob)
1359	ereport(ERROR,
1360	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1361	/ translator: %s is a SQL function name /
1362	errmsg("%s does not support the \"global\" option",
1363	"regexp_split_to_array()")));
1364	/ But we find all the matches anyway /
1365	re_flags.glob = true;
1366
1367	splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(`0`),
1368	PG_GETARG_TEXT_PP(`1`),
1369	&re_flags,
1370	PG_GET_COLLATION(),
1371	false, true, true);
1372
1373	while (splitctx->next_match <= splitctx->nmatches)
1374	{
1375	astate = accumArrayResult(astate,
1376	build_regexp_split_result(splitctx),
1377	false,
1378	TEXTOID,
1379	CurrentMemoryContext);
1380	splitctx->next_match++;
1381	}
1382
1383	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
1384	}
1385
1386	/ This is separate to keep the opr_sanity regression test from complaining /
1387	Datum
1388	regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1389	{
1390	return regexp_split_to_array(fcinfo);
1391	}
1392
1393	/*
1394	* build_regexp_split_result - build output string for current match
1395	*
1396	* We return the string between the current match and the previous one,
1397	* or the string after the last match when next_match == nmatches.
1398	*/
1399	static Datum
1400	build_regexp_split_result(regexp_matches_ctx *splitctx)
1401	{
1402	char *buf = splitctx->conv_buf;
1403	int startpos;
1404	int endpos;
1405
1406	if (splitctx->next_match > `0`)
1407	startpos = splitctx->match_locs[splitctx->next_match * `2` - `1`];
1408	else
1409	startpos = `0`;
1410	if (startpos < `0`)
1411	elog(ERROR, "invalid match ending position");
1412
1413	if (buf)
1414	{
1415	int bufsiz PG_USED_FOR_ASSERTS_ONLY = splitctx->conv_bufsiz;
1416	int len;
1417
1418	endpos = splitctx->match_locs[splitctx->next_match * `2`];
1419	if (endpos < startpos)
1420	elog(ERROR, "invalid match starting position");
1421	len = pg_wchar2mb_with_len(splitctx->wide_str + startpos,
1422	buf,
1423	endpos - startpos);
1424	Assert(len < bufsiz);
1425	return PointerGetDatum(cstring_to_text_with_len(buf, len));
1426	}
1427	else
1428	{
1429	endpos = splitctx->match_locs[splitctx->next_match * `2`];
1430	if (endpos < startpos)
1431	elog(ERROR, "invalid match starting position");
1432	return DirectFunctionCall3(text_substr,
1433	PointerGetDatum(splitctx->orig_str),
1434	Int32GetDatum(startpos + `1`),
1435	Int32GetDatum(endpos - startpos));
1436	}
1437	}
1438
1439	/*
1440	* regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
1441	*
1442	* The result is NULL if there is no fixed prefix, else a palloc'd string.
1443	* If it is an exact match, not just a prefix, *exact is returned as true.
1444	*/
1445	char *
1446	regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
1447	bool *exact)
1448	{
1449	char *result;
1450	regex_t *re;
1451	int cflags;
1452	int re_result;
1453	pg_wchar *str;
1454	size_t slen;
1455	size_t maxlen;
1456	char errMsg[`100`];
1457
1458	exact = false; /* default result /
1459
1460	/ Compile RE /
1461	cflags = REG_ADVANCED;
1462	if (case_insensitive)
1463	cflags \|= REG_ICASE;
1464
1465	re = RE_compile_and_cache(text_re, cflags, collation);
1466
1467	/ Examine it to see if there's a fixed prefix /
1468	re_result = pg_regprefix(re, &str, &slen);
1469
1470	switch (re_result)
1471	{
1472	case REG_NOMATCH:
1473	return NULL;
1474
1475	case REG_PREFIX:
1476	/ continue with wchar conversion /
1477	break;
1478
1479	case REG_EXACT:
1480	*exact = true;
1481	/ continue with wchar conversion /
1482	break;
1483
1484	default:
1485	/ re failed??? /
1486	CHECK_FOR_INTERRUPTS();
1487	pg_regerror(re_result, re, errMsg, sizeof(errMsg));
1488	ereport(ERROR,
1489	(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
1490	errmsg("regular expression failed: %s", errMsg)));
1491	break;
1492	}
1493
1494	/ Convert pg_wchar result back to database encoding /
1495	maxlen = pg_database_encoding_max_length() * slen + `1`;
1496	result = (char *) palloc(maxlen);
1497	slen = pg_wchar2mb_with_len(str, result, slen);
1498	Assert(slen < maxlen);
1499
1500	free(str);
1501
1502	return result;
1503	}
1504

Browse the source code of PostgreSQL/src/backend/utils/adt/regexp.c