tsvector_parser.c source code [PostgreSQL/src/backend/utils/adt/tsvector_parser.c]

1	/-------------------------------------------------------------------------*
2	*
3	* tsvector_parser.c
4	* Parser for tsvector
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	*
8	*
9	* IDENTIFICATION
10	* src/backend/utils/adt/tsvector_parser.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14
15	#include "postgres.h"
16
17	#include "tsearch/ts_locale.h"
18	#include "tsearch/ts_utils.h"
19
20
21	/*
22	* Private state of tsvector parser. Note that tsquery also uses this code to
23	* parse its input, hence the boolean flags. The two flags are both true or
24	* both false in current usage, but we keep them separate for clarity.
25	* is_tsquery affects only the content of error messages.
26	*/
27	struct TSVectorParseStateData
28	{
29	char prsbuf; /* next input character /
30	char bufstart; /* whole string (used only for errors) /
31	char word; /* buffer to hold the current word /
32	int len; / size in bytes allocated for 'word' /
33	int eml; / max bytes per character /
34	bool oprisdelim; / treat ! \| * ( ) as delimiters? /
35	bool is_tsquery; / say "tsquery" not "tsvector" in errors? /
36	bool is_web; / we're in websearch_to_tsquery() /
37	};
38
39
40	/*
41	* Initializes parser for the input string. If oprisdelim is set, the
42	* following characters are treated as delimiters in addition to whitespace:
43	* ! \| & ( )
44	*/
45	TSVectorParseState
46	init_tsvector_parser(char input, int* flags)
47	{
48	TSVectorParseState state;
49
50	state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
51	state->prsbuf = input;
52	state->bufstart = input;
53	state->len = `32`;
54	state->word = (char *) palloc(state->len);
55	state->eml = pg_database_encoding_max_length();
56	state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != `0`;
57	state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != `0`;
58	state->is_web = (flags & P_TSV_IS_WEB) != `0`;
59
60	return state;
61	}
62
63	/*
64	* Reinitializes parser to parse 'input', instead of previous input.
65	*/
66	void
67	reset_tsvector_parser(TSVectorParseState state, char *input)
68	{
69	state->prsbuf = input;
70	}
71
72	/*
73	* Shuts down a tsvector parser.
74	*/
75	void
76	close_tsvector_parser(TSVectorParseState state)
77	{
78	pfree(state->word);
79	pfree(state);
80	}
81
82	/ increase the size of 'word' if needed to hold one more character /
83	#define RESIZEPRSBUF \
84	do { \
85	int clen = curpos - state->word; \
86	if ( clen + state->eml >= state->len ) \
87	{ \
88	state->len *= 2; \
89	state->word = (char *) repalloc(state->word, state->len); \
90	curpos = state->word + clen; \
91	} \
92	} while (0)
93
94	/ Fills gettoken_tsvector's output parameters, and returns true /
95	#define RETURN_TOKEN \
96	do { \
97	if (pos_ptr != NULL) \
98	{ \
99	*pos_ptr = pos; \
100	*poslen = npos; \
101	} \
102	else if (pos != NULL) \
103	pfree(pos); \
104	\
105	if (strval != NULL) \
106	*strval = state->word; \
107	if (lenval != NULL) \
108	*lenval = curpos - state->word; \
109	if (endptr != NULL) \
110	*endptr = state->prsbuf; \
111	return true; \
112	} while(0)
113
114
115	/ State codes used in gettoken_tsvector /
116	#define WAITWORD 1
117	#define WAITENDWORD 2
118	#define WAITNEXTCHAR 3
119	#define WAITENDCMPLX 4
120	#define WAITPOSINFO 5
121	#define INPOSINFO 6
122	#define WAITPOSDELIM 7
123	#define WAITCHARCMPLX 8
124
125	#define PRSSYNTAXERROR prssyntaxerror(state)
126
127	static void
128	prssyntaxerror(TSVectorParseState state)
129	{
130	ereport(ERROR,
131	(errcode(ERRCODE_SYNTAX_ERROR),
132	state->is_tsquery ?
133	errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
134	errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
135	}
136
137
138	/*
139	* Get next token from string being parsed. Returns true if successful,
140	* false if end of input string is reached. On success, these output
141	* parameters are filled in:
142	*
143	* *strval pointer to token
144	* lenval length of strval
145	* *pos_ptr pointer to a palloc'd array of positions and weights
146	* associated with the token. If the caller is not interested
147	* in the information, NULL can be supplied. Otherwise
148	* the caller is responsible for pfreeing the array.
149	* poslen number of elements in pos_ptr
150	* *endptr scan resumption point
151	*
152	* Pass NULL for unwanted output parameters.
153	*/
154	bool
155	gettoken_tsvector(TSVectorParseState state,
156	char *strval, int* *lenval,
157	WordEntryPos *pos_ptr, int* *poslen,
158	char **endptr)
159	{
160	int oldstate = `0`;
161	char *curpos = state->word;
162	int statecode = WAITWORD;
163
164	/*
165	* pos is for collecting the comma delimited list of positions followed by
166	* the actual token.
167	*/
168	WordEntryPos *pos = NULL;
169	int npos = `0`; / elements of pos used /
170	int posalen = `0`; / allocated size of pos /
171
172	while (`1`)
173	{
174	if (statecode == WAITWORD)
175	{
176	if (*(state->prsbuf) == `'\0'`)
177	return false;
178	else if (!state->is_web && t_iseq(state->prsbuf, `'\''`))
179	statecode = WAITENDCMPLX;
180	else if (!state->is_web && t_iseq(state->prsbuf, `'\\'`))
181	{
182	statecode = WAITNEXTCHAR;
183	oldstate = WAITENDWORD;
184	}
185	else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) \|\|
186	(state->is_web && t_iseq(state->prsbuf, `'"'`)))
187	PRSSYNTAXERROR;
188	else if (!t_isspace(state->prsbuf))
189	{
190	COPYCHAR(curpos, state->prsbuf);
191	curpos += pg_mblen(state->prsbuf);
192	statecode = WAITENDWORD;
193	}
194	}
195	else if (statecode == WAITNEXTCHAR)
196	{
197	if (*(state->prsbuf) == `'\0'`)
198	ereport(ERROR,
199	(errcode(ERRCODE_SYNTAX_ERROR),
200	errmsg("there is no escaped character: \"%s\"",
201	state->bufstart)));
202	else
203	{
204	RESIZEPRSBUF;
205	COPYCHAR(curpos, state->prsbuf);
206	curpos += pg_mblen(state->prsbuf);
207	Assert(oldstate != `0`);
208	statecode = oldstate;
209	}
210	}
211	else if (statecode == WAITENDWORD)
212	{
213	if (!state->is_web && t_iseq(state->prsbuf, `'\\'`))
214	{
215	statecode = WAITNEXTCHAR;
216	oldstate = WAITENDWORD;
217	}
218	else if (t_isspace(state->prsbuf) \|\| *(state->prsbuf) == `'\0'` \|\|
219	(state->oprisdelim && ISOPERATOR(state->prsbuf)) \|\|
220	(state->is_web && t_iseq(state->prsbuf, `'"'`)))
221	{
222	RESIZEPRSBUF;
223	if (curpos == state->word)
224	PRSSYNTAXERROR;
225	*(curpos) = `'\0'`;
226	RETURN_TOKEN;
227	}
228	else if (t_iseq(state->prsbuf, `':'`))
229	{
230	if (curpos == state->word)
231	PRSSYNTAXERROR;
232	*(curpos) = `'\0'`;
233	if (state->oprisdelim)
234	RETURN_TOKEN;
235	else
236	statecode = INPOSINFO;
237	}
238	else
239	{
240	RESIZEPRSBUF;
241	COPYCHAR(curpos, state->prsbuf);
242	curpos += pg_mblen(state->prsbuf);
243	}
244	}
245	else if (statecode == WAITENDCMPLX)
246	{
247	if (!state->is_web && t_iseq(state->prsbuf, `'\''`))
248	{
249	statecode = WAITCHARCMPLX;
250	}
251	else if (!state->is_web && t_iseq(state->prsbuf, `'\\'`))
252	{
253	statecode = WAITNEXTCHAR;
254	oldstate = WAITENDCMPLX;
255	}
256	else if (*(state->prsbuf) == `'\0'`)
257	PRSSYNTAXERROR;
258	else
259	{
260	RESIZEPRSBUF;
261	COPYCHAR(curpos, state->prsbuf);
262	curpos += pg_mblen(state->prsbuf);
263	}
264	}
265	else if (statecode == WAITCHARCMPLX)
266	{
267	if (!state->is_web && t_iseq(state->prsbuf, `'\''`))
268	{
269	RESIZEPRSBUF;
270	COPYCHAR(curpos, state->prsbuf);
271	curpos += pg_mblen(state->prsbuf);
272	statecode = WAITENDCMPLX;
273	}
274	else
275	{
276	RESIZEPRSBUF;
277	*(curpos) = `'\0'`;
278	if (curpos == state->word)
279	PRSSYNTAXERROR;
280	if (state->oprisdelim)
281	{
282	/ state->prsbuf+=pg_mblen(state->prsbuf); /
283	RETURN_TOKEN;
284	}
285	else
286	statecode = WAITPOSINFO;
287	continue; / recheck current character /
288	}
289	}
290	else if (statecode == WAITPOSINFO)
291	{
292	if (t_iseq(state->prsbuf, `':'`))
293	statecode = INPOSINFO;
294	else
295	RETURN_TOKEN;
296	}
297	else if (statecode == INPOSINFO)
298	{
299	if (t_isdigit(state->prsbuf))
300	{
301	if (posalen == `0`)
302	{
303	posalen = `4`;
304	pos = (WordEntryPos ) palloc(sizeof(WordEntryPos) posalen);
305	npos = `0`;
306	}
307	else if (npos + `1` >= posalen)
308	{
309	posalen *= `2`;
310	pos = (WordEntryPos ) repalloc(pos, sizeof(WordEntryPos) posalen);
311	}
312	npos++;
313	WEP_SETPOS(pos[npos - `1`], LIMITPOS(atoi(state->prsbuf)));
314	/ we cannot get here in tsquery, so no need for 2 errmsgs /
315	if (WEP_GETPOS(pos[npos - `1`]) == `0`)
316	ereport(ERROR,
317	(errcode(ERRCODE_SYNTAX_ERROR),
318	errmsg("wrong position info in tsvector: \"%s\"",
319	state->bufstart)));
320	WEP_SETWEIGHT(pos[npos - `1`], `0`);
321	statecode = WAITPOSDELIM;
322	}
323	else
324	PRSSYNTAXERROR;
325	}
326	else if (statecode == WAITPOSDELIM)
327	{
328	if (t_iseq(state->prsbuf, `','`))
329	statecode = INPOSINFO;
330	else if (t_iseq(state->prsbuf, `'a'`) \|\| t_iseq(state->prsbuf, `'A'`) \|\| t_iseq(state->prsbuf, `'*'`))
331	{
332	if (WEP_GETWEIGHT(pos[npos - `1`]))
333	PRSSYNTAXERROR;
334	WEP_SETWEIGHT(pos[npos - `1`], `3`);
335	}
336	else if (t_iseq(state->prsbuf, `'b'`) \|\| t_iseq(state->prsbuf, `'B'`))
337	{
338	if (WEP_GETWEIGHT(pos[npos - `1`]))
339	PRSSYNTAXERROR;
340	WEP_SETWEIGHT(pos[npos - `1`], `2`);
341	}
342	else if (t_iseq(state->prsbuf, `'c'`) \|\| t_iseq(state->prsbuf, `'C'`))
343	{
344	if (WEP_GETWEIGHT(pos[npos - `1`]))
345	PRSSYNTAXERROR;
346	WEP_SETWEIGHT(pos[npos - `1`], `1`);
347	}
348	else if (t_iseq(state->prsbuf, `'d'`) \|\| t_iseq(state->prsbuf, `'D'`))
349	{
350	if (WEP_GETWEIGHT(pos[npos - `1`]))
351	PRSSYNTAXERROR;
352	WEP_SETWEIGHT(pos[npos - `1`], `0`);
353	}
354	else if (t_isspace(state->prsbuf) \|\|
355	*(state->prsbuf) == `'\0'`)
356	RETURN_TOKEN;
357	else if (!t_isdigit(state->prsbuf))
358	PRSSYNTAXERROR;
359	}
360	else / internal error /
361	elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
362	statecode);
363
364	/ get next char /
365	state->prsbuf += pg_mblen(state->prsbuf);
366	}
367	}
368

Browse the source code of PostgreSQL/src/backend/utils/adt/tsvector_parser.c