1/*-------------------------------------------------------------------------
2 *
3 * tsvector_parser.c
4 * Parser for tsvector
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector_parser.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15#include "postgres.h"
16
17#include "tsearch/ts_locale.h"
18#include "tsearch/ts_utils.h"
19
20
21/*
22 * Private state of tsvector parser. Note that tsquery also uses this code to
23 * parse its input, hence the boolean flags. The two flags are both true or
24 * both false in current usage, but we keep them separate for clarity.
25 * is_tsquery affects *only* the content of error messages.
26 */
27struct TSVectorParseStateData
28{
29 char *prsbuf; /* next input character */
30 char *bufstart; /* whole string (used only for errors) */
31 char *word; /* buffer to hold the current word */
32 int len; /* size in bytes allocated for 'word' */
33 int eml; /* max bytes per character */
34 bool oprisdelim; /* treat ! | * ( ) as delimiters? */
35 bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
36 bool is_web; /* we're in websearch_to_tsquery() */
37};
38
39
40/*
41 * Initializes parser for the input string. If oprisdelim is set, the
42 * following characters are treated as delimiters in addition to whitespace:
43 * ! | & ( )
44 */
45TSVectorParseState
46init_tsvector_parser(char *input, int flags)
47{
48 TSVectorParseState state;
49
50 state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
51 state->prsbuf = input;
52 state->bufstart = input;
53 state->len = 32;
54 state->word = (char *) palloc(state->len);
55 state->eml = pg_database_encoding_max_length();
56 state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
57 state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
58 state->is_web = (flags & P_TSV_IS_WEB) != 0;
59
60 return state;
61}
62
63/*
64 * Reinitializes parser to parse 'input', instead of previous input.
65 */
66void
67reset_tsvector_parser(TSVectorParseState state, char *input)
68{
69 state->prsbuf = input;
70}
71
72/*
73 * Shuts down a tsvector parser.
74 */
75void
76close_tsvector_parser(TSVectorParseState state)
77{
78 pfree(state->word);
79 pfree(state);
80}
81
82/* increase the size of 'word' if needed to hold one more character */
83#define RESIZEPRSBUF \
84do { \
85 int clen = curpos - state->word; \
86 if ( clen + state->eml >= state->len ) \
87 { \
88 state->len *= 2; \
89 state->word = (char *) repalloc(state->word, state->len); \
90 curpos = state->word + clen; \
91 } \
92} while (0)
93
94/* Fills gettoken_tsvector's output parameters, and returns true */
95#define RETURN_TOKEN \
96do { \
97 if (pos_ptr != NULL) \
98 { \
99 *pos_ptr = pos; \
100 *poslen = npos; \
101 } \
102 else if (pos != NULL) \
103 pfree(pos); \
104 \
105 if (strval != NULL) \
106 *strval = state->word; \
107 if (lenval != NULL) \
108 *lenval = curpos - state->word; \
109 if (endptr != NULL) \
110 *endptr = state->prsbuf; \
111 return true; \
112} while(0)
113
114
115/* State codes used in gettoken_tsvector */
116#define WAITWORD 1
117#define WAITENDWORD 2
118#define WAITNEXTCHAR 3
119#define WAITENDCMPLX 4
120#define WAITPOSINFO 5
121#define INPOSINFO 6
122#define WAITPOSDELIM 7
123#define WAITCHARCMPLX 8
124
125#define PRSSYNTAXERROR prssyntaxerror(state)
126
127static void
128prssyntaxerror(TSVectorParseState state)
129{
130 ereport(ERROR,
131 (errcode(ERRCODE_SYNTAX_ERROR),
132 state->is_tsquery ?
133 errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
134 errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
135}
136
137
138/*
139 * Get next token from string being parsed. Returns true if successful,
140 * false if end of input string is reached. On success, these output
141 * parameters are filled in:
142 *
143 * *strval pointer to token
144 * *lenval length of *strval
145 * *pos_ptr pointer to a palloc'd array of positions and weights
146 * associated with the token. If the caller is not interested
147 * in the information, NULL can be supplied. Otherwise
148 * the caller is responsible for pfreeing the array.
149 * *poslen number of elements in *pos_ptr
150 * *endptr scan resumption point
151 *
152 * Pass NULL for unwanted output parameters.
153 */
154bool
155gettoken_tsvector(TSVectorParseState state,
156 char **strval, int *lenval,
157 WordEntryPos **pos_ptr, int *poslen,
158 char **endptr)
159{
160 int oldstate = 0;
161 char *curpos = state->word;
162 int statecode = WAITWORD;
163
164 /*
165 * pos is for collecting the comma delimited list of positions followed by
166 * the actual token.
167 */
168 WordEntryPos *pos = NULL;
169 int npos = 0; /* elements of pos used */
170 int posalen = 0; /* allocated size of pos */
171
172 while (1)
173 {
174 if (statecode == WAITWORD)
175 {
176 if (*(state->prsbuf) == '\0')
177 return false;
178 else if (!state->is_web && t_iseq(state->prsbuf, '\''))
179 statecode = WAITENDCMPLX;
180 else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
181 {
182 statecode = WAITNEXTCHAR;
183 oldstate = WAITENDWORD;
184 }
185 else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
186 (state->is_web && t_iseq(state->prsbuf, '"')))
187 PRSSYNTAXERROR;
188 else if (!t_isspace(state->prsbuf))
189 {
190 COPYCHAR(curpos, state->prsbuf);
191 curpos += pg_mblen(state->prsbuf);
192 statecode = WAITENDWORD;
193 }
194 }
195 else if (statecode == WAITNEXTCHAR)
196 {
197 if (*(state->prsbuf) == '\0')
198 ereport(ERROR,
199 (errcode(ERRCODE_SYNTAX_ERROR),
200 errmsg("there is no escaped character: \"%s\"",
201 state->bufstart)));
202 else
203 {
204 RESIZEPRSBUF;
205 COPYCHAR(curpos, state->prsbuf);
206 curpos += pg_mblen(state->prsbuf);
207 Assert(oldstate != 0);
208 statecode = oldstate;
209 }
210 }
211 else if (statecode == WAITENDWORD)
212 {
213 if (!state->is_web && t_iseq(state->prsbuf, '\\'))
214 {
215 statecode = WAITNEXTCHAR;
216 oldstate = WAITENDWORD;
217 }
218 else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
219 (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
220 (state->is_web && t_iseq(state->prsbuf, '"')))
221 {
222 RESIZEPRSBUF;
223 if (curpos == state->word)
224 PRSSYNTAXERROR;
225 *(curpos) = '\0';
226 RETURN_TOKEN;
227 }
228 else if (t_iseq(state->prsbuf, ':'))
229 {
230 if (curpos == state->word)
231 PRSSYNTAXERROR;
232 *(curpos) = '\0';
233 if (state->oprisdelim)
234 RETURN_TOKEN;
235 else
236 statecode = INPOSINFO;
237 }
238 else
239 {
240 RESIZEPRSBUF;
241 COPYCHAR(curpos, state->prsbuf);
242 curpos += pg_mblen(state->prsbuf);
243 }
244 }
245 else if (statecode == WAITENDCMPLX)
246 {
247 if (!state->is_web && t_iseq(state->prsbuf, '\''))
248 {
249 statecode = WAITCHARCMPLX;
250 }
251 else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
252 {
253 statecode = WAITNEXTCHAR;
254 oldstate = WAITENDCMPLX;
255 }
256 else if (*(state->prsbuf) == '\0')
257 PRSSYNTAXERROR;
258 else
259 {
260 RESIZEPRSBUF;
261 COPYCHAR(curpos, state->prsbuf);
262 curpos += pg_mblen(state->prsbuf);
263 }
264 }
265 else if (statecode == WAITCHARCMPLX)
266 {
267 if (!state->is_web && t_iseq(state->prsbuf, '\''))
268 {
269 RESIZEPRSBUF;
270 COPYCHAR(curpos, state->prsbuf);
271 curpos += pg_mblen(state->prsbuf);
272 statecode = WAITENDCMPLX;
273 }
274 else
275 {
276 RESIZEPRSBUF;
277 *(curpos) = '\0';
278 if (curpos == state->word)
279 PRSSYNTAXERROR;
280 if (state->oprisdelim)
281 {
282 /* state->prsbuf+=pg_mblen(state->prsbuf); */
283 RETURN_TOKEN;
284 }
285 else
286 statecode = WAITPOSINFO;
287 continue; /* recheck current character */
288 }
289 }
290 else if (statecode == WAITPOSINFO)
291 {
292 if (t_iseq(state->prsbuf, ':'))
293 statecode = INPOSINFO;
294 else
295 RETURN_TOKEN;
296 }
297 else if (statecode == INPOSINFO)
298 {
299 if (t_isdigit(state->prsbuf))
300 {
301 if (posalen == 0)
302 {
303 posalen = 4;
304 pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
305 npos = 0;
306 }
307 else if (npos + 1 >= posalen)
308 {
309 posalen *= 2;
310 pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
311 }
312 npos++;
313 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
314 /* we cannot get here in tsquery, so no need for 2 errmsgs */
315 if (WEP_GETPOS(pos[npos - 1]) == 0)
316 ereport(ERROR,
317 (errcode(ERRCODE_SYNTAX_ERROR),
318 errmsg("wrong position info in tsvector: \"%s\"",
319 state->bufstart)));
320 WEP_SETWEIGHT(pos[npos - 1], 0);
321 statecode = WAITPOSDELIM;
322 }
323 else
324 PRSSYNTAXERROR;
325 }
326 else if (statecode == WAITPOSDELIM)
327 {
328 if (t_iseq(state->prsbuf, ','))
329 statecode = INPOSINFO;
330 else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
331 {
332 if (WEP_GETWEIGHT(pos[npos - 1]))
333 PRSSYNTAXERROR;
334 WEP_SETWEIGHT(pos[npos - 1], 3);
335 }
336 else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
337 {
338 if (WEP_GETWEIGHT(pos[npos - 1]))
339 PRSSYNTAXERROR;
340 WEP_SETWEIGHT(pos[npos - 1], 2);
341 }
342 else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
343 {
344 if (WEP_GETWEIGHT(pos[npos - 1]))
345 PRSSYNTAXERROR;
346 WEP_SETWEIGHT(pos[npos - 1], 1);
347 }
348 else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
349 {
350 if (WEP_GETWEIGHT(pos[npos - 1]))
351 PRSSYNTAXERROR;
352 WEP_SETWEIGHT(pos[npos - 1], 0);
353 }
354 else if (t_isspace(state->prsbuf) ||
355 *(state->prsbuf) == '\0')
356 RETURN_TOKEN;
357 else if (!t_isdigit(state->prsbuf))
358 PRSSYNTAXERROR;
359 }
360 else /* internal error */
361 elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
362 statecode);
363
364 /* get next char */
365 state->prsbuf += pg_mblen(state->prsbuf);
366 }
367}
368