read.c source code [PostgreSQL/src/backend/nodes/read.c]

1	/-------------------------------------------------------------------------*
2	*
3	* read.c
4	* routines to convert a string (legal ascii representation of node) back
5	* to nodes
6	*
7	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8	* Portions Copyright (c) 1994, Regents of the University of California
9	*
10	*
11	* IDENTIFICATION
12	* src/backend/nodes/read.c
13	*
14	* HISTORY
15	* AUTHOR DATE MAJOR EVENT
16	* Andrew Yu Nov 2, 1994 file creation
17	*
18	*-------------------------------------------------------------------------
19	*/
20	#include "postgres.h"
21
22	#include <ctype.h>
23
24	#include "common/string.h"
25	#include "nodes/pg_list.h"
26	#include "nodes/readfuncs.h"
27	#include "nodes/value.h"
28
29
30	/ Static state for pg_strtok /
31	static const char *pg_strtok_ptr = NULL;
32
33	/ State flag that determines how readfuncs.c should treat location fields /
34	#ifdef WRITE_READ_PARSE_PLAN_TREES
35	bool restore_location_fields = false;
36	#endif
37
38
39	/*
40	* stringToNode -
41	* builds a Node tree from its string representation (assumed valid)
42	*
43	* restore_loc_fields instructs readfuncs.c whether to restore location
44	* fields rather than set them to -1. This is currently only supported
45	* in builds with the WRITE_READ_PARSE_PLAN_TREES debugging flag set.
46	*/
47	static void *
48	stringToNodeInternal(const char *str, bool restore_loc_fields)
49	{
50	void *retval;
51	const char *save_strtok;
52	#ifdef WRITE_READ_PARSE_PLAN_TREES
53	bool save_restore_location_fields;
54	#endif
55
56	/*
57	* We save and restore the pre-existing state of pg_strtok. This makes the
58	* world safe for re-entrant invocation of stringToNode, without incurring
59	* a lot of notational overhead by having to pass the next-character
60	* pointer around through all the readfuncs.c code.
61	*/
62	save_strtok = pg_strtok_ptr;
63
64	pg_strtok_ptr = str; / point pg_strtok at the string to read /
65
66	/*
67	* If enabled, likewise save/restore the location field handling flag.
68	*/
69	#ifdef WRITE_READ_PARSE_PLAN_TREES
70	save_restore_location_fields = restore_location_fields;
71	restore_location_fields = restore_loc_fields;
72	#endif
73
74	retval = nodeRead(NULL, `0`); / do the reading /
75
76	pg_strtok_ptr = save_strtok;
77
78	#ifdef WRITE_READ_PARSE_PLAN_TREES
79	restore_location_fields = save_restore_location_fields;
80	#endif
81
82	return retval;
83	}
84
85	/*
86	* Externally visible entry points
87	*/
88	void *
89	stringToNode(const char *str)
90	{
91	return stringToNodeInternal(str, false);
92	}
93
94	#ifdef WRITE_READ_PARSE_PLAN_TREES
95
96	void *
97	stringToNodeWithLocations(const char *str)
98	{
99	return stringToNodeInternal(str, true);
100	}
101
102	#endif
103
104
105	/*****************************************************************************
106	*
107	* the lisp token parser
108	*
109	*****************************************************************************/
110
111	/*
112	* pg_strtok --- retrieve next "token" from a string.
113	*
114	* Works kinda like strtok, except it never modifies the source string.
115	* (Instead of storing nulls into the string, the length of the token
116	* is returned to the caller.)
117	* Also, the rules about what is a token are hard-wired rather than being
118	* configured by passing a set of terminating characters.
119	*
120	* The string is assumed to have been initialized already by stringToNode.
121	*
122	* The rules for tokens are:
123	* * Whitespace (space, tab, newline) always separates tokens.
124	* * The characters '(', ')', '{', '}' form individual tokens even
125	* without any whitespace around them.
126	* * Otherwise, a token is all the characters up to the next whitespace
127	* or occurrence of one of the four special characters.
128	* * A backslash '\' can be used to quote whitespace or one of the four
129	* special characters, so that it is treated as a plain token character.
130	* Backslashes themselves must also be backslashed for consistency.
131	* Any other character can be, but need not be, backslashed as well.
132	* * If the resulting token is '<>' (with no backslash), it is returned
133	* as a non-NULL pointer to the token but with length == 0. Note that
134	* there is no other way to get a zero-length token.
135	*
136	* Returns a pointer to the start of the next token, and the length of the
137	* token (including any embedded backslashes!) in *length. If there are
138	* no more tokens, NULL and 0 are returned.
139	*
140	* NOTE: this routine doesn't remove backslashes; the caller must do so
141	* if necessary (see "debackslash").
142	*
143	* NOTE: prior to release 7.0, this routine also had a special case to treat
144	* a token starting with '"' as extending to the next '"'. This code was
145	* broken, however, since it would fail to cope with a string containing an
146	* embedded '"'. I have therefore removed this special case, and instead
147	* introduced rules for using backslashes to quote characters. Higher-level
148	* code should add backslashes to a string constant to ensure it is treated
149	* as a single token.
150	*/
151	const char *
152	pg_strtok(int *length)
153	{
154	const char local_str; /* working pointer to string /
155	const char ret_str; /* start of token to return /
156
157	local_str = pg_strtok_ptr;
158
159	while (local_str == `' '` \|\| local_str == `'\n'` \|\| *local_str == `'\t'`)
160	local_str++;
161
162	if (*local_str == `'\0'`)
163	{
164	*length = `0`;
165	pg_strtok_ptr = local_str;
166	return NULL; / no more tokens /
167	}
168
169	/*
170	* Now pointing at start of next token.
171	*/
172	ret_str = local_str;
173
174	if (local_str == `'('` \|\| local_str == `')'` \|\|
175	local_str == `'{'` \|\| local_str == `'}'`)
176	{
177	/ special 1-character token /
178	local_str++;
179	}
180	else
181	{
182	/ Normal token, possibly containing backslashes /
183	while (*local_str != `'\0'` &&
184	local_str != `' '` && local_str != `'\n'` &&
185	*local_str != `'\t'` &&
186	local_str != `'('` && local_str != `')'` &&
187	local_str != `'{'` && local_str != `'}'`)
188	{
189	if (*local_str == `'\\'` && local_str[`1`] != `'\0'`)
190	local_str += `2`;
191	else
192	local_str++;
193	}
194	}
195
196	*length = local_str - ret_str;
197
198	/ Recognize special case for "empty" token /
199	if (*length == `2` && ret_str[`0`] == `'<'` && ret_str[`1`] == `'>'`)
200	*length = `0`;
201
202	pg_strtok_ptr = local_str;
203
204	return ret_str;
205	}
206
207	/*
208	* debackslash -
209	* create a palloc'd string holding the given token.
210	* any protective backslashes in the token are removed.
211	*/
212	char *
213	debackslash(const char token, int* length)
214	{
215	char *result = palloc(length + `1`);
216	char *ptr = result;
217
218	while (length > `0`)
219	{
220	if (*token == `'\\'` && length > `1`)
221	token++, length--;
222	ptr++ = token++;
223	length--;
224	}
225	*ptr = `'\0'`;
226	return result;
227	}
228
229	#define RIGHT_PAREN (1000000 + 1)
230	#define LEFT_PAREN (1000000 + 2)
231	#define LEFT_BRACE (1000000 + 3)
232	#define OTHER_TOKEN (1000000 + 4)
233
234	/*
235	* nodeTokenType -
236	* returns the type of the node token contained in token.
237	* It returns one of the following valid NodeTags:
238	* T_Integer, T_Float, T_String, T_BitString
239	* and some of its own:
240	* RIGHT_PAREN, LEFT_PAREN, LEFT_BRACE, OTHER_TOKEN
241	*
242	* Assumption: the ascii representation is legal
243	*/
244	static NodeTag
245	nodeTokenType(const char token, int* length)
246	{
247	NodeTag retval;
248	const char *numptr;
249	int numlen;
250
251	/*
252	* Check if the token is a number
253	*/
254	numptr = token;
255	numlen = length;
256	if (numptr == `'+'` \|\| numptr == `'-'`)
257	numptr++, numlen--;
258	if ((numlen > `0` && isdigit((unsigned char) *numptr)) \|\|
259	(numlen > `1` && numptr == `'.'` && isdigit((unsigned* char) numptr[`1`])))
260	{
261	/*
262	* Yes. Figure out whether it is integral or float; this requires
263	* both a syntax check and a range check. strtoint() can do both for
264	* us. We know the token will end at a character that strtoint will
265	* stop at, so we do not need to modify the string.
266	*/
267	char *endptr;
268
269	errno = `0`;
270	(void) strtoint(token, &endptr, `10`);
271	if (endptr != token + length \|\| errno == ERANGE)
272	return T_Float;
273	return T_Integer;
274	}
275
276	/*
277	* these three cases do not need length checks, since pg_strtok() will
278	* always treat them as single-byte tokens
279	*/
280	else if (*token == `'('`)
281	retval = LEFT_PAREN;
282	else if (*token == `')'`)
283	retval = RIGHT_PAREN;
284	else if (*token == `'{'`)
285	retval = LEFT_BRACE;
286	else if (*token == `'"'` && length > `1` && token[length - `1`] == `'"'`)
287	retval = T_String;
288	else if (*token == `'b'`)
289	retval = T_BitString;
290	else
291	retval = OTHER_TOKEN;
292	return retval;
293	}
294
295	/*
296	* nodeRead -
297	* Slightly higher-level reader.
298	*
299	* This routine applies some semantic knowledge on top of the purely
300	* lexical tokenizer pg_strtok(). It can read
301	* * Value token nodes (integers, floats, or strings);
302	* * General nodes (via parseNodeString() from readfuncs.c);
303	* * Lists of the above;
304	* * Lists of integers or OIDs.
305	* The return value is declared void , not Node , to avoid having to
306	* cast it explicitly in callers that assign to fields of different types.
307	*
308	* External callers should always pass NULL/0 for the arguments. Internally
309	* a non-NULL token may be passed when the upper recursion level has already
310	* scanned the first token of a node's representation.
311	*
312	* We assume pg_strtok is already initialized with a string to read (hence
313	* this should only be invoked from within a stringToNode operation).
314	*/
315	void *
316	nodeRead(const char token, int* tok_len)
317	{
318	Node *result;
319	NodeTag type;
320
321	if (token == NULL) / need to read a token? /
322	{
323	token = pg_strtok(&tok_len);
324
325	if (token == NULL) / end of input /
326	return NULL;
327	}
328
329	type = nodeTokenType(token, tok_len);
330
331	switch ((int) type)
332	{
333	case LEFT_BRACE:
334	result = parseNodeString();
335	token = pg_strtok(&tok_len);
336	if (token == NULL \|\| token[`0`] != `'}'`)
337	elog(ERROR, "did not find '}' at end of input node");
338	break;
339	case LEFT_PAREN:
340	{
341	List *l = NIL;
342
343	/----------*
344	* Could be an integer list: (i int int ...)
345	* or an OID list: (o int int ...)
346	* or a list of nodes/values: (node node ...)
347	*----------
348	*/
349	token = pg_strtok(&tok_len);
350	if (token == NULL)
351	elog(ERROR, "unterminated List structure");
352	if (tok_len == `1` && token[`0`] == `'i'`)
353	{
354	/ List of integers /
355	for (;;)
356	{
357	int val;
358	char *endptr;
359
360	token = pg_strtok(&tok_len);
361	if (token == NULL)
362	elog(ERROR, "unterminated List structure");
363	if (token[`0`] == `')'`)
364	break;
365	val = (int) strtol(token, &endptr, `10`);
366	if (endptr != token + tok_len)
367	elog(ERROR, "unrecognized integer: \"%.*s\"",
368	tok_len, token);
369	l = lappend_int(l, val);
370	}
371	}
372	else if (tok_len == `1` && token[`0`] == `'o'`)
373	{
374	/ List of OIDs /
375	for (;;)
376	{
377	Oid val;
378	char *endptr;
379
380	token = pg_strtok(&tok_len);
381	if (token == NULL)
382	elog(ERROR, "unterminated List structure");
383	if (token[`0`] == `')'`)
384	break;
385	val = (Oid) strtoul(token, &endptr, `10`);
386	if (endptr != token + tok_len)
387	elog(ERROR, "unrecognized OID: \"%.*s\"",
388	tok_len, token);
389	l = lappend_oid(l, val);
390	}
391	}
392	else
393	{
394	/ List of other node types /
395	for (;;)
396	{
397	/ We have already scanned next token... /
398	if (token[`0`] == `')'`)
399	break;
400	l = lappend(l, nodeRead(token, tok_len));
401	token = pg_strtok(&tok_len);
402	if (token == NULL)
403	elog(ERROR, "unterminated List structure");
404	}
405	}
406	result = (Node *) l;
407	break;
408	}
409	case RIGHT_PAREN:
410	elog(ERROR, "unexpected right parenthesis");
411	result = NULL; / keep compiler happy /
412	break;
413	case OTHER_TOKEN:
414	if (tok_len == `0`)
415	{
416	/ must be "<>" --- represents a null pointer /
417	result = NULL;
418	}
419	else
420	{
421	elog(ERROR, "unrecognized token: \"%.*s\"", tok_len, token);
422	result = NULL; / keep compiler happy /
423	}
424	break;
425	case T_Integer:
426
427	/*
428	* we know that the token terminates on a char atoi will stop at
429	*/
430	result = (Node *) makeInteger(atoi(token));
431	break;
432	case T_Float:
433	{
434	char fval = (char* *) palloc(tok_len + `1`);
435
436	memcpy(fval, token, tok_len);
437	fval[tok_len] = `'\0'`;
438	result = (Node *) makeFloat(fval);
439	}
440	break;
441	case T_String:
442	/ need to remove leading and trailing quotes, and backslashes /
443	result = (Node *) makeString(debackslash(token + `1`, tok_len - `2`));
444	break;
445	case T_BitString:
446	{
447	char *val = palloc(tok_len);
448
449	/ skip leading 'b' /
450	memcpy(val, token + `1`, tok_len - `1`);
451	val[tok_len - `1`] = `'\0'`;
452	result = (Node *) makeBitString(val);
453	break;
454	}
455	default:
456	elog(ERROR, "unrecognized node type: %d", (int) type);
457	result = NULL; / keep compiler happy /
458	break;
459	}
460
461	return (void *) result;
462	}
463

Browse the source code of PostgreSQL/src/backend/nodes/read.c

Definitions