encode.c source code [PostgreSQL/src/backend/utils/adt/encode.c]

1	/-------------------------------------------------------------------------*
2	*
3	* encode.c
4	* Various data encoding/decoding things.
5	*
6	* Copyright (c) 2001-2019, PostgreSQL Global Development Group
7	*
8	*
9	* IDENTIFICATION
10	* src/backend/utils/adt/encode.c
11	*
12	*-------------------------------------------------------------------------
13	*/
14	#include "postgres.h"
15
16	#include <ctype.h>
17
18	#include "utils/builtins.h"
19
20
21	struct pg_encoding
22	{
23	unsigned (encode_len) (const* char data, unsigned* dlen);
24	unsigned (decode_len) (const* char data, unsigned* dlen);
25	unsigned (encode) (const* char data, unsigned* dlen, char *res);
26	unsigned (decode) (const* char data, unsigned* dlen, char *res);
27	};
28
29	static const struct pg_encoding pg_find_encoding(const* char *name);
30
31	/*
32	* SQL functions.
33	*/
34
35	Datum
36	binary_encode(PG_FUNCTION_ARGS)
37	{
38	bytea *data = PG_GETARG_BYTEA_PP(`0`);
39	Datum name = PG_GETARG_DATUM(`1`);
40	text *result;
41	char *namebuf;
42	int datalen,
43	resultlen,
44	res;
45	const struct pg_encoding *enc;
46
47	datalen = VARSIZE_ANY_EXHDR(data);
48
49	namebuf = TextDatumGetCString(name);
50
51	enc = pg_find_encoding(namebuf);
52	if (enc == NULL)
53	ereport(ERROR,
54	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
55	errmsg("unrecognized encoding: \"%s\"", namebuf)));
56
57	resultlen = enc->encode_len(VARDATA_ANY(data), datalen);
58	result = palloc(VARHDRSZ + resultlen);
59
60	res = enc->encode(VARDATA_ANY(data), datalen, VARDATA(result));
61
62	/ Make this FATAL 'cause we've trodden on memory ... /
63	if (res > resultlen)
64	elog(FATAL, "overflow - encode estimate too small");
65
66	SET_VARSIZE(result, VARHDRSZ + res);
67
68	PG_RETURN_TEXT_P(result);
69	}
70
71	Datum
72	binary_decode(PG_FUNCTION_ARGS)
73	{
74	text *data = PG_GETARG_TEXT_PP(`0`);
75	Datum name = PG_GETARG_DATUM(`1`);
76	bytea *result;
77	char *namebuf;
78	int datalen,
79	resultlen,
80	res;
81	const struct pg_encoding *enc;
82
83	datalen = VARSIZE_ANY_EXHDR(data);
84
85	namebuf = TextDatumGetCString(name);
86
87	enc = pg_find_encoding(namebuf);
88	if (enc == NULL)
89	ereport(ERROR,
90	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
91	errmsg("unrecognized encoding: \"%s\"", namebuf)));
92
93	resultlen = enc->decode_len(VARDATA_ANY(data), datalen);
94	result = palloc(VARHDRSZ + resultlen);
95
96	res = enc->decode(VARDATA_ANY(data), datalen, VARDATA(result));
97
98	/ Make this FATAL 'cause we've trodden on memory ... /
99	if (res > resultlen)
100	elog(FATAL, "overflow - decode estimate too small");
101
102	SET_VARSIZE(result, VARHDRSZ + res);
103
104	PG_RETURN_BYTEA_P(result);
105	}
106
107
108	/*
109	* HEX
110	*/
111
112	static const char hextbl[] = "0123456789abcdef";
113
114	static const int8 hexlookup[`128`] = {
115	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
116	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
117	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
118	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
119	-`1`, `10`, `11`, `12`, `13`, `14`, `15`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
120	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
121	-`1`, `10`, `11`, `12`, `13`, `14`, `15`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
122	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
123	};
124
125	unsigned
126	hex_encode(const char src, unsigned* len, char *dst)
127	{
128	const char *end = src + len;
129
130	while (src < end)
131	{
132	dst++ = hextbl[(src >> `4`) & `0xF`];
133	dst++ = hextbl[src & `0xF`];
134	src++;
135	}
136	return len * `2`;
137	}
138
139	static inline char
140	get_hex(char c)
141	{
142	int res = -`1`;
143
144	if (c > `0` && c < `127`)
145	res = hexlookup[(unsigned char) c];
146
147	if (res < `0`)
148	ereport(ERROR,
149	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
150	errmsg("invalid hexadecimal digit: \"%c\"", c)));
151
152	return (char) res;
153	}
154
155	unsigned
156	hex_decode(const char src, unsigned* len, char *dst)
157	{
158	const char *s,
159	*srcend;
160	char v1,
161	v2,
162	*p;
163
164	srcend = src + len;
165	s = src;
166	p = dst;
167	while (s < srcend)
168	{
169	if (s == `' '` \|\| s == `'\n'` \|\| s == `'\t'` \|\| s == `'\r'`)
170	{
171	s++;
172	continue;
173	}
174	v1 = get_hex(*s++) << `4`;
175	if (s >= srcend)
176	ereport(ERROR,
177	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
178	errmsg("invalid hexadecimal data: odd number of digits")));
179
180	v2 = get_hex(*s++);
181	*p++ = v1 \| v2;
182	}
183
184	return p - dst;
185	}
186
187	static unsigned
188	hex_enc_len(const char src, unsigned* srclen)
189	{
190	return srclen << `1`;
191	}
192
193	static unsigned
194	hex_dec_len(const char src, unsigned* srclen)
195	{
196	return srclen >> `1`;
197	}
198
199	/*
200	* BASE64
201	*/
202
203	static const char _base64[] =
204	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
205
206	static const int8 b64lookup[`128`] = {
207	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
208	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
209	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `62`, -`1`, -`1`, -`1`, `63`,
210	`52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`,
211	-`1`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`,
212	`15`, `16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, -`1`, -`1`, -`1`, -`1`, -`1`,
213	-`1`, `26`, `27`, `28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`,
214	`41`, `42`, `43`, `44`, `45`, `46`, `47`, `48`, `49`, `50`, `51`, -`1`, -`1`, -`1`, -`1`, -`1`,
215	};
216
217	static unsigned
218	pg_base64_encode(const char src, unsigned* len, char *dst)
219	{
220	char *p,
221	*lend = dst + `76`;
222	const char *s,
223	*end = src + len;
224	int pos = `2`;
225	uint32 buf = `0`;
226
227	s = src;
228	p = dst;
229
230	while (s < end)
231	{
232	buf \|= (unsigned char) *s << (pos << `3`);
233	pos--;
234	s++;
235
236	/ write it out /
237	if (pos < `0`)
238	{
239	*p++ = _base64[(buf >> `18`) & `0x3f`];
240	*p++ = _base64[(buf >> `12`) & `0x3f`];
241	*p++ = _base64[(buf >> `6`) & `0x3f`];
242	*p++ = _base64[buf & `0x3f`];
243
244	pos = `2`;
245	buf = `0`;
246	}
247	if (p >= lend)
248	{
249	*p++ = `'\n'`;
250	lend = p + `76`;
251	}
252	}
253	if (pos != `2`)
254	{
255	*p++ = _base64[(buf >> `18`) & `0x3f`];
256	*p++ = _base64[(buf >> `12`) & `0x3f`];
257	*p++ = (pos == `0`) ? _base64[(buf >> `6`) & `0x3f`] : `'='`;
258	*p++ = `'='`;
259	}
260
261	return p - dst;
262	}
263
264	static unsigned
265	pg_base64_decode(const char src, unsigned* len, char *dst)
266	{
267	const char *srcend = src + len,
268	*s = src;
269	char *p = dst;
270	char c;
271	int b = `0`;
272	uint32 buf = `0`;
273	int pos = `0`,
274	end = `0`;
275
276	while (s < srcend)
277	{
278	c = *s++;
279
280	if (c == `' '` \|\| c == `'\t'` \|\| c == `'\n'` \|\| c == `'\r'`)
281	continue;
282
283	if (c == `'='`)
284	{
285	/ end sequence /
286	if (!end)
287	{
288	if (pos == `2`)
289	end = `1`;
290	else if (pos == `3`)
291	end = `2`;
292	else
293	ereport(ERROR,
294	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
295	errmsg("unexpected \"=\" while decoding base64 sequence")));
296	}
297	b = `0`;
298	}
299	else
300	{
301	b = -`1`;
302	if (c > `0` && c < `127`)
303	b = b64lookup[(unsigned char) c];
304	if (b < `0`)
305	ereport(ERROR,
306	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
307	errmsg("invalid symbol \"%c\" while decoding base64 sequence", (int) c)));
308	}
309	/ add it to buffer /
310	buf = (buf << `6`) + b;
311	pos++;
312	if (pos == `4`)
313	{
314	*p++ = (buf >> `16`) & `255`;
315	if (end == `0` \|\| end > `1`)
316	*p++ = (buf >> `8`) & `255`;
317	if (end == `0` \|\| end > `2`)
318	*p++ = buf & `255`;
319	buf = `0`;
320	pos = `0`;
321	}
322	}
323
324	if (pos != `0`)
325	ereport(ERROR,
326	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
327	errmsg("invalid base64 end sequence"),
328	errhint("Input data is missing padding, is truncated, or is otherwise corrupted.")));
329
330	return p - dst;
331	}
332
333
334	static unsigned
335	pg_base64_enc_len(const char src, unsigned* srclen)
336	{
337	/ 3 bytes will be converted to 4, linefeed after 76 chars /
338	return (srclen + `2`) * `4` / `3` + srclen / (`76` * `3` / `4`);
339	}
340
341	static unsigned
342	pg_base64_dec_len(const char src, unsigned* srclen)
343	{
344	return (srclen * `3`) >> `2`;
345	}
346
347	/*
348	* Escape
349	* Minimally escape bytea to text.
350	* De-escape text to bytea.
351	*
352	* We must escape zero bytes and high-bit-set bytes to avoid generating
353	* text that might be invalid in the current encoding, or that might
354	* change to something else if passed through an encoding conversion
355	* (leading to failing to de-escape to the original bytea value).
356	* Also of course backslash itself has to be escaped.
357	*
358	* De-escaping processes \\ and any \### octal
359	*/
360
361	#define VAL(CH) ((CH) - '0')
362	#define DIG(VAL) ((VAL) + '0')
363
364	static unsigned
365	esc_encode(const char src, unsigned* srclen, char *dst)
366	{
367	const char *end = src + srclen;
368	char *rp = dst;
369	int len = `0`;
370
371	while (src < end)
372	{
373	unsigned char c = (unsigned char) *src;
374
375	if (c == `'\0'` \|\| IS_HIGHBIT_SET(c))
376	{
377	rp[`0`] = `'\\'`;
378	rp[`1`] = DIG(c >> `6`);
379	rp[`2`] = DIG((c >> `3`) & `7`);
380	rp[`3`] = DIG(c & `7`);
381	rp += `4`;
382	len += `4`;
383	}
384	else if (c == `'\\'`)
385	{
386	rp[`0`] = `'\\'`;
387	rp[`1`] = `'\\'`;
388	rp += `2`;
389	len += `2`;
390	}
391	else
392	{
393	*rp++ = c;
394	len++;
395	}
396
397	src++;
398	}
399
400	return len;
401	}
402
403	static unsigned
404	esc_decode(const char src, unsigned* srclen, char *dst)
405	{
406	const char *end = src + srclen;
407	char *rp = dst;
408	int len = `0`;
409
410	while (src < end)
411	{
412	if (src[`0`] != `'\\'`)
413	rp++ = src++;
414	else if (src + `3` < end &&
415	(src[`1`] >= `'0'` && src[`1`] <= `'3'`) &&
416	(src[`2`] >= `'0'` && src[`2`] <= `'7'`) &&
417	(src[`3`] >= `'0'` && src[`3`] <= `'7'`))
418	{
419	int val;
420
421	val = VAL(src[`1`]);
422	val <<= `3`;
423	val += VAL(src[`2`]);
424	val <<= `3`;
425	*rp++ = val + VAL(src[`3`]);
426	src += `4`;
427	}
428	else if (src + `1` < end &&
429	(src[`1`] == `'\\'`))
430	{
431	*rp++ = `'\\'`;
432	src += `2`;
433	}
434	else
435	{
436	/*
437	* One backslash, not followed by ### valid octal. Should never
438	* get here, since esc_dec_len does same check.
439	*/
440	ereport(ERROR,
441	(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
442	errmsg("invalid input syntax for type %s", "bytea")));
443	}
444
445	len++;
446	}
447
448	return len;
449	}
450
451	static unsigned
452	esc_enc_len(const char src, unsigned* srclen)
453	{
454	const char *end = src + srclen;
455	int len = `0`;
456
457	while (src < end)
458	{
459	if (src == `'\0'` \|\| IS_HIGHBIT_SET(src))
460	len += `4`;
461	else if (*src == `'\\'`)
462	len += `2`;
463	else
464	len++;
465
466	src++;
467	}
468
469	return len;
470	}
471
472	static unsigned
473	esc_dec_len(const char src, unsigned* srclen)
474	{
475	const char *end = src + srclen;
476	int len = `0`;
477
478	while (src < end)
479	{
480	if (src[`0`] != `'\\'`)
481	src++;
482	else if (src + `3` < end &&
483	(src[`1`] >= `'0'` && src[`1`] <= `'3'`) &&
484	(src[`2`] >= `'0'` && src[`2`] <= `'7'`) &&
485	(src[`3`] >= `'0'` && src[`3`] <= `'7'`))
486	{
487	/*
488	* backslash + valid octal
489	*/
490	src += `4`;
491	}
492	else if (src + `1` < end &&
493	(src[`1`] == `'\\'`))
494	{
495	/*
496	* two backslashes = backslash
497	*/
498	src += `2`;
499	}
500	else
501	{
502	/*
503	* one backslash, not followed by ### valid octal
504	*/
505	ereport(ERROR,
506	(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
507	errmsg("invalid input syntax for type %s", "bytea")));
508	}
509
510	len++;
511	}
512	return len;
513	}
514
515	/*
516	* Common
517	*/
518
519	static const struct
520	{
521	const char *name;
522	struct pg_encoding enc;
523	} enclist[] =
524
525	{
526	{
527	"hex",
528	{
529	hex_enc_len, hex_dec_len, hex_encode, hex_decode
530	}
531	},
532	{
533	"base64",
534	{
535	pg_base64_enc_len, pg_base64_dec_len, pg_base64_encode, pg_base64_decode
536	}
537	},
538	{
539	"escape",
540	{
541	esc_enc_len, esc_dec_len, esc_encode, esc_decode
542	}
543	},
544	{
545	NULL,
546	{
547	NULL, NULL, NULL, NULL
548	}
549	}
550	};
551
552	static const struct pg_encoding *
553	pg_find_encoding(const char *name)
554	{
555	int i;
556
557	for (i = `0`; enclist[i].name; i++)
558	if (pg_strcasecmp(enclist[i].name, name) == `0`)
559	return &enclist[i].enc;
560
561	return NULL;
562	}
563

Browse the source code of PostgreSQL/src/backend/utils/adt/encode.c