mbprint.c source code [PostgreSQL/src/fe_utils/mbprint.c]

1	/-------------------------------------------------------------------------*
2	*
3	* Multibyte character printing support for frontend code
4	*
5	*
6	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7	* Portions Copyright (c) 1994, Regents of the University of California
8	*
9	* src/fe_utils/mbprint.c
10	*
11	*-------------------------------------------------------------------------
12	*/
13	#include "postgres_fe.h"
14
15	#include "fe_utils/mbprint.h"
16
17	#include "libpq-fe.h"
18
19
20	/*
21	* To avoid version-skew problems, this file must not use declarations
22	* from pg_wchar.h: the encoding IDs we are dealing with are determined
23	* by the libpq.so we are linked with, and that might not match the
24	* numbers we see at compile time. (If this file were inside libpq,
25	* the problem would go away...)
26	*
27	* Hence, we have our own definition of pg_wchar, and we get the values
28	* of any needed encoding IDs on-the-fly.
29	*/
30
31	typedef unsigned int pg_wchar;
32
33	static int
34	pg_get_utf8_id(void)
35	{
36	static int utf8_id = -`1`;
37
38	if (utf8_id < `0`)
39	utf8_id = pg_char_to_encoding("utf8");
40	return utf8_id;
41	}
42
43	#define PG_UTF8 pg_get_utf8_id()
44
45
46	/*
47	* Convert a UTF-8 character to a Unicode code point.
48	* This is a one-character version of pg_utf2wchar_with_len.
49	*
50	* No error checks here, c must point to a long-enough string.
51	*/
52	static pg_wchar
53	utf8_to_unicode(const unsigned char *c)
54	{
55	if ((*c & `0x80`) == `0`)
56	return (pg_wchar) c[`0`];
57	else if ((*c & `0xe0`) == `0xc0`)
58	return (pg_wchar) (((c[`0`] & `0x1f`) << `6`) \|
59	(c[`1`] & `0x3f`));
60	else if ((*c & `0xf0`) == `0xe0`)
61	return (pg_wchar) (((c[`0`] & `0x0f`) << `12`) \|
62	((c[`1`] & `0x3f`) << `6`) \|
63	(c[`2`] & `0x3f`));
64	else if ((*c & `0xf8`) == `0xf0`)
65	return (pg_wchar) (((c[`0`] & `0x07`) << `18`) \|
66	((c[`1`] & `0x3f`) << `12`) \|
67	((c[`2`] & `0x3f`) << `6`) \|
68	(c[`3`] & `0x3f`));
69	else
70	/ that is an invalid code on purpose /
71	return `0xffffffff`;
72	}
73
74
75	/*
76	* Unicode 3.1 compliant validation : for each category, it checks the
77	* combination of each byte to make sure it maps to a valid range. It also
78	* returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
79	* 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
80	*/
81	static int
82	utf_charcheck(const unsigned char *c)
83	{
84	if ((*c & `0x80`) == `0`)
85	return `1`;
86	else if ((*c & `0xe0`) == `0xc0`)
87	{
88	/ two-byte char /
89	if (((c[`1`] & `0xc0`) == `0x80`) && ((c[`0`] & `0x1f`) > `0x01`))
90	return `2`;
91	return -`1`;
92	}
93	else if ((*c & `0xf0`) == `0xe0`)
94	{
95	/ three-byte char /
96	if (((c[`1`] & `0xc0`) == `0x80`) &&
97	(((c[`0`] & `0x0f`) != `0x00`) \|\| ((c[`1`] & `0x20`) == `0x20`)) &&
98	((c[`2`] & `0xc0`) == `0x80`))
99	{
100	int z = c[`0`] & `0x0f`;
101	int yx = ((c[`1`] & `0x3f`) << `6`) \| (c[`0`] & `0x3f`);
102	int lx = yx & `0x7f`;
103
104	/ check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates /
105	if (((z == `0x0f`) &&
106	(((yx & `0xffe`) == `0xffe`) \|\|
107	(((yx & `0xf80`) == `0xd80`) && (lx >= `0x30`) && (lx <= `0x4f`)))) \|\|
108	((z == `0x0d`) && ((yx & `0xb00`) == `0x800`)))
109	return -`1`;
110	return `3`;
111	}
112	return -`1`;
113	}
114	else if ((*c & `0xf8`) == `0xf0`)
115	{
116	int u = ((c[`0`] & `0x07`) << `2`) \| ((c[`1`] & `0x30`) >> `4`);
117
118	/ four-byte char /
119	if (((c[`1`] & `0xc0`) == `0x80`) &&
120	(u > `0x00`) && (u <= `0x10`) &&
121	((c[`2`] & `0xc0`) == `0x80`) && ((c[`3`] & `0xc0`) == `0x80`))
122	{
123	/ test for 0xzzzzfffe/0xzzzzfffff /
124	if (((c[`1`] & `0x0f`) == `0x0f`) && ((c[`2`] & `0x3f`) == `0x3f`) &&
125	((c[`3`] & `0x3e`) == `0x3e`))
126	return -`1`;
127	return `4`;
128	}
129	return -`1`;
130	}
131	return -`1`;
132	}
133
134
135	static void
136	mb_utf_validate(unsigned char *pwcs)
137	{
138	unsigned char *p = pwcs;
139
140	while (*pwcs)
141	{
142	int len;
143
144	if ((len = utf_charcheck(pwcs)) > `0`)
145	{
146	if (p != pwcs)
147	{
148	int i;
149
150	for (i = `0`; i < len; i++)
151	p++ = pwcs++;
152	}
153	else
154	{
155	pwcs += len;
156	p += len;
157	}
158	}
159	else
160	/ we skip the char /
161	pwcs++;
162	}
163	if (p != pwcs)
164	*p = `'\0'`;
165	}
166
167	/*
168	* public functions : wcswidth and mbvalidate
169	*/
170
171	/*
172	* pg_wcswidth is the dumb display-width function.
173	* It assumes that everything will appear on one line.
174	* OTOH it is easier to use than pg_wcssize if this applies to you.
175	*/
176	int
177	pg_wcswidth(const char pwcs, size_t len, int* encoding)
178	{
179	int width = `0`;
180
181	while (len > `0`)
182	{
183	int chlen,
184	chwidth;
185
186	chlen = PQmblen(pwcs, encoding);
187	if (len < (size_t) chlen)
188	break; / Invalid string /
189
190	chwidth = PQdsplen(pwcs, encoding);
191	if (chwidth > `0`)
192	width += chwidth;
193
194	pwcs += chlen;
195	len -= chlen;
196	}
197	return width;
198	}
199
200	/*
201	* pg_wcssize takes the given string in the given encoding and returns three
202	* values:
203	* result_width: Width in display characters of the longest line in string
204	* result_height: Number of lines in display output
205	* result_format_size: Number of bytes required to store formatted
206	* representation of string
207	*
208	* This MUST be kept in sync with pg_wcsformat!
209	*/
210	void
211	pg_wcssize(const unsigned char pwcs, size_t len, int* encoding,
212	int result_width, int* result_height, int* *result_format_size)
213	{
214	int w,
215	chlen = `0`,
216	linewidth = `0`;
217	int width = `0`;
218	int height = `1`;
219	int format_size = `0`;
220
221	for (; *pwcs && len > `0`; pwcs += chlen)
222	{
223	chlen = PQmblen((const char *) pwcs, encoding);
224	if (len < (size_t) chlen)
225	break;
226	w = PQdsplen((const char *) pwcs, encoding);
227
228	if (chlen == `1`) / single-byte char /
229	{
230	if (pwcs == `'\n'`) /* Newline /
231	{
232	if (linewidth > width)
233	width = linewidth;
234	linewidth = `0`;
235	height += `1`;
236	format_size += `1`; / For NUL char /
237	}
238	else if (pwcs == `'\r'`) /* Linefeed /
239	{
240	linewidth += `2`;
241	format_size += `2`;
242	}
243	else if (pwcs == `'\t'`) /* Tab /
244	{
245	do
246	{
247	linewidth++;
248	format_size++;
249	} while (linewidth % `8` != `0`);
250	}
251	else if (w < `0`) / Other control char /
252	{
253	linewidth += `4`;
254	format_size += `4`;
255	}
256	else / Output it as-is /
257	{
258	linewidth += w;
259	format_size += `1`;
260	}
261	}
262	else if (w < `0`) / Non-ascii control char /
263	{
264	linewidth += `6`; / \u0000 /
265	format_size += `6`;
266	}
267	else / All other chars /
268	{
269	linewidth += w;
270	format_size += chlen;
271	}
272	len -= chlen;
273	}
274	if (linewidth > width)
275	width = linewidth;
276	format_size += `1`; / For NUL char /
277
278	/ Set results /
279	if (result_width)
280	*result_width = width;
281	if (result_height)
282	*result_height = height;
283	if (result_format_size)
284	*result_format_size = format_size;
285	}
286
287	/*
288	* Format a string into one or more "struct lineptr" lines.
289	* lines[i].ptr == NULL indicates the end of the array.
290	*
291	* This MUST be kept in sync with pg_wcssize!
292	*/
293	void
294	pg_wcsformat(const unsigned char pwcs, size_t len, int* encoding,
295	struct lineptr lines, int* count)
296	{
297	int w,
298	chlen = `0`;
299	int linewidth = `0`;
300	unsigned char ptr = lines->ptr; /* Pointer to data area /
301
302	for (; *pwcs && len > `0`; pwcs += chlen)
303	{
304	chlen = PQmblen((const char *) pwcs, encoding);
305	if (len < (size_t) chlen)
306	break;
307	w = PQdsplen((const char *) pwcs, encoding);
308
309	if (chlen == `1`) / single-byte char /
310	{
311	if (pwcs == `'\n'`) /* Newline /
312	{
313	*ptr++ = `'\0'`;
314	lines->width = linewidth;
315	linewidth = `0`;
316	lines++;
317	count--;
318	if (count <= `0`)
319	exit(`1`); / Screwup /
320
321	/ make next line point to remaining memory /
322	lines->ptr = ptr;
323	}
324	else if (pwcs == `'\r'`) /* Linefeed /
325	{
326	strcpy((char *) ptr, "\\r");
327	linewidth += `2`;
328	ptr += `2`;
329	}
330	else if (pwcs == `'\t'`) /* Tab /
331	{
332	do
333	{
334	*ptr++ = `' '`;
335	linewidth++;
336	} while (linewidth % `8` != `0`);
337	}
338	else if (w < `0`) / Other control char /
339	{
340	sprintf((char ) ptr, "\\x%02X", pwcs);
341	linewidth += `4`;
342	ptr += `4`;
343	}
344	else / Output it as-is /
345	{
346	linewidth += w;
347	ptr++ = pwcs;
348	}
349	}
350	else if (w < `0`) / Non-ascii control char /
351	{
352	if (encoding == PG_UTF8)
353	sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
354	else
355	{
356	/*
357	* This case cannot happen in the current code because only
358	* UTF-8 signals multibyte control characters. But we may need
359	* to support it at some stage
360	*/
361	sprintf((char *) ptr, "\\u????");
362	}
363	ptr += `6`;
364	linewidth += `6`;
365	}
366	else / All other chars /
367	{
368	int i;
369
370	for (i = `0`; i < chlen; i++)
371	*ptr++ = pwcs[i];
372	linewidth += w;
373	}
374	len -= chlen;
375	}
376	lines->width = linewidth;
377	ptr++ = `'\0'`; /* Terminate formatted string /
378
379	if (count <= `0`)
380	exit(`1`); / Screwup /
381
382	(lines + `1`)->ptr = NULL; / terminate line array /
383	}
384
385
386	/*
387	* Encoding validation: delete any unvalidatable characters from the string
388	*
389	* This seems redundant with existing functionality elsewhere?
390	*/
391	unsigned char *
392	mbvalidate(unsigned char pwcs, int* encoding)
393	{
394	if (encoding == PG_UTF8)
395	mb_utf_validate(pwcs);
396	else
397	{
398	/*
399	* other encodings needing validation should add their own routines
400	* here
401	*/
402	}
403
404	return pwcs;
405	}
406

Browse the source code of PostgreSQL/src/fe_utils/mbprint.c