1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * Multibyte character printing support for frontend code |
4 | * |
5 | * |
6 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
7 | * Portions Copyright (c) 1994, Regents of the University of California |
8 | * |
9 | * src/fe_utils/mbprint.c |
10 | * |
11 | *------------------------------------------------------------------------- |
12 | */ |
13 | #include "postgres_fe.h" |
14 | |
15 | #include "fe_utils/mbprint.h" |
16 | |
17 | #include "libpq-fe.h" |
18 | |
19 | |
20 | /* |
21 | * To avoid version-skew problems, this file must not use declarations |
22 | * from pg_wchar.h: the encoding IDs we are dealing with are determined |
23 | * by the libpq.so we are linked with, and that might not match the |
24 | * numbers we see at compile time. (If this file were inside libpq, |
25 | * the problem would go away...) |
26 | * |
27 | * Hence, we have our own definition of pg_wchar, and we get the values |
28 | * of any needed encoding IDs on-the-fly. |
29 | */ |
30 | |
31 | typedef unsigned int pg_wchar; |
32 | |
33 | static int |
34 | pg_get_utf8_id(void) |
35 | { |
36 | static int utf8_id = -1; |
37 | |
38 | if (utf8_id < 0) |
39 | utf8_id = pg_char_to_encoding("utf8" ); |
40 | return utf8_id; |
41 | } |
42 | |
43 | #define PG_UTF8 pg_get_utf8_id() |
44 | |
45 | |
46 | /* |
47 | * Convert a UTF-8 character to a Unicode code point. |
48 | * This is a one-character version of pg_utf2wchar_with_len. |
49 | * |
50 | * No error checks here, c must point to a long-enough string. |
51 | */ |
52 | static pg_wchar |
53 | utf8_to_unicode(const unsigned char *c) |
54 | { |
55 | if ((*c & 0x80) == 0) |
56 | return (pg_wchar) c[0]; |
57 | else if ((*c & 0xe0) == 0xc0) |
58 | return (pg_wchar) (((c[0] & 0x1f) << 6) | |
59 | (c[1] & 0x3f)); |
60 | else if ((*c & 0xf0) == 0xe0) |
61 | return (pg_wchar) (((c[0] & 0x0f) << 12) | |
62 | ((c[1] & 0x3f) << 6) | |
63 | (c[2] & 0x3f)); |
64 | else if ((*c & 0xf8) == 0xf0) |
65 | return (pg_wchar) (((c[0] & 0x07) << 18) | |
66 | ((c[1] & 0x3f) << 12) | |
67 | ((c[2] & 0x3f) << 6) | |
68 | (c[3] & 0x3f)); |
69 | else |
70 | /* that is an invalid code on purpose */ |
71 | return 0xffffffff; |
72 | } |
73 | |
74 | |
75 | /* |
76 | * Unicode 3.1 compliant validation : for each category, it checks the |
77 | * combination of each byte to make sure it maps to a valid range. It also |
78 | * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe = |
79 | * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates) |
80 | */ |
81 | static int |
82 | utf_charcheck(const unsigned char *c) |
83 | { |
84 | if ((*c & 0x80) == 0) |
85 | return 1; |
86 | else if ((*c & 0xe0) == 0xc0) |
87 | { |
88 | /* two-byte char */ |
89 | if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01)) |
90 | return 2; |
91 | return -1; |
92 | } |
93 | else if ((*c & 0xf0) == 0xe0) |
94 | { |
95 | /* three-byte char */ |
96 | if (((c[1] & 0xc0) == 0x80) && |
97 | (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) && |
98 | ((c[2] & 0xc0) == 0x80)) |
99 | { |
100 | int z = c[0] & 0x0f; |
101 | int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f); |
102 | int lx = yx & 0x7f; |
103 | |
104 | /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */ |
105 | if (((z == 0x0f) && |
106 | (((yx & 0xffe) == 0xffe) || |
107 | (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) || |
108 | ((z == 0x0d) && ((yx & 0xb00) == 0x800))) |
109 | return -1; |
110 | return 3; |
111 | } |
112 | return -1; |
113 | } |
114 | else if ((*c & 0xf8) == 0xf0) |
115 | { |
116 | int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4); |
117 | |
118 | /* four-byte char */ |
119 | if (((c[1] & 0xc0) == 0x80) && |
120 | (u > 0x00) && (u <= 0x10) && |
121 | ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80)) |
122 | { |
123 | /* test for 0xzzzzfffe/0xzzzzfffff */ |
124 | if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) && |
125 | ((c[3] & 0x3e) == 0x3e)) |
126 | return -1; |
127 | return 4; |
128 | } |
129 | return -1; |
130 | } |
131 | return -1; |
132 | } |
133 | |
134 | |
135 | static void |
136 | mb_utf_validate(unsigned char *pwcs) |
137 | { |
138 | unsigned char *p = pwcs; |
139 | |
140 | while (*pwcs) |
141 | { |
142 | int len; |
143 | |
144 | if ((len = utf_charcheck(pwcs)) > 0) |
145 | { |
146 | if (p != pwcs) |
147 | { |
148 | int i; |
149 | |
150 | for (i = 0; i < len; i++) |
151 | *p++ = *pwcs++; |
152 | } |
153 | else |
154 | { |
155 | pwcs += len; |
156 | p += len; |
157 | } |
158 | } |
159 | else |
160 | /* we skip the char */ |
161 | pwcs++; |
162 | } |
163 | if (p != pwcs) |
164 | *p = '\0'; |
165 | } |
166 | |
167 | /* |
168 | * public functions : wcswidth and mbvalidate |
169 | */ |
170 | |
171 | /* |
172 | * pg_wcswidth is the dumb display-width function. |
173 | * It assumes that everything will appear on one line. |
174 | * OTOH it is easier to use than pg_wcssize if this applies to you. |
175 | */ |
176 | int |
177 | pg_wcswidth(const char *pwcs, size_t len, int encoding) |
178 | { |
179 | int width = 0; |
180 | |
181 | while (len > 0) |
182 | { |
183 | int chlen, |
184 | chwidth; |
185 | |
186 | chlen = PQmblen(pwcs, encoding); |
187 | if (len < (size_t) chlen) |
188 | break; /* Invalid string */ |
189 | |
190 | chwidth = PQdsplen(pwcs, encoding); |
191 | if (chwidth > 0) |
192 | width += chwidth; |
193 | |
194 | pwcs += chlen; |
195 | len -= chlen; |
196 | } |
197 | return width; |
198 | } |
199 | |
200 | /* |
201 | * pg_wcssize takes the given string in the given encoding and returns three |
202 | * values: |
203 | * result_width: Width in display characters of the longest line in string |
204 | * result_height: Number of lines in display output |
205 | * result_format_size: Number of bytes required to store formatted |
206 | * representation of string |
207 | * |
208 | * This MUST be kept in sync with pg_wcsformat! |
209 | */ |
210 | void |
211 | pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, |
212 | int *result_width, int *result_height, int *result_format_size) |
213 | { |
214 | int w, |
215 | chlen = 0, |
216 | linewidth = 0; |
217 | int width = 0; |
218 | int height = 1; |
219 | int format_size = 0; |
220 | |
221 | for (; *pwcs && len > 0; pwcs += chlen) |
222 | { |
223 | chlen = PQmblen((const char *) pwcs, encoding); |
224 | if (len < (size_t) chlen) |
225 | break; |
226 | w = PQdsplen((const char *) pwcs, encoding); |
227 | |
228 | if (chlen == 1) /* single-byte char */ |
229 | { |
230 | if (*pwcs == '\n') /* Newline */ |
231 | { |
232 | if (linewidth > width) |
233 | width = linewidth; |
234 | linewidth = 0; |
235 | height += 1; |
236 | format_size += 1; /* For NUL char */ |
237 | } |
238 | else if (*pwcs == '\r') /* Linefeed */ |
239 | { |
240 | linewidth += 2; |
241 | format_size += 2; |
242 | } |
243 | else if (*pwcs == '\t') /* Tab */ |
244 | { |
245 | do |
246 | { |
247 | linewidth++; |
248 | format_size++; |
249 | } while (linewidth % 8 != 0); |
250 | } |
251 | else if (w < 0) /* Other control char */ |
252 | { |
253 | linewidth += 4; |
254 | format_size += 4; |
255 | } |
256 | else /* Output it as-is */ |
257 | { |
258 | linewidth += w; |
259 | format_size += 1; |
260 | } |
261 | } |
262 | else if (w < 0) /* Non-ascii control char */ |
263 | { |
264 | linewidth += 6; /* \u0000 */ |
265 | format_size += 6; |
266 | } |
267 | else /* All other chars */ |
268 | { |
269 | linewidth += w; |
270 | format_size += chlen; |
271 | } |
272 | len -= chlen; |
273 | } |
274 | if (linewidth > width) |
275 | width = linewidth; |
276 | format_size += 1; /* For NUL char */ |
277 | |
278 | /* Set results */ |
279 | if (result_width) |
280 | *result_width = width; |
281 | if (result_height) |
282 | *result_height = height; |
283 | if (result_format_size) |
284 | *result_format_size = format_size; |
285 | } |
286 | |
287 | /* |
288 | * Format a string into one or more "struct lineptr" lines. |
289 | * lines[i].ptr == NULL indicates the end of the array. |
290 | * |
291 | * This MUST be kept in sync with pg_wcssize! |
292 | */ |
293 | void |
294 | pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, |
295 | struct lineptr *lines, int count) |
296 | { |
297 | int w, |
298 | chlen = 0; |
299 | int linewidth = 0; |
300 | unsigned char *ptr = lines->ptr; /* Pointer to data area */ |
301 | |
302 | for (; *pwcs && len > 0; pwcs += chlen) |
303 | { |
304 | chlen = PQmblen((const char *) pwcs, encoding); |
305 | if (len < (size_t) chlen) |
306 | break; |
307 | w = PQdsplen((const char *) pwcs, encoding); |
308 | |
309 | if (chlen == 1) /* single-byte char */ |
310 | { |
311 | if (*pwcs == '\n') /* Newline */ |
312 | { |
313 | *ptr++ = '\0'; |
314 | lines->width = linewidth; |
315 | linewidth = 0; |
316 | lines++; |
317 | count--; |
318 | if (count <= 0) |
319 | exit(1); /* Screwup */ |
320 | |
321 | /* make next line point to remaining memory */ |
322 | lines->ptr = ptr; |
323 | } |
324 | else if (*pwcs == '\r') /* Linefeed */ |
325 | { |
326 | strcpy((char *) ptr, "\\r" ); |
327 | linewidth += 2; |
328 | ptr += 2; |
329 | } |
330 | else if (*pwcs == '\t') /* Tab */ |
331 | { |
332 | do |
333 | { |
334 | *ptr++ = ' '; |
335 | linewidth++; |
336 | } while (linewidth % 8 != 0); |
337 | } |
338 | else if (w < 0) /* Other control char */ |
339 | { |
340 | sprintf((char *) ptr, "\\x%02X" , *pwcs); |
341 | linewidth += 4; |
342 | ptr += 4; |
343 | } |
344 | else /* Output it as-is */ |
345 | { |
346 | linewidth += w; |
347 | *ptr++ = *pwcs; |
348 | } |
349 | } |
350 | else if (w < 0) /* Non-ascii control char */ |
351 | { |
352 | if (encoding == PG_UTF8) |
353 | sprintf((char *) ptr, "\\u%04X" , utf8_to_unicode(pwcs)); |
354 | else |
355 | { |
356 | /* |
357 | * This case cannot happen in the current code because only |
358 | * UTF-8 signals multibyte control characters. But we may need |
359 | * to support it at some stage |
360 | */ |
361 | sprintf((char *) ptr, "\\u????" ); |
362 | } |
363 | ptr += 6; |
364 | linewidth += 6; |
365 | } |
366 | else /* All other chars */ |
367 | { |
368 | int i; |
369 | |
370 | for (i = 0; i < chlen; i++) |
371 | *ptr++ = pwcs[i]; |
372 | linewidth += w; |
373 | } |
374 | len -= chlen; |
375 | } |
376 | lines->width = linewidth; |
377 | *ptr++ = '\0'; /* Terminate formatted string */ |
378 | |
379 | if (count <= 0) |
380 | exit(1); /* Screwup */ |
381 | |
382 | (lines + 1)->ptr = NULL; /* terminate line array */ |
383 | } |
384 | |
385 | |
386 | /* |
387 | * Encoding validation: delete any unvalidatable characters from the string |
388 | * |
389 | * This seems redundant with existing functionality elsewhere? |
390 | */ |
391 | unsigned char * |
392 | mbvalidate(unsigned char *pwcs, int encoding) |
393 | { |
394 | if (encoding == PG_UTF8) |
395 | mb_utf_validate(pwcs); |
396 | else |
397 | { |
398 | /* |
399 | * other encodings needing validation should add their own routines |
400 | * here |
401 | */ |
402 | } |
403 | |
404 | return pwcs; |
405 | } |
406 | |