1 | /*-------------------------------------------------------------------- |
2 | * Symbols referenced in this file: |
3 | * - truncate_identifier |
4 | * - downcase_truncate_identifier |
5 | * - downcase_identifier |
6 | * - scanner_isspace |
7 | *-------------------------------------------------------------------- |
8 | */ |
9 | |
10 | /*------------------------------------------------------------------------- |
11 | * |
12 | * scansup.c |
13 | * support routines for the lex/flex scanner, used by both the normal |
14 | * backend as well as the bootstrap backend |
15 | * |
16 | * Portions Copyright (c) 1996-2017, PostgreSQL Global Development PGGroup |
17 | * Portions Copyright (c) 1994, Regents of the University of California |
18 | * |
19 | * |
20 | * IDENTIFICATION |
21 | * src/backend/parser/scansup.c |
22 | * |
23 | *------------------------------------------------------------------------- |
24 | */ |
25 | #include "pg_functions.hpp" |
26 | #include <string.h> |
27 | |
28 | #include <ctype.h> |
29 | |
30 | #include "parser/scansup.hpp" |
31 | #include "mb/pg_wchar.hpp" |
32 | |
33 | namespace duckdb_libpgquery { |
34 | |
35 | /* ---------------- |
36 | * scanstr |
37 | * |
38 | * if the string passed in has escaped codes, map the escape codes to actual |
39 | * chars |
40 | * |
41 | * the string returned is palloc'd and should eventually be pfree'd by the |
42 | * caller! |
43 | * ---------------- |
44 | */ |
45 | |
46 | /* |
47 | * downcase_truncate_identifier() --- do appropriate downcasing and |
48 | * truncation of an unquoted identifier. Optionally warn of truncation. |
49 | * |
50 | * Returns a palloc'd string containing the adjusted identifier. |
51 | * |
52 | * Note: in some usages the passed string is not null-terminated. |
53 | * |
54 | * Note: the API of this function is designed to allow for downcasing |
55 | * transformations that increase the string length, but we don't yet |
56 | * support that. If you want to implement it, you'll need to fix |
57 | * SplitIdentifierString() in utils/adt/varlena.c. |
58 | */ |
59 | char *downcase_truncate_identifier(const char *ident, int len, bool warn) { |
60 | return downcase_identifier(ident, len, warn, truncate: true); |
61 | } |
62 | |
63 | static __thread bool pg_preserve_identifier_case = false; |
64 | |
65 | void set_preserve_identifier_case(bool preserve) { |
66 | pg_preserve_identifier_case = preserve; |
67 | } |
68 | |
69 | bool get_preserve_identifier_case() { |
70 | return pg_preserve_identifier_case; |
71 | } |
72 | |
73 | /* |
74 | * a workhorse for downcase_truncate_identifier |
75 | */ |
76 | char *downcase_identifier(const char *ident, int len, bool warn, bool truncate) { |
77 | char *result; |
78 | int i; |
79 | bool enc_is_single_byte; |
80 | |
81 | result = (char *)palloc(n: len + 1); |
82 | enc_is_single_byte = pg_database_encoding_max_length() == 1; |
83 | |
84 | /* |
85 | * SQL99 specifies Unicode-aware case normalization, which we don't yet |
86 | * have the infrastructure for. Instead we use tolower() to provide a |
87 | * locale-aware translation. However, there are some locales where this |
88 | * is not right either (eg, Turkish may do strange things with 'i' and |
89 | * 'I'). Our current compromise is to use tolower() for characters with |
90 | * the high bit set, as long as they aren't part of a multi-byte |
91 | * character, and use an ASCII-only downcasing for 7-bit characters. |
92 | */ |
93 | for (i = 0; i < len; i++) { |
94 | unsigned char ch = (unsigned char)ident[i]; |
95 | |
96 | if (!get_preserve_identifier_case()) { |
97 | if (ch >= 'A' && ch <= 'Z') |
98 | ch += 'a' - 'A'; |
99 | else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) |
100 | ch = tolower(c: ch); |
101 | } |
102 | result[i] = (char)ch; |
103 | } |
104 | result[i] = '\0'; |
105 | |
106 | return result; |
107 | } |
108 | |
109 | /* |
110 | * scanner_isspace() --- return true if flex scanner considers char whitespace |
111 | * |
112 | * This should be used instead of the potentially locale-dependent isspace() |
113 | * function when it's important to match the lexer's behavior. |
114 | * |
115 | * In principle we might need similar functions for isalnum etc, but for the |
116 | * moment only isspace seems needed. |
117 | */ |
118 | bool scanner_isspace(char ch) { |
119 | /* This must match scan.l's list of {space} characters */ |
120 | if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\f') |
121 | return true; |
122 | return false; |
123 | } |
124 | } |