1 | //===----------------------------------------------------------------------===// |
2 | // DuckDB |
3 | // |
4 | // duckdb/common/string_util.hpp |
5 | // |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #pragma once |
10 | |
11 | #include "duckdb/common/constants.hpp" |
12 | #include "duckdb/common/exception.hpp" |
13 | #include "duckdb/common/vector.hpp" |
14 | |
15 | #include <cstring> |
16 | |
17 | namespace duckdb { |
18 | |
19 | /** |
20 | * String Utility Functions |
21 | * Note that these are not the most efficient implementations (i.e., they copy |
22 | * memory) and therefore they should only be used for debug messages and other |
23 | * such things. |
24 | */ |
25 | class StringUtil { |
26 | public: |
27 | static string GenerateRandomName(idx_t length = 16); |
28 | |
29 | static uint8_t GetHexValue(char c) { |
30 | if (c >= '0' && c <= '9') { |
31 | return c - '0'; |
32 | } |
33 | if (c >= 'a' && c <= 'f') { |
34 | return c - 'a' + 10; |
35 | } |
36 | if (c >= 'A' && c <= 'F') { |
37 | return c - 'A' + 10; |
38 | } |
39 | throw InvalidInputException("Invalid input for hex digit: %s" , string(c, 1)); |
40 | } |
41 | |
42 | static uint8_t GetBinaryValue(char c) { |
43 | if (c >= '0' && c <= '1') { |
44 | return c - '0'; |
45 | } |
46 | throw InvalidInputException("Invalid input for binary digit: %s" , string(c, 1)); |
47 | } |
48 | |
49 | static bool CharacterIsSpace(char c) { |
50 | return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; |
51 | } |
52 | static bool CharacterIsNewline(char c) { |
53 | return c == '\n' || c == '\r'; |
54 | } |
55 | static bool CharacterIsDigit(char c) { |
56 | return c >= '0' && c <= '9'; |
57 | } |
58 | static bool CharacterIsHex(char c) { |
59 | return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); |
60 | } |
61 | static char CharacterToLower(char c) { |
62 | if (c >= 'A' && c <= 'Z') { |
63 | return c - ('A' - 'a'); |
64 | } |
65 | return c; |
66 | } |
67 | static char CharacterIsAlpha(char c) { |
68 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
69 | } |
70 | static bool CharacterIsOperator(char c) { |
71 | if (c == '_') { |
72 | return false; |
73 | } |
74 | if (c >= '!' && c <= '/') { |
75 | return true; |
76 | } |
77 | if (c >= ':' && c <= '@') { |
78 | return true; |
79 | } |
80 | if (c >= '[' && c <= '`') { |
81 | return true; |
82 | } |
83 | if (c >= '{' && c <= '~') { |
84 | return true; |
85 | } |
86 | return false; |
87 | } |
88 | |
89 | template <class TO> |
90 | static vector<TO> ConvertStrings(const vector<string> &strings) { |
91 | vector<TO> result; |
92 | for (auto &string : strings) { |
93 | result.emplace_back(string); |
94 | } |
95 | return result; |
96 | } |
97 | |
98 | static vector<SQLIdentifier> ConvertToSQLIdentifiers(const vector<string> &strings) { |
99 | return ConvertStrings<SQLIdentifier>(strings); |
100 | } |
101 | |
102 | static vector<SQLString> ConvertToSQLStrings(const vector<string> &strings) { |
103 | return ConvertStrings<SQLString>(strings); |
104 | } |
105 | |
106 | //! Returns true if the needle string exists in the haystack |
107 | DUCKDB_API static bool Contains(const string &haystack, const string &needle); |
108 | |
109 | //! Returns true if the target string starts with the given prefix |
110 | DUCKDB_API static bool StartsWith(string str, string prefix); |
111 | |
112 | //! Returns true if the target string <b>ends</b> with the given suffix. |
113 | DUCKDB_API static bool EndsWith(const string &str, const string &suffix); |
114 | |
115 | //! Repeat a string multiple times |
116 | DUCKDB_API static string Repeat(const string &str, const idx_t n); |
117 | |
118 | //! Split the input string based on newline char |
119 | DUCKDB_API static vector<string> Split(const string &str, char delimiter); |
120 | |
121 | //! Split the input string allong a quote. Note that any escaping is NOT supported. |
122 | DUCKDB_API static vector<string> SplitWithQuote(const string &str, char delimiter = ',', char quote = '"'); |
123 | |
124 | //! Join multiple strings into one string. Components are concatenated by the given separator |
125 | DUCKDB_API static string Join(const vector<string> &input, const string &separator); |
126 | |
127 | template <class T> |
128 | static string ToString(const vector<T> &input, const string &separator) { |
129 | vector<string> input_list; |
130 | for (auto &i : input) { |
131 | input_list.push_back(i.ToString()); |
132 | } |
133 | return StringUtil::Join(input: input_list, separator); |
134 | } |
135 | |
136 | //! Join multiple items of container with given size, transformed to string |
137 | //! using function, into one string using the given separator |
138 | template <typename C, typename S, typename Func> |
139 | static string Join(const C &input, S count, const string &separator, Func f) { |
140 | // The result |
141 | std::string result; |
142 | |
143 | // If the input isn't empty, append the first element. We do this so we |
144 | // don't need to introduce an if into the loop. |
145 | if (count > 0) { |
146 | result += f(input[0]); |
147 | } |
148 | |
149 | // Append the remaining input components, after the first |
150 | for (size_t i = 1; i < count; i++) { |
151 | result += separator + f(input[i]); |
152 | } |
153 | |
154 | return result; |
155 | } |
156 | |
157 | //! Return a string that formats the give number of bytes |
158 | DUCKDB_API static string BytesToHumanReadableString(idx_t bytes); |
159 | |
160 | //! Convert a string to uppercase |
161 | DUCKDB_API static string Upper(const string &str); |
162 | |
163 | //! Convert a string to lowercase |
164 | DUCKDB_API static string Lower(const string &str); |
165 | |
166 | DUCKDB_API static bool IsLower(const string &str); |
167 | |
168 | //! Case insensitive hash |
169 | DUCKDB_API static uint64_t CIHash(const string &str); |
170 | |
171 | //! Case insensitive equals |
172 | DUCKDB_API static bool CIEquals(const string &l1, const string &l2); |
173 | |
174 | //! Format a string using printf semantics |
175 | template <typename... Args> |
176 | static string Format(const string fmt_str, Args... params) { |
177 | return Exception::ConstructMessage(fmt_str, params...); |
178 | } |
179 | |
180 | //! Split the input string into a vector of strings based on the split string |
181 | DUCKDB_API static vector<string> Split(const string &input, const string &split); |
182 | |
183 | //! Remove the whitespace char in the left end of the string |
184 | DUCKDB_API static void LTrim(string &str); |
185 | //! Remove the whitespace char in the right end of the string |
186 | DUCKDB_API static void RTrim(string &str); |
187 | //! Remove the all chars from chars_to_trim char in the right end of the string |
188 | DUCKDB_API static void RTrim(string &str, const string &chars_to_trim); |
189 | //! Remove the whitespace char in the left and right end of the string |
190 | DUCKDB_API static void Trim(string &str); |
191 | |
192 | DUCKDB_API static string Replace(string source, const string &from, const string &to); |
193 | |
194 | //! Get the levenshtein distance from two strings |
195 | //! The not_equal_penalty is the penalty given when two characters in a string are not equal |
196 | //! The regular levenshtein distance has a not equal penalty of 1, which means changing a character is as expensive |
197 | //! as adding or removing one For similarity searches we often want to give extra weight to changing a character For |
198 | //! example: with an equal penalty of 1, "pg_am" is closer to "depdelay" than "depdelay_minutes" |
199 | //! with an equal penalty of 3, "depdelay_minutes" is closer to "depdelay" than to "pg_am" |
200 | DUCKDB_API static idx_t LevenshteinDistance(const string &s1, const string &s2, idx_t not_equal_penalty = 1); |
201 | |
202 | //! Returns the similarity score between two strings |
203 | DUCKDB_API static idx_t SimilarityScore(const string &s1, const string &s2); |
204 | //! Get the top-n strings (sorted by the given score distance) from a set of scores. |
205 | //! At least one entry is returned (if there is one). |
206 | //! Strings are only returned if they have a score less than the threshold. |
207 | DUCKDB_API static vector<string> TopNStrings(vector<std::pair<string, idx_t>> scores, idx_t n = 5, |
208 | idx_t threshold = 5); |
209 | //! Computes the levenshtein distance of each string in strings, and compares it to target, then returns TopNStrings |
210 | //! with the given params. |
211 | DUCKDB_API static vector<string> TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n = 5, |
212 | idx_t threshold = 5); |
213 | DUCKDB_API static string CandidatesMessage(const vector<string> &candidates, |
214 | const string &candidate = "Candidate bindings" ); |
215 | |
216 | //! Generate an error message in the form of "{message_prefix}: nearest_string, nearest_string2, ... |
217 | //! Equivalent to calling TopNLevenshtein followed by CandidatesMessage |
218 | DUCKDB_API static string CandidatesErrorMessage(const vector<string> &strings, const string &target, |
219 | const string &message_prefix, idx_t n = 5); |
220 | |
221 | //! Returns true if two null-terminated strings are equal or point to the same address. |
222 | //! Returns false if only one of the strings is nullptr |
223 | static bool Equals(const char *s1, const char *s2) { |
224 | if (s1 == s2) { |
225 | return true; |
226 | } |
227 | if (s1 == nullptr || s2 == nullptr) { |
228 | return false; |
229 | } |
230 | return strcmp(s1: s1, s2: s2) == 0; |
231 | } |
232 | }; |
233 | |
234 | } // namespace duckdb |
235 | |