1 | #include "duckdb/common/string_util.hpp" |
2 | |
3 | #include "duckdb/common/exception.hpp" |
4 | #include "duckdb/common/pair.hpp" |
5 | #include "duckdb/common/to_string.hpp" |
6 | #include "duckdb/common/helper.hpp" |
7 | |
8 | #include <algorithm> |
9 | #include <cctype> |
10 | #include <iomanip> |
11 | #include <memory> |
12 | #include <sstream> |
13 | #include <stdarg.h> |
14 | #include <string.h> |
15 | #include <random> |
16 | |
17 | namespace duckdb { |
18 | |
19 | string StringUtil::GenerateRandomName(idx_t length) { |
20 | std::random_device rd; |
21 | std::mt19937 gen(rd()); |
22 | std::uniform_int_distribution<> dis(0, 15); |
23 | |
24 | std::stringstream ss; |
25 | ss << std::hex; |
26 | for (idx_t i = 0; i < length; i++) { |
27 | ss << dis(gen); |
28 | } |
29 | return ss.str(); |
30 | } |
31 | |
32 | bool StringUtil::Contains(const string &haystack, const string &needle) { |
33 | return (haystack.find(str: needle) != string::npos); |
34 | } |
35 | |
36 | void StringUtil::LTrim(string &str) { |
37 | auto it = str.begin(); |
38 | while (it != str.end() && CharacterIsSpace(c: *it)) { |
39 | it++; |
40 | } |
41 | str.erase(first: str.begin(), last: it); |
42 | } |
43 | |
44 | // Remove trailing ' ', '\f', '\n', '\r', '\t', '\v' |
45 | void StringUtil::RTrim(string &str) { |
46 | str.erase(first: find_if(first: str.rbegin(), last: str.rend(), pred: [](int ch) { return ch > 0 && !CharacterIsSpace(c: ch); }).base(), |
47 | last: str.end()); |
48 | } |
49 | |
50 | void StringUtil::RTrim(string &str, const string &chars_to_trim) { |
51 | str.erase(first: find_if(first: str.rbegin(), last: str.rend(), |
52 | pred: [&chars_to_trim](int ch) { return ch > 0 && chars_to_trim.find(c: ch) == string::npos; }) |
53 | .base(), |
54 | last: str.end()); |
55 | } |
56 | |
57 | void StringUtil::Trim(string &str) { |
58 | StringUtil::LTrim(str); |
59 | StringUtil::RTrim(str); |
60 | } |
61 | |
62 | bool StringUtil::StartsWith(string str, string prefix) { |
63 | if (prefix.size() > str.size()) { |
64 | return false; |
65 | } |
66 | return equal(prefix.begin(), prefix.end(), str.begin()); |
67 | } |
68 | |
69 | bool StringUtil::EndsWith(const string &str, const string &suffix) { |
70 | if (suffix.size() > str.size()) { |
71 | return false; |
72 | } |
73 | return equal(suffix.rbegin(), suffix.rend(), str.rbegin()); |
74 | } |
75 | |
76 | string StringUtil::Repeat(const string &str, idx_t n) { |
77 | std::ostringstream os; |
78 | for (idx_t i = 0; i < n; i++) { |
79 | os << str; |
80 | } |
81 | return (os.str()); |
82 | } |
83 | |
84 | vector<string> StringUtil::Split(const string &str, char delimiter) { |
85 | std::stringstream ss(str); |
86 | vector<string> lines; |
87 | string temp; |
88 | while (getline(in&: ss, str&: temp, delim: delimiter)) { |
89 | lines.push_back(x: temp); |
90 | } |
91 | return (lines); |
92 | } |
93 | |
94 | namespace string_util_internal { |
95 | |
96 | inline void SkipSpaces(const string &str, idx_t &index) { |
97 | while (index < str.size() && std::isspace(str[index])) { |
98 | index++; |
99 | } |
100 | } |
101 | |
102 | inline void ConsumeLetter(const string &str, idx_t &index, char expected) { |
103 | if (index >= str.size() || str[index] != expected) { |
104 | throw ParserException("Invalid quoted list: %s" , str); |
105 | } |
106 | |
107 | index++; |
108 | } |
109 | |
110 | template <typename F> |
111 | inline void TakeWhile(const string &str, idx_t &index, const F &cond, string &taker) { |
112 | while (index < str.size() && cond(str[index])) { |
113 | taker.push_back(c: str[index]); |
114 | index++; |
115 | } |
116 | } |
117 | |
118 | inline string TakePossiblyQuotedItem(const string &str, idx_t &index, char delimiter, char quote) { |
119 | string entry; |
120 | |
121 | if (str[index] == quote) { |
122 | index++; |
123 | TakeWhile( |
124 | str, index, cond: [quote](char c) { return c != quote; }, taker&: entry); |
125 | ConsumeLetter(str, index, expected: quote); |
126 | } else { |
127 | TakeWhile( |
128 | str, index, cond: [delimiter, quote](char c) { return c != delimiter && c != quote && !std::isspace(c); }, taker&: entry); |
129 | } |
130 | |
131 | return entry; |
132 | } |
133 | |
134 | } // namespace string_util_internal |
135 | |
136 | vector<string> StringUtil::SplitWithQuote(const string &str, char delimiter, char quote) { |
137 | vector<string> entries; |
138 | idx_t i = 0; |
139 | |
140 | string_util_internal::SkipSpaces(str, index&: i); |
141 | while (i < str.size()) { |
142 | if (!entries.empty()) { |
143 | string_util_internal::ConsumeLetter(str, index&: i, expected: delimiter); |
144 | } |
145 | |
146 | entries.emplace_back(args: string_util_internal::TakePossiblyQuotedItem(str, index&: i, delimiter, quote)); |
147 | string_util_internal::SkipSpaces(str, index&: i); |
148 | } |
149 | |
150 | return entries; |
151 | } |
152 | |
153 | string StringUtil::Join(const vector<string> &input, const string &separator) { |
154 | return StringUtil::Join(input, count: input.size(), separator, f: [](const string &s) { return s; }); |
155 | } |
156 | |
157 | string StringUtil::BytesToHumanReadableString(idx_t bytes) { |
158 | string db_size; |
159 | auto kilobytes = bytes / 1000; |
160 | auto megabytes = kilobytes / 1000; |
161 | kilobytes -= megabytes * 1000; |
162 | auto gigabytes = megabytes / 1000; |
163 | megabytes -= gigabytes * 1000; |
164 | auto terabytes = gigabytes / 1000; |
165 | gigabytes -= terabytes * 1000; |
166 | auto petabytes = terabytes / 1000; |
167 | terabytes -= petabytes * 1000; |
168 | if (petabytes > 0) { |
169 | return to_string(val: petabytes) + "." + to_string(val: terabytes / 100) + "PB" ; |
170 | } |
171 | if (terabytes > 0) { |
172 | return to_string(val: terabytes) + "." + to_string(val: gigabytes / 100) + "TB" ; |
173 | } else if (gigabytes > 0) { |
174 | return to_string(val: gigabytes) + "." + to_string(val: megabytes / 100) + "GB" ; |
175 | } else if (megabytes > 0) { |
176 | return to_string(val: megabytes) + "." + to_string(val: kilobytes / 100) + "MB" ; |
177 | } else if (kilobytes > 0) { |
178 | return to_string(val: kilobytes) + "KB" ; |
179 | } else { |
180 | return to_string(val: bytes) + (bytes == 1 ? " byte" : " bytes" ); |
181 | } |
182 | } |
183 | |
184 | string StringUtil::Upper(const string &str) { |
185 | string copy(str); |
186 | transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return std::toupper(c: c); }); |
187 | return (copy); |
188 | } |
189 | |
190 | string StringUtil::Lower(const string &str) { |
191 | string copy(str); |
192 | transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return StringUtil::CharacterToLower(c); }); |
193 | return (copy); |
194 | } |
195 | |
196 | bool StringUtil::IsLower(const string &str) { |
197 | return str == Lower(str); |
198 | } |
199 | |
200 | // Jenkins hash function: https://en.wikipedia.org/wiki/Jenkins_hash_function |
201 | uint64_t StringUtil::CIHash(const string &str) { |
202 | uint32_t hash = 0; |
203 | for (auto c : str) { |
204 | hash += StringUtil::CharacterToLower(c); |
205 | hash += hash << 10; |
206 | hash ^= hash >> 6; |
207 | } |
208 | hash += hash << 3; |
209 | hash ^= hash >> 11; |
210 | hash += hash << 15; |
211 | return hash; |
212 | } |
213 | |
214 | bool StringUtil::CIEquals(const string &l1, const string &l2) { |
215 | if (l1.size() != l2.size()) { |
216 | return false; |
217 | } |
218 | for (idx_t c = 0; c < l1.size(); c++) { |
219 | if (StringUtil::CharacterToLower(c: l1[c]) != StringUtil::CharacterToLower(c: l2[c])) { |
220 | return false; |
221 | } |
222 | } |
223 | return true; |
224 | } |
225 | |
226 | vector<string> StringUtil::Split(const string &input, const string &split) { |
227 | vector<string> splits; |
228 | |
229 | idx_t last = 0; |
230 | idx_t input_len = input.size(); |
231 | idx_t split_len = split.size(); |
232 | while (last <= input_len) { |
233 | idx_t next = input.find(str: split, pos: last); |
234 | if (next == string::npos) { |
235 | next = input_len; |
236 | } |
237 | |
238 | // Push the substring [last, next) on to splits |
239 | string substr = input.substr(pos: last, n: next - last); |
240 | if (!substr.empty()) { |
241 | splits.push_back(x: substr); |
242 | } |
243 | last = next + split_len; |
244 | } |
245 | if (splits.empty()) { |
246 | splits.push_back(x: input); |
247 | } |
248 | return splits; |
249 | } |
250 | |
251 | string StringUtil::Replace(string source, const string &from, const string &to) { |
252 | if (from.empty()) { |
253 | throw InternalException("Invalid argument to StringUtil::Replace - empty FROM" ); |
254 | } |
255 | idx_t start_pos = 0; |
256 | while ((start_pos = source.find(str: from, pos: start_pos)) != string::npos) { |
257 | source.replace(pos: start_pos, n: from.length(), str: to); |
258 | start_pos += to.length(); // In case 'to' contains 'from', like |
259 | // replacing 'x' with 'yx' |
260 | } |
261 | return source; |
262 | } |
263 | |
264 | vector<string> StringUtil::TopNStrings(vector<pair<string, idx_t>> scores, idx_t n, idx_t threshold) { |
265 | if (scores.empty()) { |
266 | return vector<string>(); |
267 | } |
268 | sort(first: scores.begin(), last: scores.end(), comp: [](const pair<string, idx_t> &a, const pair<string, idx_t> &b) -> bool { |
269 | return a.second < b.second || (a.second == b.second && a.first.size() < b.first.size()); |
270 | }); |
271 | vector<string> result; |
272 | result.push_back(x: scores[0].first); |
273 | for (idx_t i = 1; i < MinValue<idx_t>(a: scores.size(), b: n); i++) { |
274 | if (scores[i].second > threshold) { |
275 | break; |
276 | } |
277 | result.push_back(x: scores[i].first); |
278 | } |
279 | return result; |
280 | } |
281 | |
282 | struct LevenshteinArray { |
283 | LevenshteinArray(idx_t len1, idx_t len2) : len1(len1) { |
284 | dist = make_unsafe_uniq_array<idx_t>(n: len1 * len2); |
285 | } |
286 | |
287 | idx_t &Score(idx_t i, idx_t j) { |
288 | return dist[GetIndex(i, j)]; |
289 | } |
290 | |
291 | private: |
292 | idx_t len1; |
293 | unsafe_unique_array<idx_t> dist; |
294 | |
295 | idx_t GetIndex(idx_t i, idx_t j) { |
296 | return j * len1 + i; |
297 | } |
298 | }; |
299 | |
300 | // adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++ |
301 | idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p, idx_t not_equal_penalty) { |
302 | auto s1 = StringUtil::Lower(str: s1_p); |
303 | auto s2 = StringUtil::Lower(str: s2_p); |
304 | idx_t len1 = s1.size(); |
305 | idx_t len2 = s2.size(); |
306 | if (len1 == 0) { |
307 | return len2; |
308 | } |
309 | if (len2 == 0) { |
310 | return len1; |
311 | } |
312 | LevenshteinArray array(len1 + 1, len2 + 1); |
313 | array.Score(i: 0, j: 0) = 0; |
314 | for (idx_t i = 0; i <= len1; i++) { |
315 | array.Score(i, j: 0) = i; |
316 | } |
317 | for (idx_t j = 0; j <= len2; j++) { |
318 | array.Score(i: 0, j) = j; |
319 | } |
320 | for (idx_t i = 1; i <= len1; i++) { |
321 | for (idx_t j = 1; j <= len2; j++) { |
322 | // d[i][j] = std::min({ d[i - 1][j] + 1, |
323 | // d[i][j - 1] + 1, |
324 | // d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) }); |
325 | int equal = s1[i - 1] == s2[j - 1] ? 0 : not_equal_penalty; |
326 | idx_t adjacent_score1 = array.Score(i: i - 1, j) + 1; |
327 | idx_t adjacent_score2 = array.Score(i, j: j - 1) + 1; |
328 | idx_t adjacent_score3 = array.Score(i: i - 1, j: j - 1) + equal; |
329 | |
330 | idx_t t = MinValue<idx_t>(a: adjacent_score1, b: adjacent_score2); |
331 | array.Score(i, j) = MinValue<idx_t>(a: t, b: adjacent_score3); |
332 | } |
333 | } |
334 | return array.Score(i: len1, j: len2); |
335 | } |
336 | |
337 | idx_t StringUtil::SimilarityScore(const string &s1, const string &s2) { |
338 | return LevenshteinDistance(s1_p: s1, s2_p: s2, not_equal_penalty: 3); |
339 | } |
340 | |
341 | vector<string> StringUtil::TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n, |
342 | idx_t threshold) { |
343 | vector<pair<string, idx_t>> scores; |
344 | scores.reserve(n: strings.size()); |
345 | for (auto &str : strings) { |
346 | if (target.size() < str.size()) { |
347 | scores.emplace_back(args: str, args: SimilarityScore(s1: str.substr(pos: 0, n: target.size()), s2: target)); |
348 | } else { |
349 | scores.emplace_back(args: str, args: SimilarityScore(s1: str, s2: target)); |
350 | } |
351 | } |
352 | return TopNStrings(scores, n, threshold); |
353 | } |
354 | |
355 | string StringUtil::CandidatesMessage(const vector<string> &candidates, const string &candidate) { |
356 | string result_str; |
357 | if (!candidates.empty()) { |
358 | result_str = "\n" + candidate + ": " ; |
359 | for (idx_t i = 0; i < candidates.size(); i++) { |
360 | if (i > 0) { |
361 | result_str += ", " ; |
362 | } |
363 | result_str += "\"" + candidates[i] + "\"" ; |
364 | } |
365 | } |
366 | return result_str; |
367 | } |
368 | |
369 | string StringUtil::CandidatesErrorMessage(const vector<string> &strings, const string &target, |
370 | const string &message_prefix, idx_t n) { |
371 | auto closest_strings = StringUtil::TopNLevenshtein(strings, target, n); |
372 | return StringUtil::CandidatesMessage(candidates: closest_strings, candidate: message_prefix); |
373 | } |
374 | |
375 | } // namespace duckdb |
376 | |