1#include "duckdb/common/string_util.hpp"
2
3#include "duckdb/common/exception.hpp"
4#include "duckdb/common/pair.hpp"
5#include "duckdb/common/to_string.hpp"
6#include "duckdb/common/helper.hpp"
7
8#include <algorithm>
9#include <cctype>
10#include <iomanip>
11#include <memory>
12#include <sstream>
13#include <stdarg.h>
14#include <string.h>
15#include <random>
16
17namespace duckdb {
18
19string StringUtil::GenerateRandomName(idx_t length) {
20 std::random_device rd;
21 std::mt19937 gen(rd());
22 std::uniform_int_distribution<> dis(0, 15);
23
24 std::stringstream ss;
25 ss << std::hex;
26 for (idx_t i = 0; i < length; i++) {
27 ss << dis(gen);
28 }
29 return ss.str();
30}
31
32bool StringUtil::Contains(const string &haystack, const string &needle) {
33 return (haystack.find(str: needle) != string::npos);
34}
35
36void StringUtil::LTrim(string &str) {
37 auto it = str.begin();
38 while (it != str.end() && CharacterIsSpace(c: *it)) {
39 it++;
40 }
41 str.erase(first: str.begin(), last: it);
42}
43
44// Remove trailing ' ', '\f', '\n', '\r', '\t', '\v'
45void StringUtil::RTrim(string &str) {
46 str.erase(first: find_if(first: str.rbegin(), last: str.rend(), pred: [](int ch) { return ch > 0 && !CharacterIsSpace(c: ch); }).base(),
47 last: str.end());
48}
49
50void StringUtil::RTrim(string &str, const string &chars_to_trim) {
51 str.erase(first: find_if(first: str.rbegin(), last: str.rend(),
52 pred: [&chars_to_trim](int ch) { return ch > 0 && chars_to_trim.find(c: ch) == string::npos; })
53 .base(),
54 last: str.end());
55}
56
57void StringUtil::Trim(string &str) {
58 StringUtil::LTrim(str);
59 StringUtil::RTrim(str);
60}
61
62bool StringUtil::StartsWith(string str, string prefix) {
63 if (prefix.size() > str.size()) {
64 return false;
65 }
66 return equal(prefix.begin(), prefix.end(), str.begin());
67}
68
69bool StringUtil::EndsWith(const string &str, const string &suffix) {
70 if (suffix.size() > str.size()) {
71 return false;
72 }
73 return equal(suffix.rbegin(), suffix.rend(), str.rbegin());
74}
75
76string StringUtil::Repeat(const string &str, idx_t n) {
77 std::ostringstream os;
78 for (idx_t i = 0; i < n; i++) {
79 os << str;
80 }
81 return (os.str());
82}
83
84vector<string> StringUtil::Split(const string &str, char delimiter) {
85 std::stringstream ss(str);
86 vector<string> lines;
87 string temp;
88 while (getline(in&: ss, str&: temp, delim: delimiter)) {
89 lines.push_back(x: temp);
90 }
91 return (lines);
92}
93
94namespace string_util_internal {
95
96inline void SkipSpaces(const string &str, idx_t &index) {
97 while (index < str.size() && std::isspace(str[index])) {
98 index++;
99 }
100}
101
102inline void ConsumeLetter(const string &str, idx_t &index, char expected) {
103 if (index >= str.size() || str[index] != expected) {
104 throw ParserException("Invalid quoted list: %s", str);
105 }
106
107 index++;
108}
109
110template <typename F>
111inline void TakeWhile(const string &str, idx_t &index, const F &cond, string &taker) {
112 while (index < str.size() && cond(str[index])) {
113 taker.push_back(c: str[index]);
114 index++;
115 }
116}
117
118inline string TakePossiblyQuotedItem(const string &str, idx_t &index, char delimiter, char quote) {
119 string entry;
120
121 if (str[index] == quote) {
122 index++;
123 TakeWhile(
124 str, index, cond: [quote](char c) { return c != quote; }, taker&: entry);
125 ConsumeLetter(str, index, expected: quote);
126 } else {
127 TakeWhile(
128 str, index, cond: [delimiter, quote](char c) { return c != delimiter && c != quote && !std::isspace(c); }, taker&: entry);
129 }
130
131 return entry;
132}
133
134} // namespace string_util_internal
135
136vector<string> StringUtil::SplitWithQuote(const string &str, char delimiter, char quote) {
137 vector<string> entries;
138 idx_t i = 0;
139
140 string_util_internal::SkipSpaces(str, index&: i);
141 while (i < str.size()) {
142 if (!entries.empty()) {
143 string_util_internal::ConsumeLetter(str, index&: i, expected: delimiter);
144 }
145
146 entries.emplace_back(args: string_util_internal::TakePossiblyQuotedItem(str, index&: i, delimiter, quote));
147 string_util_internal::SkipSpaces(str, index&: i);
148 }
149
150 return entries;
151}
152
153string StringUtil::Join(const vector<string> &input, const string &separator) {
154 return StringUtil::Join(input, count: input.size(), separator, f: [](const string &s) { return s; });
155}
156
157string StringUtil::BytesToHumanReadableString(idx_t bytes) {
158 string db_size;
159 auto kilobytes = bytes / 1000;
160 auto megabytes = kilobytes / 1000;
161 kilobytes -= megabytes * 1000;
162 auto gigabytes = megabytes / 1000;
163 megabytes -= gigabytes * 1000;
164 auto terabytes = gigabytes / 1000;
165 gigabytes -= terabytes * 1000;
166 auto petabytes = terabytes / 1000;
167 terabytes -= petabytes * 1000;
168 if (petabytes > 0) {
169 return to_string(val: petabytes) + "." + to_string(val: terabytes / 100) + "PB";
170 }
171 if (terabytes > 0) {
172 return to_string(val: terabytes) + "." + to_string(val: gigabytes / 100) + "TB";
173 } else if (gigabytes > 0) {
174 return to_string(val: gigabytes) + "." + to_string(val: megabytes / 100) + "GB";
175 } else if (megabytes > 0) {
176 return to_string(val: megabytes) + "." + to_string(val: kilobytes / 100) + "MB";
177 } else if (kilobytes > 0) {
178 return to_string(val: kilobytes) + "KB";
179 } else {
180 return to_string(val: bytes) + (bytes == 1 ? " byte" : " bytes");
181 }
182}
183
184string StringUtil::Upper(const string &str) {
185 string copy(str);
186 transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return std::toupper(c: c); });
187 return (copy);
188}
189
190string StringUtil::Lower(const string &str) {
191 string copy(str);
192 transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return StringUtil::CharacterToLower(c); });
193 return (copy);
194}
195
196bool StringUtil::IsLower(const string &str) {
197 return str == Lower(str);
198}
199
200// Jenkins hash function: https://en.wikipedia.org/wiki/Jenkins_hash_function
201uint64_t StringUtil::CIHash(const string &str) {
202 uint32_t hash = 0;
203 for (auto c : str) {
204 hash += StringUtil::CharacterToLower(c);
205 hash += hash << 10;
206 hash ^= hash >> 6;
207 }
208 hash += hash << 3;
209 hash ^= hash >> 11;
210 hash += hash << 15;
211 return hash;
212}
213
214bool StringUtil::CIEquals(const string &l1, const string &l2) {
215 if (l1.size() != l2.size()) {
216 return false;
217 }
218 for (idx_t c = 0; c < l1.size(); c++) {
219 if (StringUtil::CharacterToLower(c: l1[c]) != StringUtil::CharacterToLower(c: l2[c])) {
220 return false;
221 }
222 }
223 return true;
224}
225
226vector<string> StringUtil::Split(const string &input, const string &split) {
227 vector<string> splits;
228
229 idx_t last = 0;
230 idx_t input_len = input.size();
231 idx_t split_len = split.size();
232 while (last <= input_len) {
233 idx_t next = input.find(str: split, pos: last);
234 if (next == string::npos) {
235 next = input_len;
236 }
237
238 // Push the substring [last, next) on to splits
239 string substr = input.substr(pos: last, n: next - last);
240 if (!substr.empty()) {
241 splits.push_back(x: substr);
242 }
243 last = next + split_len;
244 }
245 if (splits.empty()) {
246 splits.push_back(x: input);
247 }
248 return splits;
249}
250
251string StringUtil::Replace(string source, const string &from, const string &to) {
252 if (from.empty()) {
253 throw InternalException("Invalid argument to StringUtil::Replace - empty FROM");
254 }
255 idx_t start_pos = 0;
256 while ((start_pos = source.find(str: from, pos: start_pos)) != string::npos) {
257 source.replace(pos: start_pos, n: from.length(), str: to);
258 start_pos += to.length(); // In case 'to' contains 'from', like
259 // replacing 'x' with 'yx'
260 }
261 return source;
262}
263
264vector<string> StringUtil::TopNStrings(vector<pair<string, idx_t>> scores, idx_t n, idx_t threshold) {
265 if (scores.empty()) {
266 return vector<string>();
267 }
268 sort(first: scores.begin(), last: scores.end(), comp: [](const pair<string, idx_t> &a, const pair<string, idx_t> &b) -> bool {
269 return a.second < b.second || (a.second == b.second && a.first.size() < b.first.size());
270 });
271 vector<string> result;
272 result.push_back(x: scores[0].first);
273 for (idx_t i = 1; i < MinValue<idx_t>(a: scores.size(), b: n); i++) {
274 if (scores[i].second > threshold) {
275 break;
276 }
277 result.push_back(x: scores[i].first);
278 }
279 return result;
280}
281
282struct LevenshteinArray {
283 LevenshteinArray(idx_t len1, idx_t len2) : len1(len1) {
284 dist = make_unsafe_uniq_array<idx_t>(n: len1 * len2);
285 }
286
287 idx_t &Score(idx_t i, idx_t j) {
288 return dist[GetIndex(i, j)];
289 }
290
291private:
292 idx_t len1;
293 unsafe_unique_array<idx_t> dist;
294
295 idx_t GetIndex(idx_t i, idx_t j) {
296 return j * len1 + i;
297 }
298};
299
300// adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++
301idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p, idx_t not_equal_penalty) {
302 auto s1 = StringUtil::Lower(str: s1_p);
303 auto s2 = StringUtil::Lower(str: s2_p);
304 idx_t len1 = s1.size();
305 idx_t len2 = s2.size();
306 if (len1 == 0) {
307 return len2;
308 }
309 if (len2 == 0) {
310 return len1;
311 }
312 LevenshteinArray array(len1 + 1, len2 + 1);
313 array.Score(i: 0, j: 0) = 0;
314 for (idx_t i = 0; i <= len1; i++) {
315 array.Score(i, j: 0) = i;
316 }
317 for (idx_t j = 0; j <= len2; j++) {
318 array.Score(i: 0, j) = j;
319 }
320 for (idx_t i = 1; i <= len1; i++) {
321 for (idx_t j = 1; j <= len2; j++) {
322 // d[i][j] = std::min({ d[i - 1][j] + 1,
323 // d[i][j - 1] + 1,
324 // d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) });
325 int equal = s1[i - 1] == s2[j - 1] ? 0 : not_equal_penalty;
326 idx_t adjacent_score1 = array.Score(i: i - 1, j) + 1;
327 idx_t adjacent_score2 = array.Score(i, j: j - 1) + 1;
328 idx_t adjacent_score3 = array.Score(i: i - 1, j: j - 1) + equal;
329
330 idx_t t = MinValue<idx_t>(a: adjacent_score1, b: adjacent_score2);
331 array.Score(i, j) = MinValue<idx_t>(a: t, b: adjacent_score3);
332 }
333 }
334 return array.Score(i: len1, j: len2);
335}
336
337idx_t StringUtil::SimilarityScore(const string &s1, const string &s2) {
338 return LevenshteinDistance(s1_p: s1, s2_p: s2, not_equal_penalty: 3);
339}
340
341vector<string> StringUtil::TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n,
342 idx_t threshold) {
343 vector<pair<string, idx_t>> scores;
344 scores.reserve(n: strings.size());
345 for (auto &str : strings) {
346 if (target.size() < str.size()) {
347 scores.emplace_back(args: str, args: SimilarityScore(s1: str.substr(pos: 0, n: target.size()), s2: target));
348 } else {
349 scores.emplace_back(args: str, args: SimilarityScore(s1: str, s2: target));
350 }
351 }
352 return TopNStrings(scores, n, threshold);
353}
354
355string StringUtil::CandidatesMessage(const vector<string> &candidates, const string &candidate) {
356 string result_str;
357 if (!candidates.empty()) {
358 result_str = "\n" + candidate + ": ";
359 for (idx_t i = 0; i < candidates.size(); i++) {
360 if (i > 0) {
361 result_str += ", ";
362 }
363 result_str += "\"" + candidates[i] + "\"";
364 }
365 }
366 return result_str;
367}
368
369string StringUtil::CandidatesErrorMessage(const vector<string> &strings, const string &target,
370 const string &message_prefix, idx_t n) {
371 auto closest_strings = StringUtil::TopNLevenshtein(strings, target, n);
372 return StringUtil::CandidatesMessage(candidates: closest_strings, candidate: message_prefix);
373}
374
375} // namespace duckdb
376