| 1 | #include "duckdb/common/string_util.hpp" |
| 2 | |
| 3 | #include "duckdb/common/exception.hpp" |
| 4 | #include "duckdb/common/pair.hpp" |
| 5 | #include "duckdb/common/to_string.hpp" |
| 6 | #include "duckdb/common/helper.hpp" |
| 7 | |
| 8 | #include <algorithm> |
| 9 | #include <cctype> |
| 10 | #include <iomanip> |
| 11 | #include <memory> |
| 12 | #include <sstream> |
| 13 | #include <stdarg.h> |
| 14 | #include <string.h> |
| 15 | #include <random> |
| 16 | |
| 17 | namespace duckdb { |
| 18 | |
| 19 | string StringUtil::GenerateRandomName(idx_t length) { |
| 20 | std::random_device rd; |
| 21 | std::mt19937 gen(rd()); |
| 22 | std::uniform_int_distribution<> dis(0, 15); |
| 23 | |
| 24 | std::stringstream ss; |
| 25 | ss << std::hex; |
| 26 | for (idx_t i = 0; i < length; i++) { |
| 27 | ss << dis(gen); |
| 28 | } |
| 29 | return ss.str(); |
| 30 | } |
| 31 | |
| 32 | bool StringUtil::Contains(const string &haystack, const string &needle) { |
| 33 | return (haystack.find(str: needle) != string::npos); |
| 34 | } |
| 35 | |
| 36 | void StringUtil::LTrim(string &str) { |
| 37 | auto it = str.begin(); |
| 38 | while (it != str.end() && CharacterIsSpace(c: *it)) { |
| 39 | it++; |
| 40 | } |
| 41 | str.erase(first: str.begin(), last: it); |
| 42 | } |
| 43 | |
| 44 | // Remove trailing ' ', '\f', '\n', '\r', '\t', '\v' |
| 45 | void StringUtil::RTrim(string &str) { |
| 46 | str.erase(first: find_if(first: str.rbegin(), last: str.rend(), pred: [](int ch) { return ch > 0 && !CharacterIsSpace(c: ch); }).base(), |
| 47 | last: str.end()); |
| 48 | } |
| 49 | |
| 50 | void StringUtil::RTrim(string &str, const string &chars_to_trim) { |
| 51 | str.erase(first: find_if(first: str.rbegin(), last: str.rend(), |
| 52 | pred: [&chars_to_trim](int ch) { return ch > 0 && chars_to_trim.find(c: ch) == string::npos; }) |
| 53 | .base(), |
| 54 | last: str.end()); |
| 55 | } |
| 56 | |
| 57 | void StringUtil::Trim(string &str) { |
| 58 | StringUtil::LTrim(str); |
| 59 | StringUtil::RTrim(str); |
| 60 | } |
| 61 | |
| 62 | bool StringUtil::StartsWith(string str, string prefix) { |
| 63 | if (prefix.size() > str.size()) { |
| 64 | return false; |
| 65 | } |
| 66 | return equal(prefix.begin(), prefix.end(), str.begin()); |
| 67 | } |
| 68 | |
| 69 | bool StringUtil::EndsWith(const string &str, const string &suffix) { |
| 70 | if (suffix.size() > str.size()) { |
| 71 | return false; |
| 72 | } |
| 73 | return equal(suffix.rbegin(), suffix.rend(), str.rbegin()); |
| 74 | } |
| 75 | |
| 76 | string StringUtil::Repeat(const string &str, idx_t n) { |
| 77 | std::ostringstream os; |
| 78 | for (idx_t i = 0; i < n; i++) { |
| 79 | os << str; |
| 80 | } |
| 81 | return (os.str()); |
| 82 | } |
| 83 | |
| 84 | vector<string> StringUtil::Split(const string &str, char delimiter) { |
| 85 | std::stringstream ss(str); |
| 86 | vector<string> lines; |
| 87 | string temp; |
| 88 | while (getline(in&: ss, str&: temp, delim: delimiter)) { |
| 89 | lines.push_back(x: temp); |
| 90 | } |
| 91 | return (lines); |
| 92 | } |
| 93 | |
| 94 | namespace string_util_internal { |
| 95 | |
| 96 | inline void SkipSpaces(const string &str, idx_t &index) { |
| 97 | while (index < str.size() && std::isspace(str[index])) { |
| 98 | index++; |
| 99 | } |
| 100 | } |
| 101 | |
| 102 | inline void ConsumeLetter(const string &str, idx_t &index, char expected) { |
| 103 | if (index >= str.size() || str[index] != expected) { |
| 104 | throw ParserException("Invalid quoted list: %s" , str); |
| 105 | } |
| 106 | |
| 107 | index++; |
| 108 | } |
| 109 | |
| 110 | template <typename F> |
| 111 | inline void TakeWhile(const string &str, idx_t &index, const F &cond, string &taker) { |
| 112 | while (index < str.size() && cond(str[index])) { |
| 113 | taker.push_back(c: str[index]); |
| 114 | index++; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | inline string TakePossiblyQuotedItem(const string &str, idx_t &index, char delimiter, char quote) { |
| 119 | string entry; |
| 120 | |
| 121 | if (str[index] == quote) { |
| 122 | index++; |
| 123 | TakeWhile( |
| 124 | str, index, cond: [quote](char c) { return c != quote; }, taker&: entry); |
| 125 | ConsumeLetter(str, index, expected: quote); |
| 126 | } else { |
| 127 | TakeWhile( |
| 128 | str, index, cond: [delimiter, quote](char c) { return c != delimiter && c != quote && !std::isspace(c); }, taker&: entry); |
| 129 | } |
| 130 | |
| 131 | return entry; |
| 132 | } |
| 133 | |
| 134 | } // namespace string_util_internal |
| 135 | |
| 136 | vector<string> StringUtil::SplitWithQuote(const string &str, char delimiter, char quote) { |
| 137 | vector<string> entries; |
| 138 | idx_t i = 0; |
| 139 | |
| 140 | string_util_internal::SkipSpaces(str, index&: i); |
| 141 | while (i < str.size()) { |
| 142 | if (!entries.empty()) { |
| 143 | string_util_internal::ConsumeLetter(str, index&: i, expected: delimiter); |
| 144 | } |
| 145 | |
| 146 | entries.emplace_back(args: string_util_internal::TakePossiblyQuotedItem(str, index&: i, delimiter, quote)); |
| 147 | string_util_internal::SkipSpaces(str, index&: i); |
| 148 | } |
| 149 | |
| 150 | return entries; |
| 151 | } |
| 152 | |
| 153 | string StringUtil::Join(const vector<string> &input, const string &separator) { |
| 154 | return StringUtil::Join(input, count: input.size(), separator, f: [](const string &s) { return s; }); |
| 155 | } |
| 156 | |
| 157 | string StringUtil::BytesToHumanReadableString(idx_t bytes) { |
| 158 | string db_size; |
| 159 | auto kilobytes = bytes / 1000; |
| 160 | auto megabytes = kilobytes / 1000; |
| 161 | kilobytes -= megabytes * 1000; |
| 162 | auto gigabytes = megabytes / 1000; |
| 163 | megabytes -= gigabytes * 1000; |
| 164 | auto terabytes = gigabytes / 1000; |
| 165 | gigabytes -= terabytes * 1000; |
| 166 | auto petabytes = terabytes / 1000; |
| 167 | terabytes -= petabytes * 1000; |
| 168 | if (petabytes > 0) { |
| 169 | return to_string(val: petabytes) + "." + to_string(val: terabytes / 100) + "PB" ; |
| 170 | } |
| 171 | if (terabytes > 0) { |
| 172 | return to_string(val: terabytes) + "." + to_string(val: gigabytes / 100) + "TB" ; |
| 173 | } else if (gigabytes > 0) { |
| 174 | return to_string(val: gigabytes) + "." + to_string(val: megabytes / 100) + "GB" ; |
| 175 | } else if (megabytes > 0) { |
| 176 | return to_string(val: megabytes) + "." + to_string(val: kilobytes / 100) + "MB" ; |
| 177 | } else if (kilobytes > 0) { |
| 178 | return to_string(val: kilobytes) + "KB" ; |
| 179 | } else { |
| 180 | return to_string(val: bytes) + (bytes == 1 ? " byte" : " bytes" ); |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | string StringUtil::Upper(const string &str) { |
| 185 | string copy(str); |
| 186 | transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return std::toupper(c: c); }); |
| 187 | return (copy); |
| 188 | } |
| 189 | |
| 190 | string StringUtil::Lower(const string &str) { |
| 191 | string copy(str); |
| 192 | transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return StringUtil::CharacterToLower(c); }); |
| 193 | return (copy); |
| 194 | } |
| 195 | |
| 196 | bool StringUtil::IsLower(const string &str) { |
| 197 | return str == Lower(str); |
| 198 | } |
| 199 | |
| 200 | // Jenkins hash function: https://en.wikipedia.org/wiki/Jenkins_hash_function |
| 201 | uint64_t StringUtil::CIHash(const string &str) { |
| 202 | uint32_t hash = 0; |
| 203 | for (auto c : str) { |
| 204 | hash += StringUtil::CharacterToLower(c); |
| 205 | hash += hash << 10; |
| 206 | hash ^= hash >> 6; |
| 207 | } |
| 208 | hash += hash << 3; |
| 209 | hash ^= hash >> 11; |
| 210 | hash += hash << 15; |
| 211 | return hash; |
| 212 | } |
| 213 | |
| 214 | bool StringUtil::CIEquals(const string &l1, const string &l2) { |
| 215 | if (l1.size() != l2.size()) { |
| 216 | return false; |
| 217 | } |
| 218 | for (idx_t c = 0; c < l1.size(); c++) { |
| 219 | if (StringUtil::CharacterToLower(c: l1[c]) != StringUtil::CharacterToLower(c: l2[c])) { |
| 220 | return false; |
| 221 | } |
| 222 | } |
| 223 | return true; |
| 224 | } |
| 225 | |
| 226 | vector<string> StringUtil::Split(const string &input, const string &split) { |
| 227 | vector<string> splits; |
| 228 | |
| 229 | idx_t last = 0; |
| 230 | idx_t input_len = input.size(); |
| 231 | idx_t split_len = split.size(); |
| 232 | while (last <= input_len) { |
| 233 | idx_t next = input.find(str: split, pos: last); |
| 234 | if (next == string::npos) { |
| 235 | next = input_len; |
| 236 | } |
| 237 | |
| 238 | // Push the substring [last, next) on to splits |
| 239 | string substr = input.substr(pos: last, n: next - last); |
| 240 | if (!substr.empty()) { |
| 241 | splits.push_back(x: substr); |
| 242 | } |
| 243 | last = next + split_len; |
| 244 | } |
| 245 | if (splits.empty()) { |
| 246 | splits.push_back(x: input); |
| 247 | } |
| 248 | return splits; |
| 249 | } |
| 250 | |
| 251 | string StringUtil::Replace(string source, const string &from, const string &to) { |
| 252 | if (from.empty()) { |
| 253 | throw InternalException("Invalid argument to StringUtil::Replace - empty FROM" ); |
| 254 | } |
| 255 | idx_t start_pos = 0; |
| 256 | while ((start_pos = source.find(str: from, pos: start_pos)) != string::npos) { |
| 257 | source.replace(pos: start_pos, n: from.length(), str: to); |
| 258 | start_pos += to.length(); // In case 'to' contains 'from', like |
| 259 | // replacing 'x' with 'yx' |
| 260 | } |
| 261 | return source; |
| 262 | } |
| 263 | |
| 264 | vector<string> StringUtil::TopNStrings(vector<pair<string, idx_t>> scores, idx_t n, idx_t threshold) { |
| 265 | if (scores.empty()) { |
| 266 | return vector<string>(); |
| 267 | } |
| 268 | sort(first: scores.begin(), last: scores.end(), comp: [](const pair<string, idx_t> &a, const pair<string, idx_t> &b) -> bool { |
| 269 | return a.second < b.second || (a.second == b.second && a.first.size() < b.first.size()); |
| 270 | }); |
| 271 | vector<string> result; |
| 272 | result.push_back(x: scores[0].first); |
| 273 | for (idx_t i = 1; i < MinValue<idx_t>(a: scores.size(), b: n); i++) { |
| 274 | if (scores[i].second > threshold) { |
| 275 | break; |
| 276 | } |
| 277 | result.push_back(x: scores[i].first); |
| 278 | } |
| 279 | return result; |
| 280 | } |
| 281 | |
| 282 | struct LevenshteinArray { |
| 283 | LevenshteinArray(idx_t len1, idx_t len2) : len1(len1) { |
| 284 | dist = make_unsafe_uniq_array<idx_t>(n: len1 * len2); |
| 285 | } |
| 286 | |
| 287 | idx_t &Score(idx_t i, idx_t j) { |
| 288 | return dist[GetIndex(i, j)]; |
| 289 | } |
| 290 | |
| 291 | private: |
| 292 | idx_t len1; |
| 293 | unsafe_unique_array<idx_t> dist; |
| 294 | |
| 295 | idx_t GetIndex(idx_t i, idx_t j) { |
| 296 | return j * len1 + i; |
| 297 | } |
| 298 | }; |
| 299 | |
| 300 | // adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++ |
| 301 | idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p, idx_t not_equal_penalty) { |
| 302 | auto s1 = StringUtil::Lower(str: s1_p); |
| 303 | auto s2 = StringUtil::Lower(str: s2_p); |
| 304 | idx_t len1 = s1.size(); |
| 305 | idx_t len2 = s2.size(); |
| 306 | if (len1 == 0) { |
| 307 | return len2; |
| 308 | } |
| 309 | if (len2 == 0) { |
| 310 | return len1; |
| 311 | } |
| 312 | LevenshteinArray array(len1 + 1, len2 + 1); |
| 313 | array.Score(i: 0, j: 0) = 0; |
| 314 | for (idx_t i = 0; i <= len1; i++) { |
| 315 | array.Score(i, j: 0) = i; |
| 316 | } |
| 317 | for (idx_t j = 0; j <= len2; j++) { |
| 318 | array.Score(i: 0, j) = j; |
| 319 | } |
| 320 | for (idx_t i = 1; i <= len1; i++) { |
| 321 | for (idx_t j = 1; j <= len2; j++) { |
| 322 | // d[i][j] = std::min({ d[i - 1][j] + 1, |
| 323 | // d[i][j - 1] + 1, |
| 324 | // d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) }); |
| 325 | int equal = s1[i - 1] == s2[j - 1] ? 0 : not_equal_penalty; |
| 326 | idx_t adjacent_score1 = array.Score(i: i - 1, j) + 1; |
| 327 | idx_t adjacent_score2 = array.Score(i, j: j - 1) + 1; |
| 328 | idx_t adjacent_score3 = array.Score(i: i - 1, j: j - 1) + equal; |
| 329 | |
| 330 | idx_t t = MinValue<idx_t>(a: adjacent_score1, b: adjacent_score2); |
| 331 | array.Score(i, j) = MinValue<idx_t>(a: t, b: adjacent_score3); |
| 332 | } |
| 333 | } |
| 334 | return array.Score(i: len1, j: len2); |
| 335 | } |
| 336 | |
| 337 | idx_t StringUtil::SimilarityScore(const string &s1, const string &s2) { |
| 338 | return LevenshteinDistance(s1_p: s1, s2_p: s2, not_equal_penalty: 3); |
| 339 | } |
| 340 | |
| 341 | vector<string> StringUtil::TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n, |
| 342 | idx_t threshold) { |
| 343 | vector<pair<string, idx_t>> scores; |
| 344 | scores.reserve(n: strings.size()); |
| 345 | for (auto &str : strings) { |
| 346 | if (target.size() < str.size()) { |
| 347 | scores.emplace_back(args: str, args: SimilarityScore(s1: str.substr(pos: 0, n: target.size()), s2: target)); |
| 348 | } else { |
| 349 | scores.emplace_back(args: str, args: SimilarityScore(s1: str, s2: target)); |
| 350 | } |
| 351 | } |
| 352 | return TopNStrings(scores, n, threshold); |
| 353 | } |
| 354 | |
| 355 | string StringUtil::CandidatesMessage(const vector<string> &candidates, const string &candidate) { |
| 356 | string result_str; |
| 357 | if (!candidates.empty()) { |
| 358 | result_str = "\n" + candidate + ": " ; |
| 359 | for (idx_t i = 0; i < candidates.size(); i++) { |
| 360 | if (i > 0) { |
| 361 | result_str += ", " ; |
| 362 | } |
| 363 | result_str += "\"" + candidates[i] + "\"" ; |
| 364 | } |
| 365 | } |
| 366 | return result_str; |
| 367 | } |
| 368 | |
| 369 | string StringUtil::CandidatesErrorMessage(const vector<string> &strings, const string &target, |
| 370 | const string &message_prefix, idx_t n) { |
| 371 | auto closest_strings = StringUtil::TopNLevenshtein(strings, target, n); |
| 372 | return StringUtil::CandidatesMessage(candidates: closest_strings, candidate: message_prefix); |
| 373 | } |
| 374 | |
| 375 | } // namespace duckdb |
| 376 | |