| 1 | //===----------------------------------------------------------------------===// |
| 2 | // DuckDB |
| 3 | // |
| 4 | // duckdb/common/string_util.hpp |
| 5 | // |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #pragma once |
| 10 | |
| 11 | #include "duckdb/common/constants.hpp" |
| 12 | #include "duckdb/common/exception.hpp" |
| 13 | #include "duckdb/common/vector.hpp" |
| 14 | |
| 15 | #include <cstring> |
| 16 | |
| 17 | namespace duckdb { |
| 18 | |
| 19 | /** |
| 20 | * String Utility Functions |
| 21 | * Note that these are not the most efficient implementations (i.e., they copy |
| 22 | * memory) and therefore they should only be used for debug messages and other |
| 23 | * such things. |
| 24 | */ |
| 25 | class StringUtil { |
| 26 | public: |
| 27 | static string GenerateRandomName(idx_t length = 16); |
| 28 | |
| 29 | static uint8_t GetHexValue(char c) { |
| 30 | if (c >= '0' && c <= '9') { |
| 31 | return c - '0'; |
| 32 | } |
| 33 | if (c >= 'a' && c <= 'f') { |
| 34 | return c - 'a' + 10; |
| 35 | } |
| 36 | if (c >= 'A' && c <= 'F') { |
| 37 | return c - 'A' + 10; |
| 38 | } |
| 39 | throw InvalidInputException("Invalid input for hex digit: %s" , string(c, 1)); |
| 40 | } |
| 41 | |
| 42 | static uint8_t GetBinaryValue(char c) { |
| 43 | if (c >= '0' && c <= '1') { |
| 44 | return c - '0'; |
| 45 | } |
| 46 | throw InvalidInputException("Invalid input for binary digit: %s" , string(c, 1)); |
| 47 | } |
| 48 | |
| 49 | static bool CharacterIsSpace(char c) { |
| 50 | return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'; |
| 51 | } |
| 52 | static bool CharacterIsNewline(char c) { |
| 53 | return c == '\n' || c == '\r'; |
| 54 | } |
| 55 | static bool CharacterIsDigit(char c) { |
| 56 | return c >= '0' && c <= '9'; |
| 57 | } |
| 58 | static bool CharacterIsHex(char c) { |
| 59 | return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); |
| 60 | } |
| 61 | static char CharacterToLower(char c) { |
| 62 | if (c >= 'A' && c <= 'Z') { |
| 63 | return c - ('A' - 'a'); |
| 64 | } |
| 65 | return c; |
| 66 | } |
| 67 | static char CharacterIsAlpha(char c) { |
| 68 | return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); |
| 69 | } |
| 70 | static bool CharacterIsOperator(char c) { |
| 71 | if (c == '_') { |
| 72 | return false; |
| 73 | } |
| 74 | if (c >= '!' && c <= '/') { |
| 75 | return true; |
| 76 | } |
| 77 | if (c >= ':' && c <= '@') { |
| 78 | return true; |
| 79 | } |
| 80 | if (c >= '[' && c <= '`') { |
| 81 | return true; |
| 82 | } |
| 83 | if (c >= '{' && c <= '~') { |
| 84 | return true; |
| 85 | } |
| 86 | return false; |
| 87 | } |
| 88 | |
| 89 | template <class TO> |
| 90 | static vector<TO> ConvertStrings(const vector<string> &strings) { |
| 91 | vector<TO> result; |
| 92 | for (auto &string : strings) { |
| 93 | result.emplace_back(string); |
| 94 | } |
| 95 | return result; |
| 96 | } |
| 97 | |
| 98 | static vector<SQLIdentifier> ConvertToSQLIdentifiers(const vector<string> &strings) { |
| 99 | return ConvertStrings<SQLIdentifier>(strings); |
| 100 | } |
| 101 | |
| 102 | static vector<SQLString> ConvertToSQLStrings(const vector<string> &strings) { |
| 103 | return ConvertStrings<SQLString>(strings); |
| 104 | } |
| 105 | |
| 106 | //! Returns true if the needle string exists in the haystack |
| 107 | DUCKDB_API static bool Contains(const string &haystack, const string &needle); |
| 108 | |
| 109 | //! Returns true if the target string starts with the given prefix |
| 110 | DUCKDB_API static bool StartsWith(string str, string prefix); |
| 111 | |
| 112 | //! Returns true if the target string <b>ends</b> with the given suffix. |
| 113 | DUCKDB_API static bool EndsWith(const string &str, const string &suffix); |
| 114 | |
| 115 | //! Repeat a string multiple times |
| 116 | DUCKDB_API static string Repeat(const string &str, const idx_t n); |
| 117 | |
| 118 | //! Split the input string based on newline char |
| 119 | DUCKDB_API static vector<string> Split(const string &str, char delimiter); |
| 120 | |
| 121 | //! Split the input string allong a quote. Note that any escaping is NOT supported. |
| 122 | DUCKDB_API static vector<string> SplitWithQuote(const string &str, char delimiter = ',', char quote = '"'); |
| 123 | |
| 124 | //! Join multiple strings into one string. Components are concatenated by the given separator |
| 125 | DUCKDB_API static string Join(const vector<string> &input, const string &separator); |
| 126 | |
| 127 | template <class T> |
| 128 | static string ToString(const vector<T> &input, const string &separator) { |
| 129 | vector<string> input_list; |
| 130 | for (auto &i : input) { |
| 131 | input_list.push_back(i.ToString()); |
| 132 | } |
| 133 | return StringUtil::Join(input: input_list, separator); |
| 134 | } |
| 135 | |
| 136 | //! Join multiple items of container with given size, transformed to string |
| 137 | //! using function, into one string using the given separator |
| 138 | template <typename C, typename S, typename Func> |
| 139 | static string Join(const C &input, S count, const string &separator, Func f) { |
| 140 | // The result |
| 141 | std::string result; |
| 142 | |
| 143 | // If the input isn't empty, append the first element. We do this so we |
| 144 | // don't need to introduce an if into the loop. |
| 145 | if (count > 0) { |
| 146 | result += f(input[0]); |
| 147 | } |
| 148 | |
| 149 | // Append the remaining input components, after the first |
| 150 | for (size_t i = 1; i < count; i++) { |
| 151 | result += separator + f(input[i]); |
| 152 | } |
| 153 | |
| 154 | return result; |
| 155 | } |
| 156 | |
| 157 | //! Return a string that formats the give number of bytes |
| 158 | DUCKDB_API static string BytesToHumanReadableString(idx_t bytes); |
| 159 | |
| 160 | //! Convert a string to uppercase |
| 161 | DUCKDB_API static string Upper(const string &str); |
| 162 | |
| 163 | //! Convert a string to lowercase |
| 164 | DUCKDB_API static string Lower(const string &str); |
| 165 | |
| 166 | DUCKDB_API static bool IsLower(const string &str); |
| 167 | |
| 168 | //! Case insensitive hash |
| 169 | DUCKDB_API static uint64_t CIHash(const string &str); |
| 170 | |
| 171 | //! Case insensitive equals |
| 172 | DUCKDB_API static bool CIEquals(const string &l1, const string &l2); |
| 173 | |
| 174 | //! Format a string using printf semantics |
| 175 | template <typename... Args> |
| 176 | static string Format(const string fmt_str, Args... params) { |
| 177 | return Exception::ConstructMessage(fmt_str, params...); |
| 178 | } |
| 179 | |
| 180 | //! Split the input string into a vector of strings based on the split string |
| 181 | DUCKDB_API static vector<string> Split(const string &input, const string &split); |
| 182 | |
| 183 | //! Remove the whitespace char in the left end of the string |
| 184 | DUCKDB_API static void LTrim(string &str); |
| 185 | //! Remove the whitespace char in the right end of the string |
| 186 | DUCKDB_API static void RTrim(string &str); |
| 187 | //! Remove the all chars from chars_to_trim char in the right end of the string |
| 188 | DUCKDB_API static void RTrim(string &str, const string &chars_to_trim); |
| 189 | //! Remove the whitespace char in the left and right end of the string |
| 190 | DUCKDB_API static void Trim(string &str); |
| 191 | |
| 192 | DUCKDB_API static string Replace(string source, const string &from, const string &to); |
| 193 | |
| 194 | //! Get the levenshtein distance from two strings |
| 195 | //! The not_equal_penalty is the penalty given when two characters in a string are not equal |
| 196 | //! The regular levenshtein distance has a not equal penalty of 1, which means changing a character is as expensive |
| 197 | //! as adding or removing one For similarity searches we often want to give extra weight to changing a character For |
| 198 | //! example: with an equal penalty of 1, "pg_am" is closer to "depdelay" than "depdelay_minutes" |
| 199 | //! with an equal penalty of 3, "depdelay_minutes" is closer to "depdelay" than to "pg_am" |
| 200 | DUCKDB_API static idx_t LevenshteinDistance(const string &s1, const string &s2, idx_t not_equal_penalty = 1); |
| 201 | |
| 202 | //! Returns the similarity score between two strings |
| 203 | DUCKDB_API static idx_t SimilarityScore(const string &s1, const string &s2); |
| 204 | //! Get the top-n strings (sorted by the given score distance) from a set of scores. |
| 205 | //! At least one entry is returned (if there is one). |
| 206 | //! Strings are only returned if they have a score less than the threshold. |
| 207 | DUCKDB_API static vector<string> TopNStrings(vector<std::pair<string, idx_t>> scores, idx_t n = 5, |
| 208 | idx_t threshold = 5); |
| 209 | //! Computes the levenshtein distance of each string in strings, and compares it to target, then returns TopNStrings |
| 210 | //! with the given params. |
| 211 | DUCKDB_API static vector<string> TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n = 5, |
| 212 | idx_t threshold = 5); |
| 213 | DUCKDB_API static string CandidatesMessage(const vector<string> &candidates, |
| 214 | const string &candidate = "Candidate bindings" ); |
| 215 | |
| 216 | //! Generate an error message in the form of "{message_prefix}: nearest_string, nearest_string2, ... |
| 217 | //! Equivalent to calling TopNLevenshtein followed by CandidatesMessage |
| 218 | DUCKDB_API static string CandidatesErrorMessage(const vector<string> &strings, const string &target, |
| 219 | const string &message_prefix, idx_t n = 5); |
| 220 | |
| 221 | //! Returns true if two null-terminated strings are equal or point to the same address. |
| 222 | //! Returns false if only one of the strings is nullptr |
| 223 | static bool Equals(const char *s1, const char *s2) { |
| 224 | if (s1 == s2) { |
| 225 | return true; |
| 226 | } |
| 227 | if (s1 == nullptr || s2 == nullptr) { |
| 228 | return false; |
| 229 | } |
| 230 | return strcmp(s1: s1, s2: s2) == 0; |
| 231 | } |
| 232 | }; |
| 233 | |
| 234 | } // namespace duckdb |
| 235 | |