1//===----------------------------------------------------------------------===//
2// DuckDB
3//
4// duckdb/common/string_util.hpp
5//
6//
7//===----------------------------------------------------------------------===//
8
9#pragma once
10
11#include "duckdb/common/constants.hpp"
12#include "duckdb/common/exception.hpp"
13#include "duckdb/common/vector.hpp"
14
15#include <cstring>
16
17namespace duckdb {
18
19/**
20 * String Utility Functions
21 * Note that these are not the most efficient implementations (i.e., they copy
22 * memory) and therefore they should only be used for debug messages and other
23 * such things.
24 */
25class StringUtil {
26public:
27 static string GenerateRandomName(idx_t length = 16);
28
29 static uint8_t GetHexValue(char c) {
30 if (c >= '0' && c <= '9') {
31 return c - '0';
32 }
33 if (c >= 'a' && c <= 'f') {
34 return c - 'a' + 10;
35 }
36 if (c >= 'A' && c <= 'F') {
37 return c - 'A' + 10;
38 }
39 throw InvalidInputException("Invalid input for hex digit: %s", string(c, 1));
40 }
41
42 static uint8_t GetBinaryValue(char c) {
43 if (c >= '0' && c <= '1') {
44 return c - '0';
45 }
46 throw InvalidInputException("Invalid input for binary digit: %s", string(c, 1));
47 }
48
49 static bool CharacterIsSpace(char c) {
50 return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
51 }
52 static bool CharacterIsNewline(char c) {
53 return c == '\n' || c == '\r';
54 }
55 static bool CharacterIsDigit(char c) {
56 return c >= '0' && c <= '9';
57 }
58 static bool CharacterIsHex(char c) {
59 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
60 }
61 static char CharacterToLower(char c) {
62 if (c >= 'A' && c <= 'Z') {
63 return c - ('A' - 'a');
64 }
65 return c;
66 }
67 static char CharacterIsAlpha(char c) {
68 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
69 }
70 static bool CharacterIsOperator(char c) {
71 if (c == '_') {
72 return false;
73 }
74 if (c >= '!' && c <= '/') {
75 return true;
76 }
77 if (c >= ':' && c <= '@') {
78 return true;
79 }
80 if (c >= '[' && c <= '`') {
81 return true;
82 }
83 if (c >= '{' && c <= '~') {
84 return true;
85 }
86 return false;
87 }
88
89 template <class TO>
90 static vector<TO> ConvertStrings(const vector<string> &strings) {
91 vector<TO> result;
92 for (auto &string : strings) {
93 result.emplace_back(string);
94 }
95 return result;
96 }
97
98 static vector<SQLIdentifier> ConvertToSQLIdentifiers(const vector<string> &strings) {
99 return ConvertStrings<SQLIdentifier>(strings);
100 }
101
102 static vector<SQLString> ConvertToSQLStrings(const vector<string> &strings) {
103 return ConvertStrings<SQLString>(strings);
104 }
105
106 //! Returns true if the needle string exists in the haystack
107 DUCKDB_API static bool Contains(const string &haystack, const string &needle);
108
109 //! Returns true if the target string starts with the given prefix
110 DUCKDB_API static bool StartsWith(string str, string prefix);
111
112 //! Returns true if the target string <b>ends</b> with the given suffix.
113 DUCKDB_API static bool EndsWith(const string &str, const string &suffix);
114
115 //! Repeat a string multiple times
116 DUCKDB_API static string Repeat(const string &str, const idx_t n);
117
118 //! Split the input string based on newline char
119 DUCKDB_API static vector<string> Split(const string &str, char delimiter);
120
121 //! Split the input string allong a quote. Note that any escaping is NOT supported.
122 DUCKDB_API static vector<string> SplitWithQuote(const string &str, char delimiter = ',', char quote = '"');
123
124 //! Join multiple strings into one string. Components are concatenated by the given separator
125 DUCKDB_API static string Join(const vector<string> &input, const string &separator);
126
127 template <class T>
128 static string ToString(const vector<T> &input, const string &separator) {
129 vector<string> input_list;
130 for (auto &i : input) {
131 input_list.push_back(i.ToString());
132 }
133 return StringUtil::Join(input: input_list, separator);
134 }
135
136 //! Join multiple items of container with given size, transformed to string
137 //! using function, into one string using the given separator
138 template <typename C, typename S, typename Func>
139 static string Join(const C &input, S count, const string &separator, Func f) {
140 // The result
141 std::string result;
142
143 // If the input isn't empty, append the first element. We do this so we
144 // don't need to introduce an if into the loop.
145 if (count > 0) {
146 result += f(input[0]);
147 }
148
149 // Append the remaining input components, after the first
150 for (size_t i = 1; i < count; i++) {
151 result += separator + f(input[i]);
152 }
153
154 return result;
155 }
156
157 //! Return a string that formats the give number of bytes
158 DUCKDB_API static string BytesToHumanReadableString(idx_t bytes);
159
160 //! Convert a string to uppercase
161 DUCKDB_API static string Upper(const string &str);
162
163 //! Convert a string to lowercase
164 DUCKDB_API static string Lower(const string &str);
165
166 DUCKDB_API static bool IsLower(const string &str);
167
168 //! Case insensitive hash
169 DUCKDB_API static uint64_t CIHash(const string &str);
170
171 //! Case insensitive equals
172 DUCKDB_API static bool CIEquals(const string &l1, const string &l2);
173
174 //! Format a string using printf semantics
175 template <typename... Args>
176 static string Format(const string fmt_str, Args... params) {
177 return Exception::ConstructMessage(fmt_str, params...);
178 }
179
180 //! Split the input string into a vector of strings based on the split string
181 DUCKDB_API static vector<string> Split(const string &input, const string &split);
182
183 //! Remove the whitespace char in the left end of the string
184 DUCKDB_API static void LTrim(string &str);
185 //! Remove the whitespace char in the right end of the string
186 DUCKDB_API static void RTrim(string &str);
187 //! Remove the all chars from chars_to_trim char in the right end of the string
188 DUCKDB_API static void RTrim(string &str, const string &chars_to_trim);
189 //! Remove the whitespace char in the left and right end of the string
190 DUCKDB_API static void Trim(string &str);
191
192 DUCKDB_API static string Replace(string source, const string &from, const string &to);
193
194 //! Get the levenshtein distance from two strings
195 //! The not_equal_penalty is the penalty given when two characters in a string are not equal
196 //! The regular levenshtein distance has a not equal penalty of 1, which means changing a character is as expensive
197 //! as adding or removing one For similarity searches we often want to give extra weight to changing a character For
198 //! example: with an equal penalty of 1, "pg_am" is closer to "depdelay" than "depdelay_minutes"
199 //! with an equal penalty of 3, "depdelay_minutes" is closer to "depdelay" than to "pg_am"
200 DUCKDB_API static idx_t LevenshteinDistance(const string &s1, const string &s2, idx_t not_equal_penalty = 1);
201
202 //! Returns the similarity score between two strings
203 DUCKDB_API static idx_t SimilarityScore(const string &s1, const string &s2);
204 //! Get the top-n strings (sorted by the given score distance) from a set of scores.
205 //! At least one entry is returned (if there is one).
206 //! Strings are only returned if they have a score less than the threshold.
207 DUCKDB_API static vector<string> TopNStrings(vector<std::pair<string, idx_t>> scores, idx_t n = 5,
208 idx_t threshold = 5);
209 //! Computes the levenshtein distance of each string in strings, and compares it to target, then returns TopNStrings
210 //! with the given params.
211 DUCKDB_API static vector<string> TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n = 5,
212 idx_t threshold = 5);
213 DUCKDB_API static string CandidatesMessage(const vector<string> &candidates,
214 const string &candidate = "Candidate bindings");
215
216 //! Generate an error message in the form of "{message_prefix}: nearest_string, nearest_string2, ...
217 //! Equivalent to calling TopNLevenshtein followed by CandidatesMessage
218 DUCKDB_API static string CandidatesErrorMessage(const vector<string> &strings, const string &target,
219 const string &message_prefix, idx_t n = 5);
220
221 //! Returns true if two null-terminated strings are equal or point to the same address.
222 //! Returns false if only one of the strings is nullptr
223 static bool Equals(const char *s1, const char *s2) {
224 if (s1 == s2) {
225 return true;
226 }
227 if (s1 == nullptr || s2 == nullptr) {
228 return false;
229 }
230 return strcmp(s1: s1, s2: s2) == 0;
231 }
232};
233
234} // namespace duckdb
235