string_util.cpp source code [Velox/build/_deps/duckdb-src/src/common/string_util.cpp]

1	#include "duckdb/common/string_util.hpp"
2
3	#include "duckdb/common/exception.hpp"
4	#include "duckdb/common/pair.hpp"
5	#include "duckdb/common/to_string.hpp"
6	#include "duckdb/common/helper.hpp"
7
8	#include <algorithm>
9	#include <cctype>
10	#include <iomanip>
11	#include <memory>
12	#include <sstream>
13	#include <stdarg.h>
14	#include <string.h>
15	#include <random>
16
17	namespace duckdb {
18
19	string StringUtil::GenerateRandomName(idx_t length) {
20	std::random_device rd;
21	std::mt19937 gen(rd ());
22	std::uniform_int_distribution<> dis(`0`, `15`);
23
24	std::stringstream ss;
25	ss << std::hex;
26	for (idx_t i = `0`; i < length; i++) {
27	ss << dis (gen);
28	}
29	return ss.str();
30	}
31
32	bool StringUtil::Contains(const string &haystack, const string &needle) {
33	return (haystack.find(str: needle) != string::npos);
34	}
35
36	void StringUtil::LTrim(string &str) {
37	auto it = str.begin();
38	while (it != str.end() && CharacterIsSpace(c: *it)) {
39	it ++;
40	}
41	str.erase(first: str.begin(), last: it);
42	}
43
44	// Remove trailing ' ', '\f', '\n', '\r', '\t', '\v'
45	void StringUtil::RTrim(string &str) {
46	str.erase(first: find_if(first: str.rbegin(), last: str.rend(), pred: [](int ch) { return ch > `0` && !CharacterIsSpace(c: ch); }).base(),
47	last: str.end());
48	}
49
50	void StringUtil::RTrim(string &str, const string &chars_to_trim) {
51	str.erase(first: find_if(first: str.rbegin(), last: str.rend(),
52	pred: [&chars_to_trim](int ch) { return ch > `0` && chars_to_trim.find(c: ch) == string::npos; })
53	.base(),
54	last: str.end());
55	}
56
57	void StringUtil::Trim(string &str) {
58	StringUtil::LTrim(str);
59	StringUtil::RTrim(str);
60	}
61
62	bool StringUtil::StartsWith(string str, string prefix) {
63	if (prefix.size() > str.size()) {
64	return false;
65	}
66	return equal(prefix.begin(), prefix.end(), str.begin());
67	}
68
69	bool StringUtil::EndsWith(const string &str, const string &suffix) {
70	if (suffix.size() > str.size()) {
71	return false;
72	}
73	return equal(suffix.rbegin(), suffix.rend(), str.rbegin());
74	}
75
76	string StringUtil::Repeat(const string &str, idx_t n) {
77	std::ostringstream os;
78	for (idx_t i = `0`; i < n; i++) {
79	os << str;
80	}
81	return (os.str());
82	}
83
84	vector<string> StringUtil::Split(const string &str, char delimiter) {
85	std::stringstream ss(str);
86	vector<string> lines;
87	string temp;
88	while (getline(in&: ss, str&: temp, delim: delimiter)) {
89	lines.push_back(x: temp);
90	}
91	return (lines);
92	}
93
94	namespace string_util_internal {
95
96	inline void SkipSpaces(const string &str, idx_t &index) {
97	while (index < str.size() && std::isspace(str [index])) {
98	index++;
99	}
100	}
101
102	inline void ConsumeLetter(const string &str, idx_t &index, char expected) {
103	if (index >= str.size() \|\| str [index] != expected) {
104	throw ParserException ("Invalid quoted list: %s", str);
105	}
106
107	index++;
108	}
109
110	template <typename F>
111	inline void TakeWhile(const string &str, idx_t &index, const F &cond, string &taker) {
112	while (index < str.size() && cond(str [index])) {
113	taker.push_back(c: str [index]);
114	index++;
115	}
116	}
117
118	inline string TakePossiblyQuotedItem(const string &str, idx_t &index, char delimiter, char quote) {
119	string entry;
120
121	if (str [index] == quote) {
122	index++;
123	TakeWhile(
124	str, index, cond: [quote](char c) { return c != quote; }, taker&: entry);
125	ConsumeLetter(str, index, expected: quote);
126	} else {
127	TakeWhile(
128	str, index, cond: [delimiter, quote](char c) { return c != delimiter && c != quote && !std::isspace(c); }, taker&: entry);
129	}
130
131	return entry;
132	}
133
134	} // namespace string_util_internal
135
136	vector<string> StringUtil::SplitWithQuote(const string &str, char delimiter, char quote) {
137	vector<string> entries;
138	idx_t i = `0`;
139
140	string_util_internal::SkipSpaces(str, index&: i);
141	while (i < str.size()) {
142	if (!entries.empty()) {
143	string_util_internal::ConsumeLetter(str, index&: i, expected: delimiter);
144	}
145
146	entries.emplace_back(args: string_util_internal::TakePossiblyQuotedItem(str, index&: i, delimiter, quote));
147	string_util_internal::SkipSpaces(str, index&: i);
148	}
149
150	return entries;
151	}
152
153	string StringUtil::Join(const vector<string> &input, const string &separator) {
154	return StringUtil::Join(input, count: input.size(), separator, f: [](const string &s) { return s; });
155	}
156
157	string StringUtil::BytesToHumanReadableString(idx_t bytes) {
158	string db_size;
159	auto kilobytes = bytes / `1000`;
160	auto megabytes = kilobytes / `1000`;
161	kilobytes -= megabytes * `1000`;
162	auto gigabytes = megabytes / `1000`;
163	megabytes -= gigabytes * `1000`;
164	auto terabytes = gigabytes / `1000`;
165	gigabytes -= terabytes * `1000`;
166	auto petabytes = terabytes / `1000`;
167	terabytes -= petabytes * `1000`;
168	if (petabytes > `0`) {
169	return to_string(val: petabytes) + "." + to_string(val: terabytes / `100`) + "PB";
170	}
171	if (terabytes > `0`) {
172	return to_string(val: terabytes) + "." + to_string(val: gigabytes / `100`) + "TB";
173	} else if (gigabytes > `0`) {
174	return to_string(val: gigabytes) + "." + to_string(val: megabytes / `100`) + "GB";
175	} else if (megabytes > `0`) {
176	return to_string(val: megabytes) + "." + to_string(val: kilobytes / `100`) + "MB";
177	} else if (kilobytes > `0`) {
178	return to_string(val: kilobytes) + "KB";
179	} else {
180	return to_string(val: bytes) + (bytes == `1` ? " byte" : " bytes");
181	}
182	}
183
184	string StringUtil::Upper(const string &str) {
185	string copy(str);
186	transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return std::toupper(c: c); });
187	return (copy);
188	}
189
190	string StringUtil::Lower(const string &str) {
191	string copy(str);
192	transform(first: copy.begin(), last: copy.end(), result: copy.begin(), unary_op: [](unsigned char c) { return StringUtil::CharacterToLower(c); });
193	return (copy);
194	}
195
196	bool StringUtil::IsLower(const string &str) {
197	return str == Lower(str);
198	}
199
200	// Jenkins hash function: https://en.wikipedia.org/wiki/Jenkins_hash_function
201	uint64_t StringUtil::CIHash(const string &str) {
202	uint32_t hash = `0`;
203	for (auto c : str) {
204	hash += StringUtil::CharacterToLower(c);
205	hash += hash << `10`;
206	hash ^= hash >> `6`;
207	}
208	hash += hash << `3`;
209	hash ^= hash >> `11`;
210	hash += hash << `15`;
211	return hash;
212	}
213
214	bool StringUtil::CIEquals(const string &l1, const string &l2) {
215	if (l1.size() != l2.size()) {
216	return false;
217	}
218	for (idx_t c = `0`; c < l1.size(); c++) {
219	if (StringUtil::CharacterToLower(c: l1 [c]) != StringUtil::CharacterToLower(c: l2 [c])) {
220	return false;
221	}
222	}
223	return true;
224	}
225
226	vector<string> StringUtil::Split(const string &input, const string &split) {
227	vector<string> splits;
228
229	idx_t last = `0`;
230	idx_t input_len = input.size();
231	idx_t split_len = split.size();
232	while (last <= input_len) {
233	idx_t next = input.find(str: split, pos: last);
234	if (next == string::npos) {
235	next = input_len;
236	}
237
238	// Push the substring [last, next) on to splits
239	string substr = input.substr(pos: last, n: next - last);
240	if (!substr.empty()) {
241	splits.push_back(x: substr);
242	}
243	last = next + split_len;
244	}
245	if (splits.empty()) {
246	splits.push_back(x: input);
247	}
248	return splits;
249	}
250
251	string StringUtil::Replace(string source, const string &from, const string &to) {
252	if (from.empty()) {
253	throw InternalException ("Invalid argument to StringUtil::Replace - empty FROM");
254	}
255	idx_t start_pos = `0`;
256	while ((start_pos = source.find(str: from, pos: start_pos)) != string::npos) {
257	source.replace(pos: start_pos, n: from.length(), str: to);
258	start_pos += to.length(); // In case 'to' contains 'from', like
259	// replacing 'x' with 'yx'
260	}
261	return source;
262	}
263
264	vector<string> StringUtil::TopNStrings(vector<pair<string, idx_t>> scores, idx_t n, idx_t threshold) {
265	if (scores.empty()) {
266	return vector<string>();
267	}
268	sort(first: scores.begin(), last: scores.end(), comp: [](const pair<string, idx_t> &a, const pair<string, idx_t> &b) -> bool {
269	return a.second < b.second \|\| (a.second == b.second && a.first.size() < b.first.size());
270	});
271	vector<string> result;
272	result.push_back(x: scores [`0`].first);
273	for (idx_t i = `1`; i < MinValue<idx_t>(a: scores.size(), b: n); i++) {
274	if (scores [i].second > threshold) {
275	break;
276	}
277	result.push_back(x: scores [i].first);
278	}
279	return result;
280	}
281
282	struct LevenshteinArray {
283	LevenshteinArray(idx_t len1, idx_t len2) : len1(len1) {
284	dist = make_unsafe_uniq_array<idx_t>(n: len1 * len2);
285	}
286
287	idx_t &Score(idx_t i, idx_t j) {
288	return dist [GetIndex(i, j)];
289	}
290
291	private:
292	idx_t len1;
293	unsafe_unique_array<idx_t> dist;
294
295	idx_t GetIndex(idx_t i, idx_t j) {
296	return j * len1 + i;
297	}
298	};
299
300	// adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++
301	idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p, idx_t not_equal_penalty) {
302	auto s1 = StringUtil::Lower(str: s1_p);
303	auto s2 = StringUtil::Lower(str: s2_p);
304	idx_t len1 = s1.size();
305	idx_t len2 = s2.size();
306	if (len1 == `0`) {
307	return len2;
308	}
309	if (len2 == `0`) {
310	return len1;
311	}
312	LevenshteinArray array(len1 + `1`, len2 + `1`);
313	array.Score(i: `0`, j: `0`) = `0`;
314	for (idx_t i = `0`; i <= len1; i++) {
315	array.Score(i, j: `0`) = i;
316	}
317	for (idx_t j = `0`; j <= len2; j++) {
318	array.Score(i: `0`, j) = j;
319	}
320	for (idx_t i = `1`; i <= len1; i++) {
321	for (idx_t j = `1`; j <= len2; j++) {
322	// d[i][j] = std::min({ d[i - 1][j] + 1,
323	// d[i][j - 1] + 1,
324	// d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) });
325	int equal = s1 [i - `1`] == s2 [j - `1`] ? `0` : not_equal_penalty;
326	idx_t adjacent_score1 = array.Score(i: i - `1`, j) + `1`;
327	idx_t adjacent_score2 = array.Score(i, j: j - `1`) + `1`;
328	idx_t adjacent_score3 = array.Score(i: i - `1`, j: j - `1`) + equal;
329
330	idx_t t = MinValue<idx_t>(a: adjacent_score1, b: adjacent_score2);
331	array.Score(i, j) = MinValue<idx_t>(a: t, b: adjacent_score3);
332	}
333	}
334	return array.Score(i: len1, j: len2);
335	}
336
337	idx_t StringUtil::SimilarityScore(const string &s1, const string &s2) {
338	return LevenshteinDistance(s1_p: s1, s2_p: s2, not_equal_penalty: `3`);
339	}
340
341	vector<string> StringUtil::TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n,
342	idx_t threshold) {
343	vector<pair<string, idx_t>> scores;
344	scores.reserve(n: strings.size());
345	for (auto &str : strings) {
346	if (target.size() < str.size()) {
347	scores.emplace_back(args: str, args: SimilarityScore(s1: str.substr(pos: `0`, n: target.size()), s2: target));
348	} else {
349	scores.emplace_back(args: str, args: SimilarityScore(s1: str, s2: target));
350	}
351	}
352	return TopNStrings(scores, n, threshold);
353	}
354
355	string StringUtil::CandidatesMessage(const vector<string> &candidates, const string &candidate) {
356	string result_str;
357	if (!candidates.empty()) {
358	result_str = "\n" + candidate + ": ";
359	for (idx_t i = `0`; i < candidates.size(); i++) {
360	if (i > `0`) {
361	result_str += ", ";
362	}
363	result_str += "\"" + candidates [i] + "\"";
364	}
365	}
366	return result_str;
367	}
368
369	string StringUtil::CandidatesErrorMessage(const vector<string> &strings, const string &target,
370	const string &message_prefix, idx_t n) {
371	auto closest_strings = StringUtil::TopNLevenshtein(strings, target, n);
372	return StringUtil::CandidatesMessage(candidates: closest_strings, candidate: message_prefix);
373	}
374
375	} // namespace duckdb
376

Browse the source code of Velox/build/_deps/duckdb-src/src/common/string_util.cpp