1#pragma once
2
3#include <DataTypes/DataTypeArray.h>
4#include <DataTypes/DataTypeString.h>
5#include <Columns/ColumnString.h>
6#include <Columns/ColumnFixedString.h>
7#include <Columns/ColumnConst.h>
8#include <Columns/ColumnArray.h>
9#include <Common/StringUtils/StringUtils.h>
10#include <Common/typeid_cast.h>
11#include <Common/assert_cast.h>
12#include <Functions/IFunctionImpl.h>
13#include <Functions/Regexps.h>
14#include <Functions/FunctionHelpers.h>
15#include <IO/WriteHelpers.h>
16
17
18namespace DB
19{
20
21namespace ErrorCodes
22{
23 extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
24 extern const int BAD_ARGUMENTS;
25 extern const int ILLEGAL_COLUMN;
26}
27
28
29/** Functions that split strings into an array of strings or vice versa.
30 *
31 * splitByChar(sep, s)
32 * splitByString(sep, s)
33 * splitByRegexp(regexp, s)
34 *
35 * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
36 * - first subpattern, if regexp has subpattern;
37 * - zero subpattern (the match part, otherwise);
38 * - otherwise, an empty array
39 *
40 * arrayStringConcat(arr)
41 * arrayStringConcat(arr, delimiter)
42 * - join an array of strings into one string via a separator.
43 *
44 * alphaTokens(s) - select from the string subsequence `[a-zA-Z]+`.
45 *
46 * URL functions are located separately.
47 */
48
49
50using Pos = const char *;
51
52
53/// Substring generators. All of them have a common interface.
54
55class AlphaTokensImpl
56{
57private:
58 Pos pos;
59 Pos end;
60
61public:
62 /// Get the name of the function.
63 static constexpr auto name = "alphaTokens";
64 static String getName() { return name; }
65
66 static size_t getNumberOfArguments() { return 1; }
67
68 /// Check the type of the function's arguments.
69 static void checkArguments(const DataTypes & arguments)
70 {
71 if (!isString(arguments[0]))
72 throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
73 ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
74 }
75
76 /// Initialize by the function arguments.
77 void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {}
78
79 /// Called for each next string.
80 void set(Pos pos_, Pos end_)
81 {
82 pos = pos_;
83 end = end_;
84 }
85
86 /// Returns the position of the argument, that is the column of strings
87 size_t getStringsArgumentPosition()
88 {
89 return 0;
90 }
91
92 /// Get the next token, if any, or return false.
93 bool get(Pos & token_begin, Pos & token_end)
94 {
95 /// Skip garbage
96 while (pos < end && !isAlphaASCII(*pos))
97 ++pos;
98
99 if (pos == end)
100 return false;
101
102 token_begin = pos;
103
104 while (pos < end && isAlphaASCII(*pos))
105 ++pos;
106
107 token_end = pos;
108
109 return true;
110 }
111};
112
113
114class SplitByCharImpl
115{
116private:
117 Pos pos;
118 Pos end;
119
120 char sep;
121
122public:
123 static constexpr auto name = "splitByChar";
124 static String getName() { return name; }
125 static size_t getNumberOfArguments() { return 2; }
126
127 static void checkArguments(const DataTypes & arguments)
128 {
129 if (!isString(arguments[0]))
130 throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
131 ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
132
133 if (!isString(arguments[1]))
134 throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() + ". Must be String.",
135 ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
136 }
137
138 void init(Block & block, const ColumnNumbers & arguments)
139 {
140 const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[0]).column.get());
141
142 if (!col)
143 throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
144 + " of first argument of function " + getName() + ". Must be constant string.",
145 ErrorCodes::ILLEGAL_COLUMN);
146
147 String sep_str = col->getValue<String>();
148
149 if (sep_str.size() != 1)
150 throw Exception("Illegal separator for function " + getName() + ". Must be exactly one byte.", ErrorCodes::BAD_ARGUMENTS);
151
152 sep = sep_str[0];
153 }
154
155 /// Returns the position of the argument, that is the column of strings
156 size_t getStringsArgumentPosition()
157 {
158 return 1;
159 }
160
161 void set(Pos pos_, Pos end_)
162 {
163 pos = pos_;
164 end = end_;
165 }
166
167 bool get(Pos & token_begin, Pos & token_end)
168 {
169 if (!pos)
170 return false;
171
172 token_begin = pos;
173 pos = reinterpret_cast<Pos>(memchr(pos, sep, end - pos));
174
175 if (pos)
176 {
177 token_end = pos;
178 ++pos;
179 }
180 else
181 token_end = end;
182
183 return true;
184 }
185};
186
187
188class SplitByStringImpl
189{
190private:
191 Pos pos;
192 Pos end;
193
194 String sep;
195
196public:
197 static constexpr auto name = "splitByString";
198 static String getName() { return name; }
199 static size_t getNumberOfArguments() { return 2; }
200
201 static void checkArguments(const DataTypes & arguments)
202 {
203 SplitByCharImpl::checkArguments(arguments);
204 }
205
206 void init(Block & block, const ColumnNumbers & arguments)
207 {
208 const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[0]).column.get());
209
210 if (!col)
211 throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
212 + " of first argument of function " + getName() + ". Must be constant string.",
213 ErrorCodes::ILLEGAL_COLUMN);
214
215 sep = col->getValue<String>();
216
217 if (sep.empty())
218 throw Exception("Illegal separator for function " + getName() + ". Must be not empty.", ErrorCodes::BAD_ARGUMENTS);
219 }
220
221 /// Returns the position of the argument that is the column of strings
222 size_t getStringsArgumentPosition()
223 {
224 return 1;
225 }
226
227 /// Called for each next string.
228 void set(Pos pos_, Pos end_)
229 {
230 pos = pos_;
231 end = end_;
232 }
233
234 /// Get the next token, if any, or return false.
235 bool get(Pos & token_begin, Pos & token_end)
236 {
237 if (!pos)
238 return false;
239
240 token_begin = pos;
241 pos = reinterpret_cast<Pos>(memmem(pos, end - pos, sep.data(), sep.size()));
242
243 if (pos)
244 {
245 token_end = pos;
246 pos += sep.size();
247 }
248 else
249 token_end = end;
250
251 return true;
252 }
253};
254
255class ExtractAllImpl
256{
257private:
258 Regexps::Pool::Pointer re;
259 OptimizedRegularExpression::MatchVec matches;
260 size_t capture;
261
262 Pos pos;
263 Pos end;
264public:
265 static constexpr auto name = "extractAll";
266 static String getName() { return name; }
267 static size_t getNumberOfArguments() { return 2; }
268
269 /// Check the type of function arguments.
270 static void checkArguments(const DataTypes & arguments)
271 {
272 SplitByStringImpl::checkArguments(arguments);
273 }
274
275 /// Initialize by the function arguments.
276 void init(Block & block, const ColumnNumbers & arguments)
277 {
278 const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[1]).column.get());
279
280 if (!col)
281 throw Exception("Illegal column " + block.getByPosition(arguments[1]).column->getName()
282 + " of first argument of function " + getName() + ". Must be constant string.",
283 ErrorCodes::ILLEGAL_COLUMN);
284
285 re = Regexps::get<false, false>(col->getValue<String>());
286 capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;
287
288 matches.resize(capture + 1);
289 }
290
291 /// Returns the position of the argument that is the column of strings
292 size_t getStringsArgumentPosition()
293 {
294 return 0;
295 }
296
297 /// Called for each next string.
298 void set(Pos pos_, Pos end_)
299 {
300 pos = pos_;
301 end = end_;
302 }
303
304 /// Get the next token, if any, or return false.
305 bool get(Pos & token_begin, Pos & token_end)
306 {
307 if (!pos || pos > end)
308 return false;
309
310 if (!re->match(pos, end - pos, matches) || !matches[0].length)
311 return false;
312
313 if (matches[capture].offset == std::string::npos)
314 {
315 /// Empty match.
316 token_begin = pos;
317 token_end = pos;
318 }
319 else
320 {
321 token_begin = pos + matches[capture].offset;
322 token_end = token_begin + matches[capture].length;
323 }
324
325 pos += matches[0].offset + matches[0].length;
326
327 return true;
328 }
329};
330
331/// A function that takes a string, and returns an array of substrings created by some generator.
332template <typename Generator>
333class FunctionTokens : public IFunction
334{
335public:
336 static constexpr auto name = Generator::name;
337 static FunctionPtr create(const Context &) { return std::make_shared<FunctionTokens>(); }
338
339 String getName() const override
340 {
341 return name;
342 }
343
344 size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); }
345
346 DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
347 {
348 Generator::checkArguments(arguments);
349
350 return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>());
351 }
352
353 void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
354 {
355 Generator generator;
356 generator.init(block, arguments);
357 size_t array_argument_position = arguments[generator.getStringsArgumentPosition()];
358
359 const ColumnString * col_str = checkAndGetColumn<ColumnString>(block.getByPosition(array_argument_position).column.get());
360 const ColumnConst * col_const_str =
361 checkAndGetColumnConstStringOrFixedString(block.getByPosition(array_argument_position).column.get());
362
363 auto col_res = ColumnArray::create(ColumnString::create());
364 ColumnString & res_strings = typeid_cast<ColumnString &>(col_res->getData());
365 ColumnArray::Offsets & res_offsets = col_res->getOffsets();
366 ColumnString::Chars & res_strings_chars = res_strings.getChars();
367 ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
368
369 if (col_str)
370 {
371 const ColumnString::Chars & src_chars = col_str->getChars();
372 const ColumnString::Offsets & src_offsets = col_str->getOffsets();
373
374 res_offsets.reserve(src_offsets.size());
375 res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random.
376 res_strings_chars.reserve(src_chars.size());
377
378 Pos token_begin = nullptr;
379 Pos token_end = nullptr;
380
381 size_t size = src_offsets.size();
382 ColumnString::Offset current_src_offset = 0;
383 ColumnArray::Offset current_dst_offset = 0;
384 ColumnString::Offset current_dst_strings_offset = 0;
385 for (size_t i = 0; i < size; ++i)
386 {
387 Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]);
388 current_src_offset = src_offsets[i];
389 Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1;
390
391 generator.set(pos, end);
392
393 size_t j = 0;
394 while (generator.get(token_begin, token_end))
395 {
396 size_t token_size = token_end - token_begin;
397
398 res_strings_chars.resize(res_strings_chars.size() + token_size + 1);
399 memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size);
400 res_strings_chars[current_dst_strings_offset + token_size] = 0;
401
402 current_dst_strings_offset += token_size + 1;
403 res_strings_offsets.push_back(current_dst_strings_offset);
404 ++j;
405 }
406
407 current_dst_offset += j;
408 res_offsets.push_back(current_dst_offset);
409 }
410
411 block.getByPosition(result).column = std::move(col_res);
412 }
413 else if (col_const_str)
414 {
415 String src = col_const_str->getValue<String>();
416 Array dst;
417
418 generator.set(src.data(), src.data() + src.size());
419 Pos token_begin = nullptr;
420 Pos token_end = nullptr;
421
422 while (generator.get(token_begin, token_end))
423 dst.push_back(String(token_begin, token_end - token_begin));
424
425 block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_const_str->size(), dst);
426 }
427 else
428 throw Exception("Illegal columns " + block.getByPosition(array_argument_position).column->getName()
429 + ", " + block.getByPosition(array_argument_position).column->getName()
430 + " of arguments of function " + getName(),
431 ErrorCodes::ILLEGAL_COLUMN);
432 }
433};
434
435
436/// Joins an array of strings into one string via a separator.
437class FunctionArrayStringConcat : public IFunction
438{
439private:
440 void executeInternal(
441 const ColumnString::Chars & src_chars,
442 const ColumnString::Offsets & src_string_offsets,
443 const ColumnArray::Offsets & src_array_offsets,
444 const char * delimiter, const size_t delimiter_size,
445 ColumnString::Chars & dst_chars,
446 ColumnString::Offsets & dst_string_offsets)
447 {
448 size_t size = src_array_offsets.size();
449
450 if (!size)
451 return;
452
453 /// With a small margin - as if the separator goes after the last string of the array.
454 dst_chars.resize(
455 src_chars.size()
456 + delimiter_size * src_string_offsets.size() /// Separators after each string...
457 + src_array_offsets.size() /// Zero byte after each joined string
458 - src_string_offsets.size()); /// The former zero byte after each string of the array
459
460 /// There will be as many strings as there were arrays.
461 dst_string_offsets.resize(src_array_offsets.size());
462
463 ColumnArray::Offset current_src_array_offset = 0;
464 ColumnString::Offset current_src_string_offset = 0;
465
466 ColumnString::Offset current_dst_string_offset = 0;
467
468 /// Loop through the array of strings.
469 for (size_t i = 0; i < size; ++i)
470 {
471 /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1.
472 for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset)
473 {
474 size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1;
475
476 memcpySmallAllowReadWriteOverflow15(
477 &dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy);
478
479 current_src_string_offset = src_string_offsets[current_src_array_offset];
480 current_dst_string_offset += bytes_to_copy;
481
482 if (current_src_array_offset + 1 != next_src_array_offset)
483 {
484 memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size);
485 current_dst_string_offset += delimiter_size;
486 }
487 }
488
489 dst_chars[current_dst_string_offset] = 0;
490 ++current_dst_string_offset;
491
492 dst_string_offsets[i] = current_dst_string_offset;
493 }
494
495 dst_chars.resize(dst_string_offsets.back());
496 }
497
498public:
499 static constexpr auto name = "arrayStringConcat";
500 static FunctionPtr create(const Context &) { return std::make_shared<FunctionArrayStringConcat>(); }
501
502 String getName() const override
503 {
504 return name;
505 }
506
507 bool isVariadic() const override { return true; }
508 size_t getNumberOfArguments() const override { return 0; }
509
510 DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
511 {
512 if (arguments.size() != 1 && arguments.size() != 2)
513 throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
514 + toString(arguments.size()) + ", should be 1 or 2.",
515 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
516
517 const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
518 if (!array_type || !isString(array_type->getNestedType()))
519 throw Exception("First argument for function " + getName() + " must be array of strings.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
520
521 if (arguments.size() == 2
522 && !isString(arguments[1]))
523 throw Exception("Second argument for function " + getName() + " must be constant string.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
524
525 return std::make_shared<DataTypeString>();
526 }
527
528 void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
529 {
530 String delimiter;
531 if (arguments.size() == 2)
532 {
533 const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[1]).column.get());
534 if (!col_delim)
535 throw Exception("Second argument for function " + getName() + " must be constant string.", ErrorCodes::ILLEGAL_COLUMN);
536
537 delimiter = col_delim->getValue<String>();
538 }
539
540 if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(block.getByPosition(arguments[0]).column.get()))
541 {
542 Array src_arr = col_const_arr->getValue<Array>();
543 String dst_str;
544 for (size_t i = 0, size = src_arr.size(); i < size; ++i)
545 {
546 if (i != 0)
547 dst_str += delimiter;
548 dst_str += src_arr[i].get<const String &>();
549 }
550
551 block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_const_arr->size(), dst_str);
552 }
553 else
554 {
555 const ColumnArray & col_arr = assert_cast<const ColumnArray &>(*block.getByPosition(arguments[0]).column);
556 const ColumnString & col_string = assert_cast<const ColumnString &>(col_arr.getData());
557
558 auto col_res = ColumnString::create();
559
560 executeInternal(
561 col_string.getChars(), col_string.getOffsets(), col_arr.getOffsets(),
562 delimiter.data(), delimiter.size(),
563 col_res->getChars(), col_res->getOffsets());
564
565 block.getByPosition(result).column = std::move(col_res);
566 }
567 }
568};
569
570
571using FunctionAlphaTokens = FunctionTokens<AlphaTokensImpl>;
572using FunctionSplitByChar = FunctionTokens<SplitByCharImpl>;
573using FunctionSplitByString = FunctionTokens<SplitByStringImpl>;
574using FunctionExtractAll = FunctionTokens<ExtractAllImpl>;
575
576}
577