1 | #pragma once |
2 | |
3 | #include <DataTypes/DataTypeArray.h> |
4 | #include <DataTypes/DataTypeString.h> |
5 | #include <Columns/ColumnString.h> |
6 | #include <Columns/ColumnFixedString.h> |
7 | #include <Columns/ColumnConst.h> |
8 | #include <Columns/ColumnArray.h> |
9 | #include <Common/StringUtils/StringUtils.h> |
10 | #include <Common/typeid_cast.h> |
11 | #include <Common/assert_cast.h> |
12 | #include <Functions/IFunctionImpl.h> |
13 | #include <Functions/Regexps.h> |
14 | #include <Functions/FunctionHelpers.h> |
15 | #include <IO/WriteHelpers.h> |
16 | |
17 | |
18 | namespace DB |
19 | { |
20 | |
21 | namespace ErrorCodes |
22 | { |
23 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
24 | extern const int BAD_ARGUMENTS; |
25 | extern const int ILLEGAL_COLUMN; |
26 | } |
27 | |
28 | |
29 | /** Functions that split strings into an array of strings or vice versa. |
30 | * |
31 | * splitByChar(sep, s) |
32 | * splitByString(sep, s) |
33 | * splitByRegexp(regexp, s) |
34 | * |
35 | * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp. |
36 | * - first subpattern, if regexp has subpattern; |
37 | * - zero subpattern (the match part, otherwise); |
38 | * - otherwise, an empty array |
39 | * |
40 | * arrayStringConcat(arr) |
41 | * arrayStringConcat(arr, delimiter) |
42 | * - join an array of strings into one string via a separator. |
43 | * |
44 | * alphaTokens(s) - select from the string subsequence `[a-zA-Z]+`. |
45 | * |
46 | * URL functions are located separately. |
47 | */ |
48 | |
49 | |
50 | using Pos = const char *; |
51 | |
52 | |
53 | /// Substring generators. All of them have a common interface. |
54 | |
55 | class AlphaTokensImpl |
56 | { |
57 | private: |
58 | Pos pos; |
59 | Pos end; |
60 | |
61 | public: |
62 | /// Get the name of the function. |
63 | static constexpr auto name = "alphaTokens" ; |
64 | static String getName() { return name; } |
65 | |
66 | static size_t getNumberOfArguments() { return 1; } |
67 | |
68 | /// Check the type of the function's arguments. |
69 | static void checkArguments(const DataTypes & arguments) |
70 | { |
71 | if (!isString(arguments[0])) |
72 | throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String." , |
73 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
74 | } |
75 | |
76 | /// Initialize by the function arguments. |
77 | void init(Block & /*block*/, const ColumnNumbers & /*arguments*/) {} |
78 | |
79 | /// Called for each next string. |
80 | void set(Pos pos_, Pos end_) |
81 | { |
82 | pos = pos_; |
83 | end = end_; |
84 | } |
85 | |
86 | /// Returns the position of the argument, that is the column of strings |
87 | size_t getStringsArgumentPosition() |
88 | { |
89 | return 0; |
90 | } |
91 | |
92 | /// Get the next token, if any, or return false. |
93 | bool get(Pos & token_begin, Pos & token_end) |
94 | { |
95 | /// Skip garbage |
96 | while (pos < end && !isAlphaASCII(*pos)) |
97 | ++pos; |
98 | |
99 | if (pos == end) |
100 | return false; |
101 | |
102 | token_begin = pos; |
103 | |
104 | while (pos < end && isAlphaASCII(*pos)) |
105 | ++pos; |
106 | |
107 | token_end = pos; |
108 | |
109 | return true; |
110 | } |
111 | }; |
112 | |
113 | |
114 | class SplitByCharImpl |
115 | { |
116 | private: |
117 | Pos pos; |
118 | Pos end; |
119 | |
120 | char sep; |
121 | |
122 | public: |
123 | static constexpr auto name = "splitByChar" ; |
124 | static String getName() { return name; } |
125 | static size_t getNumberOfArguments() { return 2; } |
126 | |
127 | static void checkArguments(const DataTypes & arguments) |
128 | { |
129 | if (!isString(arguments[0])) |
130 | throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String." , |
131 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
132 | |
133 | if (!isString(arguments[1])) |
134 | throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName() + ". Must be String." , |
135 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
136 | } |
137 | |
138 | void init(Block & block, const ColumnNumbers & arguments) |
139 | { |
140 | const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[0]).column.get()); |
141 | |
142 | if (!col) |
143 | throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() |
144 | + " of first argument of function " + getName() + ". Must be constant string." , |
145 | ErrorCodes::ILLEGAL_COLUMN); |
146 | |
147 | String sep_str = col->getValue<String>(); |
148 | |
149 | if (sep_str.size() != 1) |
150 | throw Exception("Illegal separator for function " + getName() + ". Must be exactly one byte." , ErrorCodes::BAD_ARGUMENTS); |
151 | |
152 | sep = sep_str[0]; |
153 | } |
154 | |
155 | /// Returns the position of the argument, that is the column of strings |
156 | size_t getStringsArgumentPosition() |
157 | { |
158 | return 1; |
159 | } |
160 | |
161 | void set(Pos pos_, Pos end_) |
162 | { |
163 | pos = pos_; |
164 | end = end_; |
165 | } |
166 | |
167 | bool get(Pos & token_begin, Pos & token_end) |
168 | { |
169 | if (!pos) |
170 | return false; |
171 | |
172 | token_begin = pos; |
173 | pos = reinterpret_cast<Pos>(memchr(pos, sep, end - pos)); |
174 | |
175 | if (pos) |
176 | { |
177 | token_end = pos; |
178 | ++pos; |
179 | } |
180 | else |
181 | token_end = end; |
182 | |
183 | return true; |
184 | } |
185 | }; |
186 | |
187 | |
188 | class SplitByStringImpl |
189 | { |
190 | private: |
191 | Pos pos; |
192 | Pos end; |
193 | |
194 | String sep; |
195 | |
196 | public: |
197 | static constexpr auto name = "splitByString" ; |
198 | static String getName() { return name; } |
199 | static size_t getNumberOfArguments() { return 2; } |
200 | |
201 | static void checkArguments(const DataTypes & arguments) |
202 | { |
203 | SplitByCharImpl::checkArguments(arguments); |
204 | } |
205 | |
206 | void init(Block & block, const ColumnNumbers & arguments) |
207 | { |
208 | const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[0]).column.get()); |
209 | |
210 | if (!col) |
211 | throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() |
212 | + " of first argument of function " + getName() + ". Must be constant string." , |
213 | ErrorCodes::ILLEGAL_COLUMN); |
214 | |
215 | sep = col->getValue<String>(); |
216 | |
217 | if (sep.empty()) |
218 | throw Exception("Illegal separator for function " + getName() + ". Must be not empty." , ErrorCodes::BAD_ARGUMENTS); |
219 | } |
220 | |
221 | /// Returns the position of the argument that is the column of strings |
222 | size_t getStringsArgumentPosition() |
223 | { |
224 | return 1; |
225 | } |
226 | |
227 | /// Called for each next string. |
228 | void set(Pos pos_, Pos end_) |
229 | { |
230 | pos = pos_; |
231 | end = end_; |
232 | } |
233 | |
234 | /// Get the next token, if any, or return false. |
235 | bool get(Pos & token_begin, Pos & token_end) |
236 | { |
237 | if (!pos) |
238 | return false; |
239 | |
240 | token_begin = pos; |
241 | pos = reinterpret_cast<Pos>(memmem(pos, end - pos, sep.data(), sep.size())); |
242 | |
243 | if (pos) |
244 | { |
245 | token_end = pos; |
246 | pos += sep.size(); |
247 | } |
248 | else |
249 | token_end = end; |
250 | |
251 | return true; |
252 | } |
253 | }; |
254 | |
255 | class |
256 | { |
257 | private: |
258 | Regexps::Pool::Pointer ; |
259 | OptimizedRegularExpression::MatchVec ; |
260 | size_t ; |
261 | |
262 | Pos ; |
263 | Pos ; |
264 | public: |
265 | static constexpr auto = "extractAll" ; |
266 | static String () { return name; } |
267 | static size_t () { return 2; } |
268 | |
269 | /// Check the type of function arguments. |
270 | static void (const DataTypes & arguments) |
271 | { |
272 | SplitByStringImpl::checkArguments(arguments); |
273 | } |
274 | |
275 | /// Initialize by the function arguments. |
276 | void (Block & block, const ColumnNumbers & arguments) |
277 | { |
278 | const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[1]).column.get()); |
279 | |
280 | if (!col) |
281 | throw Exception("Illegal column " + block.getByPosition(arguments[1]).column->getName() |
282 | + " of first argument of function " + getName() + ". Must be constant string." , |
283 | ErrorCodes::ILLEGAL_COLUMN); |
284 | |
285 | re = Regexps::get<false, false>(col->getValue<String>()); |
286 | capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0; |
287 | |
288 | matches.resize(capture + 1); |
289 | } |
290 | |
291 | /// Returns the position of the argument that is the column of strings |
292 | size_t () |
293 | { |
294 | return 0; |
295 | } |
296 | |
297 | /// Called for each next string. |
298 | void (Pos pos_, Pos end_) |
299 | { |
300 | pos = pos_; |
301 | end = end_; |
302 | } |
303 | |
304 | /// Get the next token, if any, or return false. |
305 | bool (Pos & token_begin, Pos & token_end) |
306 | { |
307 | if (!pos || pos > end) |
308 | return false; |
309 | |
310 | if (!re->match(pos, end - pos, matches) || !matches[0].length) |
311 | return false; |
312 | |
313 | if (matches[capture].offset == std::string::npos) |
314 | { |
315 | /// Empty match. |
316 | token_begin = pos; |
317 | token_end = pos; |
318 | } |
319 | else |
320 | { |
321 | token_begin = pos + matches[capture].offset; |
322 | token_end = token_begin + matches[capture].length; |
323 | } |
324 | |
325 | pos += matches[0].offset + matches[0].length; |
326 | |
327 | return true; |
328 | } |
329 | }; |
330 | |
331 | /// A function that takes a string, and returns an array of substrings created by some generator. |
332 | template <typename Generator> |
333 | class FunctionTokens : public IFunction |
334 | { |
335 | public: |
336 | static constexpr auto name = Generator::name; |
337 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionTokens>(); } |
338 | |
339 | String getName() const override |
340 | { |
341 | return name; |
342 | } |
343 | |
344 | size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); } |
345 | |
346 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
347 | { |
348 | Generator::checkArguments(arguments); |
349 | |
350 | return std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>()); |
351 | } |
352 | |
353 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
354 | { |
355 | Generator generator; |
356 | generator.init(block, arguments); |
357 | size_t array_argument_position = arguments[generator.getStringsArgumentPosition()]; |
358 | |
359 | const ColumnString * col_str = checkAndGetColumn<ColumnString>(block.getByPosition(array_argument_position).column.get()); |
360 | const ColumnConst * col_const_str = |
361 | checkAndGetColumnConstStringOrFixedString(block.getByPosition(array_argument_position).column.get()); |
362 | |
363 | auto col_res = ColumnArray::create(ColumnString::create()); |
364 | ColumnString & res_strings = typeid_cast<ColumnString &>(col_res->getData()); |
365 | ColumnArray::Offsets & res_offsets = col_res->getOffsets(); |
366 | ColumnString::Chars & res_strings_chars = res_strings.getChars(); |
367 | ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets(); |
368 | |
369 | if (col_str) |
370 | { |
371 | const ColumnString::Chars & src_chars = col_str->getChars(); |
372 | const ColumnString::Offsets & src_offsets = col_str->getOffsets(); |
373 | |
374 | res_offsets.reserve(src_offsets.size()); |
375 | res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random. |
376 | res_strings_chars.reserve(src_chars.size()); |
377 | |
378 | Pos token_begin = nullptr; |
379 | Pos token_end = nullptr; |
380 | |
381 | size_t size = src_offsets.size(); |
382 | ColumnString::Offset current_src_offset = 0; |
383 | ColumnArray::Offset current_dst_offset = 0; |
384 | ColumnString::Offset current_dst_strings_offset = 0; |
385 | for (size_t i = 0; i < size; ++i) |
386 | { |
387 | Pos pos = reinterpret_cast<Pos>(&src_chars[current_src_offset]); |
388 | current_src_offset = src_offsets[i]; |
389 | Pos end = reinterpret_cast<Pos>(&src_chars[current_src_offset]) - 1; |
390 | |
391 | generator.set(pos, end); |
392 | |
393 | size_t j = 0; |
394 | while (generator.get(token_begin, token_end)) |
395 | { |
396 | size_t token_size = token_end - token_begin; |
397 | |
398 | res_strings_chars.resize(res_strings_chars.size() + token_size + 1); |
399 | memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size); |
400 | res_strings_chars[current_dst_strings_offset + token_size] = 0; |
401 | |
402 | current_dst_strings_offset += token_size + 1; |
403 | res_strings_offsets.push_back(current_dst_strings_offset); |
404 | ++j; |
405 | } |
406 | |
407 | current_dst_offset += j; |
408 | res_offsets.push_back(current_dst_offset); |
409 | } |
410 | |
411 | block.getByPosition(result).column = std::move(col_res); |
412 | } |
413 | else if (col_const_str) |
414 | { |
415 | String src = col_const_str->getValue<String>(); |
416 | Array dst; |
417 | |
418 | generator.set(src.data(), src.data() + src.size()); |
419 | Pos token_begin = nullptr; |
420 | Pos token_end = nullptr; |
421 | |
422 | while (generator.get(token_begin, token_end)) |
423 | dst.push_back(String(token_begin, token_end - token_begin)); |
424 | |
425 | block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_const_str->size(), dst); |
426 | } |
427 | else |
428 | throw Exception("Illegal columns " + block.getByPosition(array_argument_position).column->getName() |
429 | + ", " + block.getByPosition(array_argument_position).column->getName() |
430 | + " of arguments of function " + getName(), |
431 | ErrorCodes::ILLEGAL_COLUMN); |
432 | } |
433 | }; |
434 | |
435 | |
436 | /// Joins an array of strings into one string via a separator. |
437 | class FunctionArrayStringConcat : public IFunction |
438 | { |
439 | private: |
440 | void executeInternal( |
441 | const ColumnString::Chars & src_chars, |
442 | const ColumnString::Offsets & src_string_offsets, |
443 | const ColumnArray::Offsets & src_array_offsets, |
444 | const char * delimiter, const size_t delimiter_size, |
445 | ColumnString::Chars & dst_chars, |
446 | ColumnString::Offsets & dst_string_offsets) |
447 | { |
448 | size_t size = src_array_offsets.size(); |
449 | |
450 | if (!size) |
451 | return; |
452 | |
453 | /// With a small margin - as if the separator goes after the last string of the array. |
454 | dst_chars.resize( |
455 | src_chars.size() |
456 | + delimiter_size * src_string_offsets.size() /// Separators after each string... |
457 | + src_array_offsets.size() /// Zero byte after each joined string |
458 | - src_string_offsets.size()); /// The former zero byte after each string of the array |
459 | |
460 | /// There will be as many strings as there were arrays. |
461 | dst_string_offsets.resize(src_array_offsets.size()); |
462 | |
463 | ColumnArray::Offset current_src_array_offset = 0; |
464 | ColumnString::Offset current_src_string_offset = 0; |
465 | |
466 | ColumnString::Offset current_dst_string_offset = 0; |
467 | |
468 | /// Loop through the array of strings. |
469 | for (size_t i = 0; i < size; ++i) |
470 | { |
471 | /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1. |
472 | for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset) |
473 | { |
474 | size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1; |
475 | |
476 | memcpySmallAllowReadWriteOverflow15( |
477 | &dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy); |
478 | |
479 | current_src_string_offset = src_string_offsets[current_src_array_offset]; |
480 | current_dst_string_offset += bytes_to_copy; |
481 | |
482 | if (current_src_array_offset + 1 != next_src_array_offset) |
483 | { |
484 | memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size); |
485 | current_dst_string_offset += delimiter_size; |
486 | } |
487 | } |
488 | |
489 | dst_chars[current_dst_string_offset] = 0; |
490 | ++current_dst_string_offset; |
491 | |
492 | dst_string_offsets[i] = current_dst_string_offset; |
493 | } |
494 | |
495 | dst_chars.resize(dst_string_offsets.back()); |
496 | } |
497 | |
498 | public: |
499 | static constexpr auto name = "arrayStringConcat" ; |
500 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionArrayStringConcat>(); } |
501 | |
502 | String getName() const override |
503 | { |
504 | return name; |
505 | } |
506 | |
507 | bool isVariadic() const override { return true; } |
508 | size_t getNumberOfArguments() const override { return 0; } |
509 | |
510 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
511 | { |
512 | if (arguments.size() != 1 && arguments.size() != 2) |
513 | throw Exception("Number of arguments for function " + getName() + " doesn't match: passed " |
514 | + toString(arguments.size()) + ", should be 1 or 2." , |
515 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
516 | |
517 | const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get()); |
518 | if (!array_type || !isString(array_type->getNestedType())) |
519 | throw Exception("First argument for function " + getName() + " must be array of strings." , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
520 | |
521 | if (arguments.size() == 2 |
522 | && !isString(arguments[1])) |
523 | throw Exception("Second argument for function " + getName() + " must be constant string." , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
524 | |
525 | return std::make_shared<DataTypeString>(); |
526 | } |
527 | |
528 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
529 | { |
530 | String delimiter; |
531 | if (arguments.size() == 2) |
532 | { |
533 | const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(block.getByPosition(arguments[1]).column.get()); |
534 | if (!col_delim) |
535 | throw Exception("Second argument for function " + getName() + " must be constant string." , ErrorCodes::ILLEGAL_COLUMN); |
536 | |
537 | delimiter = col_delim->getValue<String>(); |
538 | } |
539 | |
540 | if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(block.getByPosition(arguments[0]).column.get())) |
541 | { |
542 | Array src_arr = col_const_arr->getValue<Array>(); |
543 | String dst_str; |
544 | for (size_t i = 0, size = src_arr.size(); i < size; ++i) |
545 | { |
546 | if (i != 0) |
547 | dst_str += delimiter; |
548 | dst_str += src_arr[i].get<const String &>(); |
549 | } |
550 | |
551 | block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_const_arr->size(), dst_str); |
552 | } |
553 | else |
554 | { |
555 | const ColumnArray & col_arr = assert_cast<const ColumnArray &>(*block.getByPosition(arguments[0]).column); |
556 | const ColumnString & col_string = assert_cast<const ColumnString &>(col_arr.getData()); |
557 | |
558 | auto col_res = ColumnString::create(); |
559 | |
560 | executeInternal( |
561 | col_string.getChars(), col_string.getOffsets(), col_arr.getOffsets(), |
562 | delimiter.data(), delimiter.size(), |
563 | col_res->getChars(), col_res->getOffsets()); |
564 | |
565 | block.getByPosition(result).column = std::move(col_res); |
566 | } |
567 | } |
568 | }; |
569 | |
570 | |
571 | using FunctionAlphaTokens = FunctionTokens<AlphaTokensImpl>; |
572 | using FunctionSplitByChar = FunctionTokens<SplitByCharImpl>; |
573 | using FunctionSplitByString = FunctionTokens<SplitByStringImpl>; |
574 | using = FunctionTokens<ExtractAllImpl>; |
575 | |
576 | } |
577 | |