1 | #include <DataTypes/DataTypeString.h> |
2 | #include <DataTypes/DataTypeDate.h> |
3 | #include <DataTypes/DataTypeDateTime.h> |
4 | #include <DataTypes/DataTypeDateTime64.h> |
5 | #include <Columns/ColumnString.h> |
6 | |
7 | #include <Functions/IFunctionImpl.h> |
8 | #include <Functions/FunctionHelpers.h> |
9 | #include <Functions/FunctionFactory.h> |
10 | #include <Functions/DateTimeTransforms.h> |
11 | #include <Functions/extractTimeZoneFromFunctionArguments.h> |
12 | |
13 | #include <IO/WriteHelpers.h> |
14 | |
15 | #include <common/DateLUTImpl.h> |
16 | #include <common/find_symbols.h> |
17 | #include <Core/DecimalFunctions.h> |
18 | |
19 | #include <type_traits> |
20 | |
21 | |
22 | namespace DB |
23 | { |
24 | |
25 | namespace ErrorCodes |
26 | { |
27 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
28 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
29 | extern const int NOT_IMPLEMENTED; |
30 | extern const int ILLEGAL_COLUMN; |
31 | extern const int BAD_ARGUMENTS; |
32 | } |
33 | |
34 | namespace |
35 | { |
36 | // in private namespace to avoid GCC 9 error: "explicit specialization in non-namespace scope" |
37 | template <typename DataType> struct ActionaValueTypeMap {}; |
38 | template <> struct ActionaValueTypeMap<DataTypeDate> { using ActionValueType = UInt16; }; |
39 | template <> struct ActionaValueTypeMap<DataTypeDateTime> { using ActionValueType = UInt32; }; |
40 | // TODO(vnemkov): once there is support for Int64 in LUT, make that Int64. |
41 | // TODO(vnemkov): to add sub-second format instruction, make that DateTime64 and do some math in Action<T>. |
42 | template <> struct ActionaValueTypeMap<DataTypeDateTime64> { using ActionValueType = UInt32; }; |
43 | } |
44 | |
45 | /** formatDateTime(time, 'pattern') |
46 | * Performs formatting of time, according to provided pattern. |
47 | * |
48 | * This function is optimized with an assumption, that the resulting strings are fixed width. |
49 | * (This assumption is fulfilled for currently supported formatting options). |
50 | * |
51 | * It is implemented in two steps. |
52 | * At first step, it creates a pattern of zeros, literal characters, whitespaces, etc. |
53 | * and quickly fills resulting character array (string column) with this pattern. |
54 | * At second step, it walks across the resulting character array and modifies/replaces specific charaters, |
55 | * by calling some functions by pointers and shifting cursor by specified amount. |
56 | * |
57 | * Advantages: |
58 | * - memcpy is mostly unrolled; |
59 | * - low number of arithmetic ops due to pre-filled pattern; |
60 | * - for somewhat reason, function by pointer call is faster than switch/case. |
61 | * |
62 | * Possible further optimization options: |
63 | * - slightly interleave first and second step for better cache locality |
64 | * (but it has no sense when character array fits in L1d cache); |
65 | * - avoid indirect function calls and inline functions with JIT compilation. |
66 | * |
67 | * Performance on Intel(R) Core(TM) i7-6700 CPU @ 3.40GHz: |
68 | * |
69 | * WITH formatDateTime(now() + number, '%H:%M:%S') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x); |
70 | * - 97 million rows per second per core; |
71 | * |
72 | * WITH formatDateTime(toDateTime('2018-01-01 00:00:00') + number, '%F %T') AS x SELECT count() FROM system.numbers WHERE NOT ignore(x) |
73 | * - 71 million rows per second per core; |
74 | * |
75 | * select count() from (select formatDateTime(t, '%m/%d/%Y %H:%M:%S') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); |
76 | * - 53 million rows per second per core; |
77 | * |
78 | * select count() from (select formatDateTime(t, 'Hello %Y World') from (select toDateTime('2018-01-01 00:00:00')+number as t from numbers(100000000))); |
79 | * - 138 million rows per second per core; |
80 | * |
81 | * PS. We can make this function to return FixedString. Currently it returns String. |
82 | */ |
83 | class FunctionFormatDateTime : public IFunction |
84 | { |
85 | private: |
86 | /// Time is either UInt32 for DateTime or UInt16 for Date. |
87 | template <typename Time> |
88 | class Action |
89 | { |
90 | public: |
91 | using Func = void (*)(char *, Time, const DateLUTImpl &); |
92 | |
93 | Func func; |
94 | size_t shift; |
95 | |
96 | Action(Func func_, size_t shift_ = 0) : func(func_), shift(shift_) {} |
97 | |
98 | void perform(char *& target, Time source, const DateLUTImpl & timezone) |
99 | { |
100 | func(target, source, timezone); |
101 | target += shift; |
102 | } |
103 | |
104 | private: |
105 | template <typename T> |
106 | static inline void writeNumber2(char * p, T v) |
107 | { |
108 | memcpy(p, &digits100[v * 2], 2); |
109 | } |
110 | |
111 | template <typename T> |
112 | static inline void writeNumber3(char * p, T v) |
113 | { |
114 | writeNumber2(p, v / 10); |
115 | p[2] += v % 10; |
116 | } |
117 | |
118 | template <typename T> |
119 | static inline void writeNumber4(char * p, T v) |
120 | { |
121 | writeNumber2(p, v / 100); |
122 | writeNumber2(p + 2, v % 100); |
123 | } |
124 | |
125 | public: |
126 | static void noop(char *, Time, const DateLUTImpl &) |
127 | { |
128 | } |
129 | |
130 | static void century(char * target, Time source, const DateLUTImpl & timezone) |
131 | { |
132 | auto year = ToYearImpl::execute(source, timezone); |
133 | auto century = year / 100; |
134 | writeNumber2(target, century); |
135 | } |
136 | |
137 | static void dayOfMonth(char * target, Time source, const DateLUTImpl & timezone) |
138 | { |
139 | writeNumber2(target, ToDayOfMonthImpl::execute(source, timezone)); |
140 | } |
141 | |
142 | static void americanDate(char * target, Time source, const DateLUTImpl & timezone) |
143 | { |
144 | writeNumber2(target, ToMonthImpl::execute(source, timezone)); |
145 | writeNumber2(target + 3, ToDayOfMonthImpl::execute(source, timezone)); |
146 | writeNumber2(target + 6, ToYearImpl::execute(source, timezone) % 100); |
147 | } |
148 | |
149 | static void dayOfMonthSpacePadded(char * target, Time source, const DateLUTImpl & timezone) |
150 | { |
151 | auto day = ToDayOfMonthImpl::execute(source, timezone); |
152 | if (day < 10) |
153 | target[1] += day; |
154 | else |
155 | writeNumber2(target, day); |
156 | } |
157 | |
158 | static void ISO8601Date(char * target, Time source, const DateLUTImpl & timezone) |
159 | { |
160 | writeNumber4(target, ToYearImpl::execute(source, timezone)); |
161 | writeNumber2(target + 5, ToMonthImpl::execute(source, timezone)); |
162 | writeNumber2(target + 8, ToDayOfMonthImpl::execute(source, timezone)); |
163 | } |
164 | |
165 | static void dayOfYear(char * target, Time source, const DateLUTImpl & timezone) |
166 | { |
167 | writeNumber3(target, ToDayOfYearImpl::execute(source, timezone)); |
168 | } |
169 | |
170 | static void month(char * target, Time source, const DateLUTImpl & timezone) |
171 | { |
172 | writeNumber2(target, ToMonthImpl::execute(source, timezone)); |
173 | } |
174 | |
175 | static void dayOfWeek(char * target, Time source, const DateLUTImpl & timezone) |
176 | { |
177 | *target += ToDayOfWeekImpl::execute(source, timezone); |
178 | } |
179 | |
180 | static void dayOfWeek0To6(char * target, Time source, const DateLUTImpl & timezone) |
181 | { |
182 | auto day = ToDayOfWeekImpl::execute(source, timezone); |
183 | *target += (day == 7 ? 0 : day); |
184 | } |
185 | |
186 | static void ISO8601Week(char * target, Time source, const DateLUTImpl & timezone) |
187 | { |
188 | writeNumber2(target, ToISOWeekImpl::execute(source, timezone)); |
189 | } |
190 | |
191 | static void year2(char * target, Time source, const DateLUTImpl & timezone) |
192 | { |
193 | writeNumber2(target, ToYearImpl::execute(source, timezone) % 100); |
194 | } |
195 | |
196 | static void year4(char * target, Time source, const DateLUTImpl & timezone) |
197 | { |
198 | writeNumber4(target, ToYearImpl::execute(source, timezone)); |
199 | } |
200 | |
201 | static void hour24(char * target, Time source, const DateLUTImpl & timezone) |
202 | { |
203 | writeNumber2(target, ToHourImpl::execute(source, timezone)); |
204 | } |
205 | |
206 | static void hour12(char * target, Time source, const DateLUTImpl & timezone) |
207 | { |
208 | auto x = ToHourImpl::execute(source, timezone); |
209 | writeNumber2(target, x == 0 ? 12 : (x > 12 ? x - 12 : x)); |
210 | } |
211 | |
212 | static void minute(char * target, Time source, const DateLUTImpl & timezone) |
213 | { |
214 | writeNumber2(target, ToMinuteImpl::execute(source, timezone)); |
215 | } |
216 | |
217 | static void AMPM(char * target, Time source, const DateLUTImpl & timezone) |
218 | { |
219 | auto hour = ToHourImpl::execute(source, timezone); |
220 | if (hour >= 12) |
221 | *target = 'P'; |
222 | } |
223 | |
224 | static void hhmm24(char * target, Time source, const DateLUTImpl & timezone) |
225 | { |
226 | writeNumber2(target, ToHourImpl::execute(source, timezone)); |
227 | writeNumber2(target + 3, ToMinuteImpl::execute(source, timezone)); |
228 | } |
229 | |
230 | static void second(char * target, Time source, const DateLUTImpl & timezone) |
231 | { |
232 | writeNumber2(target, ToSecondImpl::execute(source, timezone)); |
233 | } |
234 | |
235 | static void ISO8601Time(char * target, Time source, const DateLUTImpl & timezone) |
236 | { |
237 | writeNumber2(target, ToHourImpl::execute(source, timezone)); |
238 | writeNumber2(target + 3, ToMinuteImpl::execute(source, timezone)); |
239 | writeNumber2(target + 6, ToSecondImpl::execute(source, timezone)); |
240 | } |
241 | }; |
242 | |
243 | public: |
244 | static constexpr auto name = "formatDateTime" ; |
245 | |
246 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionFormatDateTime>(); } |
247 | |
248 | String getName() const override |
249 | { |
250 | return name; |
251 | } |
252 | |
253 | bool useDefaultImplementationForConstants() const override { return true; } |
254 | |
255 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } |
256 | |
257 | bool isVariadic() const override { return true; } |
258 | size_t getNumberOfArguments() const override { return 0; } |
259 | |
260 | DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override |
261 | { |
262 | if (arguments.size() != 2 && arguments.size() != 3) |
263 | throw Exception("Number of arguments for function " + getName() + " doesn't match: passed " |
264 | + toString(arguments.size()) + ", should be 2 or 3" , |
265 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); |
266 | |
267 | if (!WhichDataType(arguments[0].type).isDateOrDateTime()) |
268 | throw Exception("Illegal type " + arguments[0].type->getName() + " of 1 argument of function " + getName() + |
269 | ". Should be a date or a date with time" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
270 | |
271 | if (!WhichDataType(arguments[1].type).isString()) |
272 | throw Exception("Illegal type " + arguments[1].type->getName() + " of 2 argument of function " + getName() + ". Must be String." , |
273 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
274 | |
275 | if (arguments.size() == 3) |
276 | { |
277 | if (!WhichDataType(arguments[2].type).isString()) |
278 | throw Exception("Illegal type " + arguments[2].type->getName() + " of 3 argument of function " + getName() + ". Must be String." , |
279 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
280 | } |
281 | |
282 | return std::make_shared<DataTypeString>(); |
283 | } |
284 | |
285 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
286 | { |
287 | if (!executeType<DataTypeDate>(block, arguments, result) |
288 | && !executeType<DataTypeDateTime>(block, arguments, result) |
289 | && !executeType<DataTypeDateTime64>(block, arguments, result)) |
290 | throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName() |
291 | + " of function " + getName() + ", must be Date or DateTime" , |
292 | ErrorCodes::ILLEGAL_COLUMN); |
293 | } |
294 | |
295 | template <typename DataType> |
296 | bool executeType(Block & block, const ColumnNumbers & arguments, size_t result) |
297 | { |
298 | auto * times = checkAndGetColumn<typename DataType::ColumnType>(block.getByPosition(arguments[0]).column.get()); |
299 | if (!times) |
300 | return false; |
301 | |
302 | const ColumnConst * pattern_column = checkAndGetColumnConst<ColumnString>(block.getByPosition(arguments[1]).column.get()); |
303 | if (!pattern_column) |
304 | throw Exception("Illegal column " + block.getByPosition(arguments[1]).column->getName() |
305 | + " of second ('format') argument of function " + getName() |
306 | + ". Must be constant string." , |
307 | ErrorCodes::ILLEGAL_COLUMN); |
308 | |
309 | String pattern = pattern_column->getValue<String>(); |
310 | |
311 | using T = typename ActionaValueTypeMap<DataType>::ActionValueType; |
312 | std::vector<Action<T>> instructions; |
313 | String pattern_to_fill = parsePattern(pattern, instructions); |
314 | size_t result_size = pattern_to_fill.size(); |
315 | |
316 | const DateLUTImpl * time_zone_tmp = nullptr; |
317 | if (arguments.size() == 3) |
318 | time_zone_tmp = &extractTimeZoneFromFunctionArguments(block, arguments, 2, 0); |
319 | else |
320 | time_zone_tmp = &DateLUT::instance(); |
321 | |
322 | const DateLUTImpl & time_zone = *time_zone_tmp; |
323 | |
324 | const auto & vec = times->getData(); |
325 | |
326 | UInt32 scale [[maybe_unused]] = 0; |
327 | if constexpr (std::is_same_v<DataType, DataTypeDateTime64>) |
328 | { |
329 | scale = vec.getScale(); |
330 | } |
331 | |
332 | auto col_res = ColumnString::create(); |
333 | auto & dst_data = col_res->getChars(); |
334 | auto & dst_offsets = col_res->getOffsets(); |
335 | dst_data.resize(vec.size() * (result_size + 1)); |
336 | dst_offsets.resize(vec.size()); |
337 | |
338 | /// Fill result with literals. |
339 | { |
340 | UInt8 * begin = dst_data.data(); |
341 | UInt8 * end = begin + dst_data.size(); |
342 | UInt8 * pos = begin; |
343 | |
344 | if (pos < end) |
345 | { |
346 | memcpy(pos, pattern_to_fill.data(), result_size + 1); /// With zero terminator. |
347 | pos += result_size + 1; |
348 | } |
349 | |
350 | /// Fill by copying exponential growing ranges. |
351 | while (pos < end) |
352 | { |
353 | size_t bytes_to_copy = std::min(pos - begin, end - pos); |
354 | memcpy(pos, begin, bytes_to_copy); |
355 | pos += bytes_to_copy; |
356 | } |
357 | } |
358 | |
359 | auto begin = reinterpret_cast<char *>(dst_data.data()); |
360 | auto pos = begin; |
361 | |
362 | for (size_t i = 0; i < vec.size(); ++i) |
363 | { |
364 | if constexpr (std::is_same_v<DataType, DataTypeDateTime64>) |
365 | { |
366 | for (auto & instruction : instructions) |
367 | { |
368 | // since right now LUT does not support Int64-values and not format instructions for subsecond parts, |
369 | // treat DatTime64 values just as DateTime values by ignoring fractional and casting to UInt32. |
370 | const auto c = DecimalUtils::split(vec[i], scale); |
371 | instruction.perform(pos, static_cast<UInt32>(c.whole), time_zone); |
372 | } |
373 | } |
374 | else |
375 | { |
376 | for (auto & instruction : instructions) |
377 | instruction.perform(pos, vec[i], time_zone); |
378 | } |
379 | |
380 | dst_offsets[i] = pos - begin; |
381 | } |
382 | |
383 | dst_data.resize(pos - begin); |
384 | block.getByPosition(result).column = std::move(col_res); |
385 | return true; |
386 | } |
387 | |
388 | template <typename T> |
389 | String parsePattern(const String & pattern, std::vector<Action<T>> & instructions) const |
390 | { |
391 | String result; |
392 | |
393 | const char * pos = pattern.data(); |
394 | const char * end = pos + pattern.size(); |
395 | |
396 | /// Add shift to previous action; or if there were none, add noop action with shift. |
397 | auto addShift = [&](size_t amount) |
398 | { |
399 | if (instructions.empty()) |
400 | instructions.emplace_back(&Action<T>::noop); |
401 | instructions.back().shift += amount; |
402 | }; |
403 | |
404 | /// If the argument was DateTime, add instruction for printing. If it was date, just shift (the buffer is pre-filled with default values). |
405 | auto addInstructionOrShift = [&](typename Action<T>::Func func [[maybe_unused]], size_t shift) |
406 | { |
407 | if constexpr (std::is_same_v<T, UInt32>) |
408 | instructions.emplace_back(func, shift); |
409 | else |
410 | addShift(shift); |
411 | }; |
412 | |
413 | while (true) |
414 | { |
415 | const char * percent_pos = find_first_symbols<'%'>(pos, end); |
416 | |
417 | if (percent_pos < end) |
418 | { |
419 | if (pos < percent_pos) |
420 | { |
421 | result.append(pos, percent_pos); |
422 | addShift(percent_pos - pos); |
423 | } |
424 | |
425 | pos = percent_pos + 1; |
426 | |
427 | if (pos >= end) |
428 | throw Exception("Sign '%' is the last in pattern, if you need it, use '%%'" , ErrorCodes::BAD_ARGUMENTS); |
429 | |
430 | switch (*pos) |
431 | { |
432 | // Year, divided by 100, zero-padded |
433 | case 'C': |
434 | instructions.emplace_back(&Action<T>::century, 2); |
435 | result.append("00" ); |
436 | break; |
437 | |
438 | // Day of month, zero-padded (01-31) |
439 | case 'd': |
440 | instructions.emplace_back(&Action<T>::dayOfMonth, 2); |
441 | result.append("00" ); |
442 | break; |
443 | |
444 | // Short MM/DD/YY date, equivalent to %m/%d/%y |
445 | case 'D': |
446 | instructions.emplace_back(&Action<T>::americanDate, 8); |
447 | result.append("00/00/00" ); |
448 | break; |
449 | |
450 | // Day of month, space-padded ( 1-31) 23 |
451 | case 'e': |
452 | instructions.emplace_back(&Action<T>::dayOfMonthSpacePadded, 2); |
453 | result.append(" 0" ); |
454 | break; |
455 | |
456 | // Short YYYY-MM-DD date, equivalent to %Y-%m-%d 2001-08-23 |
457 | case 'F': |
458 | instructions.emplace_back(&Action<T>::ISO8601Date, 10); |
459 | result.append("0000-00-00" ); |
460 | break; |
461 | |
462 | // Day of the year (001-366) 235 |
463 | case 'j': |
464 | instructions.emplace_back(&Action<T>::dayOfYear, 3); |
465 | result.append("000" ); |
466 | break; |
467 | |
468 | // Month as a decimal number (01-12) |
469 | case 'm': |
470 | instructions.emplace_back(&Action<T>::month, 2); |
471 | result.append("00" ); |
472 | break; |
473 | |
474 | // ISO 8601 weekday as number with Monday as 1 (1-7) |
475 | case 'u': |
476 | instructions.emplace_back(&Action<T>::dayOfWeek, 1); |
477 | result.append("0" ); |
478 | break; |
479 | |
480 | // ISO 8601 week number (01-53) |
481 | case 'V': |
482 | instructions.emplace_back(&Action<T>::ISO8601Week, 2); |
483 | result.append("00" ); |
484 | break; |
485 | |
486 | // Weekday as a decimal number with Sunday as 0 (0-6) 4 |
487 | case 'w': |
488 | instructions.emplace_back(&Action<T>::dayOfWeek0To6, 1); |
489 | result.append("0" ); |
490 | break; |
491 | |
492 | // Two digits year |
493 | case 'y': |
494 | instructions.emplace_back(&Action<T>::year2, 2); |
495 | result.append("00" ); |
496 | break; |
497 | |
498 | // Four digits year |
499 | case 'Y': |
500 | instructions.emplace_back(&Action<T>::year4, 4); |
501 | result.append("0000" ); |
502 | break; |
503 | |
504 | /// Time components. If the argument is Date, not a DateTime, then this components will have default value. |
505 | |
506 | // Minute (00-59) |
507 | case 'M': |
508 | addInstructionOrShift(&Action<T>::minute, 2); |
509 | result.append("00" ); |
510 | break; |
511 | |
512 | // AM or PM |
513 | case 'p': |
514 | addInstructionOrShift(&Action<T>::AMPM, 2); |
515 | result.append("AM" ); |
516 | break; |
517 | |
518 | // 24-hour HH:MM time, equivalent to %H:%M 14:55 |
519 | case 'R': |
520 | addInstructionOrShift(&Action<T>::hhmm24, 5); |
521 | result.append("00:00" ); |
522 | break; |
523 | |
524 | // Seconds |
525 | case 'S': |
526 | addInstructionOrShift(&Action<T>::second, 2); |
527 | result.append("00" ); |
528 | break; |
529 | |
530 | // ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S 14:55:02 |
531 | case 'T': |
532 | addInstructionOrShift(&Action<T>::ISO8601Time, 8); |
533 | result.append("00:00:00" ); |
534 | break; |
535 | |
536 | // Hour in 24h format (00-23) |
537 | case 'H': |
538 | addInstructionOrShift(&Action<T>::hour24, 2); |
539 | result.append("00" ); |
540 | break; |
541 | |
542 | // Hour in 12h format (01-12) |
543 | case 'I': |
544 | addInstructionOrShift(&Action<T>::hour12, 2); |
545 | result.append("12" ); |
546 | break; |
547 | |
548 | /// Escaped literal characters. |
549 | case '%': |
550 | result += '%'; |
551 | addShift(1); |
552 | break; |
553 | case 't': |
554 | result += '\t'; |
555 | addShift(1); |
556 | break; |
557 | case 'n': |
558 | result += '\n'; |
559 | addShift(1); |
560 | break; |
561 | |
562 | // Unimplemented |
563 | case 'U': [[fallthrough]]; |
564 | case 'W': |
565 | throw Exception("Wrong pattern '" + pattern + "', symbol '" + *pos + " is not implemented ' for function " + getName(), |
566 | ErrorCodes::NOT_IMPLEMENTED); |
567 | |
568 | default: |
569 | throw Exception( |
570 | "Wrong pattern '" + pattern + "', unexpected symbol '" + *pos + "' for function " + getName(), ErrorCodes::ILLEGAL_COLUMN); |
571 | } |
572 | |
573 | ++pos; |
574 | } |
575 | else |
576 | { |
577 | result.append(pos, end); |
578 | addShift(end + 1 - pos); /// including zero terminator |
579 | break; |
580 | } |
581 | } |
582 | |
583 | return result; |
584 | } |
585 | }; |
586 | |
587 | void registerFunctionFormatDateTime(FunctionFactory & factory) |
588 | { |
589 | factory.registerFunction<FunctionFormatDateTime>(); |
590 | } |
591 | |
592 | } |
593 | |