1#pragma once
2
3#include <Columns/ColumnString.h>
4#include <Core/Types.h>
5#include <Common/Exception.h>
6#include <Common/StringUtils/StringUtils.h>
7#include <Common/memcpySmall.h>
8
9#include <algorithm>
10#include <optional>
11#include <string>
12#include <utility>
13#include <vector>
14
15
16namespace DB
17{
18namespace ErrorCodes
19{
20 extern const int LOGICAL_ERROR;
21}
22
23struct FormatImpl
24{
25 static constexpr size_t small_argument_threshold = 1024;
26 static constexpr size_t argument_threshold = std::numeric_limits<UInt32>::max();
27 static constexpr size_t right_padding = 15;
28
29 template <typename... Args>
30 static inline void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args)
31 {
32 if (possibly_has_column_string && possibly_has_column_fixed_string)
33 format<true, true>(std::forward<Args>(args)...);
34 else if (!possibly_has_column_string && possibly_has_column_fixed_string)
35 format<false, true>(std::forward<Args>(args)...);
36 else if (possibly_has_column_string && !possibly_has_column_fixed_string)
37 format<true, false>(std::forward<Args>(args)...);
38 else
39 format<false, false>(std::forward<Args>(args)...);
40 }
41
42 static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res)
43 {
44 res = 0;
45 for (UInt64 pos = l; pos < r; pos++)
46 {
47 if (!isNumericASCII(description[pos]))
48 throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::LOGICAL_ERROR);
49 res = res * 10 + description[pos] - '0';
50 if (res >= argument_threshold)
51 throw Exception(
52 "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::LOGICAL_ERROR);
53 }
54 }
55
56 static inline void init(
57 const String & pattern,
58 const std::vector<const ColumnString::Chars *> & data,
59 size_t argument_number,
60 const std::vector<String> & constant_strings,
61 UInt64 * index_positions_ptr,
62 std::vector<String> & substrings)
63 {
64 /// Is current position after open curly brace.
65 bool is_open_curly = false;
66 /// The position of last open token.
67 size_t last_open = -1;
68
69 /// Is formatting in a plain {} token.
70 std::optional<bool> is_plain_numbering;
71 UInt64 index_if_plain = 0;
72
73 /// Left position of adding substrings, just to the closed brace position or the start of the string.
74 /// Invariant --- the start of substring is in this position.
75 size_t start_pos = 0;
76
77 /// A flag to decide whether we should glue the constant strings.
78 bool glue_to_next = false;
79
80 /// Handling double braces (escaping).
81 auto double_brace_removal = [](String & str)
82 {
83 size_t i = 0;
84 bool should_delete = true;
85 str.erase(
86 std::remove_if(
87 str.begin(),
88 str.end(),
89 [&i, &should_delete, &str](char)
90 {
91 bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
92 ++i;
93 if (is_double_brace && should_delete)
94 {
95 should_delete = false;
96 return true;
97 }
98 should_delete = true;
99 return false;
100 }),
101 str.end());
102 };
103
104 for (size_t i = 0; i < pattern.size(); ++i)
105 {
106 if (pattern[i] == '{')
107 {
108 /// Escaping handling
109 /// It is safe to access because of null termination
110 if (pattern[i + 1] == '{')
111 {
112 ++i;
113 continue;
114 }
115
116 if (is_open_curly)
117 throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR);
118
119 String to_add = String(pattern.data() + start_pos, i - start_pos);
120 double_brace_removal(to_add);
121 if (!glue_to_next)
122 substrings.emplace_back(to_add);
123 else
124 substrings.back() += to_add;
125
126 glue_to_next = false;
127
128 is_open_curly = true;
129 last_open = i + 1;
130 }
131 else if (pattern[i] == '}')
132 {
133 if (pattern[i + 1] == '}')
134 {
135 ++i;
136 continue;
137 }
138
139 if (!is_open_curly)
140 throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR);
141
142 is_open_curly = false;
143
144 if (last_open == i)
145 {
146 if (is_plain_numbering && !*is_plain_numbering)
147 throw Exception(
148 "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::LOGICAL_ERROR);
149 is_plain_numbering = true;
150 if (index_if_plain >= argument_number)
151 throw Exception("Argument is too big for formatting", ErrorCodes::LOGICAL_ERROR);
152 *index_positions_ptr = index_if_plain++;
153 }
154 else
155 {
156 if (is_plain_numbering && *is_plain_numbering)
157 throw Exception(
158 "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::LOGICAL_ERROR);
159 is_plain_numbering = false;
160
161 UInt64 arg;
162 parseNumber(pattern, last_open, i, arg);
163
164 if (arg >= argument_number)
165 throw Exception(
166 "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::LOGICAL_ERROR);
167
168 *index_positions_ptr = arg;
169 }
170
171 /// Constant string.
172 if (!data[*index_positions_ptr])
173 {
174 /// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
175 glue_to_next = true;
176 substrings.back() += constant_strings[*index_positions_ptr];
177 }
178 else
179 ++index_positions_ptr; /// Otherwise we commit arg number and proceed.
180
181 start_pos = i + 1;
182 }
183 }
184
185 if (is_open_curly)
186 throw Exception("Last open curly brace is not closed", ErrorCodes::LOGICAL_ERROR);
187
188 String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
189 double_brace_removal(to_add);
190
191 if (!glue_to_next)
192 substrings.emplace_back(to_add);
193 else
194 substrings.back() += to_add;
195 }
196
197 /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string.
198 /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString.
199 /// fixed_string_N for savings N to fixed strings.
200 /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string.
201 /// res_data is result_data, res_offsets is offset result.
202 /// input_rows_count is the number of rows processed.
203 /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size().
204 template <bool has_column_string, bool has_column_fixed_string>
205 static inline void format(
206 String pattern,
207 const std::vector<const ColumnString::Chars *> & data,
208 const std::vector<const ColumnString::Offsets *> & offsets,
209 [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N,
210 const std::vector<String> & constant_strings,
211 ColumnString::Chars & res_data,
212 ColumnString::Offsets & res_offsets,
213 size_t input_rows_count)
214 {
215 const size_t argument_number = offsets.size();
216
217 UInt64 small_index_positions_buffer[small_argument_threshold];
218 /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices.
219 UInt64 * index_positions = small_index_positions_buffer;
220
221 std::unique_ptr<UInt64[]> big_index_positions_buffer;
222 if (argument_number > small_argument_threshold)
223 {
224 big_index_positions_buffer.reset(new UInt64[argument_number]);
225 index_positions = big_index_positions_buffer.get();
226 }
227
228 /// Vector of substrings of pattern that will be copied to the ans, not string view because of escaping and iterators invalidation.
229 /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``].
230 std::vector<String> substrings;
231
232 init(pattern, data, argument_number, constant_strings, index_positions, substrings);
233
234 UInt64 final_size = 0;
235
236 for (String & str : substrings)
237 {
238 /// To use memcpySmallAllowReadWriteOverflow15 for substrings we should allocate a bit more to each string.
239 /// That was chosen due to perfomance issues.
240 if (!str.empty())
241 str.reserve(str.size() + right_padding);
242 final_size += str.size();
243 }
244
245 /// The substring number is repeated input_rows_times.
246 final_size *= input_rows_count;
247
248 /// Strings without null termination.
249 for (size_t i = 1; i < substrings.size(); ++i)
250 {
251 final_size += data[index_positions[i - 1]]->size();
252 /// Fixed strings do not have zero terminating character.
253 if (offsets[index_positions[i - 1]])
254 final_size -= input_rows_count;
255 }
256
257 /// Null termination characters.
258 final_size += input_rows_count;
259
260 res_data.resize(final_size);
261 res_offsets.resize(input_rows_count);
262
263 UInt64 offset = 0;
264 for (UInt64 i = 0; i < input_rows_count; ++i)
265 {
266 memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[0].data(), substrings[0].size());
267 offset += substrings[0].size();
268 /// All strings are constant, we should have substrings.size() == 1.
269 if constexpr (has_column_string || has_column_fixed_string)
270 {
271 for (size_t j = 1; j < substrings.size(); ++j)
272 {
273 UInt64 arg = index_positions[j - 1];
274 auto offset_ptr = offsets[arg];
275 UInt64 arg_offset = 0;
276 UInt64 size = 0;
277
278 if constexpr (has_column_string)
279 {
280 if (!has_column_fixed_string || offset_ptr)
281 {
282 arg_offset = (*offset_ptr)[i - 1];
283 size = (*offset_ptr)[i] - arg_offset - 1;
284 }
285 }
286
287 if constexpr (has_column_fixed_string)
288 {
289 if (!has_column_string || !offset_ptr)
290 {
291 arg_offset = fixed_string_N[arg] * i;
292 size = fixed_string_N[arg];
293 }
294 }
295
296 memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, data[arg]->data() + arg_offset, size);
297 offset += size;
298 memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[j].data(), substrings[j].size());
299 offset += substrings[j].size();
300 }
301 }
302 res_data[offset] = '\0';
303 ++offset;
304 res_offsets[i] = offset;
305 }
306
307 /*
308 * Invariant of `offset == final_size` must be held.
309 *
310 * if (offset != final_size)
311 * abort();
312 */
313 }
314};
315
316}
317