1 | #pragma once |
2 | |
3 | #include <Columns/ColumnString.h> |
4 | #include <Core/Types.h> |
5 | #include <Common/Exception.h> |
6 | #include <Common/StringUtils/StringUtils.h> |
7 | #include <Common/memcpySmall.h> |
8 | |
9 | #include <algorithm> |
10 | #include <optional> |
11 | #include <string> |
12 | #include <utility> |
13 | #include <vector> |
14 | |
15 | |
16 | namespace DB |
17 | { |
18 | namespace ErrorCodes |
19 | { |
20 | extern const int LOGICAL_ERROR; |
21 | } |
22 | |
23 | struct FormatImpl |
24 | { |
25 | static constexpr size_t small_argument_threshold = 1024; |
26 | static constexpr size_t argument_threshold = std::numeric_limits<UInt32>::max(); |
27 | static constexpr size_t right_padding = 15; |
28 | |
29 | template <typename... Args> |
30 | static inline void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args) |
31 | { |
32 | if (possibly_has_column_string && possibly_has_column_fixed_string) |
33 | format<true, true>(std::forward<Args>(args)...); |
34 | else if (!possibly_has_column_string && possibly_has_column_fixed_string) |
35 | format<false, true>(std::forward<Args>(args)...); |
36 | else if (possibly_has_column_string && !possibly_has_column_fixed_string) |
37 | format<true, false>(std::forward<Args>(args)...); |
38 | else |
39 | format<false, false>(std::forward<Args>(args)...); |
40 | } |
41 | |
42 | static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res) |
43 | { |
44 | res = 0; |
45 | for (UInt64 pos = l; pos < r; pos++) |
46 | { |
47 | if (!isNumericASCII(description[pos])) |
48 | throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::LOGICAL_ERROR); |
49 | res = res * 10 + description[pos] - '0'; |
50 | if (res >= argument_threshold) |
51 | throw Exception( |
52 | "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::LOGICAL_ERROR); |
53 | } |
54 | } |
55 | |
56 | static inline void init( |
57 | const String & pattern, |
58 | const std::vector<const ColumnString::Chars *> & data, |
59 | size_t argument_number, |
60 | const std::vector<String> & constant_strings, |
61 | UInt64 * index_positions_ptr, |
62 | std::vector<String> & substrings) |
63 | { |
64 | /// Is current position after open curly brace. |
65 | bool is_open_curly = false; |
66 | /// The position of last open token. |
67 | size_t last_open = -1; |
68 | |
69 | /// Is formatting in a plain {} token. |
70 | std::optional<bool> is_plain_numbering; |
71 | UInt64 index_if_plain = 0; |
72 | |
73 | /// Left position of adding substrings, just to the closed brace position or the start of the string. |
74 | /// Invariant --- the start of substring is in this position. |
75 | size_t start_pos = 0; |
76 | |
77 | /// A flag to decide whether we should glue the constant strings. |
78 | bool glue_to_next = false; |
79 | |
80 | /// Handling double braces (escaping). |
81 | auto double_brace_removal = [](String & str) |
82 | { |
83 | size_t i = 0; |
84 | bool should_delete = true; |
85 | str.erase( |
86 | std::remove_if( |
87 | str.begin(), |
88 | str.end(), |
89 | [&i, &should_delete, &str](char) |
90 | { |
91 | bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}'); |
92 | ++i; |
93 | if (is_double_brace && should_delete) |
94 | { |
95 | should_delete = false; |
96 | return true; |
97 | } |
98 | should_delete = true; |
99 | return false; |
100 | }), |
101 | str.end()); |
102 | }; |
103 | |
104 | for (size_t i = 0; i < pattern.size(); ++i) |
105 | { |
106 | if (pattern[i] == '{') |
107 | { |
108 | /// Escaping handling |
109 | /// It is safe to access because of null termination |
110 | if (pattern[i + 1] == '{') |
111 | { |
112 | ++i; |
113 | continue; |
114 | } |
115 | |
116 | if (is_open_curly) |
117 | throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR); |
118 | |
119 | String to_add = String(pattern.data() + start_pos, i - start_pos); |
120 | double_brace_removal(to_add); |
121 | if (!glue_to_next) |
122 | substrings.emplace_back(to_add); |
123 | else |
124 | substrings.back() += to_add; |
125 | |
126 | glue_to_next = false; |
127 | |
128 | is_open_curly = true; |
129 | last_open = i + 1; |
130 | } |
131 | else if (pattern[i] == '}') |
132 | { |
133 | if (pattern[i + 1] == '}') |
134 | { |
135 | ++i; |
136 | continue; |
137 | } |
138 | |
139 | if (!is_open_curly) |
140 | throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR); |
141 | |
142 | is_open_curly = false; |
143 | |
144 | if (last_open == i) |
145 | { |
146 | if (is_plain_numbering && !*is_plain_numbering) |
147 | throw Exception( |
148 | "Cannot switch from automatic field numbering to manual field specification" , ErrorCodes::LOGICAL_ERROR); |
149 | is_plain_numbering = true; |
150 | if (index_if_plain >= argument_number) |
151 | throw Exception("Argument is too big for formatting" , ErrorCodes::LOGICAL_ERROR); |
152 | *index_positions_ptr = index_if_plain++; |
153 | } |
154 | else |
155 | { |
156 | if (is_plain_numbering && *is_plain_numbering) |
157 | throw Exception( |
158 | "Cannot switch from automatic field numbering to manual field specification" , ErrorCodes::LOGICAL_ERROR); |
159 | is_plain_numbering = false; |
160 | |
161 | UInt64 arg; |
162 | parseNumber(pattern, last_open, i, arg); |
163 | |
164 | if (arg >= argument_number) |
165 | throw Exception( |
166 | "Argument is too big for formatting. Note that indexing starts from zero" , ErrorCodes::LOGICAL_ERROR); |
167 | |
168 | *index_positions_ptr = arg; |
169 | } |
170 | |
171 | /// Constant string. |
172 | if (!data[*index_positions_ptr]) |
173 | { |
174 | /// The next string should be glued to last `A {} C`.format('B') -> `A B C`. |
175 | glue_to_next = true; |
176 | substrings.back() += constant_strings[*index_positions_ptr]; |
177 | } |
178 | else |
179 | ++index_positions_ptr; /// Otherwise we commit arg number and proceed. |
180 | |
181 | start_pos = i + 1; |
182 | } |
183 | } |
184 | |
185 | if (is_open_curly) |
186 | throw Exception("Last open curly brace is not closed" , ErrorCodes::LOGICAL_ERROR); |
187 | |
188 | String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos); |
189 | double_brace_removal(to_add); |
190 | |
191 | if (!glue_to_next) |
192 | substrings.emplace_back(to_add); |
193 | else |
194 | substrings.back() += to_add; |
195 | } |
196 | |
197 | /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string. |
198 | /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString. |
199 | /// fixed_string_N for savings N to fixed strings. |
200 | /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string. |
201 | /// res_data is result_data, res_offsets is offset result. |
202 | /// input_rows_count is the number of rows processed. |
203 | /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size(). |
204 | template <bool has_column_string, bool has_column_fixed_string> |
205 | static inline void format( |
206 | String pattern, |
207 | const std::vector<const ColumnString::Chars *> & data, |
208 | const std::vector<const ColumnString::Offsets *> & offsets, |
209 | [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N, |
210 | const std::vector<String> & constant_strings, |
211 | ColumnString::Chars & res_data, |
212 | ColumnString::Offsets & res_offsets, |
213 | size_t input_rows_count) |
214 | { |
215 | const size_t argument_number = offsets.size(); |
216 | |
217 | UInt64 small_index_positions_buffer[small_argument_threshold]; |
218 | /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices. |
219 | UInt64 * index_positions = small_index_positions_buffer; |
220 | |
221 | std::unique_ptr<UInt64[]> big_index_positions_buffer; |
222 | if (argument_number > small_argument_threshold) |
223 | { |
224 | big_index_positions_buffer.reset(new UInt64[argument_number]); |
225 | index_positions = big_index_positions_buffer.get(); |
226 | } |
227 | |
228 | /// Vector of substrings of pattern that will be copied to the ans, not string view because of escaping and iterators invalidation. |
229 | /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``]. |
230 | std::vector<String> substrings; |
231 | |
232 | init(pattern, data, argument_number, constant_strings, index_positions, substrings); |
233 | |
234 | UInt64 final_size = 0; |
235 | |
236 | for (String & str : substrings) |
237 | { |
238 | /// To use memcpySmallAllowReadWriteOverflow15 for substrings we should allocate a bit more to each string. |
239 | /// That was chosen due to perfomance issues. |
240 | if (!str.empty()) |
241 | str.reserve(str.size() + right_padding); |
242 | final_size += str.size(); |
243 | } |
244 | |
245 | /// The substring number is repeated input_rows_times. |
246 | final_size *= input_rows_count; |
247 | |
248 | /// Strings without null termination. |
249 | for (size_t i = 1; i < substrings.size(); ++i) |
250 | { |
251 | final_size += data[index_positions[i - 1]]->size(); |
252 | /// Fixed strings do not have zero terminating character. |
253 | if (offsets[index_positions[i - 1]]) |
254 | final_size -= input_rows_count; |
255 | } |
256 | |
257 | /// Null termination characters. |
258 | final_size += input_rows_count; |
259 | |
260 | res_data.resize(final_size); |
261 | res_offsets.resize(input_rows_count); |
262 | |
263 | UInt64 offset = 0; |
264 | for (UInt64 i = 0; i < input_rows_count; ++i) |
265 | { |
266 | memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[0].data(), substrings[0].size()); |
267 | offset += substrings[0].size(); |
268 | /// All strings are constant, we should have substrings.size() == 1. |
269 | if constexpr (has_column_string || has_column_fixed_string) |
270 | { |
271 | for (size_t j = 1; j < substrings.size(); ++j) |
272 | { |
273 | UInt64 arg = index_positions[j - 1]; |
274 | auto offset_ptr = offsets[arg]; |
275 | UInt64 arg_offset = 0; |
276 | UInt64 size = 0; |
277 | |
278 | if constexpr (has_column_string) |
279 | { |
280 | if (!has_column_fixed_string || offset_ptr) |
281 | { |
282 | arg_offset = (*offset_ptr)[i - 1]; |
283 | size = (*offset_ptr)[i] - arg_offset - 1; |
284 | } |
285 | } |
286 | |
287 | if constexpr (has_column_fixed_string) |
288 | { |
289 | if (!has_column_string || !offset_ptr) |
290 | { |
291 | arg_offset = fixed_string_N[arg] * i; |
292 | size = fixed_string_N[arg]; |
293 | } |
294 | } |
295 | |
296 | memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, data[arg]->data() + arg_offset, size); |
297 | offset += size; |
298 | memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[j].data(), substrings[j].size()); |
299 | offset += substrings[j].size(); |
300 | } |
301 | } |
302 | res_data[offset] = '\0'; |
303 | ++offset; |
304 | res_offsets[i] = offset; |
305 | } |
306 | |
307 | /* |
308 | * Invariant of `offset == final_size` must be held. |
309 | * |
310 | * if (offset != final_size) |
311 | * abort(); |
312 | */ |
313 | } |
314 | }; |
315 | |
316 | } |
317 | |