| 1 | #pragma once |
| 2 | |
| 3 | #include <Columns/ColumnString.h> |
| 4 | #include <Core/Types.h> |
| 5 | #include <Common/Exception.h> |
| 6 | #include <Common/StringUtils/StringUtils.h> |
| 7 | #include <Common/memcpySmall.h> |
| 8 | |
| 9 | #include <algorithm> |
| 10 | #include <optional> |
| 11 | #include <string> |
| 12 | #include <utility> |
| 13 | #include <vector> |
| 14 | |
| 15 | |
| 16 | namespace DB |
| 17 | { |
| 18 | namespace ErrorCodes |
| 19 | { |
| 20 | extern const int LOGICAL_ERROR; |
| 21 | } |
| 22 | |
| 23 | struct FormatImpl |
| 24 | { |
| 25 | static constexpr size_t small_argument_threshold = 1024; |
| 26 | static constexpr size_t argument_threshold = std::numeric_limits<UInt32>::max(); |
| 27 | static constexpr size_t right_padding = 15; |
| 28 | |
| 29 | template <typename... Args> |
| 30 | static inline void formatExecute(bool possibly_has_column_string, bool possibly_has_column_fixed_string, Args &&... args) |
| 31 | { |
| 32 | if (possibly_has_column_string && possibly_has_column_fixed_string) |
| 33 | format<true, true>(std::forward<Args>(args)...); |
| 34 | else if (!possibly_has_column_string && possibly_has_column_fixed_string) |
| 35 | format<false, true>(std::forward<Args>(args)...); |
| 36 | else if (possibly_has_column_string && !possibly_has_column_fixed_string) |
| 37 | format<true, false>(std::forward<Args>(args)...); |
| 38 | else |
| 39 | format<false, false>(std::forward<Args>(args)...); |
| 40 | } |
| 41 | |
| 42 | static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res) |
| 43 | { |
| 44 | res = 0; |
| 45 | for (UInt64 pos = l; pos < r; pos++) |
| 46 | { |
| 47 | if (!isNumericASCII(description[pos])) |
| 48 | throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::LOGICAL_ERROR); |
| 49 | res = res * 10 + description[pos] - '0'; |
| 50 | if (res >= argument_threshold) |
| 51 | throw Exception( |
| 52 | "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::LOGICAL_ERROR); |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | static inline void init( |
| 57 | const String & pattern, |
| 58 | const std::vector<const ColumnString::Chars *> & data, |
| 59 | size_t argument_number, |
| 60 | const std::vector<String> & constant_strings, |
| 61 | UInt64 * index_positions_ptr, |
| 62 | std::vector<String> & substrings) |
| 63 | { |
| 64 | /// Is current position after open curly brace. |
| 65 | bool is_open_curly = false; |
| 66 | /// The position of last open token. |
| 67 | size_t last_open = -1; |
| 68 | |
| 69 | /// Is formatting in a plain {} token. |
| 70 | std::optional<bool> is_plain_numbering; |
| 71 | UInt64 index_if_plain = 0; |
| 72 | |
| 73 | /// Left position of adding substrings, just to the closed brace position or the start of the string. |
| 74 | /// Invariant --- the start of substring is in this position. |
| 75 | size_t start_pos = 0; |
| 76 | |
| 77 | /// A flag to decide whether we should glue the constant strings. |
| 78 | bool glue_to_next = false; |
| 79 | |
| 80 | /// Handling double braces (escaping). |
| 81 | auto double_brace_removal = [](String & str) |
| 82 | { |
| 83 | size_t i = 0; |
| 84 | bool should_delete = true; |
| 85 | str.erase( |
| 86 | std::remove_if( |
| 87 | str.begin(), |
| 88 | str.end(), |
| 89 | [&i, &should_delete, &str](char) |
| 90 | { |
| 91 | bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}'); |
| 92 | ++i; |
| 93 | if (is_double_brace && should_delete) |
| 94 | { |
| 95 | should_delete = false; |
| 96 | return true; |
| 97 | } |
| 98 | should_delete = true; |
| 99 | return false; |
| 100 | }), |
| 101 | str.end()); |
| 102 | }; |
| 103 | |
| 104 | for (size_t i = 0; i < pattern.size(); ++i) |
| 105 | { |
| 106 | if (pattern[i] == '{') |
| 107 | { |
| 108 | /// Escaping handling |
| 109 | /// It is safe to access because of null termination |
| 110 | if (pattern[i + 1] == '{') |
| 111 | { |
| 112 | ++i; |
| 113 | continue; |
| 114 | } |
| 115 | |
| 116 | if (is_open_curly) |
| 117 | throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR); |
| 118 | |
| 119 | String to_add = String(pattern.data() + start_pos, i - start_pos); |
| 120 | double_brace_removal(to_add); |
| 121 | if (!glue_to_next) |
| 122 | substrings.emplace_back(to_add); |
| 123 | else |
| 124 | substrings.back() += to_add; |
| 125 | |
| 126 | glue_to_next = false; |
| 127 | |
| 128 | is_open_curly = true; |
| 129 | last_open = i + 1; |
| 130 | } |
| 131 | else if (pattern[i] == '}') |
| 132 | { |
| 133 | if (pattern[i + 1] == '}') |
| 134 | { |
| 135 | ++i; |
| 136 | continue; |
| 137 | } |
| 138 | |
| 139 | if (!is_open_curly) |
| 140 | throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::LOGICAL_ERROR); |
| 141 | |
| 142 | is_open_curly = false; |
| 143 | |
| 144 | if (last_open == i) |
| 145 | { |
| 146 | if (is_plain_numbering && !*is_plain_numbering) |
| 147 | throw Exception( |
| 148 | "Cannot switch from automatic field numbering to manual field specification" , ErrorCodes::LOGICAL_ERROR); |
| 149 | is_plain_numbering = true; |
| 150 | if (index_if_plain >= argument_number) |
| 151 | throw Exception("Argument is too big for formatting" , ErrorCodes::LOGICAL_ERROR); |
| 152 | *index_positions_ptr = index_if_plain++; |
| 153 | } |
| 154 | else |
| 155 | { |
| 156 | if (is_plain_numbering && *is_plain_numbering) |
| 157 | throw Exception( |
| 158 | "Cannot switch from automatic field numbering to manual field specification" , ErrorCodes::LOGICAL_ERROR); |
| 159 | is_plain_numbering = false; |
| 160 | |
| 161 | UInt64 arg; |
| 162 | parseNumber(pattern, last_open, i, arg); |
| 163 | |
| 164 | if (arg >= argument_number) |
| 165 | throw Exception( |
| 166 | "Argument is too big for formatting. Note that indexing starts from zero" , ErrorCodes::LOGICAL_ERROR); |
| 167 | |
| 168 | *index_positions_ptr = arg; |
| 169 | } |
| 170 | |
| 171 | /// Constant string. |
| 172 | if (!data[*index_positions_ptr]) |
| 173 | { |
| 174 | /// The next string should be glued to last `A {} C`.format('B') -> `A B C`. |
| 175 | glue_to_next = true; |
| 176 | substrings.back() += constant_strings[*index_positions_ptr]; |
| 177 | } |
| 178 | else |
| 179 | ++index_positions_ptr; /// Otherwise we commit arg number and proceed. |
| 180 | |
| 181 | start_pos = i + 1; |
| 182 | } |
| 183 | } |
| 184 | |
| 185 | if (is_open_curly) |
| 186 | throw Exception("Last open curly brace is not closed" , ErrorCodes::LOGICAL_ERROR); |
| 187 | |
| 188 | String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos); |
| 189 | double_brace_removal(to_add); |
| 190 | |
| 191 | if (!glue_to_next) |
| 192 | substrings.emplace_back(to_add); |
| 193 | else |
| 194 | substrings.back() += to_add; |
| 195 | } |
| 196 | |
| 197 | /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string. |
| 198 | /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString. |
| 199 | /// fixed_string_N for savings N to fixed strings. |
| 200 | /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string. |
| 201 | /// res_data is result_data, res_offsets is offset result. |
| 202 | /// input_rows_count is the number of rows processed. |
| 203 | /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size(). |
| 204 | template <bool has_column_string, bool has_column_fixed_string> |
| 205 | static inline void format( |
| 206 | String pattern, |
| 207 | const std::vector<const ColumnString::Chars *> & data, |
| 208 | const std::vector<const ColumnString::Offsets *> & offsets, |
| 209 | [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N, |
| 210 | const std::vector<String> & constant_strings, |
| 211 | ColumnString::Chars & res_data, |
| 212 | ColumnString::Offsets & res_offsets, |
| 213 | size_t input_rows_count) |
| 214 | { |
| 215 | const size_t argument_number = offsets.size(); |
| 216 | |
| 217 | UInt64 small_index_positions_buffer[small_argument_threshold]; |
| 218 | /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices. |
| 219 | UInt64 * index_positions = small_index_positions_buffer; |
| 220 | |
| 221 | std::unique_ptr<UInt64[]> big_index_positions_buffer; |
| 222 | if (argument_number > small_argument_threshold) |
| 223 | { |
| 224 | big_index_positions_buffer.reset(new UInt64[argument_number]); |
| 225 | index_positions = big_index_positions_buffer.get(); |
| 226 | } |
| 227 | |
| 228 | /// Vector of substrings of pattern that will be copied to the ans, not string view because of escaping and iterators invalidation. |
| 229 | /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``]. |
| 230 | std::vector<String> substrings; |
| 231 | |
| 232 | init(pattern, data, argument_number, constant_strings, index_positions, substrings); |
| 233 | |
| 234 | UInt64 final_size = 0; |
| 235 | |
| 236 | for (String & str : substrings) |
| 237 | { |
| 238 | /// To use memcpySmallAllowReadWriteOverflow15 for substrings we should allocate a bit more to each string. |
| 239 | /// That was chosen due to perfomance issues. |
| 240 | if (!str.empty()) |
| 241 | str.reserve(str.size() + right_padding); |
| 242 | final_size += str.size(); |
| 243 | } |
| 244 | |
| 245 | /// The substring number is repeated input_rows_times. |
| 246 | final_size *= input_rows_count; |
| 247 | |
| 248 | /// Strings without null termination. |
| 249 | for (size_t i = 1; i < substrings.size(); ++i) |
| 250 | { |
| 251 | final_size += data[index_positions[i - 1]]->size(); |
| 252 | /// Fixed strings do not have zero terminating character. |
| 253 | if (offsets[index_positions[i - 1]]) |
| 254 | final_size -= input_rows_count; |
| 255 | } |
| 256 | |
| 257 | /// Null termination characters. |
| 258 | final_size += input_rows_count; |
| 259 | |
| 260 | res_data.resize(final_size); |
| 261 | res_offsets.resize(input_rows_count); |
| 262 | |
| 263 | UInt64 offset = 0; |
| 264 | for (UInt64 i = 0; i < input_rows_count; ++i) |
| 265 | { |
| 266 | memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[0].data(), substrings[0].size()); |
| 267 | offset += substrings[0].size(); |
| 268 | /// All strings are constant, we should have substrings.size() == 1. |
| 269 | if constexpr (has_column_string || has_column_fixed_string) |
| 270 | { |
| 271 | for (size_t j = 1; j < substrings.size(); ++j) |
| 272 | { |
| 273 | UInt64 arg = index_positions[j - 1]; |
| 274 | auto offset_ptr = offsets[arg]; |
| 275 | UInt64 arg_offset = 0; |
| 276 | UInt64 size = 0; |
| 277 | |
| 278 | if constexpr (has_column_string) |
| 279 | { |
| 280 | if (!has_column_fixed_string || offset_ptr) |
| 281 | { |
| 282 | arg_offset = (*offset_ptr)[i - 1]; |
| 283 | size = (*offset_ptr)[i] - arg_offset - 1; |
| 284 | } |
| 285 | } |
| 286 | |
| 287 | if constexpr (has_column_fixed_string) |
| 288 | { |
| 289 | if (!has_column_string || !offset_ptr) |
| 290 | { |
| 291 | arg_offset = fixed_string_N[arg] * i; |
| 292 | size = fixed_string_N[arg]; |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, data[arg]->data() + arg_offset, size); |
| 297 | offset += size; |
| 298 | memcpySmallAllowReadWriteOverflow15(res_data.data() + offset, substrings[j].data(), substrings[j].size()); |
| 299 | offset += substrings[j].size(); |
| 300 | } |
| 301 | } |
| 302 | res_data[offset] = '\0'; |
| 303 | ++offset; |
| 304 | res_offsets[i] = offset; |
| 305 | } |
| 306 | |
| 307 | /* |
| 308 | * Invariant of `offset == final_size` must be held. |
| 309 | * |
| 310 | * if (offset != final_size) |
| 311 | * abort(); |
| 312 | */ |
| 313 | } |
| 314 | }; |
| 315 | |
| 316 | } |
| 317 | |