| 1 | #include <Formats/ParsedTemplateFormatString.h> |
| 2 | #include <Formats/verbosePrintString.h> |
| 3 | #include <IO/ReadBufferFromMemory.h> |
| 4 | #include <IO/Operators.h> |
| 5 | #include <IO/ReadBufferFromFile.h> |
| 6 | #include <Core/Settings.h> |
| 7 | #include <Interpreters/Context.h> |
| 8 | |
| 9 | namespace DB |
| 10 | { |
| 11 | |
| 12 | namespace ErrorCodes |
| 13 | { |
| 14 | extern const int INVALID_TEMPLATE_FORMAT; |
| 15 | } |
| 16 | |
| 17 | ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) |
| 18 | { |
| 19 | try |
| 20 | { |
| 21 | ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); |
| 22 | String format_string; |
| 23 | readStringUntilEOF(format_string, schema_file); |
| 24 | parse(format_string, idx_by_name); |
| 25 | } |
| 26 | catch (DB::Exception & e) |
| 27 | { |
| 28 | if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT) |
| 29 | throwInvalidFormat(e.message(), columnsCount()); |
| 30 | else |
| 31 | throw; |
| 32 | } |
| 33 | } |
| 34 | |
| 35 | |
| 36 | void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) |
| 37 | { |
| 38 | enum ParserState |
| 39 | { |
| 40 | Delimiter, |
| 41 | Column, |
| 42 | Format |
| 43 | }; |
| 44 | |
| 45 | const char * pos = format_string.c_str(); |
| 46 | const char * end = format_string.c_str() + format_string.size(); |
| 47 | const char * token_begin = pos; |
| 48 | ParserState state = Delimiter; |
| 49 | delimiters.emplace_back(); |
| 50 | char * col_idx_end; |
| 51 | std::optional<size_t> column_idx; |
| 52 | for (; *pos; ++pos) |
| 53 | { |
| 54 | switch (state) |
| 55 | { |
| 56 | case Delimiter: |
| 57 | if (*pos == '$') |
| 58 | { |
| 59 | delimiters.back().append(token_begin, pos - token_begin); |
| 60 | ++pos; |
| 61 | if (*pos == '{') |
| 62 | { |
| 63 | token_begin = pos + 1; |
| 64 | state = Column; |
| 65 | } |
| 66 | else if (*pos == '$') |
| 67 | { |
| 68 | token_begin = pos; |
| 69 | } |
| 70 | else |
| 71 | throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + |
| 72 | ": Expected '{' or '$' after '$'" + |
| 73 | ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"" , columnsCount()); |
| 74 | } |
| 75 | break; |
| 76 | |
| 77 | case Column: |
| 78 | column_names.emplace_back(); |
| 79 | pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back()); |
| 80 | |
| 81 | if (*pos == ':') |
| 82 | state = Format; |
| 83 | else if (*pos == '}') |
| 84 | { |
| 85 | formats.push_back(ColumnFormat::None); |
| 86 | delimiters.emplace_back(); |
| 87 | state = Delimiter; |
| 88 | } |
| 89 | else |
| 90 | throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + |
| 91 | ": Expected ':' or '}' after column name \"" + column_names.back() + "\"" + |
| 92 | ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"" , columnsCount()); |
| 93 | |
| 94 | token_begin = pos + 1; |
| 95 | column_idx.reset(); |
| 96 | if (!column_names.back().empty()) |
| 97 | { |
| 98 | col_idx_end = nullptr; |
| 99 | errno = 0; |
| 100 | column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); |
| 101 | if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) |
| 102 | column_idx = idx_by_name(column_names.back()); |
| 103 | } |
| 104 | format_idx_to_column_idx.emplace_back(column_idx); |
| 105 | break; |
| 106 | |
| 107 | case Format: |
| 108 | if (*pos == '}') |
| 109 | { |
| 110 | formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); |
| 111 | token_begin = pos + 1; |
| 112 | delimiters.emplace_back(); |
| 113 | state = Delimiter; |
| 114 | } |
| 115 | } |
| 116 | } |
| 117 | if (state != Delimiter) |
| 118 | throwInvalidFormat("Unbalanced parentheses" , columnsCount()); |
| 119 | delimiters.back().append(token_begin, pos - token_begin); |
| 120 | } |
| 121 | |
| 122 | |
| 123 | ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) |
| 124 | { |
| 125 | if (col_format.empty()) |
| 126 | return ColumnFormat::None; |
| 127 | else if (col_format == "None" ) |
| 128 | return ColumnFormat::None; |
| 129 | else if (col_format == "Escaped" ) |
| 130 | return ColumnFormat::Escaped; |
| 131 | else if (col_format == "Quoted" ) |
| 132 | return ColumnFormat::Quoted; |
| 133 | else if (col_format == "CSV" ) |
| 134 | return ColumnFormat::Csv; |
| 135 | else if (col_format == "JSON" ) |
| 136 | return ColumnFormat::Json; |
| 137 | else if (col_format == "XML" ) |
| 138 | return ColumnFormat::Xml; |
| 139 | else if (col_format == "Raw" ) |
| 140 | return ColumnFormat::Raw; |
| 141 | else |
| 142 | throw Exception("Unknown field format \"" + col_format + "\"" , ErrorCodes::BAD_ARGUMENTS); |
| 143 | } |
| 144 | |
| 145 | size_t ParsedTemplateFormatString::columnsCount() const |
| 146 | { |
| 147 | return format_idx_to_column_idx.size(); |
| 148 | } |
| 149 | |
| 150 | String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format) |
| 151 | { |
| 152 | switch (format) |
| 153 | { |
| 154 | case ColumnFormat::None: |
| 155 | return "None" ; |
| 156 | case ColumnFormat::Escaped: |
| 157 | return "Escaped" ; |
| 158 | case ColumnFormat::Quoted: |
| 159 | return "Quoted" ; |
| 160 | case ColumnFormat::Csv: |
| 161 | return "CSV" ; |
| 162 | case ColumnFormat::Json: |
| 163 | return "Json" ; |
| 164 | case ColumnFormat::Xml: |
| 165 | return "Xml" ; |
| 166 | case ColumnFormat::Raw: |
| 167 | return "Raw" ; |
| 168 | } |
| 169 | __builtin_unreachable(); |
| 170 | } |
| 171 | |
| 172 | const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s) |
| 173 | { |
| 174 | s.clear(); |
| 175 | if (!size) |
| 176 | return pos; |
| 177 | ReadBufferFromMemory buf{pos, size}; |
| 178 | if (*pos == '"') |
| 179 | readDoubleQuotedStringWithSQLStyle(s, buf); |
| 180 | else if (*pos == '`') |
| 181 | readBackQuotedStringWithSQLStyle(s, buf); |
| 182 | else if (isWordCharASCII(*pos)) |
| 183 | { |
| 184 | size_t name_size = 1; |
| 185 | while (name_size < size && isWordCharASCII(*(pos + name_size))) |
| 186 | ++name_size; |
| 187 | s = String{pos, name_size}; |
| 188 | return pos + name_size; |
| 189 | } |
| 190 | return pos + buf.count(); |
| 191 | } |
| 192 | |
| 193 | String ParsedTemplateFormatString::dump() const |
| 194 | { |
| 195 | WriteBufferFromOwnString res; |
| 196 | res << "Delimiter " << 0 << ": " ; |
| 197 | verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res); |
| 198 | |
| 199 | size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size()); |
| 200 | for (size_t i = 0; i < num_columns; ++i) |
| 201 | { |
| 202 | res << "\nColumn " << i << ": \"" ; |
| 203 | if (column_names.size() <= i) |
| 204 | res << "<ERROR>" ; |
| 205 | else if (column_names[i].empty()) |
| 206 | res << "<SKIPPED>" ; |
| 207 | else |
| 208 | res << column_names[i]; |
| 209 | |
| 210 | res << "\" (mapped to table column " ; |
| 211 | if (format_idx_to_column_idx.size() <= i) |
| 212 | res << "<ERROR>" ; |
| 213 | else if (!format_idx_to_column_idx[i]) |
| 214 | res << "<SKIPPED>" ; |
| 215 | else |
| 216 | res << *format_idx_to_column_idx[i]; |
| 217 | |
| 218 | res << "), Format " << (i < formats.size() ? formatToString(formats[i]) : "<ERROR>" ); |
| 219 | |
| 220 | res << "\nDelimiter " << i + 1 << ": " ; |
| 221 | if (delimiters.size() <= i + 1) |
| 222 | res << "<ERROR>" ; |
| 223 | else |
| 224 | verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res); |
| 225 | } |
| 226 | |
| 227 | return res.str(); |
| 228 | } |
| 229 | |
| 230 | void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const |
| 231 | { |
| 232 | throw Exception("Invalid format string for Template: " + message + " (near column " + std::to_string(column) + |
| 233 | ")" + ". Parsed format string:\n" + dump() + "\n" , |
| 234 | ErrorCodes::INVALID_TEMPLATE_FORMAT); |
| 235 | } |
| 236 | |
| 237 | ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedResultsetFormat(const FormatSettings::Custom & settings) |
| 238 | { |
| 239 | /// Set resultset format to "result_before_delimiter ${data} result_after_delimiter" |
| 240 | ParsedTemplateFormatString resultset_format; |
| 241 | resultset_format.delimiters.emplace_back(settings.result_before_delimiter); |
| 242 | resultset_format.delimiters.emplace_back(settings.result_after_delimiter); |
| 243 | resultset_format.formats.emplace_back(ParsedTemplateFormatString::ColumnFormat::None); |
| 244 | resultset_format.format_idx_to_column_idx.emplace_back(0); |
| 245 | resultset_format.column_names.emplace_back("data" ); |
| 246 | return resultset_format; |
| 247 | } |
| 248 | |
| 249 | ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedRowFormat(const FormatSettings::Custom & settings, const Block & sample) |
| 250 | { |
| 251 | /// Set row format to |
| 252 | /// "row_before_delimiter ${Col0:escaping} field_delimiter ${Col1:escaping} field_delimiter ... ${ColN:escaping} row_after_delimiter" |
| 253 | ParsedTemplateFormatString::ColumnFormat escaping = ParsedTemplateFormatString::stringToFormat(settings.escaping_rule); |
| 254 | ParsedTemplateFormatString row_format; |
| 255 | row_format.delimiters.emplace_back(settings.row_before_delimiter); |
| 256 | for (size_t i = 0; i < sample.columns(); ++i) |
| 257 | { |
| 258 | row_format.formats.emplace_back(escaping); |
| 259 | row_format.format_idx_to_column_idx.emplace_back(i); |
| 260 | row_format.column_names.emplace_back(sample.getByPosition(i).name); |
| 261 | bool last_column = i == sample.columns() - 1; |
| 262 | row_format.delimiters.emplace_back(last_column ? settings.row_after_delimiter : settings.field_delimiter); |
| 263 | } |
| 264 | return row_format; |
| 265 | } |
| 266 | |
| 267 | } |
| 268 | |