1 | #include <Formats/ParsedTemplateFormatString.h> |
2 | #include <Formats/verbosePrintString.h> |
3 | #include <IO/ReadBufferFromMemory.h> |
4 | #include <IO/Operators.h> |
5 | #include <IO/ReadBufferFromFile.h> |
6 | #include <Core/Settings.h> |
7 | #include <Interpreters/Context.h> |
8 | |
9 | namespace DB |
10 | { |
11 | |
12 | namespace ErrorCodes |
13 | { |
14 | extern const int INVALID_TEMPLATE_FORMAT; |
15 | } |
16 | |
17 | ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name) |
18 | { |
19 | try |
20 | { |
21 | ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096); |
22 | String format_string; |
23 | readStringUntilEOF(format_string, schema_file); |
24 | parse(format_string, idx_by_name); |
25 | } |
26 | catch (DB::Exception & e) |
27 | { |
28 | if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT) |
29 | throwInvalidFormat(e.message(), columnsCount()); |
30 | else |
31 | throw; |
32 | } |
33 | } |
34 | |
35 | |
36 | void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name) |
37 | { |
38 | enum ParserState |
39 | { |
40 | Delimiter, |
41 | Column, |
42 | Format |
43 | }; |
44 | |
45 | const char * pos = format_string.c_str(); |
46 | const char * end = format_string.c_str() + format_string.size(); |
47 | const char * token_begin = pos; |
48 | ParserState state = Delimiter; |
49 | delimiters.emplace_back(); |
50 | char * col_idx_end; |
51 | std::optional<size_t> column_idx; |
52 | for (; *pos; ++pos) |
53 | { |
54 | switch (state) |
55 | { |
56 | case Delimiter: |
57 | if (*pos == '$') |
58 | { |
59 | delimiters.back().append(token_begin, pos - token_begin); |
60 | ++pos; |
61 | if (*pos == '{') |
62 | { |
63 | token_begin = pos + 1; |
64 | state = Column; |
65 | } |
66 | else if (*pos == '$') |
67 | { |
68 | token_begin = pos; |
69 | } |
70 | else |
71 | throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + |
72 | ": Expected '{' or '$' after '$'" + |
73 | ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"" , columnsCount()); |
74 | } |
75 | break; |
76 | |
77 | case Column: |
78 | column_names.emplace_back(); |
79 | pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back()); |
80 | |
81 | if (*pos == ':') |
82 | state = Format; |
83 | else if (*pos == '}') |
84 | { |
85 | formats.push_back(ColumnFormat::None); |
86 | delimiters.emplace_back(); |
87 | state = Delimiter; |
88 | } |
89 | else |
90 | throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) + |
91 | ": Expected ':' or '}' after column name \"" + column_names.back() + "\"" + |
92 | ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"" , columnsCount()); |
93 | |
94 | token_begin = pos + 1; |
95 | column_idx.reset(); |
96 | if (!column_names.back().empty()) |
97 | { |
98 | col_idx_end = nullptr; |
99 | errno = 0; |
100 | column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10); |
101 | if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno) |
102 | column_idx = idx_by_name(column_names.back()); |
103 | } |
104 | format_idx_to_column_idx.emplace_back(column_idx); |
105 | break; |
106 | |
107 | case Format: |
108 | if (*pos == '}') |
109 | { |
110 | formats.push_back(stringToFormat(String(token_begin, pos - token_begin))); |
111 | token_begin = pos + 1; |
112 | delimiters.emplace_back(); |
113 | state = Delimiter; |
114 | } |
115 | } |
116 | } |
117 | if (state != Delimiter) |
118 | throwInvalidFormat("Unbalanced parentheses" , columnsCount()); |
119 | delimiters.back().append(token_begin, pos - token_begin); |
120 | } |
121 | |
122 | |
123 | ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format) |
124 | { |
125 | if (col_format.empty()) |
126 | return ColumnFormat::None; |
127 | else if (col_format == "None" ) |
128 | return ColumnFormat::None; |
129 | else if (col_format == "Escaped" ) |
130 | return ColumnFormat::Escaped; |
131 | else if (col_format == "Quoted" ) |
132 | return ColumnFormat::Quoted; |
133 | else if (col_format == "CSV" ) |
134 | return ColumnFormat::Csv; |
135 | else if (col_format == "JSON" ) |
136 | return ColumnFormat::Json; |
137 | else if (col_format == "XML" ) |
138 | return ColumnFormat::Xml; |
139 | else if (col_format == "Raw" ) |
140 | return ColumnFormat::Raw; |
141 | else |
142 | throw Exception("Unknown field format \"" + col_format + "\"" , ErrorCodes::BAD_ARGUMENTS); |
143 | } |
144 | |
145 | size_t ParsedTemplateFormatString::columnsCount() const |
146 | { |
147 | return format_idx_to_column_idx.size(); |
148 | } |
149 | |
150 | String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format) |
151 | { |
152 | switch (format) |
153 | { |
154 | case ColumnFormat::None: |
155 | return "None" ; |
156 | case ColumnFormat::Escaped: |
157 | return "Escaped" ; |
158 | case ColumnFormat::Quoted: |
159 | return "Quoted" ; |
160 | case ColumnFormat::Csv: |
161 | return "CSV" ; |
162 | case ColumnFormat::Json: |
163 | return "Json" ; |
164 | case ColumnFormat::Xml: |
165 | return "Xml" ; |
166 | case ColumnFormat::Raw: |
167 | return "Raw" ; |
168 | } |
169 | __builtin_unreachable(); |
170 | } |
171 | |
172 | const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s) |
173 | { |
174 | s.clear(); |
175 | if (!size) |
176 | return pos; |
177 | ReadBufferFromMemory buf{pos, size}; |
178 | if (*pos == '"') |
179 | readDoubleQuotedStringWithSQLStyle(s, buf); |
180 | else if (*pos == '`') |
181 | readBackQuotedStringWithSQLStyle(s, buf); |
182 | else if (isWordCharASCII(*pos)) |
183 | { |
184 | size_t name_size = 1; |
185 | while (name_size < size && isWordCharASCII(*(pos + name_size))) |
186 | ++name_size; |
187 | s = String{pos, name_size}; |
188 | return pos + name_size; |
189 | } |
190 | return pos + buf.count(); |
191 | } |
192 | |
193 | String ParsedTemplateFormatString::dump() const |
194 | { |
195 | WriteBufferFromOwnString res; |
196 | res << "Delimiter " << 0 << ": " ; |
197 | verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res); |
198 | |
199 | size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size()); |
200 | for (size_t i = 0; i < num_columns; ++i) |
201 | { |
202 | res << "\nColumn " << i << ": \"" ; |
203 | if (column_names.size() <= i) |
204 | res << "<ERROR>" ; |
205 | else if (column_names[i].empty()) |
206 | res << "<SKIPPED>" ; |
207 | else |
208 | res << column_names[i]; |
209 | |
210 | res << "\" (mapped to table column " ; |
211 | if (format_idx_to_column_idx.size() <= i) |
212 | res << "<ERROR>" ; |
213 | else if (!format_idx_to_column_idx[i]) |
214 | res << "<SKIPPED>" ; |
215 | else |
216 | res << *format_idx_to_column_idx[i]; |
217 | |
218 | res << "), Format " << (i < formats.size() ? formatToString(formats[i]) : "<ERROR>" ); |
219 | |
220 | res << "\nDelimiter " << i + 1 << ": " ; |
221 | if (delimiters.size() <= i + 1) |
222 | res << "<ERROR>" ; |
223 | else |
224 | verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res); |
225 | } |
226 | |
227 | return res.str(); |
228 | } |
229 | |
230 | void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const |
231 | { |
232 | throw Exception("Invalid format string for Template: " + message + " (near column " + std::to_string(column) + |
233 | ")" + ". Parsed format string:\n" + dump() + "\n" , |
234 | ErrorCodes::INVALID_TEMPLATE_FORMAT); |
235 | } |
236 | |
237 | ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedResultsetFormat(const FormatSettings::Custom & settings) |
238 | { |
239 | /// Set resultset format to "result_before_delimiter ${data} result_after_delimiter" |
240 | ParsedTemplateFormatString resultset_format; |
241 | resultset_format.delimiters.emplace_back(settings.result_before_delimiter); |
242 | resultset_format.delimiters.emplace_back(settings.result_after_delimiter); |
243 | resultset_format.formats.emplace_back(ParsedTemplateFormatString::ColumnFormat::None); |
244 | resultset_format.format_idx_to_column_idx.emplace_back(0); |
245 | resultset_format.column_names.emplace_back("data" ); |
246 | return resultset_format; |
247 | } |
248 | |
249 | ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedRowFormat(const FormatSettings::Custom & settings, const Block & sample) |
250 | { |
251 | /// Set row format to |
252 | /// "row_before_delimiter ${Col0:escaping} field_delimiter ${Col1:escaping} field_delimiter ... ${ColN:escaping} row_after_delimiter" |
253 | ParsedTemplateFormatString::ColumnFormat escaping = ParsedTemplateFormatString::stringToFormat(settings.escaping_rule); |
254 | ParsedTemplateFormatString row_format; |
255 | row_format.delimiters.emplace_back(settings.row_before_delimiter); |
256 | for (size_t i = 0; i < sample.columns(); ++i) |
257 | { |
258 | row_format.formats.emplace_back(escaping); |
259 | row_format.format_idx_to_column_idx.emplace_back(i); |
260 | row_format.column_names.emplace_back(sample.getByPosition(i).name); |
261 | bool last_column = i == sample.columns() - 1; |
262 | row_format.delimiters.emplace_back(last_column ? settings.row_after_delimiter : settings.field_delimiter); |
263 | } |
264 | return row_format; |
265 | } |
266 | |
267 | } |
268 | |