1#include <Formats/ParsedTemplateFormatString.h>
2#include <Formats/verbosePrintString.h>
3#include <IO/ReadBufferFromMemory.h>
4#include <IO/Operators.h>
5#include <IO/ReadBufferFromFile.h>
6#include <Core/Settings.h>
7#include <Interpreters/Context.h>
8
9namespace DB
10{
11
12namespace ErrorCodes
13{
14 extern const int INVALID_TEMPLATE_FORMAT;
15}
16
17ParsedTemplateFormatString::ParsedTemplateFormatString(const FormatSchemaInfo & schema, const ColumnIdxGetter & idx_by_name)
18{
19 try
20 {
21 ReadBufferFromFile schema_file(schema.absoluteSchemaPath(), 4096);
22 String format_string;
23 readStringUntilEOF(format_string, schema_file);
24 parse(format_string, idx_by_name);
25 }
26 catch (DB::Exception & e)
27 {
28 if (e.code() != ErrorCodes::INVALID_TEMPLATE_FORMAT)
29 throwInvalidFormat(e.message(), columnsCount());
30 else
31 throw;
32 }
33}
34
35
36void ParsedTemplateFormatString::parse(const String & format_string, const ColumnIdxGetter & idx_by_name)
37{
38 enum ParserState
39 {
40 Delimiter,
41 Column,
42 Format
43 };
44
45 const char * pos = format_string.c_str();
46 const char * end = format_string.c_str() + format_string.size();
47 const char * token_begin = pos;
48 ParserState state = Delimiter;
49 delimiters.emplace_back();
50 char * col_idx_end;
51 std::optional<size_t> column_idx;
52 for (; *pos; ++pos)
53 {
54 switch (state)
55 {
56 case Delimiter:
57 if (*pos == '$')
58 {
59 delimiters.back().append(token_begin, pos - token_begin);
60 ++pos;
61 if (*pos == '{')
62 {
63 token_begin = pos + 1;
64 state = Column;
65 }
66 else if (*pos == '$')
67 {
68 token_begin = pos;
69 }
70 else
71 throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) +
72 ": Expected '{' or '$' after '$'" +
73 ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"", columnsCount());
74 }
75 break;
76
77 case Column:
78 column_names.emplace_back();
79 pos = readMayBeQuotedColumnNameInto(pos, end - pos, column_names.back());
80
81 if (*pos == ':')
82 state = Format;
83 else if (*pos == '}')
84 {
85 formats.push_back(ColumnFormat::None);
86 delimiters.emplace_back();
87 state = Delimiter;
88 }
89 else
90 throwInvalidFormat("At pos " + std::to_string(pos - format_string.c_str()) +
91 ": Expected ':' or '}' after column name \"" + column_names.back() + "\"" +
92 ", got \"" + std::string(pos, std::min(end - pos, 16l)) + "\"", columnsCount());
93
94 token_begin = pos + 1;
95 column_idx.reset();
96 if (!column_names.back().empty())
97 {
98 col_idx_end = nullptr;
99 errno = 0;
100 column_idx = strtoull(column_names.back().c_str(), &col_idx_end, 10);
101 if (col_idx_end != column_names.back().c_str() + column_names.back().size() || errno)
102 column_idx = idx_by_name(column_names.back());
103 }
104 format_idx_to_column_idx.emplace_back(column_idx);
105 break;
106
107 case Format:
108 if (*pos == '}')
109 {
110 formats.push_back(stringToFormat(String(token_begin, pos - token_begin)));
111 token_begin = pos + 1;
112 delimiters.emplace_back();
113 state = Delimiter;
114 }
115 }
116 }
117 if (state != Delimiter)
118 throwInvalidFormat("Unbalanced parentheses", columnsCount());
119 delimiters.back().append(token_begin, pos - token_begin);
120}
121
122
123ParsedTemplateFormatString::ColumnFormat ParsedTemplateFormatString::stringToFormat(const String & col_format)
124{
125 if (col_format.empty())
126 return ColumnFormat::None;
127 else if (col_format == "None")
128 return ColumnFormat::None;
129 else if (col_format == "Escaped")
130 return ColumnFormat::Escaped;
131 else if (col_format == "Quoted")
132 return ColumnFormat::Quoted;
133 else if (col_format == "CSV")
134 return ColumnFormat::Csv;
135 else if (col_format == "JSON")
136 return ColumnFormat::Json;
137 else if (col_format == "XML")
138 return ColumnFormat::Xml;
139 else if (col_format == "Raw")
140 return ColumnFormat::Raw;
141 else
142 throw Exception("Unknown field format \"" + col_format + "\"", ErrorCodes::BAD_ARGUMENTS);
143}
144
145size_t ParsedTemplateFormatString::columnsCount() const
146{
147 return format_idx_to_column_idx.size();
148}
149
150String ParsedTemplateFormatString::formatToString(ParsedTemplateFormatString::ColumnFormat format)
151{
152 switch (format)
153 {
154 case ColumnFormat::None:
155 return "None";
156 case ColumnFormat::Escaped:
157 return "Escaped";
158 case ColumnFormat::Quoted:
159 return "Quoted";
160 case ColumnFormat::Csv:
161 return "CSV";
162 case ColumnFormat::Json:
163 return "Json";
164 case ColumnFormat::Xml:
165 return "Xml";
166 case ColumnFormat::Raw:
167 return "Raw";
168 }
169 __builtin_unreachable();
170}
171
172const char * ParsedTemplateFormatString::readMayBeQuotedColumnNameInto(const char * pos, size_t size, String & s)
173{
174 s.clear();
175 if (!size)
176 return pos;
177 ReadBufferFromMemory buf{pos, size};
178 if (*pos == '"')
179 readDoubleQuotedStringWithSQLStyle(s, buf);
180 else if (*pos == '`')
181 readBackQuotedStringWithSQLStyle(s, buf);
182 else if (isWordCharASCII(*pos))
183 {
184 size_t name_size = 1;
185 while (name_size < size && isWordCharASCII(*(pos + name_size)))
186 ++name_size;
187 s = String{pos, name_size};
188 return pos + name_size;
189 }
190 return pos + buf.count();
191}
192
193String ParsedTemplateFormatString::dump() const
194{
195 WriteBufferFromOwnString res;
196 res << "Delimiter " << 0 << ": ";
197 verbosePrintString(delimiters.front().c_str(), delimiters.front().c_str() + delimiters.front().size(), res);
198
199 size_t num_columns = std::max(formats.size(), format_idx_to_column_idx.size());
200 for (size_t i = 0; i < num_columns; ++i)
201 {
202 res << "\nColumn " << i << ": \"";
203 if (column_names.size() <= i)
204 res << "<ERROR>";
205 else if (column_names[i].empty())
206 res << "<SKIPPED>";
207 else
208 res << column_names[i];
209
210 res << "\" (mapped to table column ";
211 if (format_idx_to_column_idx.size() <= i)
212 res << "<ERROR>";
213 else if (!format_idx_to_column_idx[i])
214 res << "<SKIPPED>";
215 else
216 res << *format_idx_to_column_idx[i];
217
218 res << "), Format " << (i < formats.size() ? formatToString(formats[i]) : "<ERROR>");
219
220 res << "\nDelimiter " << i + 1 << ": ";
221 if (delimiters.size() <= i + 1)
222 res << "<ERROR>";
223 else
224 verbosePrintString(delimiters[i + 1].c_str(), delimiters[i + 1].c_str() + delimiters[i + 1].size(), res);
225 }
226
227 return res.str();
228}
229
230void ParsedTemplateFormatString::throwInvalidFormat(const String & message, size_t column) const
231{
232 throw Exception("Invalid format string for Template: " + message + " (near column " + std::to_string(column) +
233 ")" + ". Parsed format string:\n" + dump() + "\n",
234 ErrorCodes::INVALID_TEMPLATE_FORMAT);
235}
236
237ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedResultsetFormat(const FormatSettings::Custom & settings)
238{
239 /// Set resultset format to "result_before_delimiter ${data} result_after_delimiter"
240 ParsedTemplateFormatString resultset_format;
241 resultset_format.delimiters.emplace_back(settings.result_before_delimiter);
242 resultset_format.delimiters.emplace_back(settings.result_after_delimiter);
243 resultset_format.formats.emplace_back(ParsedTemplateFormatString::ColumnFormat::None);
244 resultset_format.format_idx_to_column_idx.emplace_back(0);
245 resultset_format.column_names.emplace_back("data");
246 return resultset_format;
247}
248
249ParsedTemplateFormatString ParsedTemplateFormatString::setupCustomSeparatedRowFormat(const FormatSettings::Custom & settings, const Block & sample)
250{
251 /// Set row format to
252 /// "row_before_delimiter ${Col0:escaping} field_delimiter ${Col1:escaping} field_delimiter ... ${ColN:escaping} row_after_delimiter"
253 ParsedTemplateFormatString::ColumnFormat escaping = ParsedTemplateFormatString::stringToFormat(settings.escaping_rule);
254 ParsedTemplateFormatString row_format;
255 row_format.delimiters.emplace_back(settings.row_before_delimiter);
256 for (size_t i = 0; i < sample.columns(); ++i)
257 {
258 row_format.formats.emplace_back(escaping);
259 row_format.format_idx_to_column_idx.emplace_back(i);
260 row_format.column_names.emplace_back(sample.getByPosition(i).name);
261 bool last_column = i == sample.columns() - 1;
262 row_format.delimiters.emplace_back(last_column ? settings.row_after_delimiter : settings.field_delimiter);
263 }
264 return row_format;
265}
266
267}
268