1 | #include "duckdb/execution/operator/persistent/csv_reader_options.hpp" |
2 | #include "duckdb/common/bind_helpers.hpp" |
3 | #include "duckdb/common/vector_size.hpp" |
4 | #include "duckdb/common/string_util.hpp" |
5 | |
6 | namespace duckdb { |
7 | |
8 | static bool ParseBoolean(const Value &value, const string &loption); |
9 | |
10 | static bool ParseBoolean(const vector<Value> &set, const string &loption) { |
11 | if (set.empty()) { |
12 | // no option specified: default to true |
13 | return true; |
14 | } |
15 | if (set.size() > 1) { |
16 | throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)" , loption); |
17 | } |
18 | return ParseBoolean(value: set[0], loption); |
19 | } |
20 | |
21 | static bool ParseBoolean(const Value &value, const string &loption) { |
22 | |
23 | if (value.type().id() == LogicalTypeId::LIST) { |
24 | auto &children = ListValue::GetChildren(value); |
25 | return ParseBoolean(set: children, loption); |
26 | } |
27 | if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE || |
28 | value.type().id() == LogicalTypeId::DECIMAL) { |
29 | throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)" , loption); |
30 | } |
31 | return BooleanValue::Get(value: value.DefaultCastAs(target_type: LogicalType::BOOLEAN)); |
32 | } |
33 | |
34 | static string ParseString(const Value &value, const string &loption) { |
35 | if (value.IsNull()) { |
36 | return string(); |
37 | } |
38 | if (value.type().id() == LogicalTypeId::LIST) { |
39 | auto &children = ListValue::GetChildren(value); |
40 | if (children.size() != 1) { |
41 | throw BinderException("\"%s\" expects a single argument as a string value" , loption); |
42 | } |
43 | return ParseString(value: children[0], loption); |
44 | } |
45 | if (value.type().id() != LogicalTypeId::VARCHAR) { |
46 | throw BinderException("\"%s\" expects a string argument!" , loption); |
47 | } |
48 | return value.GetValue<string>(); |
49 | } |
50 | |
51 | static int64_t ParseInteger(const Value &value, const string &loption) { |
52 | if (value.type().id() == LogicalTypeId::LIST) { |
53 | auto &children = ListValue::GetChildren(value); |
54 | if (children.size() != 1) { |
55 | // no option specified or multiple options specified |
56 | throw BinderException("\"%s\" expects a single argument as an integer value" , loption); |
57 | } |
58 | return ParseInteger(value: children[0], loption); |
59 | } |
60 | return value.GetValue<int64_t>(); |
61 | } |
62 | |
63 | void BufferedCSVReaderOptions::(bool input) { |
64 | this->header = input; |
65 | this->has_header = true; |
66 | } |
67 | |
68 | void BufferedCSVReaderOptions::SetCompression(const string &compression_p) { |
69 | this->compression = FileCompressionTypeFromString(input: compression_p); |
70 | } |
71 | |
72 | void BufferedCSVReaderOptions::SetEscape(const string &input) { |
73 | this->escape = input; |
74 | this->has_escape = true; |
75 | } |
76 | |
77 | void BufferedCSVReaderOptions::SetDelimiter(const string &input) { |
78 | this->delimiter = StringUtil::Replace(source: input, from: "\\t" , to: "\t" ); |
79 | this->has_delimiter = true; |
80 | if (input.empty()) { |
81 | this->delimiter = string("\0" , 1); |
82 | } |
83 | } |
84 | |
85 | void BufferedCSVReaderOptions::SetQuote(const string "e_p) { |
86 | this->quote = quote_p; |
87 | this->has_quote = true; |
88 | } |
89 | |
90 | void BufferedCSVReaderOptions::SetNewline(const string &input) { |
91 | if (input == "\\n" || input == "\\r" ) { |
92 | new_line = NewLineIdentifier::SINGLE; |
93 | } else if (input == "\\r\\n" ) { |
94 | new_line = NewLineIdentifier::CARRY_ON; |
95 | } else { |
96 | throw InvalidInputException("This is not accepted as a newline: " + input); |
97 | } |
98 | has_newline = true; |
99 | } |
100 | |
101 | void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) { |
102 | string error; |
103 | if (read_format) { |
104 | error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: date_format[type]); |
105 | date_format[type].format_specifier = format; |
106 | } else { |
107 | error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: write_date_format[type]); |
108 | } |
109 | if (!error.empty()) { |
110 | throw InvalidInputException("Could not parse DATEFORMAT: %s" , error.c_str()); |
111 | } |
112 | has_format[type] = true; |
113 | } |
114 | |
115 | void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value, |
116 | vector<string> &expected_names) { |
117 | if (SetBaseOption(loption, value)) { |
118 | return; |
119 | } |
120 | if (loption == "auto_detect" ) { |
121 | auto_detect = ParseBoolean(value, loption); |
122 | } else if (loption == "sample_size" ) { |
123 | int64_t sample_size = ParseInteger(value, loption); |
124 | if (sample_size < 1 && sample_size != -1) { |
125 | throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1" ); |
126 | } |
127 | if (sample_size == -1) { |
128 | sample_chunks = std::numeric_limits<uint64_t>::max(); |
129 | sample_chunk_size = STANDARD_VECTOR_SIZE; |
130 | } else if (sample_size <= STANDARD_VECTOR_SIZE) { |
131 | sample_chunk_size = sample_size; |
132 | sample_chunks = 1; |
133 | } else { |
134 | sample_chunk_size = STANDARD_VECTOR_SIZE; |
135 | sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1; |
136 | } |
137 | } else if (loption == "skip" ) { |
138 | skip_rows = ParseInteger(value, loption); |
139 | skip_rows_set = true; |
140 | } else if (loption == "max_line_size" || loption == "maximum_line_size" ) { |
141 | maximum_line_size = ParseInteger(value, loption); |
142 | } else if (loption == "sample_chunk_size" ) { |
143 | sample_chunk_size = ParseInteger(value, loption); |
144 | if (sample_chunk_size > STANDARD_VECTOR_SIZE) { |
145 | throw BinderException( |
146 | "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d" , |
147 | STANDARD_VECTOR_SIZE); |
148 | } else if (sample_chunk_size < 1) { |
149 | throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1" ); |
150 | } |
151 | } else if (loption == "sample_chunks" ) { |
152 | sample_chunks = ParseInteger(value, loption); |
153 | if (sample_chunks < 1) { |
154 | throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1" ); |
155 | } |
156 | } else if (loption == "force_not_null" ) { |
157 | force_not_null = ParseColumnList(value, names&: expected_names, option_name: loption); |
158 | } else if (loption == "date_format" || loption == "dateformat" ) { |
159 | string format = ParseString(value, loption); |
160 | SetDateFormat(type: LogicalTypeId::DATE, format, read_format: true); |
161 | } else if (loption == "timestamp_format" || loption == "timestampformat" ) { |
162 | string format = ParseString(value, loption); |
163 | SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: true); |
164 | } else if (loption == "ignore_errors" ) { |
165 | ignore_errors = ParseBoolean(value, loption); |
166 | } else if (loption == "buffer_size" ) { |
167 | buffer_size = ParseInteger(value, loption); |
168 | if (buffer_size == 0) { |
169 | throw InvalidInputException("Buffer Size option must be higher than 0" ); |
170 | } |
171 | } else if (loption == "decimal_separator" ) { |
172 | decimal_separator = ParseString(value, loption); |
173 | if (decimal_separator != "." && decimal_separator != "," ) { |
174 | throw BinderException("Unsupported parameter for DECIMAL_SEPARATOR: should be '.' or ','" ); |
175 | } |
176 | } else if (loption == "null_padding" ) { |
177 | null_padding = ParseBoolean(value, loption); |
178 | } else if (loption == "allow_quoted_nulls" ) { |
179 | allow_quoted_nulls = ParseBoolean(value, loption); |
180 | } else if (loption == "parallel" ) { |
181 | parallel_mode = ParseBoolean(value, loption) ? ParallelMode::PARALLEL : ParallelMode::SINGLE_THREADED; |
182 | } else { |
183 | throw BinderException("Unrecognized option for CSV reader \"%s\"" , loption); |
184 | } |
185 | } |
186 | |
187 | void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) { |
188 | if (loption == "new_line" ) { |
189 | // Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY) |
190 | write_newline = ParseString(value, loption); |
191 | return; |
192 | } |
193 | |
194 | if (SetBaseOption(loption, value)) { |
195 | return; |
196 | } |
197 | |
198 | if (loption == "force_quote" ) { |
199 | force_quote = ParseColumnList(value, names&: name_list, option_name: loption); |
200 | } else if (loption == "date_format" || loption == "dateformat" ) { |
201 | string format = ParseString(value, loption); |
202 | SetDateFormat(type: LogicalTypeId::DATE, format, read_format: false); |
203 | } else if (loption == "timestamp_format" || loption == "timestampformat" ) { |
204 | string format = ParseString(value, loption); |
205 | if (StringUtil::Lower(str: format) == "iso" ) { |
206 | format = "%Y-%m-%dT%H:%M:%S.%fZ" ; |
207 | } |
208 | SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: false); |
209 | SetDateFormat(type: LogicalTypeId::TIMESTAMP_TZ, format, read_format: false); |
210 | } else if (loption == "prefix" ) { |
211 | prefix = ParseString(value, loption); |
212 | } else if (loption == "suffix" ) { |
213 | suffix = ParseString(value, loption); |
214 | } else { |
215 | throw BinderException("Unrecognized option CSV writer \"%s\"" , loption); |
216 | } |
217 | } |
218 | |
219 | bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) { |
220 | // Make sure this function was only called after the option was turned into lowercase |
221 | D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper)); |
222 | |
223 | if (StringUtil::StartsWith(str: loption, prefix: "delim" ) || StringUtil::StartsWith(str: loption, prefix: "sep" )) { |
224 | SetDelimiter(ParseString(value, loption)); |
225 | } else if (loption == "quote" ) { |
226 | SetQuote(ParseString(value, loption)); |
227 | } else if (loption == "new_line" ) { |
228 | SetNewline(ParseString(value, loption)); |
229 | } else if (loption == "escape" ) { |
230 | SetEscape(ParseString(value, loption)); |
231 | } else if (loption == "header" ) { |
232 | SetHeader(ParseBoolean(value, loption)); |
233 | } else if (loption == "null" || loption == "nullstr" ) { |
234 | null_str = ParseString(value, loption); |
235 | } else if (loption == "encoding" ) { |
236 | auto encoding = StringUtil::Lower(str: ParseString(value, loption)); |
237 | if (encoding != "utf8" && encoding != "utf-8" ) { |
238 | throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'" ); |
239 | } |
240 | } else if (loption == "compression" ) { |
241 | SetCompression(ParseString(value, loption)); |
242 | } else { |
243 | // unrecognized option in base CSV |
244 | return false; |
245 | } |
246 | return true; |
247 | } |
248 | |
249 | std::string BufferedCSVReaderOptions::ToString() const { |
250 | return " file=" + file_path + "\n delimiter='" + delimiter + |
251 | (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)" )) + "\n quote='" + quote + |
252 | (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)" )) + "\n escape='" + escape + |
253 | (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)" )) + |
254 | "\n header=" + std::to_string(val: header) + |
255 | (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)" )) + |
256 | "\n sample_size=" + std::to_string(val: sample_chunk_size * sample_chunks) + |
257 | "\n ignore_errors=" + std::to_string(val: ignore_errors) + "\n all_varchar=" + std::to_string(val: all_varchar); |
258 | } |
259 | |
260 | } // namespace duckdb |
261 | |