1#include "duckdb/execution/operator/persistent/csv_reader_options.hpp"
2#include "duckdb/common/bind_helpers.hpp"
3#include "duckdb/common/vector_size.hpp"
4#include "duckdb/common/string_util.hpp"
5
6namespace duckdb {
7
8static bool ParseBoolean(const Value &value, const string &loption);
9
10static bool ParseBoolean(const vector<Value> &set, const string &loption) {
11 if (set.empty()) {
12 // no option specified: default to true
13 return true;
14 }
15 if (set.size() > 1) {
16 throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)", loption);
17 }
18 return ParseBoolean(value: set[0], loption);
19}
20
21static bool ParseBoolean(const Value &value, const string &loption) {
22
23 if (value.type().id() == LogicalTypeId::LIST) {
24 auto &children = ListValue::GetChildren(value);
25 return ParseBoolean(set: children, loption);
26 }
27 if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE ||
28 value.type().id() == LogicalTypeId::DECIMAL) {
29 throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)", loption);
30 }
31 return BooleanValue::Get(value: value.DefaultCastAs(target_type: LogicalType::BOOLEAN));
32}
33
34static string ParseString(const Value &value, const string &loption) {
35 if (value.IsNull()) {
36 return string();
37 }
38 if (value.type().id() == LogicalTypeId::LIST) {
39 auto &children = ListValue::GetChildren(value);
40 if (children.size() != 1) {
41 throw BinderException("\"%s\" expects a single argument as a string value", loption);
42 }
43 return ParseString(value: children[0], loption);
44 }
45 if (value.type().id() != LogicalTypeId::VARCHAR) {
46 throw BinderException("\"%s\" expects a string argument!", loption);
47 }
48 return value.GetValue<string>();
49}
50
51static int64_t ParseInteger(const Value &value, const string &loption) {
52 if (value.type().id() == LogicalTypeId::LIST) {
53 auto &children = ListValue::GetChildren(value);
54 if (children.size() != 1) {
55 // no option specified or multiple options specified
56 throw BinderException("\"%s\" expects a single argument as an integer value", loption);
57 }
58 return ParseInteger(value: children[0], loption);
59 }
60 return value.GetValue<int64_t>();
61}
62
63void BufferedCSVReaderOptions::SetHeader(bool input) {
64 this->header = input;
65 this->has_header = true;
66}
67
68void BufferedCSVReaderOptions::SetCompression(const string &compression_p) {
69 this->compression = FileCompressionTypeFromString(input: compression_p);
70}
71
72void BufferedCSVReaderOptions::SetEscape(const string &input) {
73 this->escape = input;
74 this->has_escape = true;
75}
76
77void BufferedCSVReaderOptions::SetDelimiter(const string &input) {
78 this->delimiter = StringUtil::Replace(source: input, from: "\\t", to: "\t");
79 this->has_delimiter = true;
80 if (input.empty()) {
81 this->delimiter = string("\0", 1);
82 }
83}
84
85void BufferedCSVReaderOptions::SetQuote(const string &quote_p) {
86 this->quote = quote_p;
87 this->has_quote = true;
88}
89
90void BufferedCSVReaderOptions::SetNewline(const string &input) {
91 if (input == "\\n" || input == "\\r") {
92 new_line = NewLineIdentifier::SINGLE;
93 } else if (input == "\\r\\n") {
94 new_line = NewLineIdentifier::CARRY_ON;
95 } else {
96 throw InvalidInputException("This is not accepted as a newline: " + input);
97 }
98 has_newline = true;
99}
100
101void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) {
102 string error;
103 if (read_format) {
104 error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: date_format[type]);
105 date_format[type].format_specifier = format;
106 } else {
107 error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: write_date_format[type]);
108 }
109 if (!error.empty()) {
110 throw InvalidInputException("Could not parse DATEFORMAT: %s", error.c_str());
111 }
112 has_format[type] = true;
113}
114
115void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value,
116 vector<string> &expected_names) {
117 if (SetBaseOption(loption, value)) {
118 return;
119 }
120 if (loption == "auto_detect") {
121 auto_detect = ParseBoolean(value, loption);
122 } else if (loption == "sample_size") {
123 int64_t sample_size = ParseInteger(value, loption);
124 if (sample_size < 1 && sample_size != -1) {
125 throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1");
126 }
127 if (sample_size == -1) {
128 sample_chunks = std::numeric_limits<uint64_t>::max();
129 sample_chunk_size = STANDARD_VECTOR_SIZE;
130 } else if (sample_size <= STANDARD_VECTOR_SIZE) {
131 sample_chunk_size = sample_size;
132 sample_chunks = 1;
133 } else {
134 sample_chunk_size = STANDARD_VECTOR_SIZE;
135 sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1;
136 }
137 } else if (loption == "skip") {
138 skip_rows = ParseInteger(value, loption);
139 skip_rows_set = true;
140 } else if (loption == "max_line_size" || loption == "maximum_line_size") {
141 maximum_line_size = ParseInteger(value, loption);
142 } else if (loption == "sample_chunk_size") {
143 sample_chunk_size = ParseInteger(value, loption);
144 if (sample_chunk_size > STANDARD_VECTOR_SIZE) {
145 throw BinderException(
146 "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d",
147 STANDARD_VECTOR_SIZE);
148 } else if (sample_chunk_size < 1) {
149 throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1");
150 }
151 } else if (loption == "sample_chunks") {
152 sample_chunks = ParseInteger(value, loption);
153 if (sample_chunks < 1) {
154 throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1");
155 }
156 } else if (loption == "force_not_null") {
157 force_not_null = ParseColumnList(value, names&: expected_names, option_name: loption);
158 } else if (loption == "date_format" || loption == "dateformat") {
159 string format = ParseString(value, loption);
160 SetDateFormat(type: LogicalTypeId::DATE, format, read_format: true);
161 } else if (loption == "timestamp_format" || loption == "timestampformat") {
162 string format = ParseString(value, loption);
163 SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: true);
164 } else if (loption == "ignore_errors") {
165 ignore_errors = ParseBoolean(value, loption);
166 } else if (loption == "buffer_size") {
167 buffer_size = ParseInteger(value, loption);
168 if (buffer_size == 0) {
169 throw InvalidInputException("Buffer Size option must be higher than 0");
170 }
171 } else if (loption == "decimal_separator") {
172 decimal_separator = ParseString(value, loption);
173 if (decimal_separator != "." && decimal_separator != ",") {
174 throw BinderException("Unsupported parameter for DECIMAL_SEPARATOR: should be '.' or ','");
175 }
176 } else if (loption == "null_padding") {
177 null_padding = ParseBoolean(value, loption);
178 } else if (loption == "allow_quoted_nulls") {
179 allow_quoted_nulls = ParseBoolean(value, loption);
180 } else if (loption == "parallel") {
181 parallel_mode = ParseBoolean(value, loption) ? ParallelMode::PARALLEL : ParallelMode::SINGLE_THREADED;
182 } else {
183 throw BinderException("Unrecognized option for CSV reader \"%s\"", loption);
184 }
185}
186
187void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) {
188 if (loption == "new_line") {
189 // Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY)
190 write_newline = ParseString(value, loption);
191 return;
192 }
193
194 if (SetBaseOption(loption, value)) {
195 return;
196 }
197
198 if (loption == "force_quote") {
199 force_quote = ParseColumnList(value, names&: name_list, option_name: loption);
200 } else if (loption == "date_format" || loption == "dateformat") {
201 string format = ParseString(value, loption);
202 SetDateFormat(type: LogicalTypeId::DATE, format, read_format: false);
203 } else if (loption == "timestamp_format" || loption == "timestampformat") {
204 string format = ParseString(value, loption);
205 if (StringUtil::Lower(str: format) == "iso") {
206 format = "%Y-%m-%dT%H:%M:%S.%fZ";
207 }
208 SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: false);
209 SetDateFormat(type: LogicalTypeId::TIMESTAMP_TZ, format, read_format: false);
210 } else if (loption == "prefix") {
211 prefix = ParseString(value, loption);
212 } else if (loption == "suffix") {
213 suffix = ParseString(value, loption);
214 } else {
215 throw BinderException("Unrecognized option CSV writer \"%s\"", loption);
216 }
217}
218
219bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) {
220 // Make sure this function was only called after the option was turned into lowercase
221 D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper));
222
223 if (StringUtil::StartsWith(str: loption, prefix: "delim") || StringUtil::StartsWith(str: loption, prefix: "sep")) {
224 SetDelimiter(ParseString(value, loption));
225 } else if (loption == "quote") {
226 SetQuote(ParseString(value, loption));
227 } else if (loption == "new_line") {
228 SetNewline(ParseString(value, loption));
229 } else if (loption == "escape") {
230 SetEscape(ParseString(value, loption));
231 } else if (loption == "header") {
232 SetHeader(ParseBoolean(value, loption));
233 } else if (loption == "null" || loption == "nullstr") {
234 null_str = ParseString(value, loption);
235 } else if (loption == "encoding") {
236 auto encoding = StringUtil::Lower(str: ParseString(value, loption));
237 if (encoding != "utf8" && encoding != "utf-8") {
238 throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'");
239 }
240 } else if (loption == "compression") {
241 SetCompression(ParseString(value, loption));
242 } else {
243 // unrecognized option in base CSV
244 return false;
245 }
246 return true;
247}
248
249std::string BufferedCSVReaderOptions::ToString() const {
250 return " file=" + file_path + "\n delimiter='" + delimiter +
251 (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n quote='" + quote +
252 (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) + "\n escape='" + escape +
253 (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)")) +
254 "\n header=" + std::to_string(val: header) +
255 (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)")) +
256 "\n sample_size=" + std::to_string(val: sample_chunk_size * sample_chunks) +
257 "\n ignore_errors=" + std::to_string(val: ignore_errors) + "\n all_varchar=" + std::to_string(val: all_varchar);
258}
259
260} // namespace duckdb
261