| 1 | #include "duckdb/execution/operator/persistent/csv_reader_options.hpp" |
| 2 | #include "duckdb/common/bind_helpers.hpp" |
| 3 | #include "duckdb/common/vector_size.hpp" |
| 4 | #include "duckdb/common/string_util.hpp" |
| 5 | |
| 6 | namespace duckdb { |
| 7 | |
| 8 | static bool ParseBoolean(const Value &value, const string &loption); |
| 9 | |
| 10 | static bool ParseBoolean(const vector<Value> &set, const string &loption) { |
| 11 | if (set.empty()) { |
| 12 | // no option specified: default to true |
| 13 | return true; |
| 14 | } |
| 15 | if (set.size() > 1) { |
| 16 | throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)" , loption); |
| 17 | } |
| 18 | return ParseBoolean(value: set[0], loption); |
| 19 | } |
| 20 | |
| 21 | static bool ParseBoolean(const Value &value, const string &loption) { |
| 22 | |
| 23 | if (value.type().id() == LogicalTypeId::LIST) { |
| 24 | auto &children = ListValue::GetChildren(value); |
| 25 | return ParseBoolean(set: children, loption); |
| 26 | } |
| 27 | if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE || |
| 28 | value.type().id() == LogicalTypeId::DECIMAL) { |
| 29 | throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)" , loption); |
| 30 | } |
| 31 | return BooleanValue::Get(value: value.DefaultCastAs(target_type: LogicalType::BOOLEAN)); |
| 32 | } |
| 33 | |
| 34 | static string ParseString(const Value &value, const string &loption) { |
| 35 | if (value.IsNull()) { |
| 36 | return string(); |
| 37 | } |
| 38 | if (value.type().id() == LogicalTypeId::LIST) { |
| 39 | auto &children = ListValue::GetChildren(value); |
| 40 | if (children.size() != 1) { |
| 41 | throw BinderException("\"%s\" expects a single argument as a string value" , loption); |
| 42 | } |
| 43 | return ParseString(value: children[0], loption); |
| 44 | } |
| 45 | if (value.type().id() != LogicalTypeId::VARCHAR) { |
| 46 | throw BinderException("\"%s\" expects a string argument!" , loption); |
| 47 | } |
| 48 | return value.GetValue<string>(); |
| 49 | } |
| 50 | |
| 51 | static int64_t ParseInteger(const Value &value, const string &loption) { |
| 52 | if (value.type().id() == LogicalTypeId::LIST) { |
| 53 | auto &children = ListValue::GetChildren(value); |
| 54 | if (children.size() != 1) { |
| 55 | // no option specified or multiple options specified |
| 56 | throw BinderException("\"%s\" expects a single argument as an integer value" , loption); |
| 57 | } |
| 58 | return ParseInteger(value: children[0], loption); |
| 59 | } |
| 60 | return value.GetValue<int64_t>(); |
| 61 | } |
| 62 | |
| 63 | void BufferedCSVReaderOptions::(bool input) { |
| 64 | this->header = input; |
| 65 | this->has_header = true; |
| 66 | } |
| 67 | |
| 68 | void BufferedCSVReaderOptions::SetCompression(const string &compression_p) { |
| 69 | this->compression = FileCompressionTypeFromString(input: compression_p); |
| 70 | } |
| 71 | |
| 72 | void BufferedCSVReaderOptions::SetEscape(const string &input) { |
| 73 | this->escape = input; |
| 74 | this->has_escape = true; |
| 75 | } |
| 76 | |
| 77 | void BufferedCSVReaderOptions::SetDelimiter(const string &input) { |
| 78 | this->delimiter = StringUtil::Replace(source: input, from: "\\t" , to: "\t" ); |
| 79 | this->has_delimiter = true; |
| 80 | if (input.empty()) { |
| 81 | this->delimiter = string("\0" , 1); |
| 82 | } |
| 83 | } |
| 84 | |
| 85 | void BufferedCSVReaderOptions::SetQuote(const string "e_p) { |
| 86 | this->quote = quote_p; |
| 87 | this->has_quote = true; |
| 88 | } |
| 89 | |
| 90 | void BufferedCSVReaderOptions::SetNewline(const string &input) { |
| 91 | if (input == "\\n" || input == "\\r" ) { |
| 92 | new_line = NewLineIdentifier::SINGLE; |
| 93 | } else if (input == "\\r\\n" ) { |
| 94 | new_line = NewLineIdentifier::CARRY_ON; |
| 95 | } else { |
| 96 | throw InvalidInputException("This is not accepted as a newline: " + input); |
| 97 | } |
| 98 | has_newline = true; |
| 99 | } |
| 100 | |
| 101 | void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) { |
| 102 | string error; |
| 103 | if (read_format) { |
| 104 | error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: date_format[type]); |
| 105 | date_format[type].format_specifier = format; |
| 106 | } else { |
| 107 | error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: write_date_format[type]); |
| 108 | } |
| 109 | if (!error.empty()) { |
| 110 | throw InvalidInputException("Could not parse DATEFORMAT: %s" , error.c_str()); |
| 111 | } |
| 112 | has_format[type] = true; |
| 113 | } |
| 114 | |
| 115 | void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value, |
| 116 | vector<string> &expected_names) { |
| 117 | if (SetBaseOption(loption, value)) { |
| 118 | return; |
| 119 | } |
| 120 | if (loption == "auto_detect" ) { |
| 121 | auto_detect = ParseBoolean(value, loption); |
| 122 | } else if (loption == "sample_size" ) { |
| 123 | int64_t sample_size = ParseInteger(value, loption); |
| 124 | if (sample_size < 1 && sample_size != -1) { |
| 125 | throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1" ); |
| 126 | } |
| 127 | if (sample_size == -1) { |
| 128 | sample_chunks = std::numeric_limits<uint64_t>::max(); |
| 129 | sample_chunk_size = STANDARD_VECTOR_SIZE; |
| 130 | } else if (sample_size <= STANDARD_VECTOR_SIZE) { |
| 131 | sample_chunk_size = sample_size; |
| 132 | sample_chunks = 1; |
| 133 | } else { |
| 134 | sample_chunk_size = STANDARD_VECTOR_SIZE; |
| 135 | sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1; |
| 136 | } |
| 137 | } else if (loption == "skip" ) { |
| 138 | skip_rows = ParseInteger(value, loption); |
| 139 | skip_rows_set = true; |
| 140 | } else if (loption == "max_line_size" || loption == "maximum_line_size" ) { |
| 141 | maximum_line_size = ParseInteger(value, loption); |
| 142 | } else if (loption == "sample_chunk_size" ) { |
| 143 | sample_chunk_size = ParseInteger(value, loption); |
| 144 | if (sample_chunk_size > STANDARD_VECTOR_SIZE) { |
| 145 | throw BinderException( |
| 146 | "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d" , |
| 147 | STANDARD_VECTOR_SIZE); |
| 148 | } else if (sample_chunk_size < 1) { |
| 149 | throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1" ); |
| 150 | } |
| 151 | } else if (loption == "sample_chunks" ) { |
| 152 | sample_chunks = ParseInteger(value, loption); |
| 153 | if (sample_chunks < 1) { |
| 154 | throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1" ); |
| 155 | } |
| 156 | } else if (loption == "force_not_null" ) { |
| 157 | force_not_null = ParseColumnList(value, names&: expected_names, option_name: loption); |
| 158 | } else if (loption == "date_format" || loption == "dateformat" ) { |
| 159 | string format = ParseString(value, loption); |
| 160 | SetDateFormat(type: LogicalTypeId::DATE, format, read_format: true); |
| 161 | } else if (loption == "timestamp_format" || loption == "timestampformat" ) { |
| 162 | string format = ParseString(value, loption); |
| 163 | SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: true); |
| 164 | } else if (loption == "ignore_errors" ) { |
| 165 | ignore_errors = ParseBoolean(value, loption); |
| 166 | } else if (loption == "buffer_size" ) { |
| 167 | buffer_size = ParseInteger(value, loption); |
| 168 | if (buffer_size == 0) { |
| 169 | throw InvalidInputException("Buffer Size option must be higher than 0" ); |
| 170 | } |
| 171 | } else if (loption == "decimal_separator" ) { |
| 172 | decimal_separator = ParseString(value, loption); |
| 173 | if (decimal_separator != "." && decimal_separator != "," ) { |
| 174 | throw BinderException("Unsupported parameter for DECIMAL_SEPARATOR: should be '.' or ','" ); |
| 175 | } |
| 176 | } else if (loption == "null_padding" ) { |
| 177 | null_padding = ParseBoolean(value, loption); |
| 178 | } else if (loption == "allow_quoted_nulls" ) { |
| 179 | allow_quoted_nulls = ParseBoolean(value, loption); |
| 180 | } else if (loption == "parallel" ) { |
| 181 | parallel_mode = ParseBoolean(value, loption) ? ParallelMode::PARALLEL : ParallelMode::SINGLE_THREADED; |
| 182 | } else { |
| 183 | throw BinderException("Unrecognized option for CSV reader \"%s\"" , loption); |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) { |
| 188 | if (loption == "new_line" ) { |
| 189 | // Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY) |
| 190 | write_newline = ParseString(value, loption); |
| 191 | return; |
| 192 | } |
| 193 | |
| 194 | if (SetBaseOption(loption, value)) { |
| 195 | return; |
| 196 | } |
| 197 | |
| 198 | if (loption == "force_quote" ) { |
| 199 | force_quote = ParseColumnList(value, names&: name_list, option_name: loption); |
| 200 | } else if (loption == "date_format" || loption == "dateformat" ) { |
| 201 | string format = ParseString(value, loption); |
| 202 | SetDateFormat(type: LogicalTypeId::DATE, format, read_format: false); |
| 203 | } else if (loption == "timestamp_format" || loption == "timestampformat" ) { |
| 204 | string format = ParseString(value, loption); |
| 205 | if (StringUtil::Lower(str: format) == "iso" ) { |
| 206 | format = "%Y-%m-%dT%H:%M:%S.%fZ" ; |
| 207 | } |
| 208 | SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: false); |
| 209 | SetDateFormat(type: LogicalTypeId::TIMESTAMP_TZ, format, read_format: false); |
| 210 | } else if (loption == "prefix" ) { |
| 211 | prefix = ParseString(value, loption); |
| 212 | } else if (loption == "suffix" ) { |
| 213 | suffix = ParseString(value, loption); |
| 214 | } else { |
| 215 | throw BinderException("Unrecognized option CSV writer \"%s\"" , loption); |
| 216 | } |
| 217 | } |
| 218 | |
| 219 | bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) { |
| 220 | // Make sure this function was only called after the option was turned into lowercase |
| 221 | D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper)); |
| 222 | |
| 223 | if (StringUtil::StartsWith(str: loption, prefix: "delim" ) || StringUtil::StartsWith(str: loption, prefix: "sep" )) { |
| 224 | SetDelimiter(ParseString(value, loption)); |
| 225 | } else if (loption == "quote" ) { |
| 226 | SetQuote(ParseString(value, loption)); |
| 227 | } else if (loption == "new_line" ) { |
| 228 | SetNewline(ParseString(value, loption)); |
| 229 | } else if (loption == "escape" ) { |
| 230 | SetEscape(ParseString(value, loption)); |
| 231 | } else if (loption == "header" ) { |
| 232 | SetHeader(ParseBoolean(value, loption)); |
| 233 | } else if (loption == "null" || loption == "nullstr" ) { |
| 234 | null_str = ParseString(value, loption); |
| 235 | } else if (loption == "encoding" ) { |
| 236 | auto encoding = StringUtil::Lower(str: ParseString(value, loption)); |
| 237 | if (encoding != "utf8" && encoding != "utf-8" ) { |
| 238 | throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'" ); |
| 239 | } |
| 240 | } else if (loption == "compression" ) { |
| 241 | SetCompression(ParseString(value, loption)); |
| 242 | } else { |
| 243 | // unrecognized option in base CSV |
| 244 | return false; |
| 245 | } |
| 246 | return true; |
| 247 | } |
| 248 | |
| 249 | std::string BufferedCSVReaderOptions::ToString() const { |
| 250 | return " file=" + file_path + "\n delimiter='" + delimiter + |
| 251 | (has_delimiter ? "'" : (auto_detect ? "' (auto detected)" : "' (default)" )) + "\n quote='" + quote + |
| 252 | (has_quote ? "'" : (auto_detect ? "' (auto detected)" : "' (default)" )) + "\n escape='" + escape + |
| 253 | (has_escape ? "'" : (auto_detect ? "' (auto detected)" : "' (default)" )) + |
| 254 | "\n header=" + std::to_string(val: header) + |
| 255 | (has_header ? "" : (auto_detect ? " (auto detected)" : "' (default)" )) + |
| 256 | "\n sample_size=" + std::to_string(val: sample_chunk_size * sample_chunks) + |
| 257 | "\n ignore_errors=" + std::to_string(val: ignore_errors) + "\n all_varchar=" + std::to_string(val: all_varchar); |
| 258 | } |
| 259 | |
| 260 | } // namespace duckdb |
| 261 | |