| 1 | #include "duckdb/execution/operator/persistent/csv_reader_options.hpp" | 
| 2 | #include "duckdb/common/bind_helpers.hpp" | 
| 3 | #include "duckdb/common/vector_size.hpp" | 
| 4 | #include "duckdb/common/string_util.hpp" | 
| 5 |  | 
| 6 | namespace duckdb { | 
| 7 |  | 
| 8 | static bool ParseBoolean(const Value &value, const string &loption); | 
| 9 |  | 
| 10 | static bool ParseBoolean(const vector<Value> &set, const string &loption) { | 
| 11 | 	if (set.empty()) { | 
| 12 | 		// no option specified: default to true | 
| 13 | 		return true; | 
| 14 | 	} | 
| 15 | 	if (set.size() > 1) { | 
| 16 | 		throw BinderException("\"%s\" expects a single argument as a boolean value (e.g. TRUE or 1)" , loption); | 
| 17 | 	} | 
| 18 | 	return ParseBoolean(value: set[0], loption); | 
| 19 | } | 
| 20 |  | 
| 21 | static bool ParseBoolean(const Value &value, const string &loption) { | 
| 22 |  | 
| 23 | 	if (value.type().id() == LogicalTypeId::LIST) { | 
| 24 | 		auto &children = ListValue::GetChildren(value); | 
| 25 | 		return ParseBoolean(set: children, loption); | 
| 26 | 	} | 
| 27 | 	if (value.type() == LogicalType::FLOAT || value.type() == LogicalType::DOUBLE || | 
| 28 | 	    value.type().id() == LogicalTypeId::DECIMAL) { | 
| 29 | 		throw BinderException("\"%s\" expects a boolean value (e.g. TRUE or 1)" , loption); | 
| 30 | 	} | 
| 31 | 	return BooleanValue::Get(value: value.DefaultCastAs(target_type: LogicalType::BOOLEAN)); | 
| 32 | } | 
| 33 |  | 
| 34 | static string ParseString(const Value &value, const string &loption) { | 
| 35 | 	if (value.IsNull()) { | 
| 36 | 		return string(); | 
| 37 | 	} | 
| 38 | 	if (value.type().id() == LogicalTypeId::LIST) { | 
| 39 | 		auto &children = ListValue::GetChildren(value); | 
| 40 | 		if (children.size() != 1) { | 
| 41 | 			throw BinderException("\"%s\" expects a single argument as a string value" , loption); | 
| 42 | 		} | 
| 43 | 		return ParseString(value: children[0], loption); | 
| 44 | 	} | 
| 45 | 	if (value.type().id() != LogicalTypeId::VARCHAR) { | 
| 46 | 		throw BinderException("\"%s\" expects a string argument!" , loption); | 
| 47 | 	} | 
| 48 | 	return value.GetValue<string>(); | 
| 49 | } | 
| 50 |  | 
| 51 | static int64_t ParseInteger(const Value &value, const string &loption) { | 
| 52 | 	if (value.type().id() == LogicalTypeId::LIST) { | 
| 53 | 		auto &children = ListValue::GetChildren(value); | 
| 54 | 		if (children.size() != 1) { | 
| 55 | 			// no option specified or multiple options specified | 
| 56 | 			throw BinderException("\"%s\" expects a single argument as an integer value" , loption); | 
| 57 | 		} | 
| 58 | 		return ParseInteger(value: children[0], loption); | 
| 59 | 	} | 
| 60 | 	return value.GetValue<int64_t>(); | 
| 61 | } | 
| 62 |  | 
| 63 | void BufferedCSVReaderOptions::(bool input) { | 
| 64 | 	this->header = input; | 
| 65 | 	this->has_header = true; | 
| 66 | } | 
| 67 |  | 
| 68 | void BufferedCSVReaderOptions::SetCompression(const string &compression_p) { | 
| 69 | 	this->compression = FileCompressionTypeFromString(input: compression_p); | 
| 70 | } | 
| 71 |  | 
| 72 | void BufferedCSVReaderOptions::SetEscape(const string &input) { | 
| 73 | 	this->escape = input; | 
| 74 | 	this->has_escape = true; | 
| 75 | } | 
| 76 |  | 
| 77 | void BufferedCSVReaderOptions::SetDelimiter(const string &input) { | 
| 78 | 	this->delimiter = StringUtil::Replace(source: input, from: "\\t" , to: "\t" ); | 
| 79 | 	this->has_delimiter = true; | 
| 80 | 	if (input.empty()) { | 
| 81 | 		this->delimiter = string("\0" , 1); | 
| 82 | 	} | 
| 83 | } | 
| 84 |  | 
| 85 | void BufferedCSVReaderOptions::SetQuote(const string "e_p) { | 
| 86 | 	this->quote = quote_p; | 
| 87 | 	this->has_quote = true; | 
| 88 | } | 
| 89 |  | 
| 90 | void BufferedCSVReaderOptions::SetNewline(const string &input) { | 
| 91 | 	if (input == "\\n"  || input == "\\r" ) { | 
| 92 | 		new_line = NewLineIdentifier::SINGLE; | 
| 93 | 	} else if (input == "\\r\\n" ) { | 
| 94 | 		new_line = NewLineIdentifier::CARRY_ON; | 
| 95 | 	} else { | 
| 96 | 		throw InvalidInputException("This is not accepted as a newline: "  + input); | 
| 97 | 	} | 
| 98 | 	has_newline = true; | 
| 99 | } | 
| 100 |  | 
| 101 | void BufferedCSVReaderOptions::SetDateFormat(LogicalTypeId type, const string &format, bool read_format) { | 
| 102 | 	string error; | 
| 103 | 	if (read_format) { | 
| 104 | 		error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: date_format[type]); | 
| 105 | 		date_format[type].format_specifier = format; | 
| 106 | 	} else { | 
| 107 | 		error = StrTimeFormat::ParseFormatSpecifier(format_string: format, format&: write_date_format[type]); | 
| 108 | 	} | 
| 109 | 	if (!error.empty()) { | 
| 110 | 		throw InvalidInputException("Could not parse DATEFORMAT: %s" , error.c_str()); | 
| 111 | 	} | 
| 112 | 	has_format[type] = true; | 
| 113 | } | 
| 114 |  | 
| 115 | void BufferedCSVReaderOptions::SetReadOption(const string &loption, const Value &value, | 
| 116 |                                              vector<string> &expected_names) { | 
| 117 | 	if (SetBaseOption(loption, value)) { | 
| 118 | 		return; | 
| 119 | 	} | 
| 120 | 	if (loption == "auto_detect" ) { | 
| 121 | 		auto_detect = ParseBoolean(value, loption); | 
| 122 | 	} else if (loption == "sample_size" ) { | 
| 123 | 		int64_t sample_size = ParseInteger(value, loption); | 
| 124 | 		if (sample_size < 1 && sample_size != -1) { | 
| 125 | 			throw BinderException("Unsupported parameter for SAMPLE_SIZE: cannot be smaller than 1" ); | 
| 126 | 		} | 
| 127 | 		if (sample_size == -1) { | 
| 128 | 			sample_chunks = std::numeric_limits<uint64_t>::max(); | 
| 129 | 			sample_chunk_size = STANDARD_VECTOR_SIZE; | 
| 130 | 		} else if (sample_size <= STANDARD_VECTOR_SIZE) { | 
| 131 | 			sample_chunk_size = sample_size; | 
| 132 | 			sample_chunks = 1; | 
| 133 | 		} else { | 
| 134 | 			sample_chunk_size = STANDARD_VECTOR_SIZE; | 
| 135 | 			sample_chunks = sample_size / STANDARD_VECTOR_SIZE + 1; | 
| 136 | 		} | 
| 137 | 	} else if (loption == "skip" ) { | 
| 138 | 		skip_rows = ParseInteger(value, loption); | 
| 139 | 		skip_rows_set = true; | 
| 140 | 	} else if (loption == "max_line_size"  || loption == "maximum_line_size" ) { | 
| 141 | 		maximum_line_size = ParseInteger(value, loption); | 
| 142 | 	} else if (loption == "sample_chunk_size" ) { | 
| 143 | 		sample_chunk_size = ParseInteger(value, loption); | 
| 144 | 		if (sample_chunk_size > STANDARD_VECTOR_SIZE) { | 
| 145 | 			throw BinderException( | 
| 146 | 			    "Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be bigger than STANDARD_VECTOR_SIZE %d" , | 
| 147 | 			    STANDARD_VECTOR_SIZE); | 
| 148 | 		} else if (sample_chunk_size < 1) { | 
| 149 | 			throw BinderException("Unsupported parameter for SAMPLE_CHUNK_SIZE: cannot be smaller than 1" ); | 
| 150 | 		} | 
| 151 | 	} else if (loption == "sample_chunks" ) { | 
| 152 | 		sample_chunks = ParseInteger(value, loption); | 
| 153 | 		if (sample_chunks < 1) { | 
| 154 | 			throw BinderException("Unsupported parameter for SAMPLE_CHUNKS: cannot be smaller than 1" ); | 
| 155 | 		} | 
| 156 | 	} else if (loption == "force_not_null" ) { | 
| 157 | 		force_not_null = ParseColumnList(value, names&: expected_names, option_name: loption); | 
| 158 | 	} else if (loption == "date_format"  || loption == "dateformat" ) { | 
| 159 | 		string format = ParseString(value, loption); | 
| 160 | 		SetDateFormat(type: LogicalTypeId::DATE, format, read_format: true); | 
| 161 | 	} else if (loption == "timestamp_format"  || loption == "timestampformat" ) { | 
| 162 | 		string format = ParseString(value, loption); | 
| 163 | 		SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: true); | 
| 164 | 	} else if (loption == "ignore_errors" ) { | 
| 165 | 		ignore_errors = ParseBoolean(value, loption); | 
| 166 | 	} else if (loption == "buffer_size" ) { | 
| 167 | 		buffer_size = ParseInteger(value, loption); | 
| 168 | 		if (buffer_size == 0) { | 
| 169 | 			throw InvalidInputException("Buffer Size option must be higher than 0" ); | 
| 170 | 		} | 
| 171 | 	} else if (loption == "decimal_separator" ) { | 
| 172 | 		decimal_separator = ParseString(value, loption); | 
| 173 | 		if (decimal_separator != "."  && decimal_separator != "," ) { | 
| 174 | 			throw BinderException("Unsupported parameter for DECIMAL_SEPARATOR: should be '.' or ','" ); | 
| 175 | 		} | 
| 176 | 	} else if (loption == "null_padding" ) { | 
| 177 | 		null_padding = ParseBoolean(value, loption); | 
| 178 | 	} else if (loption == "allow_quoted_nulls" ) { | 
| 179 | 		allow_quoted_nulls = ParseBoolean(value, loption); | 
| 180 | 	} else if (loption == "parallel" ) { | 
| 181 | 		parallel_mode = ParseBoolean(value, loption) ? ParallelMode::PARALLEL : ParallelMode::SINGLE_THREADED; | 
| 182 | 	} else { | 
| 183 | 		throw BinderException("Unrecognized option for CSV reader \"%s\"" , loption); | 
| 184 | 	} | 
| 185 | } | 
| 186 |  | 
| 187 | void BufferedCSVReaderOptions::SetWriteOption(const string &loption, const Value &value) { | 
| 188 | 	if (loption == "new_line" ) { | 
| 189 | 		// Steal this from SetBaseOption so we can write different newlines (e.g., format JSON ARRAY) | 
| 190 | 		write_newline = ParseString(value, loption); | 
| 191 | 		return; | 
| 192 | 	} | 
| 193 |  | 
| 194 | 	if (SetBaseOption(loption, value)) { | 
| 195 | 		return; | 
| 196 | 	} | 
| 197 |  | 
| 198 | 	if (loption == "force_quote" ) { | 
| 199 | 		force_quote = ParseColumnList(value, names&: name_list, option_name: loption); | 
| 200 | 	} else if (loption == "date_format"  || loption == "dateformat" ) { | 
| 201 | 		string format = ParseString(value, loption); | 
| 202 | 		SetDateFormat(type: LogicalTypeId::DATE, format, read_format: false); | 
| 203 | 	} else if (loption == "timestamp_format"  || loption == "timestampformat" ) { | 
| 204 | 		string format = ParseString(value, loption); | 
| 205 | 		if (StringUtil::Lower(str: format) == "iso" ) { | 
| 206 | 			format = "%Y-%m-%dT%H:%M:%S.%fZ" ; | 
| 207 | 		} | 
| 208 | 		SetDateFormat(type: LogicalTypeId::TIMESTAMP, format, read_format: false); | 
| 209 | 		SetDateFormat(type: LogicalTypeId::TIMESTAMP_TZ, format, read_format: false); | 
| 210 | 	} else if (loption == "prefix" ) { | 
| 211 | 		prefix = ParseString(value, loption); | 
| 212 | 	} else if (loption == "suffix" ) { | 
| 213 | 		suffix = ParseString(value, loption); | 
| 214 | 	} else { | 
| 215 | 		throw BinderException("Unrecognized option CSV writer \"%s\"" , loption); | 
| 216 | 	} | 
| 217 | } | 
| 218 |  | 
| 219 | bool BufferedCSVReaderOptions::SetBaseOption(const string &loption, const Value &value) { | 
| 220 | 	// Make sure this function was only called after the option was turned into lowercase | 
| 221 | 	D_ASSERT(!std::any_of(loption.begin(), loption.end(), ::isupper)); | 
| 222 |  | 
| 223 | 	if (StringUtil::StartsWith(str: loption, prefix: "delim" ) || StringUtil::StartsWith(str: loption, prefix: "sep" )) { | 
| 224 | 		SetDelimiter(ParseString(value, loption)); | 
| 225 | 	} else if (loption == "quote" ) { | 
| 226 | 		SetQuote(ParseString(value, loption)); | 
| 227 | 	} else if (loption == "new_line" ) { | 
| 228 | 		SetNewline(ParseString(value, loption)); | 
| 229 | 	} else if (loption == "escape" ) { | 
| 230 | 		SetEscape(ParseString(value, loption)); | 
| 231 | 	} else if (loption == "header" ) { | 
| 232 | 		SetHeader(ParseBoolean(value, loption)); | 
| 233 | 	} else if (loption == "null"  || loption == "nullstr" ) { | 
| 234 | 		null_str = ParseString(value, loption); | 
| 235 | 	} else if (loption == "encoding" ) { | 
| 236 | 		auto encoding = StringUtil::Lower(str: ParseString(value, loption)); | 
| 237 | 		if (encoding != "utf8"  && encoding != "utf-8" ) { | 
| 238 | 			throw BinderException("Copy is only supported for UTF-8 encoded files, ENCODING 'UTF-8'" ); | 
| 239 | 		} | 
| 240 | 	} else if (loption == "compression" ) { | 
| 241 | 		SetCompression(ParseString(value, loption)); | 
| 242 | 	} else { | 
| 243 | 		// unrecognized option in base CSV | 
| 244 | 		return false; | 
| 245 | 	} | 
| 246 | 	return true; | 
| 247 | } | 
| 248 |  | 
| 249 | std::string BufferedCSVReaderOptions::ToString() const { | 
| 250 | 	return "  file="  + file_path + "\n  delimiter='"  + delimiter + | 
| 251 | 	       (has_delimiter ? "'"  : (auto_detect ? "' (auto detected)"  : "' (default)" )) + "\n  quote='"  + quote + | 
| 252 | 	       (has_quote ? "'"  : (auto_detect ? "' (auto detected)"  : "' (default)" )) + "\n  escape='"  + escape + | 
| 253 | 	       (has_escape ? "'"  : (auto_detect ? "' (auto detected)"  : "' (default)" )) + | 
| 254 | 	       "\n  header="  + std::to_string(val: header) + | 
| 255 | 	       (has_header ? ""  : (auto_detect ? " (auto detected)"  : "' (default)" )) + | 
| 256 | 	       "\n  sample_size="  + std::to_string(val: sample_chunk_size * sample_chunks) + | 
| 257 | 	       "\n  ignore_errors="  + std::to_string(val: ignore_errors) + "\n  all_varchar="  + std::to_string(val: all_varchar); | 
| 258 | } | 
| 259 |  | 
| 260 | } // namespace duckdb | 
| 261 |  |