| 1 | #include "duckdb/execution/operator/persistent/physical_copy_to_file.hpp" | 
|---|
| 2 | #include "duckdb/common/vector_operations/vector_operations.hpp" | 
|---|
| 3 |  | 
|---|
| 4 | #include <algorithm> | 
|---|
| 5 | #include <fstream> | 
|---|
| 6 |  | 
|---|
| 7 | using namespace duckdb; | 
|---|
| 8 | using namespace std; | 
|---|
| 9 |  | 
|---|
| 10 | class BufferedWriter { | 
|---|
| 11 | constexpr static idx_t BUFFER_SIZE = 4096 * 4; | 
|---|
| 12 |  | 
|---|
| 13 | public: | 
|---|
| 14 | BufferedWriter(string &path) : pos(0) { | 
|---|
| 15 | to_csv.open(path); | 
|---|
| 16 | if (to_csv.fail()) { | 
|---|
| 17 | throw IOException( "Could not open CSV file"); | 
|---|
| 18 | } | 
|---|
| 19 | } | 
|---|
| 20 |  | 
|---|
| 21 | void Flush() { | 
|---|
| 22 | if (pos > 0) { | 
|---|
| 23 | to_csv.write(buffer, pos); | 
|---|
| 24 | pos = 0; | 
|---|
| 25 | } | 
|---|
| 26 | } | 
|---|
| 27 |  | 
|---|
| 28 | void Close() { | 
|---|
| 29 | Flush(); | 
|---|
| 30 | to_csv.close(); | 
|---|
| 31 | } | 
|---|
| 32 |  | 
|---|
| 33 | void Write(const char *buf, idx_t len) { | 
|---|
| 34 | if (len >= BUFFER_SIZE) { | 
|---|
| 35 | Flush(); | 
|---|
| 36 | to_csv.write(buf, len); | 
|---|
| 37 | return; | 
|---|
| 38 | } | 
|---|
| 39 | if (pos + len > BUFFER_SIZE) { | 
|---|
| 40 | Flush(); | 
|---|
| 41 | } | 
|---|
| 42 | memcpy(buffer + pos, buf, len); | 
|---|
| 43 | pos += len; | 
|---|
| 44 | } | 
|---|
| 45 |  | 
|---|
| 46 | void Write(string &value) { | 
|---|
| 47 | Write(value.c_str(), value.size()); | 
|---|
| 48 | } | 
|---|
| 49 |  | 
|---|
| 50 | private: | 
|---|
| 51 | char buffer[BUFFER_SIZE]; | 
|---|
| 52 | idx_t pos = 0; | 
|---|
| 53 |  | 
|---|
| 54 | ofstream to_csv; | 
|---|
| 55 | }; | 
|---|
| 56 |  | 
|---|
| 57 | string AddEscapes(string &to_be_escaped, string escape, string val) { | 
|---|
| 58 | idx_t i = 0; | 
|---|
| 59 | string new_val = ""; | 
|---|
| 60 | idx_t found = val.find(to_be_escaped); | 
|---|
| 61 |  | 
|---|
| 62 | while (found != string::npos) { | 
|---|
| 63 | while (i < found) { | 
|---|
| 64 | new_val += val[i]; | 
|---|
| 65 | i++; | 
|---|
| 66 | } | 
|---|
| 67 | new_val += escape; | 
|---|
| 68 | found = val.find(to_be_escaped, found + escape.length()); | 
|---|
| 69 | } | 
|---|
| 70 | while (i < val.length()) { | 
|---|
| 71 | new_val += val[i]; | 
|---|
| 72 | i++; | 
|---|
| 73 | } | 
|---|
| 74 | return new_val; | 
|---|
| 75 | } | 
|---|
| 76 |  | 
|---|
| 77 | static void WriteQuotedString(BufferedWriter &writer, string_t str_value, string &delimiter, string "e, | 
|---|
| 78 | string &escape, string &null_str, bool write_quoted) { | 
|---|
| 79 | // used for adding escapes | 
|---|
| 80 | bool add_escapes = false; | 
|---|
| 81 | auto str_data = str_value.GetData(); | 
|---|
| 82 | string new_val(str_data, str_value.GetSize()); | 
|---|
| 83 |  | 
|---|
| 84 | // check for \n, \r, \n\r in string | 
|---|
| 85 | if (!write_quoted) { | 
|---|
| 86 | for (idx_t i = 0; i < str_value.GetSize(); i++) { | 
|---|
| 87 | if (str_data[i] == '\n' || str_data[i] == '\r') { | 
|---|
| 88 | // newline, write a quoted string | 
|---|
| 89 | write_quoted = true; | 
|---|
| 90 | } | 
|---|
| 91 | } | 
|---|
| 92 | } | 
|---|
| 93 |  | 
|---|
| 94 | // check if value is null string | 
|---|
| 95 | if (!write_quoted) { | 
|---|
| 96 | if (new_val == null_str) { | 
|---|
| 97 | write_quoted = true; | 
|---|
| 98 | } | 
|---|
| 99 | } | 
|---|
| 100 |  | 
|---|
| 101 | // check for delimiter | 
|---|
| 102 | if (!write_quoted) { | 
|---|
| 103 | if (new_val.find(delimiter) != string::npos) { | 
|---|
| 104 | write_quoted = true; | 
|---|
| 105 | } | 
|---|
| 106 | } | 
|---|
| 107 |  | 
|---|
| 108 | // check for quote | 
|---|
| 109 | if (new_val.find(quote) != string::npos) { | 
|---|
| 110 | write_quoted = true; | 
|---|
| 111 | add_escapes = true; | 
|---|
| 112 | } | 
|---|
| 113 |  | 
|---|
| 114 | // check for escapes in quoted string | 
|---|
| 115 | if (write_quoted && !add_escapes) { | 
|---|
| 116 | if (new_val.find(escape) != string::npos) { | 
|---|
| 117 | add_escapes = true; | 
|---|
| 118 | } | 
|---|
| 119 | } | 
|---|
| 120 |  | 
|---|
| 121 | if (add_escapes) { | 
|---|
| 122 | new_val = AddEscapes(escape, escape, new_val); | 
|---|
| 123 | // also escape quotes | 
|---|
| 124 | if (escape != quote) { | 
|---|
| 125 | new_val = AddEscapes(quote, escape, new_val); | 
|---|
| 126 | } | 
|---|
| 127 | } | 
|---|
| 128 |  | 
|---|
| 129 | if (!write_quoted) { | 
|---|
| 130 | writer.Write(new_val); | 
|---|
| 131 | } else { | 
|---|
| 132 | writer.Write(quote); | 
|---|
| 133 | writer.Write(new_val); | 
|---|
| 134 | writer.Write(quote); | 
|---|
| 135 | } | 
|---|
| 136 | } | 
|---|
| 137 |  | 
|---|
| 138 | void PhysicalCopyToFile::GetChunkInternal(ClientContext &context, DataChunk &chunk, PhysicalOperatorState *state) { | 
|---|
| 139 | auto &info = *this->info; | 
|---|
| 140 | idx_t total = 0; | 
|---|
| 141 |  | 
|---|
| 142 | string newline = "\n"; | 
|---|
| 143 | BufferedWriter writer(info.file_path); | 
|---|
| 144 | if (info.header) { | 
|---|
| 145 | // write the header line | 
|---|
| 146 | for (idx_t i = 0; i < names.size(); i++) { | 
|---|
| 147 | if (i != 0) { | 
|---|
| 148 | writer.Write(info.delimiter); | 
|---|
| 149 | } | 
|---|
| 150 | WriteQuotedString(writer, names[i].c_str(), info.delimiter, info.quote, info.escape, info.null_str, false); | 
|---|
| 151 | } | 
|---|
| 152 | writer.Write(newline); | 
|---|
| 153 | } | 
|---|
| 154 | // create a chunk with VARCHAR columns | 
|---|
| 155 | vector<TypeId> types; | 
|---|
| 156 | for (idx_t col_idx = 0; col_idx < state->child_chunk.column_count(); col_idx++) { | 
|---|
| 157 | types.push_back(TypeId::VARCHAR); | 
|---|
| 158 | } | 
|---|
| 159 | DataChunk cast_chunk; | 
|---|
| 160 | cast_chunk.Initialize(types); | 
|---|
| 161 |  | 
|---|
| 162 | while (true) { | 
|---|
| 163 | children[0]->GetChunk(context, state->child_chunk, state->child_state.get()); | 
|---|
| 164 | if (state->child_chunk.size() == 0) { | 
|---|
| 165 | break; | 
|---|
| 166 | } | 
|---|
| 167 | // cast the columns of the chunk to varchar | 
|---|
| 168 | cast_chunk.SetCardinality(state->child_chunk); | 
|---|
| 169 | for (idx_t col_idx = 0; col_idx < state->child_chunk.column_count(); col_idx++) { | 
|---|
| 170 | if (sql_types[col_idx].id == SQLTypeId::VARCHAR || sql_types[col_idx].id == SQLTypeId::BLOB) { | 
|---|
| 171 | // VARCHAR, just create a reference | 
|---|
| 172 | cast_chunk.data[col_idx].Reference(state->child_chunk.data[col_idx]); | 
|---|
| 173 | } else { | 
|---|
| 174 | // non varchar column, perform the cast | 
|---|
| 175 | VectorOperations::Cast(state->child_chunk.data[col_idx], cast_chunk.data[col_idx], sql_types[col_idx], | 
|---|
| 176 | SQLType::VARCHAR, cast_chunk.size()); | 
|---|
| 177 | } | 
|---|
| 178 | } | 
|---|
| 179 | cast_chunk.Normalify(); | 
|---|
| 180 | // now loop over the vectors and output the values | 
|---|
| 181 | for (idx_t i = 0; i < cast_chunk.size(); i++) { | 
|---|
| 182 | // write values | 
|---|
| 183 | for (idx_t col_idx = 0; col_idx < state->child_chunk.column_count(); col_idx++) { | 
|---|
| 184 | if (col_idx != 0) { | 
|---|
| 185 | writer.Write(info.delimiter); | 
|---|
| 186 | } | 
|---|
| 187 | if (FlatVector::IsNull(cast_chunk.data[col_idx], i)) { | 
|---|
| 188 | // write null value | 
|---|
| 189 | writer.Write(info.null_str); | 
|---|
| 190 | continue; | 
|---|
| 191 | } | 
|---|
| 192 |  | 
|---|
| 193 | // non-null value, fetch the string value from the cast chunk | 
|---|
| 194 | auto str_data = FlatVector::GetData<string_t>(cast_chunk.data[col_idx]); | 
|---|
| 195 | auto str_value = str_data[i]; | 
|---|
| 196 | WriteQuotedString(writer, str_value, info.delimiter, info.quote, info.escape, info.null_str, | 
|---|
| 197 | info.force_quote[col_idx]); | 
|---|
| 198 | } | 
|---|
| 199 | writer.Write(newline); | 
|---|
| 200 | } | 
|---|
| 201 | total += cast_chunk.size(); | 
|---|
| 202 | } | 
|---|
| 203 | writer.Close(); | 
|---|
| 204 |  | 
|---|
| 205 | chunk.SetCardinality(1); | 
|---|
| 206 | chunk.SetValue(0, 0, Value::BIGINT(total)); | 
|---|
| 207 |  | 
|---|
| 208 | state->finished = true; | 
|---|
| 209 | } | 
|---|
| 210 |  | 
|---|