| 1 | #include "duckdb/common/types/data_chunk.hpp" |
| 2 | #include "duckdb/common/types/bit.hpp" |
| 3 | #include "duckdb/common/arrow/arrow.hpp" |
| 4 | #include "duckdb/common/arrow/arrow_converter.hpp" |
| 5 | #include "duckdb/common/exception.hpp" |
| 6 | #include "duckdb/common/helper.hpp" |
| 7 | #include "duckdb/common/serializer.hpp" |
| 8 | #include "duckdb/common/types/interval.hpp" |
| 9 | #include "duckdb/common/types/sel_cache.hpp" |
| 10 | #include "duckdb/common/types/vector_cache.hpp" |
| 11 | #include "duckdb/common/unordered_map.hpp" |
| 12 | #include "duckdb/common/vector.hpp" |
| 13 | #include <list> |
| 14 | #include "duckdb/common/arrow/arrow_appender.hpp" |
| 15 | |
| 16 | namespace duckdb { |
| 17 | |
| 18 | void ArrowConverter::ToArrowArray(DataChunk &input, ArrowArray *out_array, ArrowOptions options) { |
| 19 | ArrowAppender appender(input.GetTypes(), input.size(), std::move(options)); |
| 20 | appender.Append(input, from: 0, to: input.size(), input_size: input.size()); |
| 21 | *out_array = appender.Finalize(); |
| 22 | } |
| 23 | |
| 24 | //===--------------------------------------------------------------------===// |
| 25 | // Arrow Schema |
| 26 | //===--------------------------------------------------------------------===// |
| 27 | struct DuckDBArrowSchemaHolder { |
| 28 | // unused in children |
| 29 | vector<ArrowSchema> children; |
| 30 | // unused in children |
| 31 | vector<ArrowSchema *> children_ptrs; |
| 32 | //! used for nested structures |
| 33 | std::list<vector<ArrowSchema>> nested_children; |
| 34 | std::list<vector<ArrowSchema *>> nested_children_ptr; |
| 35 | //! This holds strings created to represent decimal types |
| 36 | vector<unsafe_unique_array<char>> owned_type_names; |
| 37 | }; |
| 38 | |
| 39 | static void ReleaseDuckDBArrowSchema(ArrowSchema *schema) { |
| 40 | if (!schema || !schema->release) { |
| 41 | return; |
| 42 | } |
| 43 | schema->release = nullptr; |
| 44 | auto holder = static_cast<DuckDBArrowSchemaHolder *>(schema->private_data); |
| 45 | delete holder; |
| 46 | } |
| 47 | |
| 48 | void InitializeChild(ArrowSchema &child, const string &name = "" ) { |
| 49 | //! Child is cleaned up by parent |
| 50 | child.private_data = nullptr; |
| 51 | child.release = ReleaseDuckDBArrowSchema; |
| 52 | |
| 53 | //! Store the child schema |
| 54 | child.flags = ARROW_FLAG_NULLABLE; |
| 55 | child.name = name.c_str(); |
| 56 | child.n_children = 0; |
| 57 | child.children = nullptr; |
| 58 | child.metadata = nullptr; |
| 59 | child.dictionary = nullptr; |
| 60 | } |
| 61 | void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type, |
| 62 | const ArrowOptions &options); |
| 63 | |
| 64 | void SetArrowMapFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type, |
| 65 | const ArrowOptions &options) { |
| 66 | child.format = "+m" ; |
| 67 | //! Map has one child which is a struct |
| 68 | child.n_children = 1; |
| 69 | root_holder.nested_children.emplace_back(); |
| 70 | root_holder.nested_children.back().resize(new_size: 1); |
| 71 | root_holder.nested_children_ptr.emplace_back(); |
| 72 | root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]); |
| 73 | InitializeChild(child&: root_holder.nested_children.back()[0]); |
| 74 | child.children = &root_holder.nested_children_ptr.back()[0]; |
| 75 | child.children[0]->name = "entries" ; |
| 76 | SetArrowFormat(root_holder, child&: **child.children, type: ListType::GetChildType(type), options); |
| 77 | } |
| 78 | |
| 79 | void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type, |
| 80 | const ArrowOptions &options) { |
| 81 | switch (type.id()) { |
| 82 | case LogicalTypeId::BOOLEAN: |
| 83 | child.format = "b" ; |
| 84 | break; |
| 85 | case LogicalTypeId::TINYINT: |
| 86 | child.format = "c" ; |
| 87 | break; |
| 88 | case LogicalTypeId::SMALLINT: |
| 89 | child.format = "s" ; |
| 90 | break; |
| 91 | case LogicalTypeId::INTEGER: |
| 92 | child.format = "i" ; |
| 93 | break; |
| 94 | case LogicalTypeId::BIGINT: |
| 95 | child.format = "l" ; |
| 96 | break; |
| 97 | case LogicalTypeId::UTINYINT: |
| 98 | child.format = "C" ; |
| 99 | break; |
| 100 | case LogicalTypeId::USMALLINT: |
| 101 | child.format = "S" ; |
| 102 | break; |
| 103 | case LogicalTypeId::UINTEGER: |
| 104 | child.format = "I" ; |
| 105 | break; |
| 106 | case LogicalTypeId::UBIGINT: |
| 107 | child.format = "L" ; |
| 108 | break; |
| 109 | case LogicalTypeId::FLOAT: |
| 110 | child.format = "f" ; |
| 111 | break; |
| 112 | case LogicalTypeId::HUGEINT: |
| 113 | child.format = "d:38,0" ; |
| 114 | break; |
| 115 | case LogicalTypeId::DOUBLE: |
| 116 | child.format = "g" ; |
| 117 | break; |
| 118 | case LogicalTypeId::UUID: |
| 119 | case LogicalTypeId::VARCHAR: |
| 120 | if (options.offset_size == ArrowOffsetSize::LARGE) { |
| 121 | child.format = "U" ; |
| 122 | } else { |
| 123 | child.format = "u" ; |
| 124 | } |
| 125 | break; |
| 126 | case LogicalTypeId::DATE: |
| 127 | child.format = "tdD" ; |
| 128 | break; |
| 129 | case LogicalTypeId::TIME: |
| 130 | case LogicalTypeId::TIME_TZ: |
| 131 | child.format = "ttu" ; |
| 132 | break; |
| 133 | case LogicalTypeId::TIMESTAMP: |
| 134 | child.format = "tsu:" ; |
| 135 | break; |
| 136 | case LogicalTypeId::TIMESTAMP_TZ: { |
| 137 | string format = "tsu:" + options.time_zone; |
| 138 | auto format_ptr = make_unsafe_uniq_array<char>(n: format.size() + 1); |
| 139 | for (size_t i = 0; i < format.size(); i++) { |
| 140 | format_ptr[i] = format[i]; |
| 141 | } |
| 142 | format_ptr[format.size()] = '\0'; |
| 143 | root_holder.owned_type_names.push_back(x: std::move(format_ptr)); |
| 144 | child.format = root_holder.owned_type_names.back().get(); |
| 145 | break; |
| 146 | } |
| 147 | case LogicalTypeId::TIMESTAMP_SEC: |
| 148 | child.format = "tss:" ; |
| 149 | break; |
| 150 | case LogicalTypeId::TIMESTAMP_NS: |
| 151 | child.format = "tsn:" ; |
| 152 | break; |
| 153 | case LogicalTypeId::TIMESTAMP_MS: |
| 154 | child.format = "tsm:" ; |
| 155 | break; |
| 156 | case LogicalTypeId::INTERVAL: |
| 157 | child.format = "tin" ; |
| 158 | break; |
| 159 | case LogicalTypeId::DECIMAL: { |
| 160 | uint8_t width, scale; |
| 161 | type.GetDecimalProperties(width, scale); |
| 162 | string format = "d:" + to_string(val: width) + "," + to_string(val: scale); |
| 163 | auto format_ptr = make_unsafe_uniq_array<char>(n: format.size() + 1); |
| 164 | for (size_t i = 0; i < format.size(); i++) { |
| 165 | format_ptr[i] = format[i]; |
| 166 | } |
| 167 | format_ptr[format.size()] = '\0'; |
| 168 | root_holder.owned_type_names.push_back(x: std::move(format_ptr)); |
| 169 | child.format = root_holder.owned_type_names.back().get(); |
| 170 | break; |
| 171 | } |
| 172 | case LogicalTypeId::SQLNULL: { |
| 173 | child.format = "n" ; |
| 174 | break; |
| 175 | } |
| 176 | case LogicalTypeId::BLOB: |
| 177 | case LogicalTypeId::BIT: { |
| 178 | if (options.offset_size == ArrowOffsetSize::LARGE) { |
| 179 | child.format = "Z" ; |
| 180 | } else { |
| 181 | child.format = "z" ; |
| 182 | } |
| 183 | break; |
| 184 | } |
| 185 | case LogicalTypeId::LIST: { |
| 186 | child.format = "+l" ; |
| 187 | child.n_children = 1; |
| 188 | root_holder.nested_children.emplace_back(); |
| 189 | root_holder.nested_children.back().resize(new_size: 1); |
| 190 | root_holder.nested_children_ptr.emplace_back(); |
| 191 | root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]); |
| 192 | InitializeChild(child&: root_holder.nested_children.back()[0]); |
| 193 | child.children = &root_holder.nested_children_ptr.back()[0]; |
| 194 | child.children[0]->name = "l" ; |
| 195 | SetArrowFormat(root_holder, child&: **child.children, type: ListType::GetChildType(type), options); |
| 196 | break; |
| 197 | } |
| 198 | case LogicalTypeId::STRUCT: { |
| 199 | child.format = "+s" ; |
| 200 | auto &child_types = StructType::GetChildTypes(type); |
| 201 | child.n_children = child_types.size(); |
| 202 | root_holder.nested_children.emplace_back(); |
| 203 | root_holder.nested_children.back().resize(new_size: child_types.size()); |
| 204 | root_holder.nested_children_ptr.emplace_back(); |
| 205 | root_holder.nested_children_ptr.back().resize(new_size: child_types.size()); |
| 206 | for (idx_t type_idx = 0; type_idx < child_types.size(); type_idx++) { |
| 207 | root_holder.nested_children_ptr.back()[type_idx] = &root_holder.nested_children.back()[type_idx]; |
| 208 | } |
| 209 | child.children = &root_holder.nested_children_ptr.back()[0]; |
| 210 | for (size_t type_idx = 0; type_idx < child_types.size(); type_idx++) { |
| 211 | |
| 212 | InitializeChild(child&: *child.children[type_idx]); |
| 213 | |
| 214 | auto &struct_col_name = child_types[type_idx].first; |
| 215 | auto name_ptr = make_unsafe_uniq_array<char>(n: struct_col_name.size() + 1); |
| 216 | for (size_t i = 0; i < struct_col_name.size(); i++) { |
| 217 | name_ptr[i] = struct_col_name[i]; |
| 218 | } |
| 219 | name_ptr[struct_col_name.size()] = '\0'; |
| 220 | root_holder.owned_type_names.push_back(x: std::move(name_ptr)); |
| 221 | |
| 222 | child.children[type_idx]->name = root_holder.owned_type_names.back().get(); |
| 223 | SetArrowFormat(root_holder, child&: *child.children[type_idx], type: child_types[type_idx].second, options); |
| 224 | } |
| 225 | break; |
| 226 | } |
| 227 | case LogicalTypeId::MAP: { |
| 228 | SetArrowMapFormat(root_holder, child, type, options); |
| 229 | break; |
| 230 | } |
| 231 | case LogicalTypeId::ENUM: { |
| 232 | // TODO what do we do with pointer enums here? |
| 233 | switch (EnumType::GetPhysicalType(type)) { |
| 234 | case PhysicalType::UINT8: |
| 235 | child.format = "C" ; |
| 236 | break; |
| 237 | case PhysicalType::UINT16: |
| 238 | child.format = "S" ; |
| 239 | break; |
| 240 | case PhysicalType::UINT32: |
| 241 | child.format = "I" ; |
| 242 | break; |
| 243 | default: |
| 244 | throw InternalException("Unsupported Enum Internal Type" ); |
| 245 | } |
| 246 | root_holder.nested_children.emplace_back(); |
| 247 | root_holder.nested_children.back().resize(new_size: 1); |
| 248 | root_holder.nested_children_ptr.emplace_back(); |
| 249 | root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]); |
| 250 | InitializeChild(child&: root_holder.nested_children.back()[0]); |
| 251 | child.dictionary = root_holder.nested_children_ptr.back()[0]; |
| 252 | child.dictionary->format = "u" ; |
| 253 | break; |
| 254 | } |
| 255 | default: |
| 256 | throw InternalException("Unsupported Arrow type " + type.ToString()); |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | void ArrowConverter::ToArrowSchema(ArrowSchema *out_schema, const vector<LogicalType> &types, |
| 261 | const vector<string> &names, const ArrowOptions &options) { |
| 262 | D_ASSERT(out_schema); |
| 263 | D_ASSERT(types.size() == names.size()); |
| 264 | idx_t column_count = types.size(); |
| 265 | // Allocate as unique_ptr first to cleanup properly on error |
| 266 | auto root_holder = make_uniq<DuckDBArrowSchemaHolder>(); |
| 267 | |
| 268 | // Allocate the children |
| 269 | root_holder->children.resize(new_size: column_count); |
| 270 | root_holder->children_ptrs.resize(new_size: column_count, x: nullptr); |
| 271 | for (size_t i = 0; i < column_count; ++i) { |
| 272 | root_holder->children_ptrs[i] = &root_holder->children[i]; |
| 273 | } |
| 274 | out_schema->children = root_holder->children_ptrs.data(); |
| 275 | out_schema->n_children = column_count; |
| 276 | |
| 277 | // Store the schema |
| 278 | out_schema->format = "+s" ; // struct apparently |
| 279 | out_schema->flags = 0; |
| 280 | out_schema->metadata = nullptr; |
| 281 | out_schema->name = "duckdb_query_result" ; |
| 282 | out_schema->dictionary = nullptr; |
| 283 | |
| 284 | // Configure all child schemas |
| 285 | for (idx_t col_idx = 0; col_idx < column_count; col_idx++) { |
| 286 | |
| 287 | auto &child = root_holder->children[col_idx]; |
| 288 | InitializeChild(child, name: names[col_idx]); |
| 289 | SetArrowFormat(root_holder&: *root_holder, child, type: types[col_idx], options); |
| 290 | } |
| 291 | |
| 292 | // Release ownership to caller |
| 293 | out_schema->private_data = root_holder.release(); |
| 294 | out_schema->release = ReleaseDuckDBArrowSchema; |
| 295 | } |
| 296 | |
| 297 | } // namespace duckdb |
| 298 | |