1 | #include "duckdb/common/types/data_chunk.hpp" |
2 | #include "duckdb/common/types/bit.hpp" |
3 | #include "duckdb/common/arrow/arrow.hpp" |
4 | #include "duckdb/common/arrow/arrow_converter.hpp" |
5 | #include "duckdb/common/exception.hpp" |
6 | #include "duckdb/common/helper.hpp" |
7 | #include "duckdb/common/serializer.hpp" |
8 | #include "duckdb/common/types/interval.hpp" |
9 | #include "duckdb/common/types/sel_cache.hpp" |
10 | #include "duckdb/common/types/vector_cache.hpp" |
11 | #include "duckdb/common/unordered_map.hpp" |
12 | #include "duckdb/common/vector.hpp" |
13 | #include <list> |
14 | #include "duckdb/common/arrow/arrow_appender.hpp" |
15 | |
16 | namespace duckdb { |
17 | |
18 | void ArrowConverter::ToArrowArray(DataChunk &input, ArrowArray *out_array, ArrowOptions options) { |
19 | ArrowAppender appender(input.GetTypes(), input.size(), std::move(options)); |
20 | appender.Append(input, from: 0, to: input.size(), input_size: input.size()); |
21 | *out_array = appender.Finalize(); |
22 | } |
23 | |
24 | //===--------------------------------------------------------------------===// |
25 | // Arrow Schema |
26 | //===--------------------------------------------------------------------===// |
27 | struct DuckDBArrowSchemaHolder { |
28 | // unused in children |
29 | vector<ArrowSchema> children; |
30 | // unused in children |
31 | vector<ArrowSchema *> children_ptrs; |
32 | //! used for nested structures |
33 | std::list<vector<ArrowSchema>> nested_children; |
34 | std::list<vector<ArrowSchema *>> nested_children_ptr; |
35 | //! This holds strings created to represent decimal types |
36 | vector<unsafe_unique_array<char>> owned_type_names; |
37 | }; |
38 | |
39 | static void ReleaseDuckDBArrowSchema(ArrowSchema *schema) { |
40 | if (!schema || !schema->release) { |
41 | return; |
42 | } |
43 | schema->release = nullptr; |
44 | auto holder = static_cast<DuckDBArrowSchemaHolder *>(schema->private_data); |
45 | delete holder; |
46 | } |
47 | |
48 | void InitializeChild(ArrowSchema &child, const string &name = "" ) { |
49 | //! Child is cleaned up by parent |
50 | child.private_data = nullptr; |
51 | child.release = ReleaseDuckDBArrowSchema; |
52 | |
53 | //! Store the child schema |
54 | child.flags = ARROW_FLAG_NULLABLE; |
55 | child.name = name.c_str(); |
56 | child.n_children = 0; |
57 | child.children = nullptr; |
58 | child.metadata = nullptr; |
59 | child.dictionary = nullptr; |
60 | } |
61 | void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type, |
62 | const ArrowOptions &options); |
63 | |
64 | void SetArrowMapFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type, |
65 | const ArrowOptions &options) { |
66 | child.format = "+m" ; |
67 | //! Map has one child which is a struct |
68 | child.n_children = 1; |
69 | root_holder.nested_children.emplace_back(); |
70 | root_holder.nested_children.back().resize(new_size: 1); |
71 | root_holder.nested_children_ptr.emplace_back(); |
72 | root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]); |
73 | InitializeChild(child&: root_holder.nested_children.back()[0]); |
74 | child.children = &root_holder.nested_children_ptr.back()[0]; |
75 | child.children[0]->name = "entries" ; |
76 | SetArrowFormat(root_holder, child&: **child.children, type: ListType::GetChildType(type), options); |
77 | } |
78 | |
79 | void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type, |
80 | const ArrowOptions &options) { |
81 | switch (type.id()) { |
82 | case LogicalTypeId::BOOLEAN: |
83 | child.format = "b" ; |
84 | break; |
85 | case LogicalTypeId::TINYINT: |
86 | child.format = "c" ; |
87 | break; |
88 | case LogicalTypeId::SMALLINT: |
89 | child.format = "s" ; |
90 | break; |
91 | case LogicalTypeId::INTEGER: |
92 | child.format = "i" ; |
93 | break; |
94 | case LogicalTypeId::BIGINT: |
95 | child.format = "l" ; |
96 | break; |
97 | case LogicalTypeId::UTINYINT: |
98 | child.format = "C" ; |
99 | break; |
100 | case LogicalTypeId::USMALLINT: |
101 | child.format = "S" ; |
102 | break; |
103 | case LogicalTypeId::UINTEGER: |
104 | child.format = "I" ; |
105 | break; |
106 | case LogicalTypeId::UBIGINT: |
107 | child.format = "L" ; |
108 | break; |
109 | case LogicalTypeId::FLOAT: |
110 | child.format = "f" ; |
111 | break; |
112 | case LogicalTypeId::HUGEINT: |
113 | child.format = "d:38,0" ; |
114 | break; |
115 | case LogicalTypeId::DOUBLE: |
116 | child.format = "g" ; |
117 | break; |
118 | case LogicalTypeId::UUID: |
119 | case LogicalTypeId::VARCHAR: |
120 | if (options.offset_size == ArrowOffsetSize::LARGE) { |
121 | child.format = "U" ; |
122 | } else { |
123 | child.format = "u" ; |
124 | } |
125 | break; |
126 | case LogicalTypeId::DATE: |
127 | child.format = "tdD" ; |
128 | break; |
129 | case LogicalTypeId::TIME: |
130 | case LogicalTypeId::TIME_TZ: |
131 | child.format = "ttu" ; |
132 | break; |
133 | case LogicalTypeId::TIMESTAMP: |
134 | child.format = "tsu:" ; |
135 | break; |
136 | case LogicalTypeId::TIMESTAMP_TZ: { |
137 | string format = "tsu:" + options.time_zone; |
138 | auto format_ptr = make_unsafe_uniq_array<char>(n: format.size() + 1); |
139 | for (size_t i = 0; i < format.size(); i++) { |
140 | format_ptr[i] = format[i]; |
141 | } |
142 | format_ptr[format.size()] = '\0'; |
143 | root_holder.owned_type_names.push_back(x: std::move(format_ptr)); |
144 | child.format = root_holder.owned_type_names.back().get(); |
145 | break; |
146 | } |
147 | case LogicalTypeId::TIMESTAMP_SEC: |
148 | child.format = "tss:" ; |
149 | break; |
150 | case LogicalTypeId::TIMESTAMP_NS: |
151 | child.format = "tsn:" ; |
152 | break; |
153 | case LogicalTypeId::TIMESTAMP_MS: |
154 | child.format = "tsm:" ; |
155 | break; |
156 | case LogicalTypeId::INTERVAL: |
157 | child.format = "tin" ; |
158 | break; |
159 | case LogicalTypeId::DECIMAL: { |
160 | uint8_t width, scale; |
161 | type.GetDecimalProperties(width, scale); |
162 | string format = "d:" + to_string(val: width) + "," + to_string(val: scale); |
163 | auto format_ptr = make_unsafe_uniq_array<char>(n: format.size() + 1); |
164 | for (size_t i = 0; i < format.size(); i++) { |
165 | format_ptr[i] = format[i]; |
166 | } |
167 | format_ptr[format.size()] = '\0'; |
168 | root_holder.owned_type_names.push_back(x: std::move(format_ptr)); |
169 | child.format = root_holder.owned_type_names.back().get(); |
170 | break; |
171 | } |
172 | case LogicalTypeId::SQLNULL: { |
173 | child.format = "n" ; |
174 | break; |
175 | } |
176 | case LogicalTypeId::BLOB: |
177 | case LogicalTypeId::BIT: { |
178 | if (options.offset_size == ArrowOffsetSize::LARGE) { |
179 | child.format = "Z" ; |
180 | } else { |
181 | child.format = "z" ; |
182 | } |
183 | break; |
184 | } |
185 | case LogicalTypeId::LIST: { |
186 | child.format = "+l" ; |
187 | child.n_children = 1; |
188 | root_holder.nested_children.emplace_back(); |
189 | root_holder.nested_children.back().resize(new_size: 1); |
190 | root_holder.nested_children_ptr.emplace_back(); |
191 | root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]); |
192 | InitializeChild(child&: root_holder.nested_children.back()[0]); |
193 | child.children = &root_holder.nested_children_ptr.back()[0]; |
194 | child.children[0]->name = "l" ; |
195 | SetArrowFormat(root_holder, child&: **child.children, type: ListType::GetChildType(type), options); |
196 | break; |
197 | } |
198 | case LogicalTypeId::STRUCT: { |
199 | child.format = "+s" ; |
200 | auto &child_types = StructType::GetChildTypes(type); |
201 | child.n_children = child_types.size(); |
202 | root_holder.nested_children.emplace_back(); |
203 | root_holder.nested_children.back().resize(new_size: child_types.size()); |
204 | root_holder.nested_children_ptr.emplace_back(); |
205 | root_holder.nested_children_ptr.back().resize(new_size: child_types.size()); |
206 | for (idx_t type_idx = 0; type_idx < child_types.size(); type_idx++) { |
207 | root_holder.nested_children_ptr.back()[type_idx] = &root_holder.nested_children.back()[type_idx]; |
208 | } |
209 | child.children = &root_holder.nested_children_ptr.back()[0]; |
210 | for (size_t type_idx = 0; type_idx < child_types.size(); type_idx++) { |
211 | |
212 | InitializeChild(child&: *child.children[type_idx]); |
213 | |
214 | auto &struct_col_name = child_types[type_idx].first; |
215 | auto name_ptr = make_unsafe_uniq_array<char>(n: struct_col_name.size() + 1); |
216 | for (size_t i = 0; i < struct_col_name.size(); i++) { |
217 | name_ptr[i] = struct_col_name[i]; |
218 | } |
219 | name_ptr[struct_col_name.size()] = '\0'; |
220 | root_holder.owned_type_names.push_back(x: std::move(name_ptr)); |
221 | |
222 | child.children[type_idx]->name = root_holder.owned_type_names.back().get(); |
223 | SetArrowFormat(root_holder, child&: *child.children[type_idx], type: child_types[type_idx].second, options); |
224 | } |
225 | break; |
226 | } |
227 | case LogicalTypeId::MAP: { |
228 | SetArrowMapFormat(root_holder, child, type, options); |
229 | break; |
230 | } |
231 | case LogicalTypeId::ENUM: { |
232 | // TODO what do we do with pointer enums here? |
233 | switch (EnumType::GetPhysicalType(type)) { |
234 | case PhysicalType::UINT8: |
235 | child.format = "C" ; |
236 | break; |
237 | case PhysicalType::UINT16: |
238 | child.format = "S" ; |
239 | break; |
240 | case PhysicalType::UINT32: |
241 | child.format = "I" ; |
242 | break; |
243 | default: |
244 | throw InternalException("Unsupported Enum Internal Type" ); |
245 | } |
246 | root_holder.nested_children.emplace_back(); |
247 | root_holder.nested_children.back().resize(new_size: 1); |
248 | root_holder.nested_children_ptr.emplace_back(); |
249 | root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]); |
250 | InitializeChild(child&: root_holder.nested_children.back()[0]); |
251 | child.dictionary = root_holder.nested_children_ptr.back()[0]; |
252 | child.dictionary->format = "u" ; |
253 | break; |
254 | } |
255 | default: |
256 | throw InternalException("Unsupported Arrow type " + type.ToString()); |
257 | } |
258 | } |
259 | |
260 | void ArrowConverter::ToArrowSchema(ArrowSchema *out_schema, const vector<LogicalType> &types, |
261 | const vector<string> &names, const ArrowOptions &options) { |
262 | D_ASSERT(out_schema); |
263 | D_ASSERT(types.size() == names.size()); |
264 | idx_t column_count = types.size(); |
265 | // Allocate as unique_ptr first to cleanup properly on error |
266 | auto root_holder = make_uniq<DuckDBArrowSchemaHolder>(); |
267 | |
268 | // Allocate the children |
269 | root_holder->children.resize(new_size: column_count); |
270 | root_holder->children_ptrs.resize(new_size: column_count, x: nullptr); |
271 | for (size_t i = 0; i < column_count; ++i) { |
272 | root_holder->children_ptrs[i] = &root_holder->children[i]; |
273 | } |
274 | out_schema->children = root_holder->children_ptrs.data(); |
275 | out_schema->n_children = column_count; |
276 | |
277 | // Store the schema |
278 | out_schema->format = "+s" ; // struct apparently |
279 | out_schema->flags = 0; |
280 | out_schema->metadata = nullptr; |
281 | out_schema->name = "duckdb_query_result" ; |
282 | out_schema->dictionary = nullptr; |
283 | |
284 | // Configure all child schemas |
285 | for (idx_t col_idx = 0; col_idx < column_count; col_idx++) { |
286 | |
287 | auto &child = root_holder->children[col_idx]; |
288 | InitializeChild(child, name: names[col_idx]); |
289 | SetArrowFormat(root_holder&: *root_holder, child, type: types[col_idx], options); |
290 | } |
291 | |
292 | // Release ownership to caller |
293 | out_schema->private_data = root_holder.release(); |
294 | out_schema->release = ReleaseDuckDBArrowSchema; |
295 | } |
296 | |
297 | } // namespace duckdb |
298 | |