1#include "duckdb/common/types/data_chunk.hpp"
2#include "duckdb/common/types/bit.hpp"
3#include "duckdb/common/arrow/arrow.hpp"
4#include "duckdb/common/arrow/arrow_converter.hpp"
5#include "duckdb/common/exception.hpp"
6#include "duckdb/common/helper.hpp"
7#include "duckdb/common/serializer.hpp"
8#include "duckdb/common/types/interval.hpp"
9#include "duckdb/common/types/sel_cache.hpp"
10#include "duckdb/common/types/vector_cache.hpp"
11#include "duckdb/common/unordered_map.hpp"
12#include "duckdb/common/vector.hpp"
13#include <list>
14#include "duckdb/common/arrow/arrow_appender.hpp"
15
16namespace duckdb {
17
18void ArrowConverter::ToArrowArray(DataChunk &input, ArrowArray *out_array, ArrowOptions options) {
19 ArrowAppender appender(input.GetTypes(), input.size(), std::move(options));
20 appender.Append(input, from: 0, to: input.size(), input_size: input.size());
21 *out_array = appender.Finalize();
22}
23
24//===--------------------------------------------------------------------===//
25// Arrow Schema
26//===--------------------------------------------------------------------===//
27struct DuckDBArrowSchemaHolder {
28 // unused in children
29 vector<ArrowSchema> children;
30 // unused in children
31 vector<ArrowSchema *> children_ptrs;
32 //! used for nested structures
33 std::list<vector<ArrowSchema>> nested_children;
34 std::list<vector<ArrowSchema *>> nested_children_ptr;
35 //! This holds strings created to represent decimal types
36 vector<unsafe_unique_array<char>> owned_type_names;
37};
38
39static void ReleaseDuckDBArrowSchema(ArrowSchema *schema) {
40 if (!schema || !schema->release) {
41 return;
42 }
43 schema->release = nullptr;
44 auto holder = static_cast<DuckDBArrowSchemaHolder *>(schema->private_data);
45 delete holder;
46}
47
48void InitializeChild(ArrowSchema &child, const string &name = "") {
49 //! Child is cleaned up by parent
50 child.private_data = nullptr;
51 child.release = ReleaseDuckDBArrowSchema;
52
53 //! Store the child schema
54 child.flags = ARROW_FLAG_NULLABLE;
55 child.name = name.c_str();
56 child.n_children = 0;
57 child.children = nullptr;
58 child.metadata = nullptr;
59 child.dictionary = nullptr;
60}
61void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type,
62 const ArrowOptions &options);
63
64void SetArrowMapFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type,
65 const ArrowOptions &options) {
66 child.format = "+m";
67 //! Map has one child which is a struct
68 child.n_children = 1;
69 root_holder.nested_children.emplace_back();
70 root_holder.nested_children.back().resize(new_size: 1);
71 root_holder.nested_children_ptr.emplace_back();
72 root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]);
73 InitializeChild(child&: root_holder.nested_children.back()[0]);
74 child.children = &root_holder.nested_children_ptr.back()[0];
75 child.children[0]->name = "entries";
76 SetArrowFormat(root_holder, child&: **child.children, type: ListType::GetChildType(type), options);
77}
78
79void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, const LogicalType &type,
80 const ArrowOptions &options) {
81 switch (type.id()) {
82 case LogicalTypeId::BOOLEAN:
83 child.format = "b";
84 break;
85 case LogicalTypeId::TINYINT:
86 child.format = "c";
87 break;
88 case LogicalTypeId::SMALLINT:
89 child.format = "s";
90 break;
91 case LogicalTypeId::INTEGER:
92 child.format = "i";
93 break;
94 case LogicalTypeId::BIGINT:
95 child.format = "l";
96 break;
97 case LogicalTypeId::UTINYINT:
98 child.format = "C";
99 break;
100 case LogicalTypeId::USMALLINT:
101 child.format = "S";
102 break;
103 case LogicalTypeId::UINTEGER:
104 child.format = "I";
105 break;
106 case LogicalTypeId::UBIGINT:
107 child.format = "L";
108 break;
109 case LogicalTypeId::FLOAT:
110 child.format = "f";
111 break;
112 case LogicalTypeId::HUGEINT:
113 child.format = "d:38,0";
114 break;
115 case LogicalTypeId::DOUBLE:
116 child.format = "g";
117 break;
118 case LogicalTypeId::UUID:
119 case LogicalTypeId::VARCHAR:
120 if (options.offset_size == ArrowOffsetSize::LARGE) {
121 child.format = "U";
122 } else {
123 child.format = "u";
124 }
125 break;
126 case LogicalTypeId::DATE:
127 child.format = "tdD";
128 break;
129 case LogicalTypeId::TIME:
130 case LogicalTypeId::TIME_TZ:
131 child.format = "ttu";
132 break;
133 case LogicalTypeId::TIMESTAMP:
134 child.format = "tsu:";
135 break;
136 case LogicalTypeId::TIMESTAMP_TZ: {
137 string format = "tsu:" + options.time_zone;
138 auto format_ptr = make_unsafe_uniq_array<char>(n: format.size() + 1);
139 for (size_t i = 0; i < format.size(); i++) {
140 format_ptr[i] = format[i];
141 }
142 format_ptr[format.size()] = '\0';
143 root_holder.owned_type_names.push_back(x: std::move(format_ptr));
144 child.format = root_holder.owned_type_names.back().get();
145 break;
146 }
147 case LogicalTypeId::TIMESTAMP_SEC:
148 child.format = "tss:";
149 break;
150 case LogicalTypeId::TIMESTAMP_NS:
151 child.format = "tsn:";
152 break;
153 case LogicalTypeId::TIMESTAMP_MS:
154 child.format = "tsm:";
155 break;
156 case LogicalTypeId::INTERVAL:
157 child.format = "tin";
158 break;
159 case LogicalTypeId::DECIMAL: {
160 uint8_t width, scale;
161 type.GetDecimalProperties(width, scale);
162 string format = "d:" + to_string(val: width) + "," + to_string(val: scale);
163 auto format_ptr = make_unsafe_uniq_array<char>(n: format.size() + 1);
164 for (size_t i = 0; i < format.size(); i++) {
165 format_ptr[i] = format[i];
166 }
167 format_ptr[format.size()] = '\0';
168 root_holder.owned_type_names.push_back(x: std::move(format_ptr));
169 child.format = root_holder.owned_type_names.back().get();
170 break;
171 }
172 case LogicalTypeId::SQLNULL: {
173 child.format = "n";
174 break;
175 }
176 case LogicalTypeId::BLOB:
177 case LogicalTypeId::BIT: {
178 if (options.offset_size == ArrowOffsetSize::LARGE) {
179 child.format = "Z";
180 } else {
181 child.format = "z";
182 }
183 break;
184 }
185 case LogicalTypeId::LIST: {
186 child.format = "+l";
187 child.n_children = 1;
188 root_holder.nested_children.emplace_back();
189 root_holder.nested_children.back().resize(new_size: 1);
190 root_holder.nested_children_ptr.emplace_back();
191 root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]);
192 InitializeChild(child&: root_holder.nested_children.back()[0]);
193 child.children = &root_holder.nested_children_ptr.back()[0];
194 child.children[0]->name = "l";
195 SetArrowFormat(root_holder, child&: **child.children, type: ListType::GetChildType(type), options);
196 break;
197 }
198 case LogicalTypeId::STRUCT: {
199 child.format = "+s";
200 auto &child_types = StructType::GetChildTypes(type);
201 child.n_children = child_types.size();
202 root_holder.nested_children.emplace_back();
203 root_holder.nested_children.back().resize(new_size: child_types.size());
204 root_holder.nested_children_ptr.emplace_back();
205 root_holder.nested_children_ptr.back().resize(new_size: child_types.size());
206 for (idx_t type_idx = 0; type_idx < child_types.size(); type_idx++) {
207 root_holder.nested_children_ptr.back()[type_idx] = &root_holder.nested_children.back()[type_idx];
208 }
209 child.children = &root_holder.nested_children_ptr.back()[0];
210 for (size_t type_idx = 0; type_idx < child_types.size(); type_idx++) {
211
212 InitializeChild(child&: *child.children[type_idx]);
213
214 auto &struct_col_name = child_types[type_idx].first;
215 auto name_ptr = make_unsafe_uniq_array<char>(n: struct_col_name.size() + 1);
216 for (size_t i = 0; i < struct_col_name.size(); i++) {
217 name_ptr[i] = struct_col_name[i];
218 }
219 name_ptr[struct_col_name.size()] = '\0';
220 root_holder.owned_type_names.push_back(x: std::move(name_ptr));
221
222 child.children[type_idx]->name = root_holder.owned_type_names.back().get();
223 SetArrowFormat(root_holder, child&: *child.children[type_idx], type: child_types[type_idx].second, options);
224 }
225 break;
226 }
227 case LogicalTypeId::MAP: {
228 SetArrowMapFormat(root_holder, child, type, options);
229 break;
230 }
231 case LogicalTypeId::ENUM: {
232 // TODO what do we do with pointer enums here?
233 switch (EnumType::GetPhysicalType(type)) {
234 case PhysicalType::UINT8:
235 child.format = "C";
236 break;
237 case PhysicalType::UINT16:
238 child.format = "S";
239 break;
240 case PhysicalType::UINT32:
241 child.format = "I";
242 break;
243 default:
244 throw InternalException("Unsupported Enum Internal Type");
245 }
246 root_holder.nested_children.emplace_back();
247 root_holder.nested_children.back().resize(new_size: 1);
248 root_holder.nested_children_ptr.emplace_back();
249 root_holder.nested_children_ptr.back().push_back(x: &root_holder.nested_children.back()[0]);
250 InitializeChild(child&: root_holder.nested_children.back()[0]);
251 child.dictionary = root_holder.nested_children_ptr.back()[0];
252 child.dictionary->format = "u";
253 break;
254 }
255 default:
256 throw InternalException("Unsupported Arrow type " + type.ToString());
257 }
258}
259
260void ArrowConverter::ToArrowSchema(ArrowSchema *out_schema, const vector<LogicalType> &types,
261 const vector<string> &names, const ArrowOptions &options) {
262 D_ASSERT(out_schema);
263 D_ASSERT(types.size() == names.size());
264 idx_t column_count = types.size();
265 // Allocate as unique_ptr first to cleanup properly on error
266 auto root_holder = make_uniq<DuckDBArrowSchemaHolder>();
267
268 // Allocate the children
269 root_holder->children.resize(new_size: column_count);
270 root_holder->children_ptrs.resize(new_size: column_count, x: nullptr);
271 for (size_t i = 0; i < column_count; ++i) {
272 root_holder->children_ptrs[i] = &root_holder->children[i];
273 }
274 out_schema->children = root_holder->children_ptrs.data();
275 out_schema->n_children = column_count;
276
277 // Store the schema
278 out_schema->format = "+s"; // struct apparently
279 out_schema->flags = 0;
280 out_schema->metadata = nullptr;
281 out_schema->name = "duckdb_query_result";
282 out_schema->dictionary = nullptr;
283
284 // Configure all child schemas
285 for (idx_t col_idx = 0; col_idx < column_count; col_idx++) {
286
287 auto &child = root_holder->children[col_idx];
288 InitializeChild(child, name: names[col_idx]);
289 SetArrowFormat(root_holder&: *root_holder, child, type: types[col_idx], options);
290 }
291
292 // Release ownership to caller
293 out_schema->private_data = root_holder.release();
294 out_schema->release = ReleaseDuckDBArrowSchema;
295}
296
297} // namespace duckdb
298