1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_IPC_TEST_COMMON_H |
19 | #define ARROW_IPC_TEST_COMMON_H |
20 | |
21 | #include <algorithm> |
22 | #include <cstdint> |
23 | #include <memory> |
24 | #include <numeric> |
25 | #include <string> |
26 | #include <vector> |
27 | |
28 | #include "arrow/array.h" |
29 | #include "arrow/buffer.h" |
30 | #include "arrow/builder.h" |
31 | #include "arrow/memory_pool.h" |
32 | #include "arrow/pretty_print.h" |
33 | #include "arrow/record_batch.h" |
34 | #include "arrow/status.h" |
35 | #include "arrow/test-util.h" |
36 | #include "arrow/type.h" |
37 | #include "arrow/util/bit-util.h" |
38 | |
39 | namespace arrow { |
40 | namespace ipc { |
41 | |
42 | static inline void CompareArraysDetailed(int index, const Array& result, |
43 | const Array& expected) { |
44 | if (!expected.Equals(result)) { |
45 | std::stringstream pp_result; |
46 | std::stringstream pp_expected; |
47 | |
48 | ASSERT_OK(PrettyPrint(expected, 0, &pp_expected)); |
49 | ASSERT_OK(PrettyPrint(result, 0, &pp_result)); |
50 | |
51 | FAIL() << "Index: " << index << " Expected: " << pp_expected.str() |
52 | << "\nGot: " << pp_result.str(); |
53 | } |
54 | } |
55 | |
56 | static inline void CompareBatchColumnsDetailed(const RecordBatch& result, |
57 | const RecordBatch& expected) { |
58 | for (int i = 0; i < expected.num_columns(); ++i) { |
59 | auto left = result.column(i); |
60 | auto right = expected.column(i); |
61 | CompareArraysDetailed(i, *left, *right); |
62 | } |
63 | } |
64 | |
65 | const auto kListInt32 = list(int32()); |
66 | const auto kListListInt32 = list(kListInt32); |
67 | |
68 | Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, |
69 | std::shared_ptr<Array>* out, uint32_t seed = 0) { |
70 | std::shared_ptr<ResizableBuffer> data; |
71 | RETURN_NOT_OK(MakeRandomBuffer<int32_t>(length, pool, &data, seed)); |
72 | Int32Builder builder(int32(), pool); |
73 | RETURN_NOT_OK(builder.Resize(length)); |
74 | if (include_nulls) { |
75 | std::shared_ptr<ResizableBuffer> valid_bytes; |
76 | RETURN_NOT_OK(MakeRandomByteBuffer(length, pool, &valid_bytes)); |
77 | RETURN_NOT_OK(builder.AppendValues(reinterpret_cast<const int32_t*>(data->data()), |
78 | length, valid_bytes->data())); |
79 | return builder.Finish(out); |
80 | } |
81 | RETURN_NOT_OK( |
82 | builder.AppendValues(reinterpret_cast<const int32_t*>(data->data()), length)); |
83 | return builder.Finish(out); |
84 | } |
85 | |
86 | Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists, |
87 | bool include_nulls, MemoryPool* pool, |
88 | std::shared_ptr<Array>* out) { |
89 | // Create the null list values |
90 | std::vector<uint8_t> valid_lists(num_lists); |
91 | const double null_percent = include_nulls ? 0.1 : 0; |
92 | random_null_bytes(num_lists, null_percent, valid_lists.data()); |
93 | |
94 | // Create list offsets |
95 | const int max_list_size = 10; |
96 | |
97 | std::vector<int32_t> list_sizes(num_lists, 0); |
98 | std::vector<int32_t> offsets( |
99 | num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. |
100 | const uint32_t seed = static_cast<uint32_t>(child_array->length()); |
101 | |
102 | if (num_lists > 0) { |
103 | rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); |
104 | // make sure sizes are consistent with null |
105 | std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), |
106 | list_sizes.begin(), |
107 | [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); |
108 | std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); |
109 | |
110 | // Force invariants |
111 | const int32_t child_length = static_cast<int32_t>(child_array->length()); |
112 | offsets[0] = 0; |
113 | std::replace_if(offsets.begin(), offsets.end(), |
114 | [child_length](int32_t offset) { return offset > child_length; }, |
115 | child_length); |
116 | } |
117 | |
118 | offsets[num_lists] = static_cast<int32_t>(child_array->length()); |
119 | |
120 | /// TODO(wesm): Implement support for nulls in ListArray::FromArrays |
121 | std::shared_ptr<Buffer> null_bitmap, offsets_buffer; |
122 | RETURN_NOT_OK(GetBitmapFromVector(valid_lists, &null_bitmap)); |
123 | RETURN_NOT_OK(CopyBufferFromVector(offsets, pool, &offsets_buffer)); |
124 | |
125 | *out = std::make_shared<ListArray>(list(child_array->type()), num_lists, offsets_buffer, |
126 | child_array, null_bitmap, kUnknownNullCount); |
127 | return ValidateArray(**out); |
128 | } |
129 | |
130 | typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out); |
131 | |
132 | Status MakeRandomBooleanArray(const int length, bool include_nulls, |
133 | std::shared_ptr<Array>* out) { |
134 | std::vector<uint8_t> values(length); |
135 | random_null_bytes(length, 0.5, values.data()); |
136 | std::shared_ptr<Buffer> data; |
137 | RETURN_NOT_OK(BitUtil::BytesToBits(values, default_memory_pool(), &data)); |
138 | |
139 | if (include_nulls) { |
140 | std::vector<uint8_t> valid_bytes(length); |
141 | std::shared_ptr<Buffer> null_bitmap; |
142 | RETURN_NOT_OK(BitUtil::BytesToBits(valid_bytes, default_memory_pool(), &null_bitmap)); |
143 | random_null_bytes(length, 0.1, valid_bytes.data()); |
144 | *out = std::make_shared<BooleanArray>(length, data, null_bitmap, -1); |
145 | } else { |
146 | *out = std::make_shared<BooleanArray>(length, data, NULLPTR, 0); |
147 | } |
148 | return Status::OK(); |
149 | } |
150 | |
151 | Status MakeBooleanBatchSized(const int length, std::shared_ptr<RecordBatch>* out) { |
152 | // Make the schema |
153 | auto f0 = field("f0" , boolean()); |
154 | auto f1 = field("f1" , boolean()); |
155 | auto schema = ::arrow::schema({f0, f1}); |
156 | |
157 | std::shared_ptr<Array> a0, a1; |
158 | RETURN_NOT_OK(MakeRandomBooleanArray(length, true, &a0)); |
159 | RETURN_NOT_OK(MakeRandomBooleanArray(length, false, &a1)); |
160 | *out = RecordBatch::Make(schema, length, {a0, a1}); |
161 | return Status::OK(); |
162 | } |
163 | |
164 | Status MakeBooleanBatch(std::shared_ptr<RecordBatch>* out) { |
165 | return MakeBooleanBatchSized(1000, out); |
166 | } |
167 | |
168 | Status MakeIntBatchSized(int length, std::shared_ptr<RecordBatch>* out, |
169 | uint32_t seed = 0) { |
170 | // Make the schema |
171 | auto f0 = field("f0" , int32()); |
172 | auto f1 = field("f1" , int32()); |
173 | auto schema = ::arrow::schema({f0, f1}); |
174 | |
175 | // Example data |
176 | std::shared_ptr<Array> a0, a1; |
177 | MemoryPool* pool = default_memory_pool(); |
178 | RETURN_NOT_OK(MakeRandomInt32Array(length, false, pool, &a0, seed)); |
179 | RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &a1, seed + 1)); |
180 | *out = RecordBatch::Make(schema, length, {a0, a1}); |
181 | return Status::OK(); |
182 | } |
183 | |
184 | Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out) { |
185 | return MakeIntBatchSized(10, out); |
186 | } |
187 | |
188 | template <class Builder, class RawType> |
189 | Status MakeRandomBinaryArray(int64_t length, bool include_nulls, MemoryPool* pool, |
190 | std::shared_ptr<Array>* out) { |
191 | const std::vector<std::string> values = {"" , "" , "abc" , "123" , |
192 | "efg" , "456!@#!@#" , "12312" }; |
193 | Builder builder(pool); |
194 | const size_t values_len = values.size(); |
195 | for (int64_t i = 0; i < length; ++i) { |
196 | int64_t values_index = i % values_len; |
197 | if (include_nulls && values_index == 0) { |
198 | RETURN_NOT_OK(builder.AppendNull()); |
199 | } else { |
200 | const std::string& value = values[values_index]; |
201 | RETURN_NOT_OK(builder.Append(reinterpret_cast<const RawType*>(value.data()), |
202 | static_cast<int32_t>(value.size()))); |
203 | } |
204 | } |
205 | return builder.Finish(out); |
206 | } |
207 | |
208 | template <class Builder, class RawType> |
209 | Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls, |
210 | MemoryPool* pool, std::shared_ptr<Array>* out) { |
211 | Builder builder(pool); |
212 | for (int64_t i = 0; i < length; ++i) { |
213 | if (include_nulls && (i % 7 == 0)) { |
214 | RETURN_NOT_OK(builder.AppendNull()); |
215 | } else { |
216 | const std::string value = std::to_string(i); |
217 | RETURN_NOT_OK(builder.Append(reinterpret_cast<const RawType*>(value.data()), |
218 | static_cast<int32_t>(value.size()))); |
219 | } |
220 | } |
221 | return builder.Finish(out); |
222 | } |
223 | |
224 | Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out, |
225 | bool with_nulls = true) { |
226 | const int64_t length = 500; |
227 | auto string_type = utf8(); |
228 | auto binary_type = binary(); |
229 | auto f0 = field("f0" , string_type); |
230 | auto f1 = field("f1" , binary_type); |
231 | auto schema = ::arrow::schema({f0, f1}); |
232 | |
233 | std::shared_ptr<Array> a0, a1; |
234 | MemoryPool* pool = default_memory_pool(); |
235 | |
236 | // Quirk with RETURN_NOT_OK macro and templated functions |
237 | { |
238 | auto s = MakeBinaryArrayWithUniqueValues<StringBuilder, char>(length, with_nulls, |
239 | pool, &a0); |
240 | RETURN_NOT_OK(s); |
241 | } |
242 | |
243 | { |
244 | auto s = MakeBinaryArrayWithUniqueValues<BinaryBuilder, uint8_t>(length, with_nulls, |
245 | pool, &a1); |
246 | RETURN_NOT_OK(s); |
247 | } |
248 | *out = RecordBatch::Make(schema, length, {a0, a1}); |
249 | return Status::OK(); |
250 | } |
251 | |
252 | Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr<RecordBatch>* out) { |
253 | return MakeStringTypesRecordBatch(out, true); |
254 | } |
255 | |
256 | Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out) { |
257 | const int64_t length = 500; |
258 | auto f0 = field("f0" , null()); |
259 | auto schema = ::arrow::schema({f0}); |
260 | std::shared_ptr<Array> a0 = std::make_shared<NullArray>(length); |
261 | *out = RecordBatch::Make(schema, length, {a0}); |
262 | return Status::OK(); |
263 | } |
264 | |
265 | Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) { |
266 | // Make the schema |
267 | auto f0 = field("f0" , kListInt32); |
268 | auto f1 = field("f1" , kListListInt32); |
269 | auto f2 = field("f2" , int32()); |
270 | auto schema = ::arrow::schema({f0, f1, f2}); |
271 | |
272 | // Example data |
273 | |
274 | MemoryPool* pool = default_memory_pool(); |
275 | const int length = 200; |
276 | std::shared_ptr<Array> leaf_values, list_array, list_list_array, flat_array; |
277 | const bool include_nulls = true; |
278 | RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); |
279 | RETURN_NOT_OK( |
280 | MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); |
281 | RETURN_NOT_OK( |
282 | MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); |
283 | RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); |
284 | *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array}); |
285 | return Status::OK(); |
286 | } |
287 | |
288 | Status MakeZeroLengthRecordBatch(std::shared_ptr<RecordBatch>* out) { |
289 | // Make the schema |
290 | auto f0 = field("f0" , kListInt32); |
291 | auto f1 = field("f1" , kListListInt32); |
292 | auto f2 = field("f2" , int32()); |
293 | auto schema = ::arrow::schema({f0, f1, f2}); |
294 | |
295 | // Example data |
296 | MemoryPool* pool = default_memory_pool(); |
297 | const bool include_nulls = true; |
298 | std::shared_ptr<Array> leaf_values, list_array, list_list_array, flat_array; |
299 | RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); |
300 | RETURN_NOT_OK(MakeRandomListArray(leaf_values, 0, include_nulls, pool, &list_array)); |
301 | RETURN_NOT_OK( |
302 | MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); |
303 | RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); |
304 | *out = RecordBatch::Make(schema, 0, {list_array, list_list_array, flat_array}); |
305 | return Status::OK(); |
306 | } |
307 | |
308 | Status MakeNonNullRecordBatch(std::shared_ptr<RecordBatch>* out) { |
309 | // Make the schema |
310 | auto f0 = field("f0" , kListInt32); |
311 | auto f1 = field("f1" , kListListInt32); |
312 | auto f2 = field("f2" , int32()); |
313 | auto schema = ::arrow::schema({f0, f1, f2}); |
314 | |
315 | // Example data |
316 | MemoryPool* pool = default_memory_pool(); |
317 | const int length = 50; |
318 | std::shared_ptr<Array> leaf_values, list_array, list_list_array, flat_array; |
319 | |
320 | RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); |
321 | bool include_nulls = false; |
322 | RETURN_NOT_OK( |
323 | MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); |
324 | RETURN_NOT_OK( |
325 | MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); |
326 | RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); |
327 | *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array}); |
328 | return Status::OK(); |
329 | } |
330 | |
331 | Status MakeDeeplyNestedList(std::shared_ptr<RecordBatch>* out) { |
332 | const int batch_length = 5; |
333 | auto type = int32(); |
334 | |
335 | MemoryPool* pool = default_memory_pool(); |
336 | std::shared_ptr<Array> array; |
337 | const bool include_nulls = true; |
338 | RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); |
339 | for (int i = 0; i < 63; ++i) { |
340 | type = std::static_pointer_cast<DataType>(list(type)); |
341 | RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); |
342 | } |
343 | |
344 | auto f0 = field("f0" , type); |
345 | auto schema = ::arrow::schema({f0}); |
346 | std::vector<std::shared_ptr<Array>> arrays = {array}; |
347 | *out = RecordBatch::Make(schema, batch_length, arrays); |
348 | return Status::OK(); |
349 | } |
350 | |
351 | Status MakeStruct(std::shared_ptr<RecordBatch>* out) { |
352 | // reuse constructed list columns |
353 | std::shared_ptr<RecordBatch> list_batch; |
354 | RETURN_NOT_OK(MakeListRecordBatch(&list_batch)); |
355 | std::vector<std::shared_ptr<Array>> columns = { |
356 | list_batch->column(0), list_batch->column(1), list_batch->column(2)}; |
357 | auto list_schema = list_batch->schema(); |
358 | |
359 | // Define schema |
360 | std::shared_ptr<DataType> type(new StructType( |
361 | {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); |
362 | auto f0 = field("non_null_struct" , type); |
363 | auto f1 = field("null_struct" , type); |
364 | auto schema = ::arrow::schema({f0, f1}); |
365 | |
366 | // construct individual nullable/non-nullable struct arrays |
367 | std::shared_ptr<Array> no_nulls(new StructArray(type, list_batch->num_rows(), columns)); |
368 | std::vector<uint8_t> null_bytes(list_batch->num_rows(), 1); |
369 | null_bytes[0] = 0; |
370 | std::shared_ptr<Buffer> null_bitmask; |
371 | RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, default_memory_pool(), &null_bitmask)); |
372 | std::shared_ptr<Array> with_nulls( |
373 | new StructArray(type, list_batch->num_rows(), columns, null_bitmask, 1)); |
374 | |
375 | // construct batch |
376 | std::vector<std::shared_ptr<Array>> arrays = {no_nulls, with_nulls}; |
377 | *out = RecordBatch::Make(schema, list_batch->num_rows(), arrays); |
378 | return Status::OK(); |
379 | } |
380 | |
381 | Status MakeUnion(std::shared_ptr<RecordBatch>* out) { |
382 | // Define schema |
383 | std::vector<std::shared_ptr<Field>> union_types( |
384 | {field("u0" , int32()), field("u1" , uint8())}); |
385 | |
386 | std::vector<uint8_t> type_codes = {5, 10}; |
387 | auto sparse_type = |
388 | std::make_shared<UnionType>(union_types, type_codes, UnionMode::SPARSE); |
389 | |
390 | auto dense_type = |
391 | std::make_shared<UnionType>(union_types, type_codes, UnionMode::DENSE); |
392 | |
393 | auto f0 = field("sparse_nonnull" , sparse_type, false); |
394 | auto f1 = field("sparse" , sparse_type); |
395 | auto f2 = field("dense" , dense_type); |
396 | |
397 | auto schema = ::arrow::schema({f0, f1, f2}); |
398 | |
399 | // Create data |
400 | std::vector<std::shared_ptr<Array>> sparse_children(2); |
401 | std::vector<std::shared_ptr<Array>> dense_children(2); |
402 | |
403 | const int64_t length = 7; |
404 | |
405 | std::shared_ptr<Buffer> type_ids_buffer; |
406 | std::vector<uint8_t> type_ids = {5, 10, 5, 5, 10, 10, 5}; |
407 | RETURN_NOT_OK(CopyBufferFromVector(type_ids, default_memory_pool(), &type_ids_buffer)); |
408 | |
409 | std::vector<int32_t> u0_values = {0, 1, 2, 3, 4, 5, 6}; |
410 | ArrayFromVector<Int32Type, int32_t>(u0_values, &sparse_children[0]); |
411 | |
412 | std::vector<uint8_t> u1_values = {10, 11, 12, 13, 14, 15, 16}; |
413 | ArrayFromVector<UInt8Type, uint8_t>(u1_values, &sparse_children[1]); |
414 | |
415 | // dense children |
416 | u0_values = {0, 2, 3, 7}; |
417 | ArrayFromVector<Int32Type, int32_t>(u0_values, &dense_children[0]); |
418 | |
419 | u1_values = {11, 14, 15}; |
420 | ArrayFromVector<UInt8Type, uint8_t>(u1_values, &dense_children[1]); |
421 | |
422 | std::shared_ptr<Buffer> offsets_buffer; |
423 | std::vector<int32_t> offsets = {0, 0, 1, 2, 1, 2, 3}; |
424 | RETURN_NOT_OK(CopyBufferFromVector(offsets, default_memory_pool(), &offsets_buffer)); |
425 | |
426 | std::vector<uint8_t> null_bytes(length, 1); |
427 | null_bytes[2] = 0; |
428 | std::shared_ptr<Buffer> null_bitmask; |
429 | RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, default_memory_pool(), &null_bitmask)); |
430 | |
431 | // construct individual nullable/non-nullable struct arrays |
432 | auto sparse_no_nulls = |
433 | std::make_shared<UnionArray>(sparse_type, length, sparse_children, type_ids_buffer); |
434 | auto sparse = std::make_shared<UnionArray>(sparse_type, length, sparse_children, |
435 | type_ids_buffer, NULLPTR, null_bitmask, 1); |
436 | |
437 | auto dense = |
438 | std::make_shared<UnionArray>(dense_type, length, dense_children, type_ids_buffer, |
439 | offsets_buffer, null_bitmask, 1); |
440 | |
441 | // construct batch |
442 | std::vector<std::shared_ptr<Array>> arrays = {sparse_no_nulls, sparse, dense}; |
443 | *out = RecordBatch::Make(schema, length, arrays); |
444 | return Status::OK(); |
445 | } |
446 | |
447 | Status MakeDictionary(std::shared_ptr<RecordBatch>* out) { |
448 | const int64_t length = 6; |
449 | |
450 | std::vector<bool> is_valid = {true, true, false, true, true, true}; |
451 | std::shared_ptr<Array> dict1, dict2; |
452 | |
453 | std::vector<std::string> dict1_values = {"foo" , "bar" , "baz" }; |
454 | std::vector<std::string> dict2_values = {"foo" , "bar" , "baz" , "qux" }; |
455 | |
456 | ArrayFromVector<StringType, std::string>(dict1_values, &dict1); |
457 | ArrayFromVector<StringType, std::string>(dict2_values, &dict2); |
458 | |
459 | auto f0_type = arrow::dictionary(arrow::int32(), dict1); |
460 | auto f1_type = arrow::dictionary(arrow::int8(), dict1, true); |
461 | auto f2_type = arrow::dictionary(arrow::int32(), dict2); |
462 | |
463 | std::shared_ptr<Array> indices0, indices1, indices2; |
464 | std::vector<int32_t> indices0_values = {1, 2, -1, 0, 2, 0}; |
465 | std::vector<int8_t> indices1_values = {0, 0, 2, 2, 1, 1}; |
466 | std::vector<int32_t> indices2_values = {3, 0, 2, 1, 0, 2}; |
467 | |
468 | ArrayFromVector<Int32Type, int32_t>(is_valid, indices0_values, &indices0); |
469 | ArrayFromVector<Int8Type, int8_t>(is_valid, indices1_values, &indices1); |
470 | ArrayFromVector<Int32Type, int32_t>(is_valid, indices2_values, &indices2); |
471 | |
472 | auto a0 = std::make_shared<DictionaryArray>(f0_type, indices0); |
473 | auto a1 = std::make_shared<DictionaryArray>(f1_type, indices1); |
474 | auto a2 = std::make_shared<DictionaryArray>(f2_type, indices2); |
475 | |
476 | // List of dictionary-encoded string |
477 | auto f3_type = list(f1_type); |
478 | |
479 | std::vector<int32_t> list_offsets = {0, 0, 2, 2, 5, 6, 9}; |
480 | std::shared_ptr<Array> offsets, indices3; |
481 | ArrayFromVector<Int32Type, int32_t>(std::vector<bool>(list_offsets.size(), true), |
482 | list_offsets, &offsets); |
483 | |
484 | std::vector<int8_t> indices3_values = {0, 1, 2, 0, 1, 2, 0, 1, 2}; |
485 | std::vector<bool> is_valid3(9, true); |
486 | ArrayFromVector<Int8Type, int8_t>(is_valid3, indices3_values, &indices3); |
487 | |
488 | std::shared_ptr<Buffer> null_bitmap; |
489 | RETURN_NOT_OK(GetBitmapFromVector(is_valid, &null_bitmap)); |
490 | |
491 | std::shared_ptr<Array> a3 = std::make_shared<ListArray>( |
492 | f3_type, length, std::static_pointer_cast<PrimitiveArray>(offsets)->values(), |
493 | std::make_shared<DictionaryArray>(f1_type, indices3), null_bitmap, 1); |
494 | |
495 | // Dictionary-encoded list of integer |
496 | auto f4_value_type = list(int8()); |
497 | |
498 | std::shared_ptr<Array> offsets4, values4, indices4; |
499 | |
500 | std::vector<int32_t> list_offsets4 = {0, 2, 2, 3}; |
501 | ArrayFromVector<Int32Type, int32_t>(std::vector<bool>(4, true), list_offsets4, |
502 | &offsets4); |
503 | |
504 | std::vector<int8_t> list_values4 = {0, 1, 2}; |
505 | ArrayFromVector<Int8Type, int8_t>(std::vector<bool>(3, true), list_values4, &values4); |
506 | |
507 | auto dict3 = std::make_shared<ListArray>( |
508 | f4_value_type, 3, std::static_pointer_cast<PrimitiveArray>(offsets4)->values(), |
509 | values4); |
510 | |
511 | std::vector<int8_t> indices4_values = {0, 1, 2, 0, 1, 2}; |
512 | ArrayFromVector<Int8Type, int8_t>(is_valid, indices4_values, &indices4); |
513 | |
514 | auto f4_type = dictionary(int8(), dict3); |
515 | auto a4 = std::make_shared<DictionaryArray>(f4_type, indices4); |
516 | |
517 | // construct batch |
518 | auto schema = ::arrow::schema( |
519 | {field("dict1" , f0_type), field("sparse" , f1_type), field("dense" , f2_type), |
520 | field("list of encoded string" , f3_type), field("encoded list<int8>" , f4_type)}); |
521 | |
522 | std::vector<std::shared_ptr<Array>> arrays = {a0, a1, a2, a3, a4}; |
523 | |
524 | *out = RecordBatch::Make(schema, length, arrays); |
525 | return Status::OK(); |
526 | } |
527 | |
528 | Status MakeDictionaryFlat(std::shared_ptr<RecordBatch>* out) { |
529 | const int64_t length = 6; |
530 | |
531 | std::vector<bool> is_valid = {true, true, false, true, true, true}; |
532 | std::shared_ptr<Array> dict1, dict2; |
533 | |
534 | std::vector<std::string> dict1_values = {"foo" , "bar" , "baz" }; |
535 | std::vector<std::string> dict2_values = {"foo" , "bar" , "baz" , "qux" }; |
536 | |
537 | ArrayFromVector<StringType, std::string>(dict1_values, &dict1); |
538 | ArrayFromVector<StringType, std::string>(dict2_values, &dict2); |
539 | |
540 | auto f0_type = arrow::dictionary(arrow::int32(), dict1); |
541 | auto f1_type = arrow::dictionary(arrow::int8(), dict1); |
542 | auto f2_type = arrow::dictionary(arrow::int32(), dict2); |
543 | |
544 | std::shared_ptr<Array> indices0, indices1, indices2; |
545 | std::vector<int32_t> indices0_values = {1, 2, -1, 0, 2, 0}; |
546 | std::vector<int8_t> indices1_values = {0, 0, 2, 2, 1, 1}; |
547 | std::vector<int32_t> indices2_values = {3, 0, 2, 1, 0, 2}; |
548 | |
549 | ArrayFromVector<Int32Type, int32_t>(is_valid, indices0_values, &indices0); |
550 | ArrayFromVector<Int8Type, int8_t>(is_valid, indices1_values, &indices1); |
551 | ArrayFromVector<Int32Type, int32_t>(is_valid, indices2_values, &indices2); |
552 | |
553 | auto a0 = std::make_shared<DictionaryArray>(f0_type, indices0); |
554 | auto a1 = std::make_shared<DictionaryArray>(f1_type, indices1); |
555 | auto a2 = std::make_shared<DictionaryArray>(f2_type, indices2); |
556 | |
557 | // construct batch |
558 | auto schema = ::arrow::schema( |
559 | {field("dict1" , f0_type), field("sparse" , f1_type), field("dense" , f2_type)}); |
560 | |
561 | std::vector<std::shared_ptr<Array>> arrays = {a0, a1, a2}; |
562 | *out = RecordBatch::Make(schema, length, arrays); |
563 | return Status::OK(); |
564 | } |
565 | |
566 | Status MakeDates(std::shared_ptr<RecordBatch>* out) { |
567 | std::vector<bool> is_valid = {true, true, true, false, true, true, true}; |
568 | auto f0 = field("f0" , date32()); |
569 | auto f1 = field("f1" , date64()); |
570 | auto schema = ::arrow::schema({f0, f1}); |
571 | |
572 | std::vector<int32_t> date32_values = {0, 1, 2, 3, 4, 5, 6}; |
573 | std::shared_ptr<Array> date32_array; |
574 | ArrayFromVector<Date32Type, int32_t>(is_valid, date32_values, &date32_array); |
575 | |
576 | std::vector<int64_t> date64_values = {1489269000000, 1489270000000, 1489271000000, |
577 | 1489272000000, 1489272000000, 1489273000000, |
578 | 1489274000000}; |
579 | std::shared_ptr<Array> date64_array; |
580 | ArrayFromVector<Date64Type, int64_t>(is_valid, date64_values, &date64_array); |
581 | |
582 | *out = RecordBatch::Make(schema, date32_array->length(), {date32_array, date64_array}); |
583 | return Status::OK(); |
584 | } |
585 | |
586 | Status MakeTimestamps(std::shared_ptr<RecordBatch>* out) { |
587 | std::vector<bool> is_valid = {true, true, true, false, true, true, true}; |
588 | auto f0 = field("f0" , timestamp(TimeUnit::MILLI)); |
589 | auto f1 = field("f1" , timestamp(TimeUnit::NANO, "America/New_York" )); |
590 | auto f2 = field("f2" , timestamp(TimeUnit::SECOND)); |
591 | auto schema = ::arrow::schema({f0, f1, f2}); |
592 | |
593 | std::vector<int64_t> ts_values = {1489269000000, 1489270000000, 1489271000000, |
594 | 1489272000000, 1489272000000, 1489273000000}; |
595 | |
596 | std::shared_ptr<Array> a0, a1, a2; |
597 | ArrayFromVector<TimestampType, int64_t>(f0->type(), is_valid, ts_values, &a0); |
598 | ArrayFromVector<TimestampType, int64_t>(f1->type(), is_valid, ts_values, &a1); |
599 | ArrayFromVector<TimestampType, int64_t>(f2->type(), is_valid, ts_values, &a2); |
600 | |
601 | *out = RecordBatch::Make(schema, a0->length(), {a0, a1, a2}); |
602 | return Status::OK(); |
603 | } |
604 | |
605 | Status MakeTimes(std::shared_ptr<RecordBatch>* out) { |
606 | std::vector<bool> is_valid = {true, true, true, false, true, true, true}; |
607 | auto f0 = field("f0" , time32(TimeUnit::MILLI)); |
608 | auto f1 = field("f1" , time64(TimeUnit::NANO)); |
609 | auto f2 = field("f2" , time32(TimeUnit::SECOND)); |
610 | auto f3 = field("f3" , time64(TimeUnit::NANO)); |
611 | auto schema = ::arrow::schema({f0, f1, f2, f3}); |
612 | |
613 | std::vector<int32_t> t32_values = {1489269000, 1489270000, 1489271000, |
614 | 1489272000, 1489272000, 1489273000}; |
615 | std::vector<int64_t> t64_values = {1489269000000, 1489270000000, 1489271000000, |
616 | 1489272000000, 1489272000000, 1489273000000}; |
617 | |
618 | std::shared_ptr<Array> a0, a1, a2, a3; |
619 | ArrayFromVector<Time32Type, int32_t>(f0->type(), is_valid, t32_values, &a0); |
620 | ArrayFromVector<Time64Type, int64_t>(f1->type(), is_valid, t64_values, &a1); |
621 | ArrayFromVector<Time32Type, int32_t>(f2->type(), is_valid, t32_values, &a2); |
622 | ArrayFromVector<Time64Type, int64_t>(f3->type(), is_valid, t64_values, &a3); |
623 | |
624 | *out = RecordBatch::Make(schema, a0->length(), {a0, a1, a2, a3}); |
625 | return Status::OK(); |
626 | } |
627 | |
628 | template <typename BuilderType, typename T> |
629 | void AppendValues(const std::vector<bool>& is_valid, const std::vector<T>& values, |
630 | BuilderType* builder) { |
631 | for (size_t i = 0; i < values.size(); ++i) { |
632 | if (is_valid[i]) { |
633 | ASSERT_OK(builder->Append(values[i])); |
634 | } else { |
635 | ASSERT_OK(builder->AppendNull()); |
636 | } |
637 | } |
638 | } |
639 | |
640 | Status MakeFWBinary(std::shared_ptr<RecordBatch>* out) { |
641 | std::vector<bool> is_valid = {true, true, true, false}; |
642 | auto f0 = field("f0" , fixed_size_binary(4)); |
643 | auto f1 = field("f1" , fixed_size_binary(0)); |
644 | auto schema = ::arrow::schema({f0, f1}); |
645 | |
646 | std::shared_ptr<Array> a1, a2; |
647 | |
648 | FixedSizeBinaryBuilder b1(f0->type()); |
649 | FixedSizeBinaryBuilder b2(f1->type()); |
650 | |
651 | std::vector<std::string> values1 = {"foo1" , "foo2" , "foo3" , "foo4" }; |
652 | AppendValues(is_valid, values1, &b1); |
653 | |
654 | std::vector<std::string> values2 = {"" , "" , "" , "" }; |
655 | AppendValues(is_valid, values2, &b2); |
656 | |
657 | RETURN_NOT_OK(b1.Finish(&a1)); |
658 | RETURN_NOT_OK(b2.Finish(&a2)); |
659 | |
660 | *out = RecordBatch::Make(schema, a1->length(), {a1, a2}); |
661 | return Status::OK(); |
662 | } |
663 | |
664 | Status MakeDecimal(std::shared_ptr<RecordBatch>* out) { |
665 | constexpr int kDecimalPrecision = 38; |
666 | auto type = decimal(kDecimalPrecision, 4); |
667 | auto f0 = field("f0" , type); |
668 | auto f1 = field("f1" , type); |
669 | auto schema = ::arrow::schema({f0, f1}); |
670 | |
671 | constexpr int kDecimalSize = 16; |
672 | constexpr int length = 10; |
673 | |
674 | std::shared_ptr<Buffer> data, is_valid; |
675 | std::vector<uint8_t> is_valid_bytes(length); |
676 | |
677 | RETURN_NOT_OK(AllocateBuffer(kDecimalSize * length, &data)); |
678 | |
679 | random_decimals(length, 1, kDecimalPrecision, data->mutable_data()); |
680 | random_null_bytes(length, 0.1, is_valid_bytes.data()); |
681 | |
682 | RETURN_NOT_OK(BitUtil::BytesToBits(is_valid_bytes, default_memory_pool(), &is_valid)); |
683 | |
684 | auto a1 = std::make_shared<Decimal128Array>(f0->type(), length, data, is_valid, |
685 | kUnknownNullCount); |
686 | |
687 | auto a2 = std::make_shared<Decimal128Array>(f1->type(), length, data); |
688 | |
689 | *out = RecordBatch::Make(schema, length, {a1, a2}); |
690 | return Status::OK(); |
691 | } |
692 | |
693 | Status MakeNull(std::shared_ptr<RecordBatch>* out) { |
694 | auto f0 = field("f0" , null()); |
695 | |
696 | // Also put a non-null field to make sure we handle the null array buffers properly |
697 | auto f1 = field("f1" , int64()); |
698 | |
699 | auto schema = ::arrow::schema({f0, f1}); |
700 | |
701 | auto a1 = std::make_shared<NullArray>(10); |
702 | |
703 | std::vector<int64_t> int_values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; |
704 | std::vector<bool> is_valid = {true, true, true, false, false, |
705 | true, true, true, true, true}; |
706 | std::shared_ptr<Array> a2; |
707 | ArrayFromVector<Int64Type, int64_t>(f1->type(), is_valid, int_values, &a2); |
708 | |
709 | *out = RecordBatch::Make(schema, a1->length(), {a1, a2}); |
710 | return Status::OK(); |
711 | } |
712 | |
713 | } // namespace ipc |
714 | } // namespace arrow |
715 | |
716 | #endif // ARROW_IPC_TEST_COMMON_H |
717 | |