1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/test-util.h"
19
20#ifndef _WIN32
21#include <sys/stat.h> // IWYU pragma: keep
22#include <sys/wait.h> // IWYU pragma: keep
23#include <unistd.h> // IWYU pragma: keep
24#endif
25
26#include <algorithm>
27#include <cstdint>
28#include <cstdlib>
29#include <iostream>
30#include <limits>
31#include <memory>
32#include <random>
33#include <sstream>
34#include <string>
35#include <vector>
36
37#include <gtest/gtest.h>
38
39#include "arrow/array.h"
40#include "arrow/buffer.h"
41#include "arrow/ipc/json-simple.h"
42#include "arrow/pretty_print.h"
43#include "arrow/status.h"
44#include "arrow/table.h"
45#include "arrow/type.h"
46#include "arrow/util/logging.h"
47
48namespace arrow {
49
50std::shared_ptr<Array> ArrayFromJSON(const std::shared_ptr<DataType>& type,
51 const std::string& json) {
52 std::shared_ptr<Array> out;
53 ABORT_NOT_OK(ipc::internal::json::ArrayFromJSON(type, json, &out));
54 return out;
55}
56
57void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) {
58 const int random_seed = 0;
59 std::default_random_engine gen(random_seed);
60 std::uniform_real_distribution<double> d(0.0, 1.0);
61 std::generate(null_bytes, null_bytes + n,
62 [&d, &gen, &pct_null] { return d(gen) > pct_null; });
63}
64
65void random_is_valid(int64_t n, double pct_null, std::vector<bool>* is_valid) {
66 const int random_seed = 0;
67 std::default_random_engine gen(random_seed);
68 std::uniform_real_distribution<double> d(0.0, 1.0);
69 is_valid->resize(n, false);
70 std::generate(is_valid->begin(), is_valid->end(),
71 [&d, &gen, &pct_null] { return d(gen) > pct_null; });
72}
73
74void random_bytes(int64_t n, uint32_t seed, uint8_t* out) {
75 std::default_random_engine gen(seed);
76 std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
77 std::generate(out, out + n, [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
78}
79
80int32_t DecimalSize(int32_t precision) {
81 DCHECK_GE(precision, 1) << "decimal precision must be greater than or equal to 1, got "
82 << precision;
83 DCHECK_LE(precision, 38) << "decimal precision must be less than or equal to 38, got "
84 << precision;
85
86 switch (precision) {
87 case 1:
88 case 2:
89 return 1; // 127
90 case 3:
91 case 4:
92 return 2; // 32,767
93 case 5:
94 case 6:
95 return 3; // 8,388,607
96 case 7:
97 case 8:
98 case 9:
99 return 4; // 2,147,483,427
100 case 10:
101 case 11:
102 return 5; // 549,755,813,887
103 case 12:
104 case 13:
105 case 14:
106 return 6; // 140,737,488,355,327
107 case 15:
108 case 16:
109 return 7; // 36,028,797,018,963,967
110 case 17:
111 case 18:
112 return 8; // 9,223,372,036,854,775,807
113 case 19:
114 case 20:
115 case 21:
116 return 9; // 2,361,183,241,434,822,606,847
117 case 22:
118 case 23:
119 return 10; // 604,462,909,807,314,587,353,087
120 case 24:
121 case 25:
122 case 26:
123 return 11; // 154,742,504,910,672,534,362,390,527
124 case 27:
125 case 28:
126 return 12; // 39,614,081,257,132,168,796,771,975,167
127 case 29:
128 case 30:
129 case 31:
130 return 13; // 10,141,204,801,825,835,211,973,625,643,007
131 case 32:
132 case 33:
133 return 14; // 2,596,148,429,267,413,814,265,248,164,610,047
134 case 34:
135 case 35:
136 return 15; // 664,613,997,892,457,936,451,903,530,140,172,287
137 case 36:
138 case 37:
139 case 38:
140 return 16; // 170,141,183,460,469,231,731,687,303,715,884,105,727
141 default:
142 DCHECK(false);
143 break;
144 }
145 return -1;
146}
147
148void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
149 std::default_random_engine gen(seed);
150 std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
151 const int32_t required_bytes = DecimalSize(precision);
152 constexpr int32_t byte_width = 16;
153 std::fill(out, out + byte_width * n, '\0');
154
155 for (int64_t i = 0; i < n; ++i, out += byte_width) {
156 std::generate(out, out + required_bytes,
157 [&d, &gen] { return static_cast<uint8_t>(d(gen)); });
158
159 // sign extend if the sign bit is set for the last byte generated
160 // 0b10000000 == 0x80 == 128
161 if ((out[required_bytes - 1] & '\x80') != 0) {
162 std::fill(out + required_bytes, out + byte_width, '\xFF');
163 }
164 }
165}
166
167void random_ascii(int64_t n, uint32_t seed, uint8_t* out) {
168 rand_uniform_int(n, seed, static_cast<int32_t>('A'), static_cast<int32_t>('z'), out);
169}
170
171int64_t CountNulls(const std::vector<uint8_t>& valid_bytes) {
172 return static_cast<int64_t>(std::count(valid_bytes.cbegin(), valid_bytes.cend(), '\0'));
173}
174
175Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool,
176 std::shared_ptr<ResizableBuffer>* out, uint32_t seed) {
177 std::shared_ptr<ResizableBuffer> result;
178 RETURN_NOT_OK(AllocateResizableBuffer(pool, length, &result));
179 random_bytes(length, seed, result->mutable_data());
180 *out = result;
181 return Status::OK();
182}
183
184void AssertArraysEqual(const Array& expected, const Array& actual) {
185 ASSERT_ARRAYS_EQUAL(expected, actual);
186}
187
188void AssertChunkedEqual(const ChunkedArray& expected, const ChunkedArray& actual) {
189 ASSERT_EQ(expected.num_chunks(), actual.num_chunks()) << "# chunks unequal";
190 if (!actual.Equals(expected)) {
191 std::stringstream pp_result;
192 std::stringstream pp_expected;
193
194 for (int i = 0; i < actual.num_chunks(); ++i) {
195 auto c1 = actual.chunk(i);
196 auto c2 = expected.chunk(i);
197 if (!c1->Equals(*c2)) {
198 EXPECT_OK(::arrow::PrettyPrint(*c1, 0, &pp_result));
199 EXPECT_OK(::arrow::PrettyPrint(*c2, 0, &pp_expected));
200 FAIL() << "Chunk " << i << " Got: " << pp_result.str()
201 << "\nExpected: " << pp_expected.str();
202 }
203 }
204 }
205}
206
207void AssertChunkedEqual(const ChunkedArray& actual, const ArrayVector& expected) {
208 AssertChunkedEqual(ChunkedArray(expected, actual.type()), actual);
209}
210
211void AssertBufferEqual(const Buffer& buffer, const std::vector<uint8_t>& expected) {
212 ASSERT_EQ(buffer.size(), expected.size()) << "Mismatching buffer size";
213 const uint8_t* buffer_data = buffer.data();
214 for (size_t i = 0; i < expected.size(); ++i) {
215 ASSERT_EQ(buffer_data[i], expected[i]);
216 }
217}
218
219void AssertBufferEqual(const Buffer& buffer, const std::string& expected) {
220 ASSERT_EQ(buffer.size(), expected.length()) << "Mismatching buffer size";
221 const uint8_t* buffer_data = buffer.data();
222 for (size_t i = 0; i < expected.size(); ++i) {
223 ASSERT_EQ(buffer_data[i], expected[i]);
224 }
225}
226
227void AssertBufferEqual(const Buffer& buffer, const Buffer& expected) {
228 ASSERT_EQ(buffer.size(), expected.size()) << "Mismatching buffer size";
229 ASSERT_TRUE(buffer.Equals(expected));
230}
231
232void AssertSchemaEqual(const Schema& lhs, const Schema& rhs) {
233 if (!lhs.Equals(rhs)) {
234 std::stringstream ss;
235 ss << "left schema: " << lhs.ToString() << std::endl
236 << "right schema: " << rhs.ToString() << std::endl;
237 FAIL() << ss.str();
238 }
239}
240
241void PrintColumn(const Column& col, std::stringstream* ss) {
242 const ChunkedArray& carr = *col.data();
243 for (int i = 0; i < carr.num_chunks(); ++i) {
244 auto c1 = carr.chunk(i);
245 *ss << "Chunk " << i << std::endl;
246 EXPECT_OK(::arrow::PrettyPrint(*c1, 0, ss));
247 *ss << std::endl;
248 }
249}
250
251void AssertTablesEqual(const Table& expected, const Table& actual,
252 bool same_chunk_layout) {
253 ASSERT_EQ(expected.num_columns(), actual.num_columns());
254
255 if (same_chunk_layout) {
256 for (int i = 0; i < actual.num_columns(); ++i) {
257 AssertChunkedEqual(*expected.column(i)->data(), *actual.column(i)->data());
258 }
259 } else {
260 std::stringstream ss;
261 if (!actual.Equals(expected)) {
262 for (int i = 0; i < expected.num_columns(); ++i) {
263 ss << "Actual column " << i << std::endl;
264 PrintColumn(*actual.column(i), &ss);
265
266 ss << "Expected column " << i << std::endl;
267 PrintColumn(*expected.column(i), &ss);
268 }
269 FAIL() << ss.str();
270 }
271 }
272}
273
274void CompareBatch(const RecordBatch& left, const RecordBatch& right) {
275 if (!left.schema()->Equals(*right.schema())) {
276 FAIL() << "Left schema: " << left.schema()->ToString()
277 << "\nRight schema: " << right.schema()->ToString();
278 }
279 ASSERT_EQ(left.num_columns(), right.num_columns())
280 << left.schema()->ToString() << " result: " << right.schema()->ToString();
281 ASSERT_EQ(left.num_rows(), right.num_rows());
282 for (int i = 0; i < left.num_columns(); ++i) {
283 if (!left.column(i)->Equals(right.column(i))) {
284 std::stringstream ss;
285 ss << "Idx: " << i << " Name: " << left.column_name(i);
286 ss << std::endl << "Left: ";
287 ASSERT_OK(PrettyPrint(*left.column(i), 0, &ss));
288 ss << std::endl << "Right: ";
289 ASSERT_OK(PrettyPrint(*right.column(i), 0, &ss));
290 FAIL() << ss.str();
291 }
292 }
293}
294
295namespace {
296
297// Used to prevent compiler optimizing away side-effect-less statements
298volatile int throw_away = 0;
299
300} // namespace
301
302void AssertZeroPadded(const Array& array) {
303 for (const auto& buffer : array.data()->buffers) {
304 if (buffer) {
305 const int64_t padding = buffer->capacity() - buffer->size();
306 if (padding > 0) {
307 std::vector<uint8_t> zeros(padding);
308 ASSERT_EQ(0, memcmp(buffer->data() + buffer->size(), zeros.data(), padding));
309 }
310 }
311 }
312}
313
314void TestInitialized(const Array& array) {
315 for (const auto& buffer : array.data()->buffers) {
316 if (buffer && buffer->capacity() > 0) {
317 int total = 0;
318 auto data = buffer->data();
319 for (int64_t i = 0; i < buffer->size(); ++i) {
320 total ^= data[i];
321 }
322 throw_away = total;
323 }
324 }
325}
326
327} // namespace arrow
328