1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <memory>
19#include <string>
20#include <vector>
21
22#include <gtest/gtest.h>
23
24#include "arrow/csv/column-builder.h"
25#include "arrow/csv/options.h"
26#include "arrow/csv/test-common.h"
27#include "arrow/table.h"
28#include "arrow/test-util.h"
29#include "arrow/type.h"
30#include "arrow/util/task-group.h"
31#include "arrow/util/thread-pool.h"
32
33namespace arrow {
34namespace csv {
35
36class BlockParser;
37
38using internal::GetCpuThreadPool;
39using internal::TaskGroup;
40
41void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder,
42 const std::vector<std::vector<std::string>>& chunks,
43 std::shared_ptr<ChunkedArray>* out) {
44 for (const auto& chunk : chunks) {
45 std::shared_ptr<BlockParser> parser;
46 MakeColumnParser(chunk, &parser);
47 builder->Append(parser);
48 }
49 ASSERT_OK(builder->task_group()->Finish());
50 ASSERT_OK(builder->Finish(out));
51}
52
53//////////////////////////////////////////////////////////////////////////
54// Tests for fixed-type column builder
55
56TEST(ColumnBuilder, Empty) {
57 auto tg = TaskGroup::MakeSerial();
58 std::shared_ptr<ColumnBuilder> builder;
59 ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder));
60
61 std::shared_ptr<ChunkedArray> actual;
62 AssertBuilding(builder, {}, &actual);
63
64 ChunkedArray expected({}, int32());
65 AssertChunkedEqual(*actual, expected);
66}
67
68TEST(ColumnBuilder, Basics) {
69 auto tg = TaskGroup::MakeSerial();
70 std::shared_ptr<ColumnBuilder> builder;
71 ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder));
72
73 std::shared_ptr<ChunkedArray> actual;
74 AssertBuilding(builder, {{"123", "-456"}}, &actual);
75
76 std::shared_ptr<ChunkedArray> expected;
77 ChunkedArrayFromVector<Int32Type>({{123, -456}}, &expected);
78 AssertChunkedEqual(*actual, *expected);
79}
80
81TEST(ColumnBuilder, Insert) {
82 // Test ColumnBuilder::Insert()
83 auto tg = TaskGroup::MakeSerial();
84 std::shared_ptr<ColumnBuilder> builder;
85 ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder));
86
87 std::shared_ptr<BlockParser> parser;
88 std::shared_ptr<ChunkedArray> actual, expected;
89 MakeColumnParser({"456"}, &parser);
90 builder->Insert(1, parser);
91 MakeColumnParser({"123"}, &parser);
92 builder->Insert(0, parser);
93 ASSERT_OK(builder->task_group()->Finish());
94 ASSERT_OK(builder->Finish(&actual));
95
96 ChunkedArrayFromVector<Int32Type>({{123}, {456}}, &expected);
97 AssertChunkedEqual(*actual, *expected);
98}
99
100TEST(ColumnBuilder, MultipleChunks) {
101 auto tg = TaskGroup::MakeSerial();
102 std::shared_ptr<ColumnBuilder> builder;
103 ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder));
104
105 std::shared_ptr<ChunkedArray> actual;
106 AssertBuilding(builder, {{"1", "2", "3"}, {"4", "5"}}, &actual);
107
108 std::shared_ptr<ChunkedArray> expected;
109 ChunkedArrayFromVector<Int32Type>({{1, 2, 3}, {4, 5}}, &expected);
110 AssertChunkedEqual(*actual, *expected);
111}
112
113TEST(ColumnBuilder, MultipleChunksParallel) {
114 auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
115 std::shared_ptr<ColumnBuilder> builder;
116 ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder));
117
118 std::shared_ptr<ChunkedArray> actual;
119 AssertBuilding(builder, {{"1", "2"}, {"3"}, {"4", "5"}, {"6", "7"}}, &actual);
120
121 std::shared_ptr<ChunkedArray> expected;
122 ChunkedArrayFromVector<Int32Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected);
123 AssertChunkedEqual(*actual, *expected);
124}
125
126//////////////////////////////////////////////////////////////////////////
127// Tests for type-inferring column builder
128
129TEST(InferringColumnBuilder, Empty) {
130 auto tg = TaskGroup::MakeSerial();
131 std::shared_ptr<ColumnBuilder> builder;
132 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
133
134 std::shared_ptr<ChunkedArray> actual;
135 AssertBuilding(builder, {}, &actual);
136
137 ASSERT_EQ(actual->type()->id(), Type::NA);
138 ASSERT_EQ(actual->num_chunks(), 0);
139}
140
141TEST(InferringColumnBuilder, SingleChunkNull) {
142 auto tg = TaskGroup::MakeSerial();
143 std::shared_ptr<ColumnBuilder> builder;
144 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
145
146 std::shared_ptr<ChunkedArray> actual;
147 AssertBuilding(builder, {{"", "NA"}}, &actual);
148
149 ASSERT_EQ(actual->type()->id(), Type::NA);
150 ASSERT_EQ(actual->length(), 2);
151}
152
153TEST(InferringColumnBuilder, MultipleChunkNull) {
154 auto tg = TaskGroup::MakeSerial();
155 std::shared_ptr<ColumnBuilder> builder;
156 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
157
158 std::shared_ptr<ChunkedArray> actual;
159 AssertBuilding(builder, {{"", "NA"}, {""}, {"NaN"}}, &actual);
160
161 ASSERT_EQ(actual->type()->id(), Type::NA);
162 ASSERT_EQ(actual->length(), 4);
163}
164
165TEST(InferringColumnBuilder, SingleChunkInteger) {
166 auto tg = TaskGroup::MakeSerial();
167 std::shared_ptr<ColumnBuilder> builder;
168 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
169
170 std::shared_ptr<ChunkedArray> actual;
171 AssertBuilding(builder, {{"", "123", "456"}}, &actual);
172
173 std::shared_ptr<ChunkedArray> expected;
174 ChunkedArrayFromVector<Int64Type>({{false, true, true}}, {{0, 123, 456}}, &expected);
175 AssertChunkedEqual(*expected, *actual);
176}
177
178TEST(InferringColumnBuilder, MultipleChunkInteger) {
179 auto tg = TaskGroup::MakeSerial();
180 std::shared_ptr<ColumnBuilder> builder;
181 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
182
183 std::shared_ptr<ChunkedArray> actual;
184 AssertBuilding(builder, {{""}, {"NA", "123", "456"}}, &actual);
185
186 std::shared_ptr<ChunkedArray> expected;
187 ChunkedArrayFromVector<Int64Type>({{false}, {false, true, true}}, {{0}, {0, 123, 456}},
188 &expected);
189 AssertChunkedEqual(*expected, *actual);
190}
191
192TEST(InferringColumnBuilder, SingleChunkReal) {
193 auto tg = TaskGroup::MakeSerial();
194 std::shared_ptr<ColumnBuilder> builder;
195 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
196
197 std::shared_ptr<ChunkedArray> actual;
198 AssertBuilding(builder, {{"", "0.0", "12.5"}}, &actual);
199
200 std::shared_ptr<ChunkedArray> expected;
201 ChunkedArrayFromVector<DoubleType>({{false, true, true}}, {{0.0, 0.0, 12.5}},
202 &expected);
203 AssertChunkedEqual(*expected, *actual);
204}
205
206TEST(InferringColumnBuilder, MultipleChunkReal) {
207 auto tg = TaskGroup::MakeSerial();
208 std::shared_ptr<ColumnBuilder> builder;
209 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
210
211 std::shared_ptr<ChunkedArray> actual;
212 AssertBuilding(builder, {{""}, {"008"}, {"NaN", "12.5"}}, &actual);
213
214 std::shared_ptr<ChunkedArray> expected;
215 ChunkedArrayFromVector<DoubleType>({{false}, {true}, {false, true}},
216 {{0.0}, {8.0}, {0.0, 12.5}}, &expected);
217 AssertChunkedEqual(*expected, *actual);
218}
219
220TEST(InferringColumnBuilder, SingleChunkTimestamp) {
221 auto tg = TaskGroup::MakeSerial();
222 std::shared_ptr<ColumnBuilder> builder;
223 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
224
225 std::shared_ptr<ChunkedArray> actual;
226 AssertBuilding(builder, {{"", "1970-01-01", "2018-11-13 17:11:10"}}, &actual);
227
228 std::shared_ptr<ChunkedArray> expected;
229 ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
230 {{false, true, true}}, {{0, 0, 1542129070}},
231 &expected);
232 AssertChunkedEqual(*expected, *actual);
233}
234
235TEST(InferringColumnBuilder, MultipleChunkTimestamp) {
236 auto tg = TaskGroup::MakeSerial();
237 std::shared_ptr<ColumnBuilder> builder;
238 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
239
240 std::shared_ptr<ChunkedArray> actual;
241 AssertBuilding(builder, {{""}, {"1970-01-01"}, {"2018-11-13 17:11:10"}}, &actual);
242
243 std::shared_ptr<ChunkedArray> expected;
244 ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND),
245 {{false}, {true}, {true}},
246 {{0}, {0}, {1542129070}}, &expected);
247 AssertChunkedEqual(*expected, *actual);
248}
249
250TEST(InferringColumnBuilder, SingleChunkString) {
251 auto tg = TaskGroup::MakeSerial();
252 std::shared_ptr<ColumnBuilder> builder;
253 std::shared_ptr<ChunkedArray> actual;
254 std::shared_ptr<ChunkedArray> expected;
255
256 // With valid UTF8
257 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
258 AssertBuilding(builder, {{"", "foo", "baré"}}, &actual);
259
260 ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
261 {{"", "foo", "baré"}}, &expected);
262 AssertChunkedEqual(*expected, *actual);
263
264 // With invalid UTF8, non-checking
265 auto options = ConvertOptions::Defaults();
266 options.check_utf8 = false;
267 tg = TaskGroup::MakeSerial();
268 ASSERT_OK(ColumnBuilder::Make(0, options, tg, &builder));
269 AssertBuilding(builder, {{"", "foo\xff", "baré"}}, &actual);
270
271 ChunkedArrayFromVector<StringType, std::string>({{true, true, true}},
272 {{"", "foo\xff", "baré"}}, &expected);
273 AssertChunkedEqual(*expected, *actual);
274}
275
276TEST(InferringColumnBuilder, SingleChunkBinary) {
277 auto tg = TaskGroup::MakeSerial();
278 std::shared_ptr<ColumnBuilder> builder;
279 std::shared_ptr<ChunkedArray> actual;
280 std::shared_ptr<ChunkedArray> expected;
281
282 // With invalid UTF8, checking
283 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
284 AssertBuilding(builder, {{"", "foo\xff", "baré"}}, &actual);
285
286 ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}},
287 {{"", "foo\xff", "baré"}}, &expected);
288 AssertChunkedEqual(*expected, *actual);
289}
290
291TEST(InferringColumnBuilder, MultipleChunkString) {
292 auto tg = TaskGroup::MakeSerial();
293 std::shared_ptr<ColumnBuilder> builder;
294 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
295
296 std::shared_ptr<ChunkedArray> actual;
297 AssertBuilding(builder, {{""}, {"008"}, {"NaN", "baré"}}, &actual);
298
299 std::shared_ptr<ChunkedArray> expected;
300 ChunkedArrayFromVector<StringType, std::string>(
301 {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré"}}, &expected);
302 AssertChunkedEqual(*expected, *actual);
303}
304
305TEST(InferringColumnBuilder, MultipleChunkBinary) {
306 auto tg = TaskGroup::MakeSerial();
307 std::shared_ptr<ColumnBuilder> builder;
308 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
309
310 std::shared_ptr<ChunkedArray> actual;
311 AssertBuilding(builder, {{""}, {"008"}, {"NaN", "baré\xff"}}, &actual);
312
313 std::shared_ptr<ChunkedArray> expected;
314 ChunkedArrayFromVector<BinaryType, std::string>(
315 {{true}, {true}, {true, true}}, {{""}, {"008"}, {"NaN", "baré\xff"}}, &expected);
316 AssertChunkedEqual(*expected, *actual);
317}
318
319TEST(InferringColumnBuilder, MultipleChunkIntegerParallel) {
320 auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool());
321 std::shared_ptr<ColumnBuilder> builder;
322 ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder));
323
324 std::shared_ptr<ChunkedArray> actual;
325 AssertBuilding(builder, {{"1", "2"}, {"3"}, {"4", "5"}, {"6", "7"}}, &actual);
326
327 std::shared_ptr<ChunkedArray> expected;
328 ChunkedArrayFromVector<Int64Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected);
329 AssertChunkedEqual(*actual, *expected);
330}
331
332} // namespace csv
333} // namespace arrow
334