1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <memory> |
19 | #include <string> |
20 | #include <vector> |
21 | |
22 | #include <gtest/gtest.h> |
23 | |
24 | #include "arrow/csv/column-builder.h" |
25 | #include "arrow/csv/options.h" |
26 | #include "arrow/csv/test-common.h" |
27 | #include "arrow/table.h" |
28 | #include "arrow/test-util.h" |
29 | #include "arrow/type.h" |
30 | #include "arrow/util/task-group.h" |
31 | #include "arrow/util/thread-pool.h" |
32 | |
33 | namespace arrow { |
34 | namespace csv { |
35 | |
36 | class BlockParser; |
37 | |
38 | using internal::GetCpuThreadPool; |
39 | using internal::TaskGroup; |
40 | |
41 | void AssertBuilding(const std::shared_ptr<ColumnBuilder>& builder, |
42 | const std::vector<std::vector<std::string>>& chunks, |
43 | std::shared_ptr<ChunkedArray>* out) { |
44 | for (const auto& chunk : chunks) { |
45 | std::shared_ptr<BlockParser> parser; |
46 | MakeColumnParser(chunk, &parser); |
47 | builder->Append(parser); |
48 | } |
49 | ASSERT_OK(builder->task_group()->Finish()); |
50 | ASSERT_OK(builder->Finish(out)); |
51 | } |
52 | |
53 | ////////////////////////////////////////////////////////////////////////// |
54 | // Tests for fixed-type column builder |
55 | |
56 | TEST(ColumnBuilder, Empty) { |
57 | auto tg = TaskGroup::MakeSerial(); |
58 | std::shared_ptr<ColumnBuilder> builder; |
59 | ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder)); |
60 | |
61 | std::shared_ptr<ChunkedArray> actual; |
62 | AssertBuilding(builder, {}, &actual); |
63 | |
64 | ChunkedArray expected({}, int32()); |
65 | AssertChunkedEqual(*actual, expected); |
66 | } |
67 | |
68 | TEST(ColumnBuilder, Basics) { |
69 | auto tg = TaskGroup::MakeSerial(); |
70 | std::shared_ptr<ColumnBuilder> builder; |
71 | ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder)); |
72 | |
73 | std::shared_ptr<ChunkedArray> actual; |
74 | AssertBuilding(builder, {{"123" , "-456" }}, &actual); |
75 | |
76 | std::shared_ptr<ChunkedArray> expected; |
77 | ChunkedArrayFromVector<Int32Type>({{123, -456}}, &expected); |
78 | AssertChunkedEqual(*actual, *expected); |
79 | } |
80 | |
81 | TEST(ColumnBuilder, Insert) { |
82 | // Test ColumnBuilder::Insert() |
83 | auto tg = TaskGroup::MakeSerial(); |
84 | std::shared_ptr<ColumnBuilder> builder; |
85 | ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder)); |
86 | |
87 | std::shared_ptr<BlockParser> parser; |
88 | std::shared_ptr<ChunkedArray> actual, expected; |
89 | MakeColumnParser({"456" }, &parser); |
90 | builder->Insert(1, parser); |
91 | MakeColumnParser({"123" }, &parser); |
92 | builder->Insert(0, parser); |
93 | ASSERT_OK(builder->task_group()->Finish()); |
94 | ASSERT_OK(builder->Finish(&actual)); |
95 | |
96 | ChunkedArrayFromVector<Int32Type>({{123}, {456}}, &expected); |
97 | AssertChunkedEqual(*actual, *expected); |
98 | } |
99 | |
100 | TEST(ColumnBuilder, MultipleChunks) { |
101 | auto tg = TaskGroup::MakeSerial(); |
102 | std::shared_ptr<ColumnBuilder> builder; |
103 | ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder)); |
104 | |
105 | std::shared_ptr<ChunkedArray> actual; |
106 | AssertBuilding(builder, {{"1" , "2" , "3" }, {"4" , "5" }}, &actual); |
107 | |
108 | std::shared_ptr<ChunkedArray> expected; |
109 | ChunkedArrayFromVector<Int32Type>({{1, 2, 3}, {4, 5}}, &expected); |
110 | AssertChunkedEqual(*actual, *expected); |
111 | } |
112 | |
113 | TEST(ColumnBuilder, MultipleChunksParallel) { |
114 | auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool()); |
115 | std::shared_ptr<ColumnBuilder> builder; |
116 | ASSERT_OK(ColumnBuilder::Make(int32(), 0, ConvertOptions::Defaults(), tg, &builder)); |
117 | |
118 | std::shared_ptr<ChunkedArray> actual; |
119 | AssertBuilding(builder, {{"1" , "2" }, {"3" }, {"4" , "5" }, {"6" , "7" }}, &actual); |
120 | |
121 | std::shared_ptr<ChunkedArray> expected; |
122 | ChunkedArrayFromVector<Int32Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected); |
123 | AssertChunkedEqual(*actual, *expected); |
124 | } |
125 | |
126 | ////////////////////////////////////////////////////////////////////////// |
127 | // Tests for type-inferring column builder |
128 | |
129 | TEST(InferringColumnBuilder, Empty) { |
130 | auto tg = TaskGroup::MakeSerial(); |
131 | std::shared_ptr<ColumnBuilder> builder; |
132 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
133 | |
134 | std::shared_ptr<ChunkedArray> actual; |
135 | AssertBuilding(builder, {}, &actual); |
136 | |
137 | ASSERT_EQ(actual->type()->id(), Type::NA); |
138 | ASSERT_EQ(actual->num_chunks(), 0); |
139 | } |
140 | |
141 | TEST(InferringColumnBuilder, SingleChunkNull) { |
142 | auto tg = TaskGroup::MakeSerial(); |
143 | std::shared_ptr<ColumnBuilder> builder; |
144 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
145 | |
146 | std::shared_ptr<ChunkedArray> actual; |
147 | AssertBuilding(builder, {{"" , "NA" }}, &actual); |
148 | |
149 | ASSERT_EQ(actual->type()->id(), Type::NA); |
150 | ASSERT_EQ(actual->length(), 2); |
151 | } |
152 | |
153 | TEST(InferringColumnBuilder, MultipleChunkNull) { |
154 | auto tg = TaskGroup::MakeSerial(); |
155 | std::shared_ptr<ColumnBuilder> builder; |
156 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
157 | |
158 | std::shared_ptr<ChunkedArray> actual; |
159 | AssertBuilding(builder, {{"" , "NA" }, {"" }, {"NaN" }}, &actual); |
160 | |
161 | ASSERT_EQ(actual->type()->id(), Type::NA); |
162 | ASSERT_EQ(actual->length(), 4); |
163 | } |
164 | |
165 | TEST(InferringColumnBuilder, SingleChunkInteger) { |
166 | auto tg = TaskGroup::MakeSerial(); |
167 | std::shared_ptr<ColumnBuilder> builder; |
168 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
169 | |
170 | std::shared_ptr<ChunkedArray> actual; |
171 | AssertBuilding(builder, {{"" , "123" , "456" }}, &actual); |
172 | |
173 | std::shared_ptr<ChunkedArray> expected; |
174 | ChunkedArrayFromVector<Int64Type>({{false, true, true}}, {{0, 123, 456}}, &expected); |
175 | AssertChunkedEqual(*expected, *actual); |
176 | } |
177 | |
178 | TEST(InferringColumnBuilder, MultipleChunkInteger) { |
179 | auto tg = TaskGroup::MakeSerial(); |
180 | std::shared_ptr<ColumnBuilder> builder; |
181 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
182 | |
183 | std::shared_ptr<ChunkedArray> actual; |
184 | AssertBuilding(builder, {{"" }, {"NA" , "123" , "456" }}, &actual); |
185 | |
186 | std::shared_ptr<ChunkedArray> expected; |
187 | ChunkedArrayFromVector<Int64Type>({{false}, {false, true, true}}, {{0}, {0, 123, 456}}, |
188 | &expected); |
189 | AssertChunkedEqual(*expected, *actual); |
190 | } |
191 | |
192 | TEST(InferringColumnBuilder, SingleChunkReal) { |
193 | auto tg = TaskGroup::MakeSerial(); |
194 | std::shared_ptr<ColumnBuilder> builder; |
195 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
196 | |
197 | std::shared_ptr<ChunkedArray> actual; |
198 | AssertBuilding(builder, {{"" , "0.0" , "12.5" }}, &actual); |
199 | |
200 | std::shared_ptr<ChunkedArray> expected; |
201 | ChunkedArrayFromVector<DoubleType>({{false, true, true}}, {{0.0, 0.0, 12.5}}, |
202 | &expected); |
203 | AssertChunkedEqual(*expected, *actual); |
204 | } |
205 | |
206 | TEST(InferringColumnBuilder, MultipleChunkReal) { |
207 | auto tg = TaskGroup::MakeSerial(); |
208 | std::shared_ptr<ColumnBuilder> builder; |
209 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
210 | |
211 | std::shared_ptr<ChunkedArray> actual; |
212 | AssertBuilding(builder, {{"" }, {"008" }, {"NaN" , "12.5" }}, &actual); |
213 | |
214 | std::shared_ptr<ChunkedArray> expected; |
215 | ChunkedArrayFromVector<DoubleType>({{false}, {true}, {false, true}}, |
216 | {{0.0}, {8.0}, {0.0, 12.5}}, &expected); |
217 | AssertChunkedEqual(*expected, *actual); |
218 | } |
219 | |
220 | TEST(InferringColumnBuilder, SingleChunkTimestamp) { |
221 | auto tg = TaskGroup::MakeSerial(); |
222 | std::shared_ptr<ColumnBuilder> builder; |
223 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
224 | |
225 | std::shared_ptr<ChunkedArray> actual; |
226 | AssertBuilding(builder, {{"" , "1970-01-01" , "2018-11-13 17:11:10" }}, &actual); |
227 | |
228 | std::shared_ptr<ChunkedArray> expected; |
229 | ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND), |
230 | {{false, true, true}}, {{0, 0, 1542129070}}, |
231 | &expected); |
232 | AssertChunkedEqual(*expected, *actual); |
233 | } |
234 | |
235 | TEST(InferringColumnBuilder, MultipleChunkTimestamp) { |
236 | auto tg = TaskGroup::MakeSerial(); |
237 | std::shared_ptr<ColumnBuilder> builder; |
238 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
239 | |
240 | std::shared_ptr<ChunkedArray> actual; |
241 | AssertBuilding(builder, {{"" }, {"1970-01-01" }, {"2018-11-13 17:11:10" }}, &actual); |
242 | |
243 | std::shared_ptr<ChunkedArray> expected; |
244 | ChunkedArrayFromVector<TimestampType>(timestamp(TimeUnit::SECOND), |
245 | {{false}, {true}, {true}}, |
246 | {{0}, {0}, {1542129070}}, &expected); |
247 | AssertChunkedEqual(*expected, *actual); |
248 | } |
249 | |
250 | TEST(InferringColumnBuilder, SingleChunkString) { |
251 | auto tg = TaskGroup::MakeSerial(); |
252 | std::shared_ptr<ColumnBuilder> builder; |
253 | std::shared_ptr<ChunkedArray> actual; |
254 | std::shared_ptr<ChunkedArray> expected; |
255 | |
256 | // With valid UTF8 |
257 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
258 | AssertBuilding(builder, {{"" , "foo" , "baré" }}, &actual); |
259 | |
260 | ChunkedArrayFromVector<StringType, std::string>({{true, true, true}}, |
261 | {{"" , "foo" , "baré" }}, &expected); |
262 | AssertChunkedEqual(*expected, *actual); |
263 | |
264 | // With invalid UTF8, non-checking |
265 | auto options = ConvertOptions::Defaults(); |
266 | options.check_utf8 = false; |
267 | tg = TaskGroup::MakeSerial(); |
268 | ASSERT_OK(ColumnBuilder::Make(0, options, tg, &builder)); |
269 | AssertBuilding(builder, {{"" , "foo\xff" , "baré" }}, &actual); |
270 | |
271 | ChunkedArrayFromVector<StringType, std::string>({{true, true, true}}, |
272 | {{"" , "foo\xff" , "baré" }}, &expected); |
273 | AssertChunkedEqual(*expected, *actual); |
274 | } |
275 | |
276 | TEST(InferringColumnBuilder, SingleChunkBinary) { |
277 | auto tg = TaskGroup::MakeSerial(); |
278 | std::shared_ptr<ColumnBuilder> builder; |
279 | std::shared_ptr<ChunkedArray> actual; |
280 | std::shared_ptr<ChunkedArray> expected; |
281 | |
282 | // With invalid UTF8, checking |
283 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
284 | AssertBuilding(builder, {{"" , "foo\xff" , "baré" }}, &actual); |
285 | |
286 | ChunkedArrayFromVector<BinaryType, std::string>({{true, true, true}}, |
287 | {{"" , "foo\xff" , "baré" }}, &expected); |
288 | AssertChunkedEqual(*expected, *actual); |
289 | } |
290 | |
291 | TEST(InferringColumnBuilder, MultipleChunkString) { |
292 | auto tg = TaskGroup::MakeSerial(); |
293 | std::shared_ptr<ColumnBuilder> builder; |
294 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
295 | |
296 | std::shared_ptr<ChunkedArray> actual; |
297 | AssertBuilding(builder, {{"" }, {"008" }, {"NaN" , "baré" }}, &actual); |
298 | |
299 | std::shared_ptr<ChunkedArray> expected; |
300 | ChunkedArrayFromVector<StringType, std::string>( |
301 | {{true}, {true}, {true, true}}, {{"" }, {"008" }, {"NaN" , "baré" }}, &expected); |
302 | AssertChunkedEqual(*expected, *actual); |
303 | } |
304 | |
305 | TEST(InferringColumnBuilder, MultipleChunkBinary) { |
306 | auto tg = TaskGroup::MakeSerial(); |
307 | std::shared_ptr<ColumnBuilder> builder; |
308 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
309 | |
310 | std::shared_ptr<ChunkedArray> actual; |
311 | AssertBuilding(builder, {{"" }, {"008" }, {"NaN" , "baré\xff" }}, &actual); |
312 | |
313 | std::shared_ptr<ChunkedArray> expected; |
314 | ChunkedArrayFromVector<BinaryType, std::string>( |
315 | {{true}, {true}, {true, true}}, {{"" }, {"008" }, {"NaN" , "baré\xff" }}, &expected); |
316 | AssertChunkedEqual(*expected, *actual); |
317 | } |
318 | |
319 | TEST(InferringColumnBuilder, MultipleChunkIntegerParallel) { |
320 | auto tg = TaskGroup::MakeThreaded(GetCpuThreadPool()); |
321 | std::shared_ptr<ColumnBuilder> builder; |
322 | ASSERT_OK(ColumnBuilder::Make(0, ConvertOptions::Defaults(), tg, &builder)); |
323 | |
324 | std::shared_ptr<ChunkedArray> actual; |
325 | AssertBuilding(builder, {{"1" , "2" }, {"3" }, {"4" , "5" }, {"6" , "7" }}, &actual); |
326 | |
327 | std::shared_ptr<ChunkedArray> expected; |
328 | ChunkedArrayFromVector<Int64Type>({{1, 2}, {3}, {4, 5}, {6, 7}}, &expected); |
329 | AssertChunkedEqual(*actual, *expected); |
330 | } |
331 | |
332 | } // namespace csv |
333 | } // namespace arrow |
334 | |