1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cstdint> |
19 | #include <memory> |
20 | #include <set> |
21 | #include <string> |
22 | #include <vector> |
23 | |
24 | #include <gtest/gtest.h> |
25 | |
26 | #include "arrow/array.h" |
27 | #include "arrow/csv/converter.h" |
28 | #include "arrow/csv/options.h" |
29 | #include "arrow/csv/test-common.h" |
30 | #include "arrow/status.h" |
31 | #include "arrow/test-util.h" |
32 | #include "arrow/type.h" |
33 | |
34 | namespace arrow { |
35 | namespace csv { |
36 | |
37 | class BlockParser; |
38 | |
39 | // All recognized (non-empty) null values |
40 | std::vector<std::string> AllNulls() { |
41 | return {"#N/A\n" , "#N/A N/A\n" , "#NA\n" , "-1.#IND\n" , "-1.#QNAN\n" , "-NaN\n" , |
42 | "-nan\n" , "1.#IND\n" , "1.#QNAN\n" , "N/A\n" , "NA\n" , "NULL\n" , |
43 | "NaN\n" , "n/a\n" , "nan\n" , "null\n" }; |
44 | } |
45 | |
46 | template <typename DATA_TYPE, typename C_TYPE> |
47 | void AssertConversion(const std::shared_ptr<DataType>& type, |
48 | const std::vector<std::string>& csv_string, |
49 | const std::vector<std::vector<C_TYPE>>& expected, |
50 | ConvertOptions options = ConvertOptions::Defaults()) { |
51 | std::shared_ptr<BlockParser> parser; |
52 | std::shared_ptr<Converter> converter; |
53 | std::shared_ptr<Array> array, expected_array; |
54 | |
55 | ASSERT_OK(Converter::Make(type, options, &converter)); |
56 | |
57 | MakeCSVParser(csv_string, &parser); |
58 | for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size()); |
59 | ++col_index) { |
60 | ASSERT_OK(converter->Convert(*parser, col_index, &array)); |
61 | ArrayFromVector<DATA_TYPE, C_TYPE>(type, expected[col_index], &expected_array); |
62 | AssertArraysEqual(*expected_array, *array); |
63 | } |
64 | } |
65 | |
66 | template <typename DATA_TYPE, typename C_TYPE> |
67 | void AssertConversion(const std::shared_ptr<DataType>& type, |
68 | const std::vector<std::string>& csv_string, |
69 | const std::vector<std::vector<C_TYPE>>& expected, |
70 | const std::vector<std::vector<bool>>& is_valid, |
71 | ConvertOptions options = ConvertOptions::Defaults()) { |
72 | std::shared_ptr<BlockParser> parser; |
73 | std::shared_ptr<Converter> converter; |
74 | std::shared_ptr<Array> array, expected_array; |
75 | |
76 | ASSERT_OK(Converter::Make(type, options, &converter)); |
77 | |
78 | MakeCSVParser(csv_string, &parser); |
79 | for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size()); |
80 | ++col_index) { |
81 | ASSERT_OK(converter->Convert(*parser, col_index, &array)); |
82 | ArrayFromVector<DATA_TYPE, C_TYPE>(type, is_valid[col_index], expected[col_index], |
83 | &expected_array); |
84 | AssertArraysEqual(*expected_array, *array); |
85 | } |
86 | } |
87 | |
88 | template <typename DATA_TYPE, typename C_TYPE> |
89 | void AssertConversionAllNulls(const std::shared_ptr<DataType>& type) { |
90 | std::vector<std::string> nulls = AllNulls(); |
91 | std::vector<bool> is_valid(nulls.size(), false); |
92 | std::vector<C_TYPE> values(nulls.size()); |
93 | AssertConversion<DATA_TYPE, C_TYPE>(type, nulls, {values}, {is_valid}); |
94 | } |
95 | |
96 | void AssertConversionError(const std::shared_ptr<DataType>& type, |
97 | const std::vector<std::string>& csv_string, |
98 | const std::set<int32_t>& invalid_columns, |
99 | ConvertOptions options = ConvertOptions::Defaults()) { |
100 | std::shared_ptr<BlockParser> parser; |
101 | std::shared_ptr<Converter> converter; |
102 | std::shared_ptr<Array> array; |
103 | |
104 | ASSERT_OK(Converter::Make(type, options, &converter)); |
105 | |
106 | MakeCSVParser(csv_string, &parser); |
107 | for (int32_t i = 0; i < parser->num_cols(); ++i) { |
108 | if (invalid_columns.find(i) == invalid_columns.end()) { |
109 | ASSERT_OK(converter->Convert(*parser, i, &array)); |
110 | } else { |
111 | ASSERT_RAISES(Invalid, converter->Convert(*parser, i, &array)); |
112 | } |
113 | } |
114 | } |
115 | |
116 | ////////////////////////////////////////////////////////////////////////// |
117 | // Test functions begin here |
118 | |
119 | TEST(BinaryConversion, Basics) { |
120 | AssertConversion<BinaryType, std::string>(binary(), {"ab,cdé\n" , ",\xffgh\n" }, |
121 | {{"ab" , "" }, {"cdé" , "\xffgh" }}); |
122 | } |
123 | |
124 | TEST(StringConversion, Basics) { |
125 | AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n" , ",gh\n" }, |
126 | {{"ab" , "" }, {"cdé" , "gh" }}); |
127 | |
128 | auto options = ConvertOptions::Defaults(); |
129 | options.check_utf8 = false; |
130 | AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n" , ",\xffgh\n" }, |
131 | {{"ab" , "" }, {"cdé" , "\xffgh" }}, options); |
132 | } |
133 | |
134 | TEST(StringConversion, Errors) { |
135 | // Invalid UTF8 in column 0 |
136 | AssertConversionError(utf8(), {"ab,cdé\n" , "\xff,gh\n" }, {0}); |
137 | } |
138 | |
139 | TEST(FixedSizeBinaryConversion, Basics) { |
140 | AssertConversion<FixedSizeBinaryType, std::string>( |
141 | fixed_size_binary(2), {"ab,cd\n" , "gh,ij\n" }, {{"ab" , "gh" }, {"cd" , "ij" }}); |
142 | } |
143 | |
144 | TEST(FixedSizeBinaryConversion, Errors) { |
145 | // Wrong-sized string in column 0 |
146 | AssertConversionError(fixed_size_binary(2), {"ab,cd\n" , "g,ij\n" }, {0}); |
147 | } |
148 | |
149 | TEST(NullConversion, Basics) { |
150 | std::shared_ptr<BlockParser> parser; |
151 | std::shared_ptr<Converter> converter; |
152 | std::shared_ptr<Array> array; |
153 | std::shared_ptr<DataType> type = null(); |
154 | |
155 | ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter)); |
156 | |
157 | MakeCSVParser({"NA,z\n" , ",0\n" }, &parser); |
158 | ASSERT_OK(converter->Convert(*parser, 0, &array)); |
159 | ASSERT_EQ(array->type()->id(), Type::NA); |
160 | ASSERT_EQ(array->length(), 2); |
161 | ASSERT_RAISES(Invalid, converter->Convert(*parser, 1, &array)); |
162 | } |
163 | |
164 | TEST(IntegerConversion, Basics) { |
165 | AssertConversion<Int8Type, int8_t>(int8(), {"12,34\n" , "0,-128\n" }, |
166 | {{12, 0}, {34, -128}}); |
167 | AssertConversion<Int64Type, int64_t>( |
168 | int64(), {"12,34\n" , "9223372036854775807,-9223372036854775808\n" }, |
169 | {{12, 9223372036854775807LL}, {34, -9223372036854775807LL - 1}}); |
170 | |
171 | AssertConversion<UInt16Type, uint16_t>(uint16(), {"12,34\n" , "0,65535\n" }, |
172 | {{12, 0}, {34, 65535}}); |
173 | AssertConversion<UInt64Type, uint64_t>(uint64(), |
174 | {"12,34\n" , "0,18446744073709551615\n" }, |
175 | {{12, 0}, {34, 18446744073709551615ULL}}); |
176 | } |
177 | |
178 | TEST(IntegerConversion, Nulls) { |
179 | AssertConversion<Int8Type, int8_t>(int8(), {"12,N/A\n" , ",-128\n" }, |
180 | {{12, 0}, {0, -128}}, |
181 | {{true, false}, {false, true}}); |
182 | |
183 | AssertConversionAllNulls<Int8Type, int8_t>(int8()); |
184 | } |
185 | |
186 | TEST(IntegerConversion, CustomNulls) { |
187 | auto options = ConvertOptions::Defaults(); |
188 | options.null_values = {"xxx" , "zzz" }; |
189 | |
190 | AssertConversion<Int8Type, int8_t>(int8(), {"12,xxx\n" , "zzz,-128\n" }, |
191 | {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, |
192 | options); |
193 | |
194 | AssertConversionError(int8(), {",xxx,N/A\n" }, {0, 2}, options); |
195 | |
196 | // Duplicate nulls allowed |
197 | options.null_values = {"xxx" , "zzz" , "xxx" }; |
198 | AssertConversion<Int8Type, int8_t>(int8(), {"12,xxx\n" , "zzz,-128\n" }, |
199 | {{12, 0}, {0, -128}}, {{true, false}, {false, true}}, |
200 | options); |
201 | } |
202 | |
203 | TEST(IntegerConversion, Whitespace) { |
204 | AssertConversion<Int32Type, int32_t>(int32(), {" 12,34 \n" , " 56 ,78\n" }, |
205 | {{12, 56}, {34, 78}}); |
206 | } |
207 | |
208 | TEST(FloatingPointConversion, Basics) { |
209 | AssertConversion<FloatType, float>(float32(), {"12,34.5\n" , "0,-1e30\n" }, |
210 | {{12., 0.}, {34.5, -1e30f}}); |
211 | AssertConversion<DoubleType, double>(float64(), {"12,34.5\n" , "0,-1e100\n" }, |
212 | {{12., 0.}, {34.5, -1e100}}); |
213 | } |
214 | |
215 | TEST(FloatingPointConversion, Nulls) { |
216 | AssertConversion<FloatType, float>(float32(), {"1.5,0.\n" , ",-1e10\n" }, |
217 | {{1.5, 0.}, {0., -1e10f}}, |
218 | {{true, false}, {true, true}}); |
219 | |
220 | AssertConversionAllNulls<DoubleType, double>(float64()); |
221 | } |
222 | |
223 | TEST(FloatingPointConversion, CustomNulls) { |
224 | auto options = ConvertOptions::Defaults(); |
225 | options.null_values = {"xxx" , "zzz" }; |
226 | |
227 | AssertConversion<FloatType, float>(float32(), {"1.5,xxx\n" , "zzz,-1e10\n" }, |
228 | {{1.5, 0.}, {0., -1e10f}}, |
229 | {{true, false}, {false, true}}, options); |
230 | } |
231 | |
232 | TEST(FloatingPointConversion, Whitespace) { |
233 | AssertConversion<DoubleType, double>(float64(), {" 12,34.5\n" , " 0 ,-1e100 \n" }, |
234 | {{12., 0.}, {34.5, -1e100}}); |
235 | } |
236 | |
237 | TEST(BooleanConversion, Basics) { |
238 | // XXX we may want to accept more bool-like values |
239 | AssertConversion<BooleanType, bool>(boolean(), {"true,false\n" , "1,0\n" }, |
240 | {{true, true}, {false, false}}); |
241 | } |
242 | |
243 | TEST(BooleanConversion, Nulls) { |
244 | AssertConversion<BooleanType, bool>(boolean(), {"true,\n" , "1,0\n" }, |
245 | {{true, true}, {false, false}}, |
246 | {{true, true}, {false, true}}); |
247 | } |
248 | |
249 | TEST(BooleanConversion, CustomNulls) { |
250 | auto options = ConvertOptions::Defaults(); |
251 | options.null_values = {"xxx" , "zzz" }; |
252 | |
253 | AssertConversion<BooleanType, bool>(boolean(), {"true,xxx\n" , "zzz,0\n" }, |
254 | {{true, false}, {false, false}}, |
255 | {{true, false}, {false, true}}, options); |
256 | } |
257 | |
258 | TEST(TimestampConversion, Basics) { |
259 | auto type = timestamp(TimeUnit::SECOND); |
260 | |
261 | AssertConversion<TimestampType, int64_t>( |
262 | type, {"1970-01-01\n2000-02-29\n3989-07-14\n1900-02-28\n" }, |
263 | {{0, 951782400, 63730281600LL, -2203977600LL}}); |
264 | AssertConversion<TimestampType, int64_t>(type, |
265 | {"2018-11-13 17:11:10\n1900-02-28 12:34:56\n" }, |
266 | {{1542129070, -2203932304LL}}); |
267 | |
268 | type = timestamp(TimeUnit::NANO); |
269 | AssertConversion<TimestampType, int64_t>( |
270 | type, {"1970-01-01\n2000-02-29\n1900-02-28\n" }, |
271 | {{0, 951782400000000000LL, -2203977600000000000LL}}); |
272 | } |
273 | |
274 | TEST(TimestampConversion, Nulls) { |
275 | auto type = timestamp(TimeUnit::MILLI); |
276 | AssertConversion<TimestampType, int64_t>(type, {"1970-01-01 00:01:00,,N/A\n" }, |
277 | {{60000}, {0}, {0}}, |
278 | {{true}, {false}, {false}}); |
279 | } |
280 | |
281 | TEST(TimestampConversion, CustomNulls) { |
282 | auto options = ConvertOptions::Defaults(); |
283 | options.null_values = {"xxx" , "zzz" }; |
284 | |
285 | auto type = timestamp(TimeUnit::MILLI); |
286 | AssertConversion<TimestampType, int64_t>(type, {"1970-01-01 00:01:00,xxx,zzz\n" }, |
287 | {{60000}, {0}, {0}}, |
288 | {{true}, {false}, {false}}, options); |
289 | } |
290 | |
291 | TEST(DecimalConversion, NotImplemented) { |
292 | std::shared_ptr<Converter> converter; |
293 | ASSERT_RAISES(NotImplemented, |
294 | Converter::Make(decimal(12, 3), ConvertOptions::Defaults(), &converter)); |
295 | } |
296 | |
297 | } // namespace csv |
298 | } // namespace arrow |
299 | |