1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <cstdint>
19#include <memory>
20#include <set>
21#include <string>
22#include <vector>
23
24#include <gtest/gtest.h>
25
26#include "arrow/array.h"
27#include "arrow/csv/converter.h"
28#include "arrow/csv/options.h"
29#include "arrow/csv/test-common.h"
30#include "arrow/status.h"
31#include "arrow/test-util.h"
32#include "arrow/type.h"
33
34namespace arrow {
35namespace csv {
36
37class BlockParser;
38
39// All recognized (non-empty) null values
40std::vector<std::string> AllNulls() {
41 return {"#N/A\n", "#N/A N/A\n", "#NA\n", "-1.#IND\n", "-1.#QNAN\n", "-NaN\n",
42 "-nan\n", "1.#IND\n", "1.#QNAN\n", "N/A\n", "NA\n", "NULL\n",
43 "NaN\n", "n/a\n", "nan\n", "null\n"};
44}
45
46template <typename DATA_TYPE, typename C_TYPE>
47void AssertConversion(const std::shared_ptr<DataType>& type,
48 const std::vector<std::string>& csv_string,
49 const std::vector<std::vector<C_TYPE>>& expected,
50 ConvertOptions options = ConvertOptions::Defaults()) {
51 std::shared_ptr<BlockParser> parser;
52 std::shared_ptr<Converter> converter;
53 std::shared_ptr<Array> array, expected_array;
54
55 ASSERT_OK(Converter::Make(type, options, &converter));
56
57 MakeCSVParser(csv_string, &parser);
58 for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size());
59 ++col_index) {
60 ASSERT_OK(converter->Convert(*parser, col_index, &array));
61 ArrayFromVector<DATA_TYPE, C_TYPE>(type, expected[col_index], &expected_array);
62 AssertArraysEqual(*expected_array, *array);
63 }
64}
65
66template <typename DATA_TYPE, typename C_TYPE>
67void AssertConversion(const std::shared_ptr<DataType>& type,
68 const std::vector<std::string>& csv_string,
69 const std::vector<std::vector<C_TYPE>>& expected,
70 const std::vector<std::vector<bool>>& is_valid,
71 ConvertOptions options = ConvertOptions::Defaults()) {
72 std::shared_ptr<BlockParser> parser;
73 std::shared_ptr<Converter> converter;
74 std::shared_ptr<Array> array, expected_array;
75
76 ASSERT_OK(Converter::Make(type, options, &converter));
77
78 MakeCSVParser(csv_string, &parser);
79 for (int32_t col_index = 0; col_index < static_cast<int32_t>(expected.size());
80 ++col_index) {
81 ASSERT_OK(converter->Convert(*parser, col_index, &array));
82 ArrayFromVector<DATA_TYPE, C_TYPE>(type, is_valid[col_index], expected[col_index],
83 &expected_array);
84 AssertArraysEqual(*expected_array, *array);
85 }
86}
87
88template <typename DATA_TYPE, typename C_TYPE>
89void AssertConversionAllNulls(const std::shared_ptr<DataType>& type) {
90 std::vector<std::string> nulls = AllNulls();
91 std::vector<bool> is_valid(nulls.size(), false);
92 std::vector<C_TYPE> values(nulls.size());
93 AssertConversion<DATA_TYPE, C_TYPE>(type, nulls, {values}, {is_valid});
94}
95
96void AssertConversionError(const std::shared_ptr<DataType>& type,
97 const std::vector<std::string>& csv_string,
98 const std::set<int32_t>& invalid_columns,
99 ConvertOptions options = ConvertOptions::Defaults()) {
100 std::shared_ptr<BlockParser> parser;
101 std::shared_ptr<Converter> converter;
102 std::shared_ptr<Array> array;
103
104 ASSERT_OK(Converter::Make(type, options, &converter));
105
106 MakeCSVParser(csv_string, &parser);
107 for (int32_t i = 0; i < parser->num_cols(); ++i) {
108 if (invalid_columns.find(i) == invalid_columns.end()) {
109 ASSERT_OK(converter->Convert(*parser, i, &array));
110 } else {
111 ASSERT_RAISES(Invalid, converter->Convert(*parser, i, &array));
112 }
113 }
114}
115
116//////////////////////////////////////////////////////////////////////////
117// Test functions begin here
118
119TEST(BinaryConversion, Basics) {
120 AssertConversion<BinaryType, std::string>(binary(), {"ab,cdé\n", ",\xffgh\n"},
121 {{"ab", ""}, {"cdé", "\xffgh"}});
122}
123
124TEST(StringConversion, Basics) {
125 AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",gh\n"},
126 {{"ab", ""}, {"cdé", "gh"}});
127
128 auto options = ConvertOptions::Defaults();
129 options.check_utf8 = false;
130 AssertConversion<StringType, std::string>(utf8(), {"ab,cdé\n", ",\xffgh\n"},
131 {{"ab", ""}, {"cdé", "\xffgh"}}, options);
132}
133
134TEST(StringConversion, Errors) {
135 // Invalid UTF8 in column 0
136 AssertConversionError(utf8(), {"ab,cdé\n", "\xff,gh\n"}, {0});
137}
138
139TEST(FixedSizeBinaryConversion, Basics) {
140 AssertConversion<FixedSizeBinaryType, std::string>(
141 fixed_size_binary(2), {"ab,cd\n", "gh,ij\n"}, {{"ab", "gh"}, {"cd", "ij"}});
142}
143
144TEST(FixedSizeBinaryConversion, Errors) {
145 // Wrong-sized string in column 0
146 AssertConversionError(fixed_size_binary(2), {"ab,cd\n", "g,ij\n"}, {0});
147}
148
149TEST(NullConversion, Basics) {
150 std::shared_ptr<BlockParser> parser;
151 std::shared_ptr<Converter> converter;
152 std::shared_ptr<Array> array;
153 std::shared_ptr<DataType> type = null();
154
155 ASSERT_OK(Converter::Make(type, ConvertOptions::Defaults(), &converter));
156
157 MakeCSVParser({"NA,z\n", ",0\n"}, &parser);
158 ASSERT_OK(converter->Convert(*parser, 0, &array));
159 ASSERT_EQ(array->type()->id(), Type::NA);
160 ASSERT_EQ(array->length(), 2);
161 ASSERT_RAISES(Invalid, converter->Convert(*parser, 1, &array));
162}
163
164TEST(IntegerConversion, Basics) {
165 AssertConversion<Int8Type, int8_t>(int8(), {"12,34\n", "0,-128\n"},
166 {{12, 0}, {34, -128}});
167 AssertConversion<Int64Type, int64_t>(
168 int64(), {"12,34\n", "9223372036854775807,-9223372036854775808\n"},
169 {{12, 9223372036854775807LL}, {34, -9223372036854775807LL - 1}});
170
171 AssertConversion<UInt16Type, uint16_t>(uint16(), {"12,34\n", "0,65535\n"},
172 {{12, 0}, {34, 65535}});
173 AssertConversion<UInt64Type, uint64_t>(uint64(),
174 {"12,34\n", "0,18446744073709551615\n"},
175 {{12, 0}, {34, 18446744073709551615ULL}});
176}
177
178TEST(IntegerConversion, Nulls) {
179 AssertConversion<Int8Type, int8_t>(int8(), {"12,N/A\n", ",-128\n"},
180 {{12, 0}, {0, -128}},
181 {{true, false}, {false, true}});
182
183 AssertConversionAllNulls<Int8Type, int8_t>(int8());
184}
185
186TEST(IntegerConversion, CustomNulls) {
187 auto options = ConvertOptions::Defaults();
188 options.null_values = {"xxx", "zzz"};
189
190 AssertConversion<Int8Type, int8_t>(int8(), {"12,xxx\n", "zzz,-128\n"},
191 {{12, 0}, {0, -128}}, {{true, false}, {false, true}},
192 options);
193
194 AssertConversionError(int8(), {",xxx,N/A\n"}, {0, 2}, options);
195
196 // Duplicate nulls allowed
197 options.null_values = {"xxx", "zzz", "xxx"};
198 AssertConversion<Int8Type, int8_t>(int8(), {"12,xxx\n", "zzz,-128\n"},
199 {{12, 0}, {0, -128}}, {{true, false}, {false, true}},
200 options);
201}
202
203TEST(IntegerConversion, Whitespace) {
204 AssertConversion<Int32Type, int32_t>(int32(), {" 12,34 \n", " 56 ,78\n"},
205 {{12, 56}, {34, 78}});
206}
207
208TEST(FloatingPointConversion, Basics) {
209 AssertConversion<FloatType, float>(float32(), {"12,34.5\n", "0,-1e30\n"},
210 {{12., 0.}, {34.5, -1e30f}});
211 AssertConversion<DoubleType, double>(float64(), {"12,34.5\n", "0,-1e100\n"},
212 {{12., 0.}, {34.5, -1e100}});
213}
214
215TEST(FloatingPointConversion, Nulls) {
216 AssertConversion<FloatType, float>(float32(), {"1.5,0.\n", ",-1e10\n"},
217 {{1.5, 0.}, {0., -1e10f}},
218 {{true, false}, {true, true}});
219
220 AssertConversionAllNulls<DoubleType, double>(float64());
221}
222
223TEST(FloatingPointConversion, CustomNulls) {
224 auto options = ConvertOptions::Defaults();
225 options.null_values = {"xxx", "zzz"};
226
227 AssertConversion<FloatType, float>(float32(), {"1.5,xxx\n", "zzz,-1e10\n"},
228 {{1.5, 0.}, {0., -1e10f}},
229 {{true, false}, {false, true}}, options);
230}
231
232TEST(FloatingPointConversion, Whitespace) {
233 AssertConversion<DoubleType, double>(float64(), {" 12,34.5\n", " 0 ,-1e100 \n"},
234 {{12., 0.}, {34.5, -1e100}});
235}
236
237TEST(BooleanConversion, Basics) {
238 // XXX we may want to accept more bool-like values
239 AssertConversion<BooleanType, bool>(boolean(), {"true,false\n", "1,0\n"},
240 {{true, true}, {false, false}});
241}
242
243TEST(BooleanConversion, Nulls) {
244 AssertConversion<BooleanType, bool>(boolean(), {"true,\n", "1,0\n"},
245 {{true, true}, {false, false}},
246 {{true, true}, {false, true}});
247}
248
249TEST(BooleanConversion, CustomNulls) {
250 auto options = ConvertOptions::Defaults();
251 options.null_values = {"xxx", "zzz"};
252
253 AssertConversion<BooleanType, bool>(boolean(), {"true,xxx\n", "zzz,0\n"},
254 {{true, false}, {false, false}},
255 {{true, false}, {false, true}}, options);
256}
257
258TEST(TimestampConversion, Basics) {
259 auto type = timestamp(TimeUnit::SECOND);
260
261 AssertConversion<TimestampType, int64_t>(
262 type, {"1970-01-01\n2000-02-29\n3989-07-14\n1900-02-28\n"},
263 {{0, 951782400, 63730281600LL, -2203977600LL}});
264 AssertConversion<TimestampType, int64_t>(type,
265 {"2018-11-13 17:11:10\n1900-02-28 12:34:56\n"},
266 {{1542129070, -2203932304LL}});
267
268 type = timestamp(TimeUnit::NANO);
269 AssertConversion<TimestampType, int64_t>(
270 type, {"1970-01-01\n2000-02-29\n1900-02-28\n"},
271 {{0, 951782400000000000LL, -2203977600000000000LL}});
272}
273
274TEST(TimestampConversion, Nulls) {
275 auto type = timestamp(TimeUnit::MILLI);
276 AssertConversion<TimestampType, int64_t>(type, {"1970-01-01 00:01:00,,N/A\n"},
277 {{60000}, {0}, {0}},
278 {{true}, {false}, {false}});
279}
280
281TEST(TimestampConversion, CustomNulls) {
282 auto options = ConvertOptions::Defaults();
283 options.null_values = {"xxx", "zzz"};
284
285 auto type = timestamp(TimeUnit::MILLI);
286 AssertConversion<TimestampType, int64_t>(type, {"1970-01-01 00:01:00,xxx,zzz\n"},
287 {{60000}, {0}, {0}},
288 {{true}, {false}, {false}}, options);
289}
290
291TEST(DecimalConversion, NotImplemented) {
292 std::shared_ptr<Converter> converter;
293 ASSERT_RAISES(NotImplemented,
294 Converter::Make(decimal(12, 3), ConvertOptions::Defaults(), &converter));
295}
296
297} // namespace csv
298} // namespace arrow
299