1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cstdint> |
19 | #include <string> |
20 | #include <utility> |
21 | #include <vector> |
22 | |
23 | #include <gtest/gtest.h> |
24 | |
25 | #include "arrow/csv/options.h" |
26 | #include "arrow/csv/parser.h" |
27 | #include "arrow/csv/test-common.h" |
28 | #include "arrow/status.h" |
29 | #include "arrow/test-util.h" |
30 | |
31 | namespace arrow { |
32 | namespace csv { |
33 | |
34 | // Read the column with the given index out of the BlockParser. |
35 | void GetColumn(const BlockParser& parser, int32_t col_index, |
36 | std::vector<std::string>* out, std::vector<bool>* out_quoted = nullptr) { |
37 | std::vector<std::string> values; |
38 | std::vector<bool> quoted_values; |
39 | auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { |
40 | values.push_back(std::string(reinterpret_cast<const char*>(data), size)); |
41 | if (out_quoted) { |
42 | quoted_values.push_back(quoted); |
43 | } |
44 | return Status::OK(); |
45 | }; |
46 | ASSERT_OK(parser.VisitColumn(col_index, visit)); |
47 | *out = std::move(values); |
48 | if (out_quoted) { |
49 | *out_quoted = std::move(quoted_values); |
50 | } |
51 | } |
52 | |
53 | Status Parse(BlockParser& parser, const std::string& str, uint32_t* out_size) { |
54 | const char* data = str.data(); |
55 | uint32_t size = static_cast<uint32_t>(str.length()); |
56 | return parser.Parse(data, size, out_size); |
57 | } |
58 | |
59 | Status ParseFinal(BlockParser& parser, const std::string& str, uint32_t* out_size) { |
60 | const char* data = str.data(); |
61 | uint32_t size = static_cast<uint32_t>(str.length()); |
62 | return parser.ParseFinal(data, size, out_size); |
63 | } |
64 | |
65 | void AssertParseOk(BlockParser& parser, const std::string& str) { |
66 | uint32_t parsed_size = static_cast<uint32_t>(-1); |
67 | ASSERT_OK(Parse(parser, str, &parsed_size)); |
68 | ASSERT_EQ(parsed_size, str.size()); |
69 | } |
70 | |
71 | void AssertParseFinal(BlockParser& parser, const std::string& str) { |
72 | uint32_t parsed_size = static_cast<uint32_t>(-1); |
73 | ASSERT_OK(ParseFinal(parser, str, &parsed_size)); |
74 | ASSERT_EQ(parsed_size, str.size()); |
75 | } |
76 | |
77 | void AssertParsePartial(BlockParser& parser, const std::string& str, |
78 | uint32_t expected_size) { |
79 | uint32_t parsed_size = static_cast<uint32_t>(-1); |
80 | ASSERT_OK(Parse(parser, str, &parsed_size)); |
81 | ASSERT_EQ(parsed_size, expected_size); |
82 | } |
83 | |
84 | void AssertColumnEq(const BlockParser& parser, int32_t col_index, |
85 | const std::vector<std::string> expected) { |
86 | std::vector<std::string> values; |
87 | GetColumn(parser, col_index, &values); |
88 | ASSERT_EQ(parser.num_rows(), expected.size()); |
89 | ASSERT_EQ(values, expected); |
90 | } |
91 | |
92 | void AssertColumnEq(const BlockParser& parser, int32_t col_index, |
93 | const std::vector<std::string> expected, |
94 | const std::vector<bool> expected_quoted) { |
95 | std::vector<std::string> values; |
96 | std::vector<bool> quoted; |
97 | GetColumn(parser, col_index, &values, "ed); |
98 | ASSERT_EQ(parser.num_rows(), expected.size()); |
99 | ASSERT_EQ(values, expected); |
100 | ASSERT_EQ(quoted, expected_quoted); |
101 | } |
102 | |
103 | void AssertColumnsEq(const BlockParser& parser, |
104 | const std::vector<std::vector<std::string>> expected) { |
105 | ASSERT_EQ(parser.num_cols(), expected.size()); |
106 | for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) { |
107 | AssertColumnEq(parser, col_index, expected[col_index]); |
108 | } |
109 | } |
110 | |
111 | void AssertColumnsEq(const BlockParser& parser, |
112 | const std::vector<std::vector<std::string>> expected, |
113 | const std::vector<std::vector<bool>> quoted) { |
114 | ASSERT_EQ(parser.num_cols(), expected.size()); |
115 | for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) { |
116 | AssertColumnEq(parser, col_index, expected[col_index], quoted[col_index]); |
117 | } |
118 | uint32_t total_bytes = 0; |
119 | for (const auto& col : expected) { |
120 | for (const auto& field : col) { |
121 | total_bytes += static_cast<uint32_t>(field.size()); |
122 | } |
123 | } |
124 | ASSERT_EQ(total_bytes, parser.num_bytes()); |
125 | } |
126 | |
127 | TEST(BlockParser, Basics) { |
128 | auto csv = MakeCSVData({"ab,cd,\n" , "ef,,gh\n" , ",ij,kl\n" }); |
129 | BlockParser parser(ParseOptions::Defaults()); |
130 | AssertParseOk(parser, csv); |
131 | AssertColumnsEq(parser, {{"ab" , "ef" , "" }, {"cd" , "" , "ij" }, {"" , "gh" , "kl" }}); |
132 | } |
133 | |
134 | TEST(BlockParser, EmptyHeader) { |
135 | // Cannot infer number of columns |
136 | uint32_t out_size; |
137 | { |
138 | auto csv = MakeCSVData({"" }); |
139 | BlockParser parser(ParseOptions::Defaults()); |
140 | ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size)); |
141 | } |
142 | { |
143 | auto csv = MakeCSVData({"\n" }); |
144 | BlockParser parser(ParseOptions::Defaults()); |
145 | ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size)); |
146 | } |
147 | } |
148 | |
149 | TEST(BlockParser, Empty) { |
150 | { |
151 | auto csv = MakeCSVData({",\n" }); |
152 | BlockParser parser(ParseOptions::Defaults()); |
153 | AssertParseOk(parser, csv); |
154 | AssertColumnsEq(parser, {{"" }, {"" }}); |
155 | } |
156 | { |
157 | auto csv = MakeCSVData({",\n,\n" }); |
158 | BlockParser parser(ParseOptions::Defaults()); |
159 | AssertParseOk(parser, csv); |
160 | AssertColumnsEq(parser, {{"" , "" }, {"" , "" }}); |
161 | } |
162 | } |
163 | |
164 | TEST(BlockParser, Whitespace) { |
165 | // Non-newline whitespace is preserved |
166 | auto csv = MakeCSVData({"a b, cd, \n" , " ef, \t,gh\n" }); |
167 | BlockParser parser(ParseOptions::Defaults()); |
168 | AssertParseOk(parser, csv); |
169 | AssertColumnsEq(parser, {{"a b" , " ef" }, {" cd" , " \t" }, {" " , "gh" }}); |
170 | } |
171 | |
172 | TEST(BlockParser, Newlines) { |
173 | auto csv = MakeCSVData({"a,b\n" , "c,d\r\n" , "e,f\r" , "g,h\r" }); |
174 | BlockParser parser(ParseOptions::Defaults()); |
175 | |
176 | AssertParseOk(parser, csv); |
177 | AssertColumnsEq(parser, {{"a" , "c" , "e" , "g" }, {"b" , "d" , "f" , "h" }}); |
178 | } |
179 | |
180 | TEST(BlockParser, MaxNumRows) { |
181 | auto csv = MakeCSVData({"a\n" , "b\n" , "c\n" , "d\n" }); |
182 | BlockParser parser(ParseOptions::Defaults(), -1, 3 /* max_num_rows */); |
183 | |
184 | AssertParsePartial(parser, csv, 6); |
185 | AssertColumnsEq(parser, {{"a" , "b" , "c" }}); |
186 | |
187 | AssertParseOk(parser, csv.substr(6)); |
188 | AssertColumnsEq(parser, {{"d" }}); |
189 | |
190 | AssertParseOk(parser, csv.substr(8)); |
191 | AssertColumnsEq(parser, {{}}); |
192 | } |
193 | |
194 | TEST(BlockParser, EmptyLinesWithOneColumn) { |
195 | auto csv = MakeCSVData({"a\n" , "\n" , "b\r" , "\r" , "c\r\n" , "\r\n" , "d\n" }); |
196 | { |
197 | BlockParser parser(ParseOptions::Defaults()); |
198 | AssertParseOk(parser, csv); |
199 | AssertColumnsEq(parser, {{"a" , "b" , "c" , "d" }}); |
200 | } |
201 | { |
202 | auto options = ParseOptions::Defaults(); |
203 | options.ignore_empty_lines = false; |
204 | BlockParser parser(options); |
205 | AssertParseOk(parser, csv); |
206 | AssertColumnsEq(parser, {{"a" , "" , "b" , "" , "c" , "" , "d" }}); |
207 | } |
208 | } |
209 | |
210 | TEST(BlockParser, EmptyLinesWithSeveralColumns) { |
211 | uint32_t out_size; |
212 | auto csv = MakeCSVData({"a,b\n" , "\n" , "c,d\r" , "\r" , "e,f\r\n" , "\r\n" , "g,h\n" }); |
213 | { |
214 | BlockParser parser(ParseOptions::Defaults()); |
215 | AssertParseOk(parser, csv); |
216 | AssertColumnsEq(parser, {{"a" , "c" , "e" , "g" }, {"b" , "d" , "f" , "h" }}); |
217 | } |
218 | { |
219 | // A non-ignored empty line is a single value, but two columns are expected |
220 | auto options = ParseOptions::Defaults(); |
221 | options.ignore_empty_lines = false; |
222 | BlockParser parser(options); |
223 | Status st = Parse(parser, csv, &out_size); |
224 | ASSERT_RAISES(Invalid, st); |
225 | } |
226 | } |
227 | |
228 | TEST(BlockParser, TruncatedData) { |
229 | BlockParser parser(ParseOptions::Defaults()); |
230 | auto csv = MakeCSVData({"a,b\n" , "c,d\n" }); |
231 | for (auto trim : {1, 2, 3}) { |
232 | AssertParsePartial(parser, csv.substr(0, csv.length() - trim), 4); |
233 | AssertColumnsEq(parser, {{"a" }, {"b" }}); |
234 | } |
235 | } |
236 | |
237 | TEST(BlockParser, Final) { |
238 | // Tests for ParseFinal() |
239 | BlockParser parser(ParseOptions::Defaults()); |
240 | auto csv = MakeCSVData({"ab,cd\n" , "ef,gh\n" }); |
241 | AssertParseFinal(parser, csv); |
242 | AssertColumnsEq(parser, {{"ab" , "ef" }, {"cd" , "gh" }}); |
243 | |
244 | // Same without newline |
245 | csv = MakeCSVData({"ab,cd\n" , "ef,gh" }); |
246 | AssertParseFinal(parser, csv); |
247 | AssertColumnsEq(parser, {{"ab" , "ef" }, {"cd" , "gh" }}); |
248 | |
249 | // Same with empty last item |
250 | csv = MakeCSVData({"ab,cd\n" , "ef," }); |
251 | AssertParseFinal(parser, csv); |
252 | AssertColumnsEq(parser, {{"ab" , "ef" }, {"cd" , "" }}); |
253 | |
254 | // Same with single line |
255 | csv = MakeCSVData({"ab,cd" }); |
256 | AssertParseFinal(parser, csv); |
257 | AssertColumnsEq(parser, {{"ab" }, {"cd" }}); |
258 | } |
259 | |
260 | TEST(BlockParser, FinalTruncatedData) { |
261 | // Test ParseFinal() with truncated data |
262 | uint32_t out_size; |
263 | BlockParser parser(ParseOptions::Defaults()); |
264 | auto csv = MakeCSVData({"ab,cd\n" , "ef" }); |
265 | Status st = ParseFinal(parser, csv, &out_size); |
266 | ASSERT_RAISES(Invalid, st); |
267 | } |
268 | |
269 | TEST(BlockParser, QuotingSimple) { |
270 | auto csv = MakeCSVData({"1,\",3,\",5\n" }); |
271 | |
272 | { |
273 | BlockParser parser(ParseOptions::Defaults()); |
274 | AssertParseOk(parser, csv); |
275 | AssertColumnsEq(parser, {{"1" }, {",3," }, {"5" }}, |
276 | {{false}, {true}, {false}} /* quoted */); |
277 | } |
278 | { |
279 | auto options = ParseOptions::Defaults(); |
280 | options.quoting = false; |
281 | BlockParser parser(options); |
282 | AssertParseOk(parser, csv); |
283 | AssertColumnsEq(parser, {{"1" }, {"\"" }, {"3" }, {"\"" }, {"5" }}, |
284 | {{false}, {false}, {false}, {false}, {false}} /* quoted */); |
285 | } |
286 | { |
287 | auto options = ParseOptions::Defaults(); |
288 | options.quote_char = 'Z'; |
289 | BlockParser parser(options); |
290 | AssertParseOk(parser, csv); |
291 | AssertColumnsEq(parser, {{"1" }, {"\"" }, {"3" }, {"\"" }, {"5" }}, |
292 | {{false}, {false}, {false}, {false}, {false}} /* quoted */); |
293 | } |
294 | } |
295 | |
296 | TEST(BlockParser, QuotingNewline) { |
297 | auto csv = MakeCSVData({"a,\"c \n d\",e\n" }); |
298 | BlockParser parser(ParseOptions::Defaults()); |
299 | AssertParseOk(parser, csv); |
300 | AssertColumnsEq(parser, {{"a" }, {"c \n d" }, {"e" }}, |
301 | {{false}, {true}, {false}} /* quoted */); |
302 | } |
303 | |
304 | TEST(BlockParser, QuotingUnbalanced) { |
305 | // Quote introduces a quoted field that doesn't end |
306 | auto csv = MakeCSVData({"a,b\n" , "1,\",3,,5\n" }); |
307 | BlockParser parser(ParseOptions::Defaults()); |
308 | AssertParsePartial(parser, csv, 4); |
309 | AssertColumnsEq(parser, {{"a" }, {"b" }}, {{false}, {false}} /* quoted */); |
310 | } |
311 | |
312 | TEST(BlockParser, QuotingEmpty) { |
313 | { |
314 | BlockParser parser(ParseOptions::Defaults()); |
315 | auto csv = MakeCSVData({"\"\"\n" }); |
316 | AssertParseOk(parser, csv); |
317 | AssertColumnsEq(parser, {{"" }}, {{true}} /* quoted */); |
318 | } |
319 | { |
320 | BlockParser parser(ParseOptions::Defaults()); |
321 | auto csv = MakeCSVData({",\"\"\n" }); |
322 | AssertParseOk(parser, csv); |
323 | AssertColumnsEq(parser, {{"" }, {"" }}, {{false}, {true}} /* quoted */); |
324 | } |
325 | { |
326 | BlockParser parser(ParseOptions::Defaults()); |
327 | auto csv = MakeCSVData({"\"\",\n" }); |
328 | AssertParseOk(parser, csv); |
329 | AssertColumnsEq(parser, {{"" }, {"" }}, {{true}, {false}} /* quoted */); |
330 | } |
331 | } |
332 | |
333 | TEST(BlockParser, QuotingDouble) { |
334 | { |
335 | BlockParser parser(ParseOptions::Defaults()); |
336 | // 4 quotes is a quoted quote |
337 | auto csv = MakeCSVData({"\"\"\"\"\n" }); |
338 | AssertParseOk(parser, csv); |
339 | AssertColumnsEq(parser, {{"\"" }}, {{true}} /* quoted */); |
340 | } |
341 | { |
342 | BlockParser parser(ParseOptions::Defaults()); |
343 | // 4 quotes is a quoted quote |
344 | auto csv = MakeCSVData({"a,\"\"\"\",b\n" }); |
345 | AssertParseOk(parser, csv); |
346 | AssertColumnsEq(parser, {{"a" }, {"\"" }, {"b" }}, |
347 | {{false}, {true}, {false}} /* quoted */); |
348 | } |
349 | { |
350 | BlockParser parser(ParseOptions::Defaults()); |
351 | // 6 quotes is two quoted quotes |
352 | auto csv = MakeCSVData({"\"\"\"\"\"\"\n" }); |
353 | AssertParseOk(parser, csv); |
354 | AssertColumnsEq(parser, {{"\"\"" }}, {{true}} /* quoted */); |
355 | } |
356 | { |
357 | BlockParser parser(ParseOptions::Defaults()); |
358 | // 6 quotes is two quoted quotes |
359 | auto csv = MakeCSVData({"a,\"\"\"\"\"\",b\n" }); |
360 | AssertParseOk(parser, csv); |
361 | AssertColumnsEq(parser, {{"a" }, {"\"\"" }, {"b" }}, |
362 | {{false}, {true}, {false}} /* quoted */); |
363 | } |
364 | } |
365 | |
366 | TEST(BlockParser, QuotesAndMore) { |
367 | // There may be trailing data after the quoted part of a field |
368 | { |
369 | BlockParser parser(ParseOptions::Defaults()); |
370 | auto csv = MakeCSVData({"a,\"b\"c,d\n" }); |
371 | AssertParseOk(parser, csv); |
372 | AssertColumnsEq(parser, {{"a" }, {"bc" }, {"d" }}, |
373 | {{false}, {true}, {false}} /* quoted */); |
374 | } |
375 | } |
376 | |
377 | TEST(BlockParser, QuotesSpecial) { |
378 | // Some non-trivial cases |
379 | { |
380 | BlockParser parser(ParseOptions::Defaults()); |
381 | auto csv = MakeCSVData({"a,b\"c,d\n" }); |
382 | AssertParseOk(parser, csv); |
383 | AssertColumnsEq(parser, {{"a" }, {"b\"c" }, {"d" }}, |
384 | {{false}, {false}, {false}} /* quoted */); |
385 | } |
386 | { |
387 | BlockParser parser(ParseOptions::Defaults()); |
388 | auto csv = MakeCSVData({"a,\"b\" \"c\",d\n" }); |
389 | AssertParseOk(parser, csv); |
390 | AssertColumnsEq(parser, {{"a" }, {"b \"c\"" }, {"d" }}, |
391 | {{false}, {true}, {false}} /* quoted */); |
392 | } |
393 | } |
394 | |
395 | TEST(BlockParser, MismatchingNumColumns) { |
396 | uint32_t out_size; |
397 | { |
398 | BlockParser parser(ParseOptions::Defaults()); |
399 | auto csv = MakeCSVData({"a,b\nc\n" }); |
400 | Status st = Parse(parser, csv, &out_size); |
401 | ASSERT_RAISES(Invalid, st); |
402 | } |
403 | { |
404 | BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */); |
405 | auto csv = MakeCSVData({"a\n" }); |
406 | Status st = Parse(parser, csv, &out_size); |
407 | ASSERT_RAISES(Invalid, st); |
408 | } |
409 | { |
410 | BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */); |
411 | auto csv = MakeCSVData({"a,b,c\n" }); |
412 | Status st = Parse(parser, csv, &out_size); |
413 | ASSERT_RAISES(Invalid, st); |
414 | } |
415 | } |
416 | |
417 | TEST(BlockParser, Escaping) { |
418 | auto options = ParseOptions::Defaults(); |
419 | options.escaping = true; |
420 | |
421 | { |
422 | auto csv = MakeCSVData({"a\\b,c\n" }); |
423 | { |
424 | BlockParser parser(ParseOptions::Defaults()); |
425 | AssertParseOk(parser, csv); |
426 | AssertColumnsEq(parser, {{"a\\b" }, {"c" }}); |
427 | } |
428 | { |
429 | BlockParser parser(options); |
430 | AssertParseOk(parser, csv); |
431 | AssertColumnsEq(parser, {{"ab" }, {"c" }}); |
432 | } |
433 | } |
434 | { |
435 | auto csv = MakeCSVData({"a\\,b,c\n" }); |
436 | BlockParser parser(options); |
437 | AssertParseOk(parser, csv); |
438 | AssertColumnsEq(parser, {{"a,b" }, {"c" }}); |
439 | } |
440 | } |
441 | |
442 | TEST(BlockParser, QuotedEscape) { |
443 | auto options = ParseOptions::Defaults(); |
444 | options.escaping = true; |
445 | |
446 | { |
447 | auto csv = MakeCSVData({"\"a\\,b\",c\n" }); |
448 | BlockParser parser(options); |
449 | AssertParseOk(parser, csv); |
450 | AssertColumnsEq(parser, {{"a,b" }, {"c" }}, {{true}, {false}} /* quoted */); |
451 | } |
452 | { |
453 | auto csv = MakeCSVData({"\"a\\\"b\",c\n" }); |
454 | BlockParser parser(options); |
455 | AssertParseOk(parser, csv); |
456 | AssertColumnsEq(parser, {{"a\"b" }, {"c" }}, {{true}, {false}} /* quoted */); |
457 | } |
458 | } |
459 | |
460 | } // namespace csv |
461 | } // namespace arrow |
462 | |