1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <cstdint>
19#include <string>
20#include <utility>
21#include <vector>
22
23#include <gtest/gtest.h>
24
25#include "arrow/csv/options.h"
26#include "arrow/csv/parser.h"
27#include "arrow/csv/test-common.h"
28#include "arrow/status.h"
29#include "arrow/test-util.h"
30
31namespace arrow {
32namespace csv {
33
34// Read the column with the given index out of the BlockParser.
35void GetColumn(const BlockParser& parser, int32_t col_index,
36 std::vector<std::string>* out, std::vector<bool>* out_quoted = nullptr) {
37 std::vector<std::string> values;
38 std::vector<bool> quoted_values;
39 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
40 values.push_back(std::string(reinterpret_cast<const char*>(data), size));
41 if (out_quoted) {
42 quoted_values.push_back(quoted);
43 }
44 return Status::OK();
45 };
46 ASSERT_OK(parser.VisitColumn(col_index, visit));
47 *out = std::move(values);
48 if (out_quoted) {
49 *out_quoted = std::move(quoted_values);
50 }
51}
52
53Status Parse(BlockParser& parser, const std::string& str, uint32_t* out_size) {
54 const char* data = str.data();
55 uint32_t size = static_cast<uint32_t>(str.length());
56 return parser.Parse(data, size, out_size);
57}
58
59Status ParseFinal(BlockParser& parser, const std::string& str, uint32_t* out_size) {
60 const char* data = str.data();
61 uint32_t size = static_cast<uint32_t>(str.length());
62 return parser.ParseFinal(data, size, out_size);
63}
64
65void AssertParseOk(BlockParser& parser, const std::string& str) {
66 uint32_t parsed_size = static_cast<uint32_t>(-1);
67 ASSERT_OK(Parse(parser, str, &parsed_size));
68 ASSERT_EQ(parsed_size, str.size());
69}
70
71void AssertParseFinal(BlockParser& parser, const std::string& str) {
72 uint32_t parsed_size = static_cast<uint32_t>(-1);
73 ASSERT_OK(ParseFinal(parser, str, &parsed_size));
74 ASSERT_EQ(parsed_size, str.size());
75}
76
77void AssertParsePartial(BlockParser& parser, const std::string& str,
78 uint32_t expected_size) {
79 uint32_t parsed_size = static_cast<uint32_t>(-1);
80 ASSERT_OK(Parse(parser, str, &parsed_size));
81 ASSERT_EQ(parsed_size, expected_size);
82}
83
84void AssertColumnEq(const BlockParser& parser, int32_t col_index,
85 const std::vector<std::string> expected) {
86 std::vector<std::string> values;
87 GetColumn(parser, col_index, &values);
88 ASSERT_EQ(parser.num_rows(), expected.size());
89 ASSERT_EQ(values, expected);
90}
91
92void AssertColumnEq(const BlockParser& parser, int32_t col_index,
93 const std::vector<std::string> expected,
94 const std::vector<bool> expected_quoted) {
95 std::vector<std::string> values;
96 std::vector<bool> quoted;
97 GetColumn(parser, col_index, &values, &quoted);
98 ASSERT_EQ(parser.num_rows(), expected.size());
99 ASSERT_EQ(values, expected);
100 ASSERT_EQ(quoted, expected_quoted);
101}
102
103void AssertColumnsEq(const BlockParser& parser,
104 const std::vector<std::vector<std::string>> expected) {
105 ASSERT_EQ(parser.num_cols(), expected.size());
106 for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) {
107 AssertColumnEq(parser, col_index, expected[col_index]);
108 }
109}
110
111void AssertColumnsEq(const BlockParser& parser,
112 const std::vector<std::vector<std::string>> expected,
113 const std::vector<std::vector<bool>> quoted) {
114 ASSERT_EQ(parser.num_cols(), expected.size());
115 for (int32_t col_index = 0; col_index < parser.num_cols(); ++col_index) {
116 AssertColumnEq(parser, col_index, expected[col_index], quoted[col_index]);
117 }
118 uint32_t total_bytes = 0;
119 for (const auto& col : expected) {
120 for (const auto& field : col) {
121 total_bytes += static_cast<uint32_t>(field.size());
122 }
123 }
124 ASSERT_EQ(total_bytes, parser.num_bytes());
125}
126
127TEST(BlockParser, Basics) {
128 auto csv = MakeCSVData({"ab,cd,\n", "ef,,gh\n", ",ij,kl\n"});
129 BlockParser parser(ParseOptions::Defaults());
130 AssertParseOk(parser, csv);
131 AssertColumnsEq(parser, {{"ab", "ef", ""}, {"cd", "", "ij"}, {"", "gh", "kl"}});
132}
133
134TEST(BlockParser, EmptyHeader) {
135 // Cannot infer number of columns
136 uint32_t out_size;
137 {
138 auto csv = MakeCSVData({""});
139 BlockParser parser(ParseOptions::Defaults());
140 ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size));
141 }
142 {
143 auto csv = MakeCSVData({"\n"});
144 BlockParser parser(ParseOptions::Defaults());
145 ASSERT_RAISES(Invalid, ParseFinal(parser, csv, &out_size));
146 }
147}
148
149TEST(BlockParser, Empty) {
150 {
151 auto csv = MakeCSVData({",\n"});
152 BlockParser parser(ParseOptions::Defaults());
153 AssertParseOk(parser, csv);
154 AssertColumnsEq(parser, {{""}, {""}});
155 }
156 {
157 auto csv = MakeCSVData({",\n,\n"});
158 BlockParser parser(ParseOptions::Defaults());
159 AssertParseOk(parser, csv);
160 AssertColumnsEq(parser, {{"", ""}, {"", ""}});
161 }
162}
163
164TEST(BlockParser, Whitespace) {
165 // Non-newline whitespace is preserved
166 auto csv = MakeCSVData({"a b, cd, \n", " ef, \t,gh\n"});
167 BlockParser parser(ParseOptions::Defaults());
168 AssertParseOk(parser, csv);
169 AssertColumnsEq(parser, {{"a b", " ef"}, {" cd", " \t"}, {" ", "gh"}});
170}
171
172TEST(BlockParser, Newlines) {
173 auto csv = MakeCSVData({"a,b\n", "c,d\r\n", "e,f\r", "g,h\r"});
174 BlockParser parser(ParseOptions::Defaults());
175
176 AssertParseOk(parser, csv);
177 AssertColumnsEq(parser, {{"a", "c", "e", "g"}, {"b", "d", "f", "h"}});
178}
179
180TEST(BlockParser, MaxNumRows) {
181 auto csv = MakeCSVData({"a\n", "b\n", "c\n", "d\n"});
182 BlockParser parser(ParseOptions::Defaults(), -1, 3 /* max_num_rows */);
183
184 AssertParsePartial(parser, csv, 6);
185 AssertColumnsEq(parser, {{"a", "b", "c"}});
186
187 AssertParseOk(parser, csv.substr(6));
188 AssertColumnsEq(parser, {{"d"}});
189
190 AssertParseOk(parser, csv.substr(8));
191 AssertColumnsEq(parser, {{}});
192}
193
194TEST(BlockParser, EmptyLinesWithOneColumn) {
195 auto csv = MakeCSVData({"a\n", "\n", "b\r", "\r", "c\r\n", "\r\n", "d\n"});
196 {
197 BlockParser parser(ParseOptions::Defaults());
198 AssertParseOk(parser, csv);
199 AssertColumnsEq(parser, {{"a", "b", "c", "d"}});
200 }
201 {
202 auto options = ParseOptions::Defaults();
203 options.ignore_empty_lines = false;
204 BlockParser parser(options);
205 AssertParseOk(parser, csv);
206 AssertColumnsEq(parser, {{"a", "", "b", "", "c", "", "d"}});
207 }
208}
209
210TEST(BlockParser, EmptyLinesWithSeveralColumns) {
211 uint32_t out_size;
212 auto csv = MakeCSVData({"a,b\n", "\n", "c,d\r", "\r", "e,f\r\n", "\r\n", "g,h\n"});
213 {
214 BlockParser parser(ParseOptions::Defaults());
215 AssertParseOk(parser, csv);
216 AssertColumnsEq(parser, {{"a", "c", "e", "g"}, {"b", "d", "f", "h"}});
217 }
218 {
219 // A non-ignored empty line is a single value, but two columns are expected
220 auto options = ParseOptions::Defaults();
221 options.ignore_empty_lines = false;
222 BlockParser parser(options);
223 Status st = Parse(parser, csv, &out_size);
224 ASSERT_RAISES(Invalid, st);
225 }
226}
227
228TEST(BlockParser, TruncatedData) {
229 BlockParser parser(ParseOptions::Defaults());
230 auto csv = MakeCSVData({"a,b\n", "c,d\n"});
231 for (auto trim : {1, 2, 3}) {
232 AssertParsePartial(parser, csv.substr(0, csv.length() - trim), 4);
233 AssertColumnsEq(parser, {{"a"}, {"b"}});
234 }
235}
236
237TEST(BlockParser, Final) {
238 // Tests for ParseFinal()
239 BlockParser parser(ParseOptions::Defaults());
240 auto csv = MakeCSVData({"ab,cd\n", "ef,gh\n"});
241 AssertParseFinal(parser, csv);
242 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", "gh"}});
243
244 // Same without newline
245 csv = MakeCSVData({"ab,cd\n", "ef,gh"});
246 AssertParseFinal(parser, csv);
247 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", "gh"}});
248
249 // Same with empty last item
250 csv = MakeCSVData({"ab,cd\n", "ef,"});
251 AssertParseFinal(parser, csv);
252 AssertColumnsEq(parser, {{"ab", "ef"}, {"cd", ""}});
253
254 // Same with single line
255 csv = MakeCSVData({"ab,cd"});
256 AssertParseFinal(parser, csv);
257 AssertColumnsEq(parser, {{"ab"}, {"cd"}});
258}
259
260TEST(BlockParser, FinalTruncatedData) {
261 // Test ParseFinal() with truncated data
262 uint32_t out_size;
263 BlockParser parser(ParseOptions::Defaults());
264 auto csv = MakeCSVData({"ab,cd\n", "ef"});
265 Status st = ParseFinal(parser, csv, &out_size);
266 ASSERT_RAISES(Invalid, st);
267}
268
269TEST(BlockParser, QuotingSimple) {
270 auto csv = MakeCSVData({"1,\",3,\",5\n"});
271
272 {
273 BlockParser parser(ParseOptions::Defaults());
274 AssertParseOk(parser, csv);
275 AssertColumnsEq(parser, {{"1"}, {",3,"}, {"5"}},
276 {{false}, {true}, {false}} /* quoted */);
277 }
278 {
279 auto options = ParseOptions::Defaults();
280 options.quoting = false;
281 BlockParser parser(options);
282 AssertParseOk(parser, csv);
283 AssertColumnsEq(parser, {{"1"}, {"\""}, {"3"}, {"\""}, {"5"}},
284 {{false}, {false}, {false}, {false}, {false}} /* quoted */);
285 }
286 {
287 auto options = ParseOptions::Defaults();
288 options.quote_char = 'Z';
289 BlockParser parser(options);
290 AssertParseOk(parser, csv);
291 AssertColumnsEq(parser, {{"1"}, {"\""}, {"3"}, {"\""}, {"5"}},
292 {{false}, {false}, {false}, {false}, {false}} /* quoted */);
293 }
294}
295
296TEST(BlockParser, QuotingNewline) {
297 auto csv = MakeCSVData({"a,\"c \n d\",e\n"});
298 BlockParser parser(ParseOptions::Defaults());
299 AssertParseOk(parser, csv);
300 AssertColumnsEq(parser, {{"a"}, {"c \n d"}, {"e"}},
301 {{false}, {true}, {false}} /* quoted */);
302}
303
304TEST(BlockParser, QuotingUnbalanced) {
305 // Quote introduces a quoted field that doesn't end
306 auto csv = MakeCSVData({"a,b\n", "1,\",3,,5\n"});
307 BlockParser parser(ParseOptions::Defaults());
308 AssertParsePartial(parser, csv, 4);
309 AssertColumnsEq(parser, {{"a"}, {"b"}}, {{false}, {false}} /* quoted */);
310}
311
312TEST(BlockParser, QuotingEmpty) {
313 {
314 BlockParser parser(ParseOptions::Defaults());
315 auto csv = MakeCSVData({"\"\"\n"});
316 AssertParseOk(parser, csv);
317 AssertColumnsEq(parser, {{""}}, {{true}} /* quoted */);
318 }
319 {
320 BlockParser parser(ParseOptions::Defaults());
321 auto csv = MakeCSVData({",\"\"\n"});
322 AssertParseOk(parser, csv);
323 AssertColumnsEq(parser, {{""}, {""}}, {{false}, {true}} /* quoted */);
324 }
325 {
326 BlockParser parser(ParseOptions::Defaults());
327 auto csv = MakeCSVData({"\"\",\n"});
328 AssertParseOk(parser, csv);
329 AssertColumnsEq(parser, {{""}, {""}}, {{true}, {false}} /* quoted */);
330 }
331}
332
333TEST(BlockParser, QuotingDouble) {
334 {
335 BlockParser parser(ParseOptions::Defaults());
336 // 4 quotes is a quoted quote
337 auto csv = MakeCSVData({"\"\"\"\"\n"});
338 AssertParseOk(parser, csv);
339 AssertColumnsEq(parser, {{"\""}}, {{true}} /* quoted */);
340 }
341 {
342 BlockParser parser(ParseOptions::Defaults());
343 // 4 quotes is a quoted quote
344 auto csv = MakeCSVData({"a,\"\"\"\",b\n"});
345 AssertParseOk(parser, csv);
346 AssertColumnsEq(parser, {{"a"}, {"\""}, {"b"}},
347 {{false}, {true}, {false}} /* quoted */);
348 }
349 {
350 BlockParser parser(ParseOptions::Defaults());
351 // 6 quotes is two quoted quotes
352 auto csv = MakeCSVData({"\"\"\"\"\"\"\n"});
353 AssertParseOk(parser, csv);
354 AssertColumnsEq(parser, {{"\"\""}}, {{true}} /* quoted */);
355 }
356 {
357 BlockParser parser(ParseOptions::Defaults());
358 // 6 quotes is two quoted quotes
359 auto csv = MakeCSVData({"a,\"\"\"\"\"\",b\n"});
360 AssertParseOk(parser, csv);
361 AssertColumnsEq(parser, {{"a"}, {"\"\""}, {"b"}},
362 {{false}, {true}, {false}} /* quoted */);
363 }
364}
365
366TEST(BlockParser, QuotesAndMore) {
367 // There may be trailing data after the quoted part of a field
368 {
369 BlockParser parser(ParseOptions::Defaults());
370 auto csv = MakeCSVData({"a,\"b\"c,d\n"});
371 AssertParseOk(parser, csv);
372 AssertColumnsEq(parser, {{"a"}, {"bc"}, {"d"}},
373 {{false}, {true}, {false}} /* quoted */);
374 }
375}
376
377TEST(BlockParser, QuotesSpecial) {
378 // Some non-trivial cases
379 {
380 BlockParser parser(ParseOptions::Defaults());
381 auto csv = MakeCSVData({"a,b\"c,d\n"});
382 AssertParseOk(parser, csv);
383 AssertColumnsEq(parser, {{"a"}, {"b\"c"}, {"d"}},
384 {{false}, {false}, {false}} /* quoted */);
385 }
386 {
387 BlockParser parser(ParseOptions::Defaults());
388 auto csv = MakeCSVData({"a,\"b\" \"c\",d\n"});
389 AssertParseOk(parser, csv);
390 AssertColumnsEq(parser, {{"a"}, {"b \"c\""}, {"d"}},
391 {{false}, {true}, {false}} /* quoted */);
392 }
393}
394
395TEST(BlockParser, MismatchingNumColumns) {
396 uint32_t out_size;
397 {
398 BlockParser parser(ParseOptions::Defaults());
399 auto csv = MakeCSVData({"a,b\nc\n"});
400 Status st = Parse(parser, csv, &out_size);
401 ASSERT_RAISES(Invalid, st);
402 }
403 {
404 BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */);
405 auto csv = MakeCSVData({"a\n"});
406 Status st = Parse(parser, csv, &out_size);
407 ASSERT_RAISES(Invalid, st);
408 }
409 {
410 BlockParser parser(ParseOptions::Defaults(), 2 /* num_cols */);
411 auto csv = MakeCSVData({"a,b,c\n"});
412 Status st = Parse(parser, csv, &out_size);
413 ASSERT_RAISES(Invalid, st);
414 }
415}
416
417TEST(BlockParser, Escaping) {
418 auto options = ParseOptions::Defaults();
419 options.escaping = true;
420
421 {
422 auto csv = MakeCSVData({"a\\b,c\n"});
423 {
424 BlockParser parser(ParseOptions::Defaults());
425 AssertParseOk(parser, csv);
426 AssertColumnsEq(parser, {{"a\\b"}, {"c"}});
427 }
428 {
429 BlockParser parser(options);
430 AssertParseOk(parser, csv);
431 AssertColumnsEq(parser, {{"ab"}, {"c"}});
432 }
433 }
434 {
435 auto csv = MakeCSVData({"a\\,b,c\n"});
436 BlockParser parser(options);
437 AssertParseOk(parser, csv);
438 AssertColumnsEq(parser, {{"a,b"}, {"c"}});
439 }
440}
441
442TEST(BlockParser, QuotedEscape) {
443 auto options = ParseOptions::Defaults();
444 options.escaping = true;
445
446 {
447 auto csv = MakeCSVData({"\"a\\,b\",c\n"});
448 BlockParser parser(options);
449 AssertParseOk(parser, csv);
450 AssertColumnsEq(parser, {{"a,b"}, {"c"}}, {{true}, {false}} /* quoted */);
451 }
452 {
453 auto csv = MakeCSVData({"\"a\\\"b\",c\n"});
454 BlockParser parser(options);
455 AssertParseOk(parser, csv);
456 AssertColumnsEq(parser, {{"a\"b"}, {"c"}}, {{true}, {false}} /* quoted */);
457 }
458}
459
460} // namespace csv
461} // namespace arrow
462