1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include <cstdint>
19#include <numeric>
20#include <string>
21
22#include <gtest/gtest.h>
23
24#include "arrow/csv/chunker.h"
25#include "arrow/csv/options.h"
26#include "arrow/csv/test-common.h"
27#include "arrow/test-util.h"
28
29namespace arrow {
30namespace csv {
31
32void AssertChunkSize(Chunker& chunker, const std::string& str, uint32_t chunk_size) {
33 uint32_t actual_chunk_size;
34 ASSERT_OK(
35 chunker.Process(str.data(), static_cast<uint32_t>(str.size()), &actual_chunk_size));
36 ASSERT_EQ(actual_chunk_size, chunk_size);
37}
38
39template <typename IntContainer>
40void AssertChunking(Chunker& chunker, const std::string& str,
41 const IntContainer& lengths) {
42 uint32_t expected_chunk_size;
43
44 // First chunkize whole CSV block
45 expected_chunk_size =
46 static_cast<uint32_t>(std::accumulate(lengths.begin(), lengths.end(), 0ULL));
47 AssertChunkSize(chunker, str, expected_chunk_size);
48
49 // Then chunkize incomplete substrings of the block
50 expected_chunk_size = 0;
51 for (const auto length : lengths) {
52 AssertChunkSize(chunker, str.substr(0, expected_chunk_size + length - 1),
53 expected_chunk_size);
54
55 expected_chunk_size += static_cast<uint32_t>(length);
56 AssertChunkSize(chunker, str.substr(0, expected_chunk_size), expected_chunk_size);
57 }
58}
59
60class BaseChunkerTest : public ::testing::TestWithParam<bool> {
61 protected:
62 void SetUp() override {
63 options_ = ParseOptions::Defaults();
64 options_.newlines_in_values = GetParam();
65 }
66
67 ParseOptions options_;
68};
69
70INSTANTIATE_TEST_CASE_P(ChunkerTest, BaseChunkerTest, ::testing::Values(true));
71
72INSTANTIATE_TEST_CASE_P(NoNewlineChunkerTest, BaseChunkerTest, ::testing::Values(false));
73
74TEST_P(BaseChunkerTest, Basics) {
75 auto csv = MakeCSVData({"ab,c,\n", "def,,gh\n", ",ij,kl\n"});
76 auto lengths = {6, 8, 7};
77 Chunker chunker(options_);
78
79 AssertChunking(chunker, csv, lengths);
80}
81
82TEST_P(BaseChunkerTest, Empty) {
83 Chunker chunker(options_);
84 {
85 auto csv = MakeCSVData({"\n"});
86 auto lengths = {1};
87 AssertChunking(chunker, csv, lengths);
88 }
89 {
90 auto csv = MakeCSVData({"\n\n"});
91 auto lengths = {1, 1};
92 AssertChunking(chunker, csv, lengths);
93 }
94 {
95 auto csv = MakeCSVData({",\n"});
96 auto lengths = {2};
97 AssertChunking(chunker, csv, lengths);
98 }
99 {
100 auto csv = MakeCSVData({",\n,\n"});
101 auto lengths = {2, 2};
102 AssertChunking(chunker, csv, lengths);
103 }
104}
105
106TEST_P(BaseChunkerTest, Newlines) {
107 Chunker chunker(options_);
108 {
109 auto csv = MakeCSVData({"a\n", "b\r", "c,d\r\n"});
110 AssertChunkSize(chunker, csv, static_cast<uint32_t>(csv.size()));
111 // Trailing \n after \r is optional
112 AssertChunkSize(chunker, csv.substr(0, csv.size() - 1),
113 static_cast<uint32_t>(csv.size() - 1));
114 }
115}
116
117TEST_P(BaseChunkerTest, QuotingSimple) {
118 auto csv = MakeCSVData({"1,\",3,\",5\n"});
119 {
120 Chunker chunker(options_);
121 auto lengths = {csv.size()};
122 AssertChunking(chunker, csv, lengths);
123 }
124 {
125 options_.quoting = false;
126 Chunker chunker(options_);
127 auto lengths = {csv.size()};
128 AssertChunking(chunker, csv, lengths);
129 }
130}
131
132TEST_P(BaseChunkerTest, QuotingNewline) {
133 auto csv = MakeCSVData({"a,\"c \n d\",e\n"});
134 if (options_.newlines_in_values) {
135 Chunker chunker(options_);
136 auto lengths = {12};
137 AssertChunking(chunker, csv, lengths);
138 }
139 {
140 options_.quoting = false;
141 Chunker chunker(options_);
142 auto lengths = {6, 6};
143 AssertChunking(chunker, csv, lengths);
144 }
145}
146
147TEST_P(BaseChunkerTest, QuotingUnbalanced) {
148 // Quote introduces a quoted field that doesn't end
149 auto csv = MakeCSVData({"a,b\n", "1,\",3,,5\n", "c,d\n"});
150 if (options_.newlines_in_values) {
151 Chunker chunker(options_);
152 auto lengths = {4};
153 AssertChunking(chunker, csv, lengths);
154 }
155 {
156 options_.quoting = false;
157 Chunker chunker(options_);
158 auto lengths = {4, 9, 4};
159 AssertChunking(chunker, csv, lengths);
160 }
161}
162
163TEST_P(BaseChunkerTest, QuotingEmpty) {
164 Chunker chunker(options_);
165 {
166 auto csv = MakeCSVData({"\"\"\n", "a\n"});
167 auto lengths = {3, 2};
168 AssertChunking(chunker, csv, lengths);
169 }
170 {
171 auto csv = MakeCSVData({",\"\"\n", "a\n"});
172 auto lengths = {4, 2};
173 AssertChunking(chunker, csv, lengths);
174 }
175 {
176 auto csv = MakeCSVData({"\"\",\n", "a\n"});
177 auto lengths = {4, 2};
178 AssertChunking(chunker, csv, lengths);
179 }
180}
181
182TEST_P(BaseChunkerTest, QuotingDouble) {
183 {
184 Chunker chunker(options_);
185 // 4 quotes is a quoted quote
186 auto csv = MakeCSVData({"\"\"\"\"\n", "a\n"});
187 auto lengths = {5, 2};
188 AssertChunking(chunker, csv, lengths);
189 }
190}
191
192TEST_P(BaseChunkerTest, QuotesSpecial) {
193 // Some non-trivial cases
194 {
195 Chunker chunker(options_);
196 auto csv = MakeCSVData({"a,b\"c,d\n", "e\n"});
197 auto lengths = {8, 2};
198 AssertChunking(chunker, csv, lengths);
199 }
200 {
201 Chunker chunker(options_);
202 auto csv = MakeCSVData({"a,\"b\" \"c\",d\n", "e\n"});
203 auto lengths = {12, 2};
204 AssertChunking(chunker, csv, lengths);
205 }
206}
207
208TEST_P(BaseChunkerTest, Escaping) {
209 {
210 auto csv = MakeCSVData({"a\\b,c\n", "d\n"});
211 auto lengths = {6, 2};
212 {
213 options_.escaping = false;
214 Chunker chunker(options_);
215 AssertChunking(chunker, csv, lengths);
216 }
217 {
218 options_.escaping = true;
219 Chunker chunker(options_);
220 AssertChunking(chunker, csv, lengths);
221 }
222 }
223 {
224 auto csv = MakeCSVData({"a\\,b,c\n", "d\n"});
225 auto lengths = {7, 2};
226 {
227 options_.escaping = false;
228 Chunker chunker(options_);
229 AssertChunking(chunker, csv, lengths);
230 }
231 {
232 options_.escaping = true;
233 Chunker chunker(options_);
234 AssertChunking(chunker, csv, lengths);
235 }
236 }
237}
238
239TEST_P(BaseChunkerTest, EscapingNewline) {
240 if (options_.newlines_in_values) {
241 auto csv = MakeCSVData({"a\\\nb\n", "c\n"});
242 {
243 auto lengths = {3, 2, 2};
244 Chunker chunker(options_);
245 AssertChunking(chunker, csv, lengths);
246 }
247 options_.escaping = true;
248 {
249 auto lengths = {5, 2};
250 Chunker chunker(options_);
251 AssertChunking(chunker, csv, lengths);
252 }
253 }
254}
255
256} // namespace csv
257} // namespace arrow
258