1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <cstdint> |
19 | #include <numeric> |
20 | #include <string> |
21 | |
22 | #include <gtest/gtest.h> |
23 | |
24 | #include "arrow/csv/chunker.h" |
25 | #include "arrow/csv/options.h" |
26 | #include "arrow/csv/test-common.h" |
27 | #include "arrow/test-util.h" |
28 | |
29 | namespace arrow { |
30 | namespace csv { |
31 | |
32 | void AssertChunkSize(Chunker& chunker, const std::string& str, uint32_t chunk_size) { |
33 | uint32_t actual_chunk_size; |
34 | ASSERT_OK( |
35 | chunker.Process(str.data(), static_cast<uint32_t>(str.size()), &actual_chunk_size)); |
36 | ASSERT_EQ(actual_chunk_size, chunk_size); |
37 | } |
38 | |
39 | template <typename IntContainer> |
40 | void AssertChunking(Chunker& chunker, const std::string& str, |
41 | const IntContainer& lengths) { |
42 | uint32_t expected_chunk_size; |
43 | |
44 | // First chunkize whole CSV block |
45 | expected_chunk_size = |
46 | static_cast<uint32_t>(std::accumulate(lengths.begin(), lengths.end(), 0ULL)); |
47 | AssertChunkSize(chunker, str, expected_chunk_size); |
48 | |
49 | // Then chunkize incomplete substrings of the block |
50 | expected_chunk_size = 0; |
51 | for (const auto length : lengths) { |
52 | AssertChunkSize(chunker, str.substr(0, expected_chunk_size + length - 1), |
53 | expected_chunk_size); |
54 | |
55 | expected_chunk_size += static_cast<uint32_t>(length); |
56 | AssertChunkSize(chunker, str.substr(0, expected_chunk_size), expected_chunk_size); |
57 | } |
58 | } |
59 | |
60 | class BaseChunkerTest : public ::testing::TestWithParam<bool> { |
61 | protected: |
62 | void SetUp() override { |
63 | options_ = ParseOptions::Defaults(); |
64 | options_.newlines_in_values = GetParam(); |
65 | } |
66 | |
67 | ParseOptions options_; |
68 | }; |
69 | |
70 | INSTANTIATE_TEST_CASE_P(ChunkerTest, BaseChunkerTest, ::testing::Values(true)); |
71 | |
72 | INSTANTIATE_TEST_CASE_P(NoNewlineChunkerTest, BaseChunkerTest, ::testing::Values(false)); |
73 | |
74 | TEST_P(BaseChunkerTest, Basics) { |
75 | auto csv = MakeCSVData({"ab,c,\n" , "def,,gh\n" , ",ij,kl\n" }); |
76 | auto lengths = {6, 8, 7}; |
77 | Chunker chunker(options_); |
78 | |
79 | AssertChunking(chunker, csv, lengths); |
80 | } |
81 | |
82 | TEST_P(BaseChunkerTest, Empty) { |
83 | Chunker chunker(options_); |
84 | { |
85 | auto csv = MakeCSVData({"\n" }); |
86 | auto lengths = {1}; |
87 | AssertChunking(chunker, csv, lengths); |
88 | } |
89 | { |
90 | auto csv = MakeCSVData({"\n\n" }); |
91 | auto lengths = {1, 1}; |
92 | AssertChunking(chunker, csv, lengths); |
93 | } |
94 | { |
95 | auto csv = MakeCSVData({",\n" }); |
96 | auto lengths = {2}; |
97 | AssertChunking(chunker, csv, lengths); |
98 | } |
99 | { |
100 | auto csv = MakeCSVData({",\n,\n" }); |
101 | auto lengths = {2, 2}; |
102 | AssertChunking(chunker, csv, lengths); |
103 | } |
104 | } |
105 | |
106 | TEST_P(BaseChunkerTest, Newlines) { |
107 | Chunker chunker(options_); |
108 | { |
109 | auto csv = MakeCSVData({"a\n" , "b\r" , "c,d\r\n" }); |
110 | AssertChunkSize(chunker, csv, static_cast<uint32_t>(csv.size())); |
111 | // Trailing \n after \r is optional |
112 | AssertChunkSize(chunker, csv.substr(0, csv.size() - 1), |
113 | static_cast<uint32_t>(csv.size() - 1)); |
114 | } |
115 | } |
116 | |
117 | TEST_P(BaseChunkerTest, QuotingSimple) { |
118 | auto csv = MakeCSVData({"1,\",3,\",5\n" }); |
119 | { |
120 | Chunker chunker(options_); |
121 | auto lengths = {csv.size()}; |
122 | AssertChunking(chunker, csv, lengths); |
123 | } |
124 | { |
125 | options_.quoting = false; |
126 | Chunker chunker(options_); |
127 | auto lengths = {csv.size()}; |
128 | AssertChunking(chunker, csv, lengths); |
129 | } |
130 | } |
131 | |
132 | TEST_P(BaseChunkerTest, QuotingNewline) { |
133 | auto csv = MakeCSVData({"a,\"c \n d\",e\n" }); |
134 | if (options_.newlines_in_values) { |
135 | Chunker chunker(options_); |
136 | auto lengths = {12}; |
137 | AssertChunking(chunker, csv, lengths); |
138 | } |
139 | { |
140 | options_.quoting = false; |
141 | Chunker chunker(options_); |
142 | auto lengths = {6, 6}; |
143 | AssertChunking(chunker, csv, lengths); |
144 | } |
145 | } |
146 | |
147 | TEST_P(BaseChunkerTest, QuotingUnbalanced) { |
148 | // Quote introduces a quoted field that doesn't end |
149 | auto csv = MakeCSVData({"a,b\n" , "1,\",3,,5\n" , "c,d\n" }); |
150 | if (options_.newlines_in_values) { |
151 | Chunker chunker(options_); |
152 | auto lengths = {4}; |
153 | AssertChunking(chunker, csv, lengths); |
154 | } |
155 | { |
156 | options_.quoting = false; |
157 | Chunker chunker(options_); |
158 | auto lengths = {4, 9, 4}; |
159 | AssertChunking(chunker, csv, lengths); |
160 | } |
161 | } |
162 | |
163 | TEST_P(BaseChunkerTest, QuotingEmpty) { |
164 | Chunker chunker(options_); |
165 | { |
166 | auto csv = MakeCSVData({"\"\"\n" , "a\n" }); |
167 | auto lengths = {3, 2}; |
168 | AssertChunking(chunker, csv, lengths); |
169 | } |
170 | { |
171 | auto csv = MakeCSVData({",\"\"\n" , "a\n" }); |
172 | auto lengths = {4, 2}; |
173 | AssertChunking(chunker, csv, lengths); |
174 | } |
175 | { |
176 | auto csv = MakeCSVData({"\"\",\n" , "a\n" }); |
177 | auto lengths = {4, 2}; |
178 | AssertChunking(chunker, csv, lengths); |
179 | } |
180 | } |
181 | |
182 | TEST_P(BaseChunkerTest, QuotingDouble) { |
183 | { |
184 | Chunker chunker(options_); |
185 | // 4 quotes is a quoted quote |
186 | auto csv = MakeCSVData({"\"\"\"\"\n" , "a\n" }); |
187 | auto lengths = {5, 2}; |
188 | AssertChunking(chunker, csv, lengths); |
189 | } |
190 | } |
191 | |
192 | TEST_P(BaseChunkerTest, QuotesSpecial) { |
193 | // Some non-trivial cases |
194 | { |
195 | Chunker chunker(options_); |
196 | auto csv = MakeCSVData({"a,b\"c,d\n" , "e\n" }); |
197 | auto lengths = {8, 2}; |
198 | AssertChunking(chunker, csv, lengths); |
199 | } |
200 | { |
201 | Chunker chunker(options_); |
202 | auto csv = MakeCSVData({"a,\"b\" \"c\",d\n" , "e\n" }); |
203 | auto lengths = {12, 2}; |
204 | AssertChunking(chunker, csv, lengths); |
205 | } |
206 | } |
207 | |
208 | TEST_P(BaseChunkerTest, Escaping) { |
209 | { |
210 | auto csv = MakeCSVData({"a\\b,c\n" , "d\n" }); |
211 | auto lengths = {6, 2}; |
212 | { |
213 | options_.escaping = false; |
214 | Chunker chunker(options_); |
215 | AssertChunking(chunker, csv, lengths); |
216 | } |
217 | { |
218 | options_.escaping = true; |
219 | Chunker chunker(options_); |
220 | AssertChunking(chunker, csv, lengths); |
221 | } |
222 | } |
223 | { |
224 | auto csv = MakeCSVData({"a\\,b,c\n" , "d\n" }); |
225 | auto lengths = {7, 2}; |
226 | { |
227 | options_.escaping = false; |
228 | Chunker chunker(options_); |
229 | AssertChunking(chunker, csv, lengths); |
230 | } |
231 | { |
232 | options_.escaping = true; |
233 | Chunker chunker(options_); |
234 | AssertChunking(chunker, csv, lengths); |
235 | } |
236 | } |
237 | } |
238 | |
239 | TEST_P(BaseChunkerTest, EscapingNewline) { |
240 | if (options_.newlines_in_values) { |
241 | auto csv = MakeCSVData({"a\\\nb\n" , "c\n" }); |
242 | { |
243 | auto lengths = {3, 2, 2}; |
244 | Chunker chunker(options_); |
245 | AssertChunking(chunker, csv, lengths); |
246 | } |
247 | options_.escaping = true; |
248 | { |
249 | auto lengths = {5, 2}; |
250 | Chunker chunker(options_); |
251 | AssertChunking(chunker, csv, lengths); |
252 | } |
253 | } |
254 | } |
255 | |
256 | } // namespace csv |
257 | } // namespace arrow |
258 | |