1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/csv/converter.h"
19
20#include <cstring>
21#include <sstream>
22#include <string>
23#include <type_traits>
24#include <vector>
25
26#include "arrow/builder.h"
27#include "arrow/csv/parser.h"
28#include "arrow/memory_pool.h"
29#include "arrow/status.h"
30#include "arrow/type.h"
31#include "arrow/type_traits.h"
32#include "arrow/util/parsing.h" // IWYU pragma: keep
33#include "arrow/util/trie.h"
34#include "arrow/util/utf8.h"
35
36namespace arrow {
37namespace csv {
38
39using internal::StringConverter;
40using internal::Trie;
41using internal::TrieBuilder;
42
43namespace {
44
45Status GenericConversionError(const std::shared_ptr<DataType>& type, const uint8_t* data,
46 uint32_t size) {
47 return Status::Invalid("CSV conversion error to ", type->ToString(),
48 ": invalid value '",
49 std::string(reinterpret_cast<const char*>(data), size), "'");
50}
51
52inline bool IsWhitespace(uint8_t c) {
53 if (ARROW_PREDICT_TRUE(c > ' ')) {
54 return false;
55 }
56 return c == ' ' || c == '\t';
57}
58
59class ConcreteConverter : public Converter {
60 public:
61 using Converter::Converter;
62
63 protected:
64 Status Initialize() override;
65 inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted);
66
67 Trie null_trie_;
68};
69
70Status ConcreteConverter::Initialize() {
71 // TODO no need to build a separate Trie for each Converter instance
72 TrieBuilder builder;
73 for (const auto& s : options_.null_values) {
74 RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */));
75 }
76 null_trie_ = builder.Finish();
77 return Status::OK();
78}
79
80bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) {
81 if (quoted) {
82 return false;
83 }
84 return null_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >=
85 0;
86}
87
88/////////////////////////////////////////////////////////////////////////
89// Concrete Converter for null values
90
91class NullConverter : public ConcreteConverter {
92 public:
93 using ConcreteConverter::ConcreteConverter;
94
95 Status Convert(const BlockParser& parser, int32_t col_index,
96 std::shared_ptr<Array>* out) override;
97};
98
99Status NullConverter::Convert(const BlockParser& parser, int32_t col_index,
100 std::shared_ptr<Array>* out) {
101 NullBuilder builder(pool_);
102
103 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
104 if (ARROW_PREDICT_TRUE(IsNull(data, size, quoted))) {
105 return builder.AppendNull();
106 } else {
107 return GenericConversionError(type_, data, size);
108 }
109 };
110 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
111 RETURN_NOT_OK(builder.Finish(out));
112
113 return Status::OK();
114}
115
116/////////////////////////////////////////////////////////////////////////
117// Concrete Converter for var-sized binary strings
118
119template <typename T, bool CheckUTF8>
120class VarSizeBinaryConverter : public ConcreteConverter {
121 public:
122 using ConcreteConverter::ConcreteConverter;
123
124 Status Convert(const BlockParser& parser, int32_t col_index,
125 std::shared_ptr<Array>* out) override {
126 using BuilderType = typename TypeTraits<T>::BuilderType;
127 BuilderType builder(pool_);
128
129 // TODO do we accept nulls here?
130
131 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
132 if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) {
133 return Status::Invalid("CSV conversion error to ", type_->ToString(),
134 ": invalid UTF8 data");
135 }
136 builder.UnsafeAppend(data, size);
137 return Status::OK();
138 };
139 RETURN_NOT_OK(builder.Resize(parser.num_rows()));
140 RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
141 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
142 RETURN_NOT_OK(builder.Finish(out));
143
144 return Status::OK();
145 }
146
147 protected:
148 Status Initialize() override {
149 util::InitializeUTF8();
150 return Status::OK();
151 }
152};
153
154/////////////////////////////////////////////////////////////////////////
155// Concrete Converter for fixed-sized binary strings
156
157class FixedSizeBinaryConverter : public ConcreteConverter {
158 public:
159 using ConcreteConverter::ConcreteConverter;
160
161 Status Convert(const BlockParser& parser, int32_t col_index,
162 std::shared_ptr<Array>* out) override;
163};
164
165Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_index,
166 std::shared_ptr<Array>* out) {
167 FixedSizeBinaryBuilder builder(type_, pool_);
168 const uint32_t byte_width = static_cast<uint32_t>(builder.byte_width());
169
170 // TODO do we accept nulls here?
171
172 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
173 if (ARROW_PREDICT_FALSE(size != byte_width)) {
174 return Status::Invalid("CSV conversion error to ", type_->ToString(), ": got a ",
175 size, "-byte long string");
176 }
177 return builder.Append(data);
178 };
179 RETURN_NOT_OK(builder.Resize(parser.num_rows()));
180 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
181 RETURN_NOT_OK(builder.Finish(out));
182
183 return Status::OK();
184}
185
186/////////////////////////////////////////////////////////////////////////
187// Concrete Converter for numbers
188
189template <typename T>
190class NumericConverter : public ConcreteConverter {
191 public:
192 using ConcreteConverter::ConcreteConverter;
193
194 Status Convert(const BlockParser& parser, int32_t col_index,
195 std::shared_ptr<Array>* out) override;
196};
197
198template <typename T>
199Status NumericConverter<T>::Convert(const BlockParser& parser, int32_t col_index,
200 std::shared_ptr<Array>* out) {
201 using BuilderType = typename TypeTraits<T>::BuilderType;
202 using value_type = typename StringConverter<T>::value_type;
203
204 BuilderType builder(type_, pool_);
205 StringConverter<T> converter;
206
207 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
208 // XXX should quoted values be allowed at all?
209 value_type value;
210 if (IsNull(data, size, quoted)) {
211 builder.UnsafeAppendNull();
212 return Status::OK();
213 }
214 if (!std::is_same<BooleanType, T>::value) {
215 // Skip trailing whitespace
216 if (ARROW_PREDICT_TRUE(size > 0) &&
217 ARROW_PREDICT_FALSE(IsWhitespace(data[size - 1]))) {
218 const uint8_t* p = data + size - 1;
219 while (size > 0 && IsWhitespace(*p)) {
220 --size;
221 --p;
222 }
223 }
224 // Skip leading whitespace
225 if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[0]))) {
226 while (size > 0 && IsWhitespace(*data)) {
227 --size;
228 ++data;
229 }
230 }
231 }
232 if (ARROW_PREDICT_FALSE(
233 !converter(reinterpret_cast<const char*>(data), size, &value))) {
234 return GenericConversionError(type_, data, size);
235 }
236 builder.UnsafeAppend(value);
237 return Status::OK();
238 };
239 RETURN_NOT_OK(builder.Resize(parser.num_rows()));
240 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
241 RETURN_NOT_OK(builder.Finish(out));
242
243 return Status::OK();
244}
245
246/////////////////////////////////////////////////////////////////////////
247// Concrete Converter for timestamps
248
249class TimestampConverter : public ConcreteConverter {
250 public:
251 using ConcreteConverter::ConcreteConverter;
252
253 Status Convert(const BlockParser& parser, int32_t col_index,
254 std::shared_ptr<Array>* out) override {
255 using value_type = TimestampType::c_type;
256
257 TimestampBuilder builder(type_, pool_);
258 StringConverter<TimestampType> converter(type_);
259
260 auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
261 value_type value = 0;
262 if (IsNull(data, size, quoted)) {
263 builder.UnsafeAppendNull();
264 return Status::OK();
265 }
266 if (ARROW_PREDICT_FALSE(
267 !converter(reinterpret_cast<const char*>(data), size, &value))) {
268 return GenericConversionError(type_, data, size);
269 }
270 builder.UnsafeAppend(value);
271 return Status::OK();
272 };
273 RETURN_NOT_OK(builder.Resize(parser.num_rows()));
274 RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
275 RETURN_NOT_OK(builder.Finish(out));
276
277 return Status::OK();
278 }
279};
280
281} // namespace
282
283/////////////////////////////////////////////////////////////////////////
284// Base Converter class implementation
285
286Converter::Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
287 MemoryPool* pool)
288 : options_(options), pool_(pool), type_(type) {}
289
290Status Converter::Make(const std::shared_ptr<DataType>& type,
291 const ConvertOptions& options, MemoryPool* pool,
292 std::shared_ptr<Converter>* out) {
293 Converter* result;
294
295 switch (type->id()) {
296#define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \
297 case TYPE_ID: \
298 result = new CONVERTER_TYPE(type, options, pool); \
299 break;
300
301 CONVERTER_CASE(Type::NA, NullConverter)
302 CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>)
303 CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>)
304 CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>)
305 CONVERTER_CASE(Type::INT64, NumericConverter<Int64Type>)
306 CONVERTER_CASE(Type::UINT8, NumericConverter<UInt8Type>)
307 CONVERTER_CASE(Type::UINT16, NumericConverter<UInt16Type>)
308 CONVERTER_CASE(Type::UINT32, NumericConverter<UInt32Type>)
309 CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>)
310 CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>)
311 CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>)
312 CONVERTER_CASE(Type::BOOL, NumericConverter<BooleanType>)
313 CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter)
314 CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>))
315 CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
316
317 case Type::STRING:
318 if (options.check_utf8) {
319 result = new VarSizeBinaryConverter<StringType, true>(type, options, pool);
320 } else {
321 result = new VarSizeBinaryConverter<StringType, false>(type, options, pool);
322 }
323 break;
324
325 default: {
326 return Status::NotImplemented("CSV conversion to ", type->ToString(),
327 " is not supported");
328 }
329
330#undef CONVERTER_CASE
331 }
332 out->reset(result);
333 return result->Initialize();
334}
335
336Status Converter::Make(const std::shared_ptr<DataType>& type,
337 const ConvertOptions& options, std::shared_ptr<Converter>* out) {
338 return Make(type, options, default_memory_pool(), out);
339}
340
341} // namespace csv
342} // namespace arrow
343