1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/csv/converter.h" |
19 | |
20 | #include <cstring> |
21 | #include <sstream> |
22 | #include <string> |
23 | #include <type_traits> |
24 | #include <vector> |
25 | |
26 | #include "arrow/builder.h" |
27 | #include "arrow/csv/parser.h" |
28 | #include "arrow/memory_pool.h" |
29 | #include "arrow/status.h" |
30 | #include "arrow/type.h" |
31 | #include "arrow/type_traits.h" |
32 | #include "arrow/util/parsing.h" // IWYU pragma: keep |
33 | #include "arrow/util/trie.h" |
34 | #include "arrow/util/utf8.h" |
35 | |
36 | namespace arrow { |
37 | namespace csv { |
38 | |
39 | using internal::StringConverter; |
40 | using internal::Trie; |
41 | using internal::TrieBuilder; |
42 | |
43 | namespace { |
44 | |
45 | Status GenericConversionError(const std::shared_ptr<DataType>& type, const uint8_t* data, |
46 | uint32_t size) { |
47 | return Status::Invalid("CSV conversion error to " , type->ToString(), |
48 | ": invalid value '" , |
49 | std::string(reinterpret_cast<const char*>(data), size), "'" ); |
50 | } |
51 | |
52 | inline bool IsWhitespace(uint8_t c) { |
53 | if (ARROW_PREDICT_TRUE(c > ' ')) { |
54 | return false; |
55 | } |
56 | return c == ' ' || c == '\t'; |
57 | } |
58 | |
59 | class ConcreteConverter : public Converter { |
60 | public: |
61 | using Converter::Converter; |
62 | |
63 | protected: |
64 | Status Initialize() override; |
65 | inline bool IsNull(const uint8_t* data, uint32_t size, bool quoted); |
66 | |
67 | Trie null_trie_; |
68 | }; |
69 | |
70 | Status ConcreteConverter::Initialize() { |
71 | // TODO no need to build a separate Trie for each Converter instance |
72 | TrieBuilder builder; |
73 | for (const auto& s : options_.null_values) { |
74 | RETURN_NOT_OK(builder.Append(s, true /* allow_duplicates */)); |
75 | } |
76 | null_trie_ = builder.Finish(); |
77 | return Status::OK(); |
78 | } |
79 | |
80 | bool ConcreteConverter::IsNull(const uint8_t* data, uint32_t size, bool quoted) { |
81 | if (quoted) { |
82 | return false; |
83 | } |
84 | return null_trie_.Find(util::string_view(reinterpret_cast<const char*>(data), size)) >= |
85 | 0; |
86 | } |
87 | |
88 | ///////////////////////////////////////////////////////////////////////// |
89 | // Concrete Converter for null values |
90 | |
91 | class NullConverter : public ConcreteConverter { |
92 | public: |
93 | using ConcreteConverter::ConcreteConverter; |
94 | |
95 | Status Convert(const BlockParser& parser, int32_t col_index, |
96 | std::shared_ptr<Array>* out) override; |
97 | }; |
98 | |
99 | Status NullConverter::Convert(const BlockParser& parser, int32_t col_index, |
100 | std::shared_ptr<Array>* out) { |
101 | NullBuilder builder(pool_); |
102 | |
103 | auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { |
104 | if (ARROW_PREDICT_TRUE(IsNull(data, size, quoted))) { |
105 | return builder.AppendNull(); |
106 | } else { |
107 | return GenericConversionError(type_, data, size); |
108 | } |
109 | }; |
110 | RETURN_NOT_OK(parser.VisitColumn(col_index, visit)); |
111 | RETURN_NOT_OK(builder.Finish(out)); |
112 | |
113 | return Status::OK(); |
114 | } |
115 | |
116 | ///////////////////////////////////////////////////////////////////////// |
117 | // Concrete Converter for var-sized binary strings |
118 | |
119 | template <typename T, bool CheckUTF8> |
120 | class VarSizeBinaryConverter : public ConcreteConverter { |
121 | public: |
122 | using ConcreteConverter::ConcreteConverter; |
123 | |
124 | Status Convert(const BlockParser& parser, int32_t col_index, |
125 | std::shared_ptr<Array>* out) override { |
126 | using BuilderType = typename TypeTraits<T>::BuilderType; |
127 | BuilderType builder(pool_); |
128 | |
129 | // TODO do we accept nulls here? |
130 | |
131 | auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { |
132 | if (CheckUTF8 && ARROW_PREDICT_FALSE(!util::ValidateUTF8(data, size))) { |
133 | return Status::Invalid("CSV conversion error to " , type_->ToString(), |
134 | ": invalid UTF8 data" ); |
135 | } |
136 | builder.UnsafeAppend(data, size); |
137 | return Status::OK(); |
138 | }; |
139 | RETURN_NOT_OK(builder.Resize(parser.num_rows())); |
140 | RETURN_NOT_OK(builder.ReserveData(parser.num_bytes())); |
141 | RETURN_NOT_OK(parser.VisitColumn(col_index, visit)); |
142 | RETURN_NOT_OK(builder.Finish(out)); |
143 | |
144 | return Status::OK(); |
145 | } |
146 | |
147 | protected: |
148 | Status Initialize() override { |
149 | util::InitializeUTF8(); |
150 | return Status::OK(); |
151 | } |
152 | }; |
153 | |
154 | ///////////////////////////////////////////////////////////////////////// |
155 | // Concrete Converter for fixed-sized binary strings |
156 | |
157 | class FixedSizeBinaryConverter : public ConcreteConverter { |
158 | public: |
159 | using ConcreteConverter::ConcreteConverter; |
160 | |
161 | Status Convert(const BlockParser& parser, int32_t col_index, |
162 | std::shared_ptr<Array>* out) override; |
163 | }; |
164 | |
165 | Status FixedSizeBinaryConverter::Convert(const BlockParser& parser, int32_t col_index, |
166 | std::shared_ptr<Array>* out) { |
167 | FixedSizeBinaryBuilder builder(type_, pool_); |
168 | const uint32_t byte_width = static_cast<uint32_t>(builder.byte_width()); |
169 | |
170 | // TODO do we accept nulls here? |
171 | |
172 | auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { |
173 | if (ARROW_PREDICT_FALSE(size != byte_width)) { |
174 | return Status::Invalid("CSV conversion error to " , type_->ToString(), ": got a " , |
175 | size, "-byte long string" ); |
176 | } |
177 | return builder.Append(data); |
178 | }; |
179 | RETURN_NOT_OK(builder.Resize(parser.num_rows())); |
180 | RETURN_NOT_OK(parser.VisitColumn(col_index, visit)); |
181 | RETURN_NOT_OK(builder.Finish(out)); |
182 | |
183 | return Status::OK(); |
184 | } |
185 | |
186 | ///////////////////////////////////////////////////////////////////////// |
187 | // Concrete Converter for numbers |
188 | |
189 | template <typename T> |
190 | class NumericConverter : public ConcreteConverter { |
191 | public: |
192 | using ConcreteConverter::ConcreteConverter; |
193 | |
194 | Status Convert(const BlockParser& parser, int32_t col_index, |
195 | std::shared_ptr<Array>* out) override; |
196 | }; |
197 | |
198 | template <typename T> |
199 | Status NumericConverter<T>::Convert(const BlockParser& parser, int32_t col_index, |
200 | std::shared_ptr<Array>* out) { |
201 | using BuilderType = typename TypeTraits<T>::BuilderType; |
202 | using value_type = typename StringConverter<T>::value_type; |
203 | |
204 | BuilderType builder(type_, pool_); |
205 | StringConverter<T> converter; |
206 | |
207 | auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { |
208 | // XXX should quoted values be allowed at all? |
209 | value_type value; |
210 | if (IsNull(data, size, quoted)) { |
211 | builder.UnsafeAppendNull(); |
212 | return Status::OK(); |
213 | } |
214 | if (!std::is_same<BooleanType, T>::value) { |
215 | // Skip trailing whitespace |
216 | if (ARROW_PREDICT_TRUE(size > 0) && |
217 | ARROW_PREDICT_FALSE(IsWhitespace(data[size - 1]))) { |
218 | const uint8_t* p = data + size - 1; |
219 | while (size > 0 && IsWhitespace(*p)) { |
220 | --size; |
221 | --p; |
222 | } |
223 | } |
224 | // Skip leading whitespace |
225 | if (ARROW_PREDICT_TRUE(size > 0) && ARROW_PREDICT_FALSE(IsWhitespace(data[0]))) { |
226 | while (size > 0 && IsWhitespace(*data)) { |
227 | --size; |
228 | ++data; |
229 | } |
230 | } |
231 | } |
232 | if (ARROW_PREDICT_FALSE( |
233 | !converter(reinterpret_cast<const char*>(data), size, &value))) { |
234 | return GenericConversionError(type_, data, size); |
235 | } |
236 | builder.UnsafeAppend(value); |
237 | return Status::OK(); |
238 | }; |
239 | RETURN_NOT_OK(builder.Resize(parser.num_rows())); |
240 | RETURN_NOT_OK(parser.VisitColumn(col_index, visit)); |
241 | RETURN_NOT_OK(builder.Finish(out)); |
242 | |
243 | return Status::OK(); |
244 | } |
245 | |
246 | ///////////////////////////////////////////////////////////////////////// |
247 | // Concrete Converter for timestamps |
248 | |
249 | class TimestampConverter : public ConcreteConverter { |
250 | public: |
251 | using ConcreteConverter::ConcreteConverter; |
252 | |
253 | Status Convert(const BlockParser& parser, int32_t col_index, |
254 | std::shared_ptr<Array>* out) override { |
255 | using value_type = TimestampType::c_type; |
256 | |
257 | TimestampBuilder builder(type_, pool_); |
258 | StringConverter<TimestampType> converter(type_); |
259 | |
260 | auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status { |
261 | value_type value = 0; |
262 | if (IsNull(data, size, quoted)) { |
263 | builder.UnsafeAppendNull(); |
264 | return Status::OK(); |
265 | } |
266 | if (ARROW_PREDICT_FALSE( |
267 | !converter(reinterpret_cast<const char*>(data), size, &value))) { |
268 | return GenericConversionError(type_, data, size); |
269 | } |
270 | builder.UnsafeAppend(value); |
271 | return Status::OK(); |
272 | }; |
273 | RETURN_NOT_OK(builder.Resize(parser.num_rows())); |
274 | RETURN_NOT_OK(parser.VisitColumn(col_index, visit)); |
275 | RETURN_NOT_OK(builder.Finish(out)); |
276 | |
277 | return Status::OK(); |
278 | } |
279 | }; |
280 | |
281 | } // namespace |
282 | |
283 | ///////////////////////////////////////////////////////////////////////// |
284 | // Base Converter class implementation |
285 | |
286 | Converter::Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options, |
287 | MemoryPool* pool) |
288 | : options_(options), pool_(pool), type_(type) {} |
289 | |
290 | Status Converter::Make(const std::shared_ptr<DataType>& type, |
291 | const ConvertOptions& options, MemoryPool* pool, |
292 | std::shared_ptr<Converter>* out) { |
293 | Converter* result; |
294 | |
295 | switch (type->id()) { |
296 | #define CONVERTER_CASE(TYPE_ID, CONVERTER_TYPE) \ |
297 | case TYPE_ID: \ |
298 | result = new CONVERTER_TYPE(type, options, pool); \ |
299 | break; |
300 | |
301 | CONVERTER_CASE(Type::NA, NullConverter) |
302 | CONVERTER_CASE(Type::INT8, NumericConverter<Int8Type>) |
303 | CONVERTER_CASE(Type::INT16, NumericConverter<Int16Type>) |
304 | CONVERTER_CASE(Type::INT32, NumericConverter<Int32Type>) |
305 | CONVERTER_CASE(Type::INT64, NumericConverter<Int64Type>) |
306 | CONVERTER_CASE(Type::UINT8, NumericConverter<UInt8Type>) |
307 | CONVERTER_CASE(Type::UINT16, NumericConverter<UInt16Type>) |
308 | CONVERTER_CASE(Type::UINT32, NumericConverter<UInt32Type>) |
309 | CONVERTER_CASE(Type::UINT64, NumericConverter<UInt64Type>) |
310 | CONVERTER_CASE(Type::FLOAT, NumericConverter<FloatType>) |
311 | CONVERTER_CASE(Type::DOUBLE, NumericConverter<DoubleType>) |
312 | CONVERTER_CASE(Type::BOOL, NumericConverter<BooleanType>) |
313 | CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) |
314 | CONVERTER_CASE(Type::BINARY, (VarSizeBinaryConverter<BinaryType, false>)) |
315 | CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) |
316 | |
317 | case Type::STRING: |
318 | if (options.check_utf8) { |
319 | result = new VarSizeBinaryConverter<StringType, true>(type, options, pool); |
320 | } else { |
321 | result = new VarSizeBinaryConverter<StringType, false>(type, options, pool); |
322 | } |
323 | break; |
324 | |
325 | default: { |
326 | return Status::NotImplemented("CSV conversion to " , type->ToString(), |
327 | " is not supported" ); |
328 | } |
329 | |
330 | #undef CONVERTER_CASE |
331 | } |
332 | out->reset(result); |
333 | return result->Initialize(); |
334 | } |
335 | |
336 | Status Converter::Make(const std::shared_ptr<DataType>& type, |
337 | const ConvertOptions& options, std::shared_ptr<Converter>* out) { |
338 | return Make(type, options, default_memory_pool(), out); |
339 | } |
340 | |
341 | } // namespace csv |
342 | } // namespace arrow |
343 | |