1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef ARROW_CSV_PARSER_H
19#define ARROW_CSV_PARSER_H
20
21#include <cstddef>
22#include <cstdint>
23#include <memory>
24#include <vector>
25
26#include "arrow/buffer.h"
27#include "arrow/csv/options.h"
28#include "arrow/status.h"
29#include "arrow/util/macros.h"
30#include "arrow/util/visibility.h"
31
32namespace arrow {
33
34class MemoryPool;
35
36namespace csv {
37
38constexpr int32_t kMaxParserNumRows = 100000;
39
40/// \class BlockParser
41/// \brief A reusable block-based parser for CSV data
42///
43/// The parser takes a block of CSV data and delimits rows and fields,
44/// unquoting and unescaping them on the fly. Parsed data is own by the
45/// parser, so the original buffer can be discarded after Parse() returns.
46///
47/// If the block is truncated (i.e. not all data can be parsed), it is up
48/// to the caller to arrange the next block to start with the trailing data.
49/// Also, if the previous block ends with CR (0x0d) and a new block starts
50/// with LF (0x0a), the parser will consider the leading newline as an empty
51/// line; the caller should therefore strip it.
52class ARROW_EXPORT BlockParser {
53 public:
54 explicit BlockParser(ParseOptions options, int32_t num_cols = -1,
55 int32_t max_num_rows = kMaxParserNumRows);
56 explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1,
57 int32_t max_num_rows = kMaxParserNumRows);
58
59 /// \brief Parse a block of data
60 ///
61 /// Parse a block of CSV data, ingesting up to max_num_rows rows.
62 /// The number of bytes actually parsed is returned in out_size.
63 Status Parse(const char* data, uint32_t size, uint32_t* out_size);
64
65 /// \brief Parse the final block of data
66 ///
67 /// Like Parse(), but called with the final block in a file.
68 /// The last row may lack a trailing line separator.
69 Status ParseFinal(const char* data, uint32_t size, uint32_t* out_size);
70
71 /// \brief Return the number of parsed rows
72 int32_t num_rows() const { return num_rows_; }
73 /// \brief Return the number of parsed columns
74 int32_t num_cols() const { return num_cols_; }
75 /// \brief Return the total size in bytes of parsed data
76 uint32_t num_bytes() const { return parsed_size_; }
77
78 /// \brief Visit parsed values in a column
79 ///
80 /// The signature of the visitor is
81 /// Status(const uint8_t* data, uint32_t size, bool quoted)
82 template <typename Visitor>
83 Status VisitColumn(int32_t col_index, Visitor&& visit) const {
84 for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) {
85 const auto& values_buffer = values_buffers_[buf_index];
86 const auto values = reinterpret_cast<const ValueDesc*>(values_buffer->data());
87 const auto max_pos =
88 static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) - 1;
89 for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) {
90 auto start = values[pos].offset;
91 auto stop = values[pos + 1].offset;
92 auto quoted = values[pos + 1].quoted;
93 ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
94 }
95 }
96 return Status::OK();
97 }
98
99 protected:
100 ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
101
102 Status DoParse(const char* data, uint32_t size, bool is_final, uint32_t* out_size);
103 template <typename SpecializedOptions>
104 Status DoParseSpecialized(const char* data, uint32_t size, bool is_final,
105 uint32_t* out_size);
106
107 template <typename SpecializedOptions, typename ValuesWriter, typename ParsedWriter>
108 Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
109 const char* data, const char* data_end, bool is_final,
110 int32_t rows_in_chunk, const char** out_data, bool* finished_parsing);
111
112 // Parse a single line from the data pointer
113 template <typename SpecializedOptions, typename ValuesWriter, typename ParsedWriter>
114 Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
115 const char* data, const char* data_end, bool is_final,
116 const char** out_data);
117
118 MemoryPool* pool_;
119 const ParseOptions options_;
120 // The number of rows parsed from the block
121 int32_t num_rows_;
122 // The number of columns (can be -1 at start)
123 int32_t num_cols_;
124 // The maximum number of rows to parse from this block
125 int32_t max_num_rows_;
126
127 // Linear scratchpad for parsed values
128 struct ValueDesc {
129 uint32_t offset : 31;
130 bool quoted : 1;
131 };
132
133 // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes?
134 // It may help with null parsing...
135 std::vector<std::shared_ptr<Buffer>> values_buffers_;
136 std::shared_ptr<Buffer> parsed_buffer_;
137 const uint8_t* parsed_;
138 int32_t values_size_;
139 int32_t parsed_size_;
140
141 class ResizableValuesWriter;
142 class PresizedValuesWriter;
143 class PresizedParsedWriter;
144};
145
146} // namespace csv
147} // namespace arrow
148
149#endif // ARROW_CSV_PARSER_H
150