1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef ARROW_CSV_PARSER_H |
19 | #define ARROW_CSV_PARSER_H |
20 | |
21 | #include <cstddef> |
22 | #include <cstdint> |
23 | #include <memory> |
24 | #include <vector> |
25 | |
26 | #include "arrow/buffer.h" |
27 | #include "arrow/csv/options.h" |
28 | #include "arrow/status.h" |
29 | #include "arrow/util/macros.h" |
30 | #include "arrow/util/visibility.h" |
31 | |
32 | namespace arrow { |
33 | |
34 | class MemoryPool; |
35 | |
36 | namespace csv { |
37 | |
38 | constexpr int32_t kMaxParserNumRows = 100000; |
39 | |
40 | /// \class BlockParser |
41 | /// \brief A reusable block-based parser for CSV data |
42 | /// |
43 | /// The parser takes a block of CSV data and delimits rows and fields, |
44 | /// unquoting and unescaping them on the fly. Parsed data is own by the |
45 | /// parser, so the original buffer can be discarded after Parse() returns. |
46 | /// |
47 | /// If the block is truncated (i.e. not all data can be parsed), it is up |
48 | /// to the caller to arrange the next block to start with the trailing data. |
49 | /// Also, if the previous block ends with CR (0x0d) and a new block starts |
50 | /// with LF (0x0a), the parser will consider the leading newline as an empty |
51 | /// line; the caller should therefore strip it. |
52 | class ARROW_EXPORT BlockParser { |
53 | public: |
54 | explicit BlockParser(ParseOptions options, int32_t num_cols = -1, |
55 | int32_t max_num_rows = kMaxParserNumRows); |
56 | explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1, |
57 | int32_t max_num_rows = kMaxParserNumRows); |
58 | |
59 | /// \brief Parse a block of data |
60 | /// |
61 | /// Parse a block of CSV data, ingesting up to max_num_rows rows. |
62 | /// The number of bytes actually parsed is returned in out_size. |
63 | Status Parse(const char* data, uint32_t size, uint32_t* out_size); |
64 | |
65 | /// \brief Parse the final block of data |
66 | /// |
67 | /// Like Parse(), but called with the final block in a file. |
68 | /// The last row may lack a trailing line separator. |
69 | Status ParseFinal(const char* data, uint32_t size, uint32_t* out_size); |
70 | |
71 | /// \brief Return the number of parsed rows |
72 | int32_t num_rows() const { return num_rows_; } |
73 | /// \brief Return the number of parsed columns |
74 | int32_t num_cols() const { return num_cols_; } |
75 | /// \brief Return the total size in bytes of parsed data |
76 | uint32_t num_bytes() const { return parsed_size_; } |
77 | |
78 | /// \brief Visit parsed values in a column |
79 | /// |
80 | /// The signature of the visitor is |
81 | /// Status(const uint8_t* data, uint32_t size, bool quoted) |
82 | template <typename Visitor> |
83 | Status VisitColumn(int32_t col_index, Visitor&& visit) const { |
84 | for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) { |
85 | const auto& values_buffer = values_buffers_[buf_index]; |
86 | const auto values = reinterpret_cast<const ValueDesc*>(values_buffer->data()); |
87 | const auto max_pos = |
88 | static_cast<int32_t>(values_buffer->size() / sizeof(ValueDesc)) - 1; |
89 | for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) { |
90 | auto start = values[pos].offset; |
91 | auto stop = values[pos + 1].offset; |
92 | auto quoted = values[pos + 1].quoted; |
93 | ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted)); |
94 | } |
95 | } |
96 | return Status::OK(); |
97 | } |
98 | |
99 | protected: |
100 | ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser); |
101 | |
102 | Status DoParse(const char* data, uint32_t size, bool is_final, uint32_t* out_size); |
103 | template <typename SpecializedOptions> |
104 | Status DoParseSpecialized(const char* data, uint32_t size, bool is_final, |
105 | uint32_t* out_size); |
106 | |
107 | template <typename SpecializedOptions, typename ValuesWriter, typename ParsedWriter> |
108 | Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer, |
109 | const char* data, const char* data_end, bool is_final, |
110 | int32_t rows_in_chunk, const char** out_data, bool* finished_parsing); |
111 | |
112 | // Parse a single line from the data pointer |
113 | template <typename SpecializedOptions, typename ValuesWriter, typename ParsedWriter> |
114 | Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer, |
115 | const char* data, const char* data_end, bool is_final, |
116 | const char** out_data); |
117 | |
118 | MemoryPool* pool_; |
119 | const ParseOptions options_; |
120 | // The number of rows parsed from the block |
121 | int32_t num_rows_; |
122 | // The number of columns (can be -1 at start) |
123 | int32_t num_cols_; |
124 | // The maximum number of rows to parse from this block |
125 | int32_t max_num_rows_; |
126 | |
127 | // Linear scratchpad for parsed values |
128 | struct ValueDesc { |
129 | uint32_t offset : 31; |
130 | bool quoted : 1; |
131 | }; |
132 | |
133 | // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes? |
134 | // It may help with null parsing... |
135 | std::vector<std::shared_ptr<Buffer>> values_buffers_; |
136 | std::shared_ptr<Buffer> parsed_buffer_; |
137 | const uint8_t* parsed_; |
138 | int32_t values_size_; |
139 | int32_t parsed_size_; |
140 | |
141 | class ResizableValuesWriter; |
142 | class PresizedValuesWriter; |
143 | class PresizedParsedWriter; |
144 | }; |
145 | |
146 | } // namespace csv |
147 | } // namespace arrow |
148 | |
149 | #endif // ARROW_CSV_PARSER_H |
150 | |