parser.h source code [arrow/arrow/csv/parser.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#ifndef ARROW_CSV_PARSER_H
19	#define ARROW_CSV_PARSER_H
20
21	#include <cstddef>
22	#include <cstdint>
23	#include <memory>
24	#include <vector>
25
26	#include "arrow/buffer.h"
27	#include "arrow/csv/options.h"
28	#include "arrow/status.h"
29	#include "arrow/util/macros.h"
30	#include "arrow/util/visibility.h"
31
32	namespace arrow {
33
34	class MemoryPool;
35
36	namespace csv {
37
38	constexpr int32_t kMaxParserNumRows = `100000`;
39
40	/// \class BlockParser
41	/// \brief A reusable block-based parser for CSV data
42	///
43	/// The parser takes a block of CSV data and delimits rows and fields,
44	/// unquoting and unescaping them on the fly. Parsed data is own by the
45	/// parser, so the original buffer can be discarded after Parse() returns.
46	///
47	/// If the block is truncated (i.e. not all data can be parsed), it is up
48	/// to the caller to arrange the next block to start with the trailing data.
49	/// Also, if the previous block ends with CR (0x0d) and a new block starts
50	/// with LF (0x0a), the parser will consider the leading newline as an empty
51	/// line; the caller should therefore strip it.
52	class ARROW_EXPORT BlockParser {
53	public:
54	explicit BlockParser(ParseOptions options, int32_t num_cols = -`1`,
55	int32_t max_num_rows = kMaxParserNumRows);
56	explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -`1`,
57	int32_t max_num_rows = kMaxParserNumRows);
58
59	/// \brief Parse a block of data
60	///
61	/// Parse a block of CSV data, ingesting up to max_num_rows rows.
62	/// The number of bytes actually parsed is returned in out_size.
63	Status Parse(const char* data, uint32_t size, uint32_t* out_size);
64
65	/// \brief Parse the final block of data
66	///
67	/// Like Parse(), but called with the final block in a file.
68	/// The last row may lack a trailing line separator.
69	Status ParseFinal(const char* data, uint32_t size, uint32_t* out_size);
70
71	/// \brief Return the number of parsed rows
72	int32_t num_rows() const { return num_rows_; }
73	/// \brief Return the number of parsed columns
74	int32_t num_cols() const { return num_cols_; }
75	/// \brief Return the total size in bytes of parsed data
76	uint32_t num_bytes() const { return parsed_size_; }
77
78	/// \brief Visit parsed values in a column
79	///
80	/// The signature of the visitor is
81	/// Status(const uint8_t data, uint32_t size, bool quoted)*
82	template <typename Visitor>
83	Status VisitColumn(int32_t col_index, Visitor&& visit) const {
84	for (size_t buf_index = `0`; buf_index < values_buffers_.size(); ++buf_index) {
85	const auto& values_buffer = values_buffers_[buf_index];
86	const auto values = reinterpret_cast<const ValueDesc*>(values_buffer ->data());
87	const auto max_pos =
88	static_cast<int32_t>(values_buffer ->size() / sizeof(ValueDesc)) - `1`;
89	for (int32_t pos = col_index; pos < max_pos; pos += num_cols_) {
90	auto start = values[pos].offset;
91	auto stop = values[pos + `1`].offset;
92	auto quoted = values[pos + `1`].quoted;
93	ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
94	}
95	}
96	return Status::OK();
97	}
98
99	protected:
100	ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
101
102	Status DoParse(const char* data, uint32_t size, bool is_final, uint32_t* out_size);
103	template <typename SpecializedOptions>
104	Status DoParseSpecialized(const char* data, uint32_t size, bool is_final,
105	uint32_t* out_size);
106
107	template <typename SpecializedOptions, typename ValuesWriter, typename ParsedWriter>
108	Status ParseChunk(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
109	const char* data, const char* data_end, bool is_final,
110	int32_t rows_in_chunk, const char** out_data, bool* finished_parsing);
111
112	// Parse a single line from the data pointer
113	template <typename SpecializedOptions, typename ValuesWriter, typename ParsedWriter>
114	Status ParseLine(ValuesWriter* values_writer, ParsedWriter* parsed_writer,
115	const char* data, const char* data_end, bool is_final,
116	const char** out_data);
117
118	MemoryPool* pool_;
119	const ParseOptions options_;
120	// The number of rows parsed from the block
121	int32_t num_rows_;
122	// The number of columns (can be -1 at start)
123	int32_t num_cols_;
124	// The maximum number of rows to parse from this block
125	int32_t max_num_rows_;
126
127	// Linear scratchpad for parsed values
128	struct ValueDesc {
129	uint32_t offset : `31`;
130	bool quoted : `1`;
131	};
132
133	// XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes?
134	// It may help with null parsing...
135	std::vector<std::shared_ptr<Buffer>> values_buffers_;
136	std::shared_ptr<Buffer> parsed_buffer_;
137	const uint8_t* parsed_;
138	int32_t values_size_;
139	int32_t parsed_size_;
140
141	class ResizableValuesWriter;
142	class PresizedValuesWriter;
143	class PresizedParsedWriter;
144	};
145
146	} // namespace csv
147	} // namespace arrow
148
149	#endif // ARROW_CSV_PARSER_H
150

Browse the source code of arrow/arrow/csv/parser.h