chunker.cc source code [arrow/arrow/csv/chunker.cc]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#include "arrow/csv/chunker.h"
19
20	#include <cstdint>
21
22	#include "arrow/status.h"
23	#include "arrow/util/logging.h"
24
25	namespace arrow {
26	namespace csv {
27
28	namespace {
29
30	// Find the last newline character in the given data block.
31	// nullptr is returned if not found (like memchr()).
32	const char* FindNewlineReverse(const char* data, uint32_t size) {
33	if (size == `0`) {
34	return nullptr;
35	}
36	const char* s = data + size - `1`;
37	while (size > `0`) {
38	if (s == `'\r'` \|\| s == `'\n'`) {
39	return s;
40	}
41	--s;
42	--size;
43	}
44	return nullptr;
45	}
46
47	} // namespace
48
49	Chunker::Chunker(ParseOptions options) : options_(options) {}
50
51	// NOTE: cvsmonkey (https://github.com/dw/csvmonkey) has optimization ideas
52
53	template <bool quoting, bool escaping>
54	inline const char* Chunker::ReadLine(const char* data, const char* data_end) {
55	DCHECK_EQ(quoting, options_.quoting);
56	DCHECK_EQ(escaping, options_.escaping);
57
58	// The parsing state machine
59	char c;
60
61	FieldStart:
62	// At the start of a field
63	// Quoting is only recognized at start of field
64	if (quoting && ARROW_PREDICT_TRUE(data != data_end) && *data == options_.quote_char) {
65	data++;
66	goto InQuotedField;
67	} else {
68	goto InField;
69	}
70
71	InField:
72	// Inside a non-quoted part of a field
73	if (ARROW_PREDICT_FALSE(data == data_end)) {
74	goto AbortLine;
75	}
76	c = *data++;
77	if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
78	if (ARROW_PREDICT_FALSE(data == data_end)) {
79	goto AbortLine;
80	}
81	data++;
82	goto InField;
83	}
84	if (ARROW_PREDICT_FALSE(c == `'\r'`)) {
85	if (ARROW_PREDICT_TRUE(data != data_end) && *data == `'\n'`) {
86	data++;
87	}
88	goto LineEnd;
89	}
90	if (ARROW_PREDICT_FALSE(c == `'\n'`)) {
91	goto LineEnd;
92	}
93	if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
94	goto FieldEnd;
95	}
96	goto InField;
97
98	InQuotedField:
99	// Inside a quoted part of a field
100	if (ARROW_PREDICT_FALSE(data == data_end)) {
101	goto AbortLine;
102	}
103	c = *data++;
104	if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
105	if (data == data_end) {
106	goto AbortLine;
107	}
108	data++;
109	goto InQuotedField;
110	}
111	if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
112	if (options_.double_quote && data != data_end && *data == options_.quote_char) {
113	// Double-quoting
114	data++;
115	} else {
116	// End of single-quoting
117	goto InField;
118	}
119	}
120	goto InQuotedField;
121
122	FieldEnd:
123	// At the end of a field
124	goto FieldStart;
125
126	LineEnd:
127	return data;
128
129	AbortLine:
130	// Truncated line at end of block
131	return nullptr;
132	}
133
134	template <bool quoting, bool escaping>
135	Status Chunker::ProcessSpecialized(const char* start, uint32_t size, uint32_t* out_size) {
136	DCHECK_EQ(quoting, options_.quoting);
137	DCHECK_EQ(escaping, options_.escaping);
138
139	const char* data = start;
140	const char* data_end = start + size;
141
142	while (data < data_end) {
143	const char* line_end = ReadLine<quoting, escaping>(data, data_end);
144	if (line_end == nullptr) {
145	// Cannot read any further
146	break;
147	}
148	data = line_end;
149	}
150	out_size = static_cast*<uint32_t>(data - start);
151	return Status::OK();
152	}
153
154	Status Chunker::Process(const char* start, uint32_t size, uint32_t* out_size) {
155	if (!options_.newlines_in_values) {
156	// In newlines are not accepted in CSV values, we can simply search for
157	// the last newline character.
158	// For common block sizes and CSV row sizes, this avoids reading
159	// most of the data block, making the chunker extremely fast compared
160	// to the rest of the CSV reading pipeline.
161	const char* nl = FindNewlineReverse(start, size);
162	if (nl == nullptr) {
163	*out_size = `0`;
164	} else {
165	out_size = static_cast*<uint32_t>(nl - start + `1`);
166	}
167	return Status::OK();
168	}
169
170	if (options_.quoting) {
171	if (options_.escaping) {
172	return ProcessSpecialized<true, true>(start, size, out_size);
173	} else {
174	return ProcessSpecialized<true, false>(start, size, out_size);
175	}
176	} else {
177	if (options_.escaping) {
178	return ProcessSpecialized<false, true>(start, size, out_size);
179	} else {
180	return ProcessSpecialized<false, false>(start, size, out_size);
181	}
182	}
183	}
184
185	} // namespace csv
186	} // namespace arrow
187

Browse the source code of arrow/arrow/csv/chunker.cc