1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#include "arrow/csv/chunker.h"
19
20#include <cstdint>
21
22#include "arrow/status.h"
23#include "arrow/util/logging.h"
24
25namespace arrow {
26namespace csv {
27
28namespace {
29
30// Find the last newline character in the given data block.
31// nullptr is returned if not found (like memchr()).
32const char* FindNewlineReverse(const char* data, uint32_t size) {
33 if (size == 0) {
34 return nullptr;
35 }
36 const char* s = data + size - 1;
37 while (size > 0) {
38 if (*s == '\r' || *s == '\n') {
39 return s;
40 }
41 --s;
42 --size;
43 }
44 return nullptr;
45}
46
47} // namespace
48
49Chunker::Chunker(ParseOptions options) : options_(options) {}
50
51// NOTE: cvsmonkey (https://github.com/dw/csvmonkey) has optimization ideas
52
53template <bool quoting, bool escaping>
54inline const char* Chunker::ReadLine(const char* data, const char* data_end) {
55 DCHECK_EQ(quoting, options_.quoting);
56 DCHECK_EQ(escaping, options_.escaping);
57
58 // The parsing state machine
59 char c;
60
61FieldStart:
62 // At the start of a field
63 // Quoting is only recognized at start of field
64 if (quoting && ARROW_PREDICT_TRUE(data != data_end) && *data == options_.quote_char) {
65 data++;
66 goto InQuotedField;
67 } else {
68 goto InField;
69 }
70
71InField:
72 // Inside a non-quoted part of a field
73 if (ARROW_PREDICT_FALSE(data == data_end)) {
74 goto AbortLine;
75 }
76 c = *data++;
77 if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
78 if (ARROW_PREDICT_FALSE(data == data_end)) {
79 goto AbortLine;
80 }
81 data++;
82 goto InField;
83 }
84 if (ARROW_PREDICT_FALSE(c == '\r')) {
85 if (ARROW_PREDICT_TRUE(data != data_end) && *data == '\n') {
86 data++;
87 }
88 goto LineEnd;
89 }
90 if (ARROW_PREDICT_FALSE(c == '\n')) {
91 goto LineEnd;
92 }
93 if (ARROW_PREDICT_FALSE(c == options_.delimiter)) {
94 goto FieldEnd;
95 }
96 goto InField;
97
98InQuotedField:
99 // Inside a quoted part of a field
100 if (ARROW_PREDICT_FALSE(data == data_end)) {
101 goto AbortLine;
102 }
103 c = *data++;
104 if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) {
105 if (data == data_end) {
106 goto AbortLine;
107 }
108 data++;
109 goto InQuotedField;
110 }
111 if (ARROW_PREDICT_FALSE(c == options_.quote_char)) {
112 if (options_.double_quote && data != data_end && *data == options_.quote_char) {
113 // Double-quoting
114 data++;
115 } else {
116 // End of single-quoting
117 goto InField;
118 }
119 }
120 goto InQuotedField;
121
122FieldEnd:
123 // At the end of a field
124 goto FieldStart;
125
126LineEnd:
127 return data;
128
129AbortLine:
130 // Truncated line at end of block
131 return nullptr;
132}
133
134template <bool quoting, bool escaping>
135Status Chunker::ProcessSpecialized(const char* start, uint32_t size, uint32_t* out_size) {
136 DCHECK_EQ(quoting, options_.quoting);
137 DCHECK_EQ(escaping, options_.escaping);
138
139 const char* data = start;
140 const char* data_end = start + size;
141
142 while (data < data_end) {
143 const char* line_end = ReadLine<quoting, escaping>(data, data_end);
144 if (line_end == nullptr) {
145 // Cannot read any further
146 break;
147 }
148 data = line_end;
149 }
150 *out_size = static_cast<uint32_t>(data - start);
151 return Status::OK();
152}
153
154Status Chunker::Process(const char* start, uint32_t size, uint32_t* out_size) {
155 if (!options_.newlines_in_values) {
156 // In newlines are not accepted in CSV values, we can simply search for
157 // the last newline character.
158 // For common block sizes and CSV row sizes, this avoids reading
159 // most of the data block, making the chunker extremely fast compared
160 // to the rest of the CSV reading pipeline.
161 const char* nl = FindNewlineReverse(start, size);
162 if (nl == nullptr) {
163 *out_size = 0;
164 } else {
165 *out_size = static_cast<uint32_t>(nl - start + 1);
166 }
167 return Status::OK();
168 }
169
170 if (options_.quoting) {
171 if (options_.escaping) {
172 return ProcessSpecialized<true, true>(start, size, out_size);
173 } else {
174 return ProcessSpecialized<true, false>(start, size, out_size);
175 }
176 } else {
177 if (options_.escaping) {
178 return ProcessSpecialized<false, true>(start, size, out_size);
179 } else {
180 return ProcessSpecialized<false, false>(start, size, out_size);
181 }
182 }
183}
184
185} // namespace csv
186} // namespace arrow
187