1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include "arrow/csv/chunker.h" |
19 | |
20 | #include <cstdint> |
21 | |
22 | #include "arrow/status.h" |
23 | #include "arrow/util/logging.h" |
24 | |
25 | namespace arrow { |
26 | namespace csv { |
27 | |
28 | namespace { |
29 | |
30 | // Find the last newline character in the given data block. |
31 | // nullptr is returned if not found (like memchr()). |
32 | const char* FindNewlineReverse(const char* data, uint32_t size) { |
33 | if (size == 0) { |
34 | return nullptr; |
35 | } |
36 | const char* s = data + size - 1; |
37 | while (size > 0) { |
38 | if (*s == '\r' || *s == '\n') { |
39 | return s; |
40 | } |
41 | --s; |
42 | --size; |
43 | } |
44 | return nullptr; |
45 | } |
46 | |
47 | } // namespace |
48 | |
49 | Chunker::Chunker(ParseOptions options) : options_(options) {} |
50 | |
51 | // NOTE: cvsmonkey (https://github.com/dw/csvmonkey) has optimization ideas |
52 | |
53 | template <bool quoting, bool escaping> |
54 | inline const char* Chunker::ReadLine(const char* data, const char* data_end) { |
55 | DCHECK_EQ(quoting, options_.quoting); |
56 | DCHECK_EQ(escaping, options_.escaping); |
57 | |
58 | // The parsing state machine |
59 | char c; |
60 | |
61 | FieldStart: |
62 | // At the start of a field |
63 | // Quoting is only recognized at start of field |
64 | if (quoting && ARROW_PREDICT_TRUE(data != data_end) && *data == options_.quote_char) { |
65 | data++; |
66 | goto InQuotedField; |
67 | } else { |
68 | goto InField; |
69 | } |
70 | |
71 | InField: |
72 | // Inside a non-quoted part of a field |
73 | if (ARROW_PREDICT_FALSE(data == data_end)) { |
74 | goto AbortLine; |
75 | } |
76 | c = *data++; |
77 | if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) { |
78 | if (ARROW_PREDICT_FALSE(data == data_end)) { |
79 | goto AbortLine; |
80 | } |
81 | data++; |
82 | goto InField; |
83 | } |
84 | if (ARROW_PREDICT_FALSE(c == '\r')) { |
85 | if (ARROW_PREDICT_TRUE(data != data_end) && *data == '\n') { |
86 | data++; |
87 | } |
88 | goto LineEnd; |
89 | } |
90 | if (ARROW_PREDICT_FALSE(c == '\n')) { |
91 | goto LineEnd; |
92 | } |
93 | if (ARROW_PREDICT_FALSE(c == options_.delimiter)) { |
94 | goto FieldEnd; |
95 | } |
96 | goto InField; |
97 | |
98 | InQuotedField: |
99 | // Inside a quoted part of a field |
100 | if (ARROW_PREDICT_FALSE(data == data_end)) { |
101 | goto AbortLine; |
102 | } |
103 | c = *data++; |
104 | if (escaping && ARROW_PREDICT_FALSE(c == options_.escape_char)) { |
105 | if (data == data_end) { |
106 | goto AbortLine; |
107 | } |
108 | data++; |
109 | goto InQuotedField; |
110 | } |
111 | if (ARROW_PREDICT_FALSE(c == options_.quote_char)) { |
112 | if (options_.double_quote && data != data_end && *data == options_.quote_char) { |
113 | // Double-quoting |
114 | data++; |
115 | } else { |
116 | // End of single-quoting |
117 | goto InField; |
118 | } |
119 | } |
120 | goto InQuotedField; |
121 | |
122 | FieldEnd: |
123 | // At the end of a field |
124 | goto FieldStart; |
125 | |
126 | LineEnd: |
127 | return data; |
128 | |
129 | AbortLine: |
130 | // Truncated line at end of block |
131 | return nullptr; |
132 | } |
133 | |
134 | template <bool quoting, bool escaping> |
135 | Status Chunker::ProcessSpecialized(const char* start, uint32_t size, uint32_t* out_size) { |
136 | DCHECK_EQ(quoting, options_.quoting); |
137 | DCHECK_EQ(escaping, options_.escaping); |
138 | |
139 | const char* data = start; |
140 | const char* data_end = start + size; |
141 | |
142 | while (data < data_end) { |
143 | const char* line_end = ReadLine<quoting, escaping>(data, data_end); |
144 | if (line_end == nullptr) { |
145 | // Cannot read any further |
146 | break; |
147 | } |
148 | data = line_end; |
149 | } |
150 | *out_size = static_cast<uint32_t>(data - start); |
151 | return Status::OK(); |
152 | } |
153 | |
154 | Status Chunker::Process(const char* start, uint32_t size, uint32_t* out_size) { |
155 | if (!options_.newlines_in_values) { |
156 | // In newlines are not accepted in CSV values, we can simply search for |
157 | // the last newline character. |
158 | // For common block sizes and CSV row sizes, this avoids reading |
159 | // most of the data block, making the chunker extremely fast compared |
160 | // to the rest of the CSV reading pipeline. |
161 | const char* nl = FindNewlineReverse(start, size); |
162 | if (nl == nullptr) { |
163 | *out_size = 0; |
164 | } else { |
165 | *out_size = static_cast<uint32_t>(nl - start + 1); |
166 | } |
167 | return Status::OK(); |
168 | } |
169 | |
170 | if (options_.quoting) { |
171 | if (options_.escaping) { |
172 | return ProcessSpecialized<true, true>(start, size, out_size); |
173 | } else { |
174 | return ProcessSpecialized<true, false>(start, size, out_size); |
175 | } |
176 | } else { |
177 | if (options_.escaping) { |
178 | return ProcessSpecialized<false, true>(start, size, out_size); |
179 | } else { |
180 | return ProcessSpecialized<false, false>(start, size, out_size); |
181 | } |
182 | } |
183 | } |
184 | |
185 | } // namespace csv |
186 | } // namespace arrow |
187 | |