1 | /* |
2 | * Copyright 2014-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #ifndef FOLLY_GEN_STRING_H_ |
18 | #error This file may only be included from folly/gen/String.h |
19 | #endif |
20 | |
21 | #include <folly/Conv.h> |
22 | #include <folly/Portability.h> |
23 | #include <folly/String.h> |
24 | |
25 | namespace folly { |
26 | namespace gen { |
27 | namespace detail { |
28 | |
29 | /** |
30 | * Finds the first occurrence of delimiter in "in", advances "in" past the |
31 | * delimiter. Populates "prefix" with the consumed bytes, including the |
32 | * delimiter. |
33 | * |
34 | * Returns the number of trailing bytes of "prefix" that make up the |
35 | * delimiter, or 0 if the delimiter was not found. |
36 | */ |
37 | inline size_t |
38 | splitPrefix(StringPiece& in, StringPiece& prefix, char delimiter) { |
39 | size_t found = in.find(delimiter); |
40 | if (found != StringPiece::npos) { |
41 | ++found; |
42 | prefix.assign(in.data(), in.data() + found); |
43 | in.advance(found); |
44 | return 1; |
45 | } |
46 | prefix.clear(); |
47 | return 0; |
48 | } |
49 | |
50 | /** |
51 | * As above, but supports multibyte delimiters. |
52 | */ |
53 | inline size_t |
54 | splitPrefix(StringPiece& in, StringPiece& prefix, StringPiece delimiter) { |
55 | auto found = in.find(delimiter); |
56 | if (found != StringPiece::npos) { |
57 | found += delimiter.size(); |
58 | prefix.assign(in.data(), in.data() + found); |
59 | in.advance(found); |
60 | return delimiter.size(); |
61 | } |
62 | prefix.clear(); |
63 | return 0; |
64 | } |
65 | |
66 | /** |
67 | * As above, but splits by any of the EOL terms: \r, \n, or \r\n. |
68 | */ |
69 | inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, MixedNewlines) { |
70 | const auto kCRLF = "\r\n" ; |
71 | const size_t kLenCRLF = 2; |
72 | |
73 | auto p = in.find_first_of(kCRLF); |
74 | if (p != std::string::npos) { |
75 | const auto in_start = in.data(); |
76 | size_t delim_len = 1; |
77 | in.advance(p); |
78 | // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time. |
79 | if (in.removePrefix(kCRLF)) { |
80 | delim_len = kLenCRLF; |
81 | } else { |
82 | in.advance(delim_len); |
83 | } |
84 | prefix.assign(in_start, in.data()); |
85 | return delim_len; |
86 | } |
87 | prefix.clear(); |
88 | return 0; |
89 | } |
90 | |
91 | inline const char* ch(const unsigned char* p) { |
92 | return reinterpret_cast<const char*>(p); |
93 | } |
94 | |
95 | // Chop s into pieces of at most maxLength, feed them to cb |
96 | template <class Callback> |
97 | bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) { |
98 | while (!s.empty()) { |
99 | auto num_to_add = s.size(); |
100 | if (maxLength) { |
101 | num_to_add = std::min<uint64_t>(num_to_add, maxLength); |
102 | } |
103 | if (!cb(StringPiece(s.begin(), num_to_add))) { |
104 | return false; |
105 | } |
106 | s.advance(num_to_add); |
107 | } |
108 | return true; |
109 | } |
110 | |
111 | // Consumes all of buffer, plus n chars from s. |
112 | template <class Callback> |
113 | bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) { |
114 | buf.reserve(0, n); |
115 | memcpy(buf.writableTail(), s.data(), n); |
116 | buf.append(n); |
117 | s.advance(n); |
118 | if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) { |
119 | return false; |
120 | } |
121 | buf.clear(); |
122 | return true; |
123 | } |
124 | |
125 | } // namespace detail |
126 | |
127 | template <class Callback> |
128 | bool StreamSplitter<Callback>::flush() { |
129 | CHECK(maxLength_ == 0 || buffer_.length() < maxLength_); |
130 | if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) { |
131 | return false; |
132 | } |
133 | // We are ready to handle another stream now. |
134 | buffer_.clear(); |
135 | return true; |
136 | } |
137 | |
138 | template <class Callback> |
139 | bool StreamSplitter<Callback>::operator()(StringPiece in) { |
140 | StringPiece prefix; |
141 | // NB This code assumes a 1-byte delimiter. It's not too hard to support |
142 | // multibyte delimiters, just remember that maxLength_ chunks can end up |
143 | // falling in the middle of a delimiter. |
144 | bool found = detail::splitPrefix(in, prefix, delimiter_); |
145 | if (buffer_.length() != 0) { |
146 | if (found) { |
147 | uint64_t num_to_add = prefix.size(); |
148 | if (maxLength_) { |
149 | CHECK(buffer_.length() < maxLength_); |
150 | // Consume as much of prefix as possible without exceeding maxLength_ |
151 | num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add); |
152 | } |
153 | |
154 | // Append part of the prefix to the buffer, and send it to the callback |
155 | if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) { |
156 | return false; |
157 | } |
158 | |
159 | if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) { |
160 | return false; |
161 | } |
162 | |
163 | found = detail::splitPrefix(in, prefix, delimiter_); |
164 | // Post-conditions: |
165 | // - we consumed all of buffer_ and all of the first prefix. |
166 | // - found, in, and prefix reflect the second delimiter_ search |
167 | } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) { |
168 | // Send all of buffer_, plus a bit of in, to the callback |
169 | if (!detail::consumeBufferPlus( |
170 | pieceCb_, buffer_, in, maxLength_ - buffer_.length())) { |
171 | return false; |
172 | } |
173 | // Post-conditions: |
174 | // - we consumed all of buffer, and the minimal # of bytes from in |
175 | // - found is false |
176 | } // Otherwise: found is false & we cannot invoke the callback this turn |
177 | } |
178 | // Post-condition: buffer_ is nonempty only if found is false **and** |
179 | // len(buffer + in) < maxLength_. |
180 | |
181 | // Send lines to callback directly from input (no buffer) |
182 | while (found) { // Buffer guaranteed to be empty |
183 | if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) { |
184 | return false; |
185 | } |
186 | found = detail::splitPrefix(in, prefix, delimiter_); |
187 | } |
188 | |
189 | // No more delimiters left; consume 'in' until it is shorter than maxLength_ |
190 | if (maxLength_) { |
191 | while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty |
192 | if (!pieceCb_(StringPiece(in.begin(), maxLength_))) { |
193 | return false; |
194 | } |
195 | in.advance(maxLength_); |
196 | } |
197 | } |
198 | |
199 | if (!in.empty()) { // Buffer may be nonempty |
200 | // Incomplete line left, append to buffer |
201 | buffer_.reserve(0, in.size()); |
202 | memcpy(buffer_.writableTail(), in.data(), in.size()); |
203 | buffer_.append(in.size()); |
204 | } |
205 | CHECK(maxLength_ == 0 || buffer_.length() < maxLength_); |
206 | return true; |
207 | } |
208 | |
209 | namespace detail { |
210 | |
211 | class StringResplitter : public Operator<StringResplitter> { |
212 | char delimiter_; |
213 | bool keepDelimiter_; |
214 | |
215 | public: |
216 | explicit StringResplitter(char delimiter, bool keepDelimiter = false) |
217 | : delimiter_(delimiter), keepDelimiter_(keepDelimiter) {} |
218 | |
219 | template <class Source> |
220 | class Generator : public GenImpl<StringPiece, Generator<Source>> { |
221 | Source source_; |
222 | char delimiter_; |
223 | bool keepDelimiter_; |
224 | |
225 | public: |
226 | Generator(Source source, char delimiter, bool keepDelimiter) |
227 | : source_(std::move(source)), |
228 | delimiter_(delimiter), |
229 | keepDelimiter_(keepDelimiter) {} |
230 | |
231 | template <class Body> |
232 | bool apply(Body&& body) const { |
233 | auto splitter = |
234 | streamSplitter(this->delimiter_, [this, &body](StringPiece s) { |
235 | // The stream ended with a delimiter; our contract is to swallow |
236 | // the final empty piece. |
237 | if (s.empty()) { |
238 | return true; |
239 | } |
240 | if (s.back() != this->delimiter_) { |
241 | return body(s); |
242 | } |
243 | if (!keepDelimiter_) { |
244 | s.pop_back(); // Remove the 1-character delimiter |
245 | } |
246 | return body(s); |
247 | }); |
248 | if (!source_.apply(splitter)) { |
249 | return false; |
250 | } |
251 | return splitter.flush(); |
252 | } |
253 | |
254 | static constexpr bool infinite = Source::infinite; |
255 | }; |
256 | |
257 | template <class Source, class Value, class Gen = Generator<Source>> |
258 | Gen compose(GenImpl<Value, Source>&& source) const { |
259 | return Gen(std::move(source.self()), delimiter_, keepDelimiter_); |
260 | } |
261 | |
262 | template <class Source, class Value, class Gen = Generator<Source>> |
263 | Gen compose(const GenImpl<Value, Source>& source) const { |
264 | return Gen(source.self(), delimiter_, keepDelimiter_); |
265 | } |
266 | }; |
267 | |
268 | template <class DelimiterType = char> |
269 | class SplitStringSource |
270 | : public GenImpl<StringPiece, SplitStringSource<DelimiterType>> { |
271 | StringPiece source_; |
272 | DelimiterType delimiter_; |
273 | |
274 | public: |
275 | SplitStringSource(const StringPiece source, DelimiterType delimiter) |
276 | : source_(source), delimiter_(std::move(delimiter)) {} |
277 | |
278 | template <class Body> |
279 | bool apply(Body&& body) const { |
280 | StringPiece rest(source_); |
281 | StringPiece prefix; |
282 | while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) { |
283 | prefix.subtract(delim_len); // Remove the delimiter |
284 | if (!body(prefix)) { |
285 | return false; |
286 | } |
287 | } |
288 | if (!rest.empty()) { |
289 | if (!body(rest)) { |
290 | return false; |
291 | } |
292 | } |
293 | return true; |
294 | } |
295 | }; |
296 | |
297 | /** |
298 | * Unsplit - For joining tokens from a generator into a string. This is |
299 | * the inverse of `split` above. |
300 | * |
301 | * This type is primarily used through the 'unsplit' function. |
302 | */ |
303 | template <class Delimiter, class Output> |
304 | class Unsplit : public Operator<Unsplit<Delimiter, Output>> { |
305 | Delimiter delimiter_; |
306 | |
307 | public: |
308 | explicit Unsplit(const Delimiter& delimiter) : delimiter_(delimiter) {} |
309 | |
310 | template <class Source, class Value> |
311 | Output compose(const GenImpl<Value, Source>& source) const { |
312 | Output outputBuffer; |
313 | UnsplitBuffer<Delimiter, Output> unsplitter(delimiter_, &outputBuffer); |
314 | unsplitter.compose(source); |
315 | return outputBuffer; |
316 | } |
317 | }; |
318 | |
319 | /** |
320 | * UnsplitBuffer - For joining tokens from a generator into a string, |
321 | * and inserting them into a custom buffer. |
322 | * |
323 | * This type is primarily used through the 'unsplit' function. |
324 | */ |
325 | template <class Delimiter, class OutputBuffer> |
326 | class UnsplitBuffer : public Operator<UnsplitBuffer<Delimiter, OutputBuffer>> { |
327 | Delimiter delimiter_; |
328 | OutputBuffer* outputBuffer_; |
329 | |
330 | public: |
331 | UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer) |
332 | : delimiter_(delimiter), outputBuffer_(outputBuffer) { |
333 | CHECK(outputBuffer); |
334 | } |
335 | |
336 | template <class Source, class Value> |
337 | void compose(const GenImpl<Value, Source>& source) const { |
338 | // If the output buffer is empty, we skip inserting the delimiter for the |
339 | // first element. |
340 | bool skipDelim = outputBuffer_->empty(); |
341 | source | [&](Value v) { |
342 | if (skipDelim) { |
343 | skipDelim = false; |
344 | toAppend(std::forward<Value>(v), outputBuffer_); |
345 | } else { |
346 | toAppend(delimiter_, std::forward<Value>(v), outputBuffer_); |
347 | } |
348 | }; |
349 | } |
350 | }; |
351 | |
352 | /** |
353 | * Hack for static for-like constructs |
354 | */ |
355 | template <class Target, class = void> |
356 | inline Target passthrough(Target target) { |
357 | return target; |
358 | } |
359 | |
360 | FOLLY_PUSH_WARNING |
361 | #ifdef __clang__ |
362 | // Clang isn't happy with eatField() hack below. |
363 | #pragma GCC diagnostic ignored "-Wreturn-stack-address" |
364 | #endif // __clang__ |
365 | |
366 | /** |
367 | * ParseToTuple - For splitting a record and immediatlely converting it to a |
368 | * target tuple type. Primary used through the 'eachToTuple' helper, like so: |
369 | * |
370 | * auto config |
371 | * = split("1:a 2:b", ' ') |
372 | * | eachToTuple<int, string>() |
373 | * | as<vector<tuple<int, string>>>(); |
374 | * |
375 | */ |
376 | template <class TargetContainer, class Delimiter, class... Targets> |
377 | class SplitTo { |
378 | Delimiter delimiter_; |
379 | |
380 | public: |
381 | explicit SplitTo(Delimiter delimiter) : delimiter_(delimiter) {} |
382 | |
383 | TargetContainer operator()(StringPiece line) const { |
384 | int i = 0; |
385 | StringPiece fields[sizeof...(Targets)]; |
386 | // HACK(tjackson): Used for referencing fields[] corresponding to variadic |
387 | // template parameters. |
388 | auto eatField = [&]() -> StringPiece& { return fields[i++]; }; |
389 | if (!split( |
390 | delimiter_, |
391 | line, |
392 | detail::passthrough<StringPiece&, Targets>(eatField())...)) { |
393 | throw std::runtime_error("field count mismatch" ); |
394 | } |
395 | i = 0; |
396 | return TargetContainer(To<Targets>()(eatField())...); |
397 | } |
398 | }; |
399 | |
400 | FOLLY_POP_WARNING |
401 | |
402 | } // namespace detail |
403 | |
404 | } // namespace gen |
405 | } // namespace folly |
406 | |