1/*
2 * Copyright 2014-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef FOLLY_GEN_STRING_H_
18#error This file may only be included from folly/gen/String.h
19#endif
20
21#include <folly/Conv.h>
22#include <folly/Portability.h>
23#include <folly/String.h>
24
25namespace folly {
26namespace gen {
27namespace detail {
28
29/**
30 * Finds the first occurrence of delimiter in "in", advances "in" past the
31 * delimiter. Populates "prefix" with the consumed bytes, including the
32 * delimiter.
33 *
34 * Returns the number of trailing bytes of "prefix" that make up the
35 * delimiter, or 0 if the delimiter was not found.
36 */
37inline size_t
38splitPrefix(StringPiece& in, StringPiece& prefix, char delimiter) {
39 size_t found = in.find(delimiter);
40 if (found != StringPiece::npos) {
41 ++found;
42 prefix.assign(in.data(), in.data() + found);
43 in.advance(found);
44 return 1;
45 }
46 prefix.clear();
47 return 0;
48}
49
50/**
51 * As above, but supports multibyte delimiters.
52 */
53inline size_t
54splitPrefix(StringPiece& in, StringPiece& prefix, StringPiece delimiter) {
55 auto found = in.find(delimiter);
56 if (found != StringPiece::npos) {
57 found += delimiter.size();
58 prefix.assign(in.data(), in.data() + found);
59 in.advance(found);
60 return delimiter.size();
61 }
62 prefix.clear();
63 return 0;
64}
65
66/**
67 * As above, but splits by any of the EOL terms: \r, \n, or \r\n.
68 */
69inline size_t splitPrefix(StringPiece& in, StringPiece& prefix, MixedNewlines) {
70 const auto kCRLF = "\r\n";
71 const size_t kLenCRLF = 2;
72
73 auto p = in.find_first_of(kCRLF);
74 if (p != std::string::npos) {
75 const auto in_start = in.data();
76 size_t delim_len = 1;
77 in.advance(p);
78 // Either remove an MS-DOS CR-LF 2-byte newline, or eat 1 byte at a time.
79 if (in.removePrefix(kCRLF)) {
80 delim_len = kLenCRLF;
81 } else {
82 in.advance(delim_len);
83 }
84 prefix.assign(in_start, in.data());
85 return delim_len;
86 }
87 prefix.clear();
88 return 0;
89}
90
91inline const char* ch(const unsigned char* p) {
92 return reinterpret_cast<const char*>(p);
93}
94
95// Chop s into pieces of at most maxLength, feed them to cb
96template <class Callback>
97bool consumeFixedSizeChunks(Callback& cb, StringPiece& s, uint64_t maxLength) {
98 while (!s.empty()) {
99 auto num_to_add = s.size();
100 if (maxLength) {
101 num_to_add = std::min<uint64_t>(num_to_add, maxLength);
102 }
103 if (!cb(StringPiece(s.begin(), num_to_add))) {
104 return false;
105 }
106 s.advance(num_to_add);
107 }
108 return true;
109}
110
111// Consumes all of buffer, plus n chars from s.
112template <class Callback>
113bool consumeBufferPlus(Callback& cb, IOBuf& buf, StringPiece& s, uint64_t n) {
114 buf.reserve(0, n);
115 memcpy(buf.writableTail(), s.data(), n);
116 buf.append(n);
117 s.advance(n);
118 if (!cb(StringPiece(detail::ch(buf.data()), buf.length()))) {
119 return false;
120 }
121 buf.clear();
122 return true;
123}
124
125} // namespace detail
126
127template <class Callback>
128bool StreamSplitter<Callback>::flush() {
129 CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
130 if (!pieceCb_(StringPiece(detail::ch(buffer_.data()), buffer_.length()))) {
131 return false;
132 }
133 // We are ready to handle another stream now.
134 buffer_.clear();
135 return true;
136}
137
138template <class Callback>
139bool StreamSplitter<Callback>::operator()(StringPiece in) {
140 StringPiece prefix;
141 // NB This code assumes a 1-byte delimiter. It's not too hard to support
142 // multibyte delimiters, just remember that maxLength_ chunks can end up
143 // falling in the middle of a delimiter.
144 bool found = detail::splitPrefix(in, prefix, delimiter_);
145 if (buffer_.length() != 0) {
146 if (found) {
147 uint64_t num_to_add = prefix.size();
148 if (maxLength_) {
149 CHECK(buffer_.length() < maxLength_);
150 // Consume as much of prefix as possible without exceeding maxLength_
151 num_to_add = std::min(maxLength_ - buffer_.length(), num_to_add);
152 }
153
154 // Append part of the prefix to the buffer, and send it to the callback
155 if (!detail::consumeBufferPlus(pieceCb_, buffer_, prefix, num_to_add)) {
156 return false;
157 }
158
159 if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
160 return false;
161 }
162
163 found = detail::splitPrefix(in, prefix, delimiter_);
164 // Post-conditions:
165 // - we consumed all of buffer_ and all of the first prefix.
166 // - found, in, and prefix reflect the second delimiter_ search
167 } else if (maxLength_ && buffer_.length() + in.size() >= maxLength_) {
168 // Send all of buffer_, plus a bit of in, to the callback
169 if (!detail::consumeBufferPlus(
170 pieceCb_, buffer_, in, maxLength_ - buffer_.length())) {
171 return false;
172 }
173 // Post-conditions:
174 // - we consumed all of buffer, and the minimal # of bytes from in
175 // - found is false
176 } // Otherwise: found is false & we cannot invoke the callback this turn
177 }
178 // Post-condition: buffer_ is nonempty only if found is false **and**
179 // len(buffer + in) < maxLength_.
180
181 // Send lines to callback directly from input (no buffer)
182 while (found) { // Buffer guaranteed to be empty
183 if (!detail::consumeFixedSizeChunks(pieceCb_, prefix, maxLength_)) {
184 return false;
185 }
186 found = detail::splitPrefix(in, prefix, delimiter_);
187 }
188
189 // No more delimiters left; consume 'in' until it is shorter than maxLength_
190 if (maxLength_) {
191 while (in.size() >= maxLength_) { // Buffer is guaranteed to be empty
192 if (!pieceCb_(StringPiece(in.begin(), maxLength_))) {
193 return false;
194 }
195 in.advance(maxLength_);
196 }
197 }
198
199 if (!in.empty()) { // Buffer may be nonempty
200 // Incomplete line left, append to buffer
201 buffer_.reserve(0, in.size());
202 memcpy(buffer_.writableTail(), in.data(), in.size());
203 buffer_.append(in.size());
204 }
205 CHECK(maxLength_ == 0 || buffer_.length() < maxLength_);
206 return true;
207}
208
209namespace detail {
210
211class StringResplitter : public Operator<StringResplitter> {
212 char delimiter_;
213 bool keepDelimiter_;
214
215 public:
216 explicit StringResplitter(char delimiter, bool keepDelimiter = false)
217 : delimiter_(delimiter), keepDelimiter_(keepDelimiter) {}
218
219 template <class Source>
220 class Generator : public GenImpl<StringPiece, Generator<Source>> {
221 Source source_;
222 char delimiter_;
223 bool keepDelimiter_;
224
225 public:
226 Generator(Source source, char delimiter, bool keepDelimiter)
227 : source_(std::move(source)),
228 delimiter_(delimiter),
229 keepDelimiter_(keepDelimiter) {}
230
231 template <class Body>
232 bool apply(Body&& body) const {
233 auto splitter =
234 streamSplitter(this->delimiter_, [this, &body](StringPiece s) {
235 // The stream ended with a delimiter; our contract is to swallow
236 // the final empty piece.
237 if (s.empty()) {
238 return true;
239 }
240 if (s.back() != this->delimiter_) {
241 return body(s);
242 }
243 if (!keepDelimiter_) {
244 s.pop_back(); // Remove the 1-character delimiter
245 }
246 return body(s);
247 });
248 if (!source_.apply(splitter)) {
249 return false;
250 }
251 return splitter.flush();
252 }
253
254 static constexpr bool infinite = Source::infinite;
255 };
256
257 template <class Source, class Value, class Gen = Generator<Source>>
258 Gen compose(GenImpl<Value, Source>&& source) const {
259 return Gen(std::move(source.self()), delimiter_, keepDelimiter_);
260 }
261
262 template <class Source, class Value, class Gen = Generator<Source>>
263 Gen compose(const GenImpl<Value, Source>& source) const {
264 return Gen(source.self(), delimiter_, keepDelimiter_);
265 }
266};
267
268template <class DelimiterType = char>
269class SplitStringSource
270 : public GenImpl<StringPiece, SplitStringSource<DelimiterType>> {
271 StringPiece source_;
272 DelimiterType delimiter_;
273
274 public:
275 SplitStringSource(const StringPiece source, DelimiterType delimiter)
276 : source_(source), delimiter_(std::move(delimiter)) {}
277
278 template <class Body>
279 bool apply(Body&& body) const {
280 StringPiece rest(source_);
281 StringPiece prefix;
282 while (size_t delim_len = splitPrefix(rest, prefix, this->delimiter_)) {
283 prefix.subtract(delim_len); // Remove the delimiter
284 if (!body(prefix)) {
285 return false;
286 }
287 }
288 if (!rest.empty()) {
289 if (!body(rest)) {
290 return false;
291 }
292 }
293 return true;
294 }
295};
296
297/**
298 * Unsplit - For joining tokens from a generator into a string. This is
299 * the inverse of `split` above.
300 *
301 * This type is primarily used through the 'unsplit' function.
302 */
303template <class Delimiter, class Output>
304class Unsplit : public Operator<Unsplit<Delimiter, Output>> {
305 Delimiter delimiter_;
306
307 public:
308 explicit Unsplit(const Delimiter& delimiter) : delimiter_(delimiter) {}
309
310 template <class Source, class Value>
311 Output compose(const GenImpl<Value, Source>& source) const {
312 Output outputBuffer;
313 UnsplitBuffer<Delimiter, Output> unsplitter(delimiter_, &outputBuffer);
314 unsplitter.compose(source);
315 return outputBuffer;
316 }
317};
318
319/**
320 * UnsplitBuffer - For joining tokens from a generator into a string,
321 * and inserting them into a custom buffer.
322 *
323 * This type is primarily used through the 'unsplit' function.
324 */
325template <class Delimiter, class OutputBuffer>
326class UnsplitBuffer : public Operator<UnsplitBuffer<Delimiter, OutputBuffer>> {
327 Delimiter delimiter_;
328 OutputBuffer* outputBuffer_;
329
330 public:
331 UnsplitBuffer(const Delimiter& delimiter, OutputBuffer* outputBuffer)
332 : delimiter_(delimiter), outputBuffer_(outputBuffer) {
333 CHECK(outputBuffer);
334 }
335
336 template <class Source, class Value>
337 void compose(const GenImpl<Value, Source>& source) const {
338 // If the output buffer is empty, we skip inserting the delimiter for the
339 // first element.
340 bool skipDelim = outputBuffer_->empty();
341 source | [&](Value v) {
342 if (skipDelim) {
343 skipDelim = false;
344 toAppend(std::forward<Value>(v), outputBuffer_);
345 } else {
346 toAppend(delimiter_, std::forward<Value>(v), outputBuffer_);
347 }
348 };
349 }
350};
351
352/**
353 * Hack for static for-like constructs
354 */
355template <class Target, class = void>
356inline Target passthrough(Target target) {
357 return target;
358}
359
360FOLLY_PUSH_WARNING
361#ifdef __clang__
362// Clang isn't happy with eatField() hack below.
363#pragma GCC diagnostic ignored "-Wreturn-stack-address"
364#endif // __clang__
365
366/**
367 * ParseToTuple - For splitting a record and immediatlely converting it to a
368 * target tuple type. Primary used through the 'eachToTuple' helper, like so:
369 *
370 * auto config
371 * = split("1:a 2:b", ' ')
372 * | eachToTuple<int, string>()
373 * | as<vector<tuple<int, string>>>();
374 *
375 */
376template <class TargetContainer, class Delimiter, class... Targets>
377class SplitTo {
378 Delimiter delimiter_;
379
380 public:
381 explicit SplitTo(Delimiter delimiter) : delimiter_(delimiter) {}
382
383 TargetContainer operator()(StringPiece line) const {
384 int i = 0;
385 StringPiece fields[sizeof...(Targets)];
386 // HACK(tjackson): Used for referencing fields[] corresponding to variadic
387 // template parameters.
388 auto eatField = [&]() -> StringPiece& { return fields[i++]; };
389 if (!split(
390 delimiter_,
391 line,
392 detail::passthrough<StringPiece&, Targets>(eatField())...)) {
393 throw std::runtime_error("field count mismatch");
394 }
395 i = 0;
396 return TargetContainer(To<Targets>()(eatField())...);
397 }
398};
399
400FOLLY_POP_WARNING
401
402} // namespace detail
403
404} // namespace gen
405} // namespace folly
406