1 | // |
2 | // Copyright 2017 The Abseil Authors. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 (the "License"); |
5 | // you may not use this file except in compliance with the License. |
6 | // You may obtain a copy of the License at |
7 | // |
8 | // https://www.apache.org/licenses/LICENSE-2.0 |
9 | // |
10 | // Unless required by applicable law or agreed to in writing, software |
11 | // distributed under the License is distributed on an "AS IS" BASIS, |
12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | // See the License for the specific language governing permissions and |
14 | // limitations under the License. |
15 | // |
16 | // ----------------------------------------------------------------------------- |
17 | // File: str_split.h |
18 | // ----------------------------------------------------------------------------- |
19 | // |
20 | // This file contains functions for splitting strings. It defines the main |
21 | // `StrSplit()` function, several delimiters for determining the boundaries on |
22 | // which to split the string, and predicates for filtering delimited results. |
23 | // `StrSplit()` adapts the returned collection to the type specified by the |
24 | // caller. |
25 | // |
26 | // Example: |
27 | // |
28 | // // Splits the given string on commas. Returns the results in a |
29 | // // vector of strings. |
30 | // std::vector<std::string> v = absl::StrSplit("a,b,c", ','); |
31 | // // Can also use "," |
32 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
33 | // |
34 | // See StrSplit() below for more information. |
35 | #ifndef ABSL_STRINGS_STR_SPLIT_H_ |
36 | #define ABSL_STRINGS_STR_SPLIT_H_ |
37 | |
38 | #include <algorithm> |
39 | #include <cstddef> |
40 | #include <map> |
41 | #include <set> |
42 | #include <string> |
43 | #include <utility> |
44 | #include <vector> |
45 | |
46 | #include "absl/base/internal/raw_logging.h" |
47 | #include "absl/strings/internal/str_split_internal.h" |
48 | #include "absl/strings/string_view.h" |
49 | #include "absl/strings/strip.h" |
50 | |
51 | namespace absl { |
52 | |
53 | //------------------------------------------------------------------------------ |
54 | // Delimiters |
55 | //------------------------------------------------------------------------------ |
56 | // |
57 | // `StrSplit()` uses delimiters to define the boundaries between elements in the |
58 | // provided input. Several `Delimiter` types are defined below. If a string |
59 | // (`const char*`, `std::string`, or `absl::string_view`) is passed in place of |
60 | // an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it |
61 | // were passed a `ByString` delimiter. |
62 | // |
63 | // A `Delimiter` is an object with a `Find()` function that knows how to find |
64 | // the first occurrence of itself in a given `absl::string_view`. |
65 | // |
66 | // The following `Delimiter` types are available for use within `StrSplit()`: |
67 | // |
68 | // - `ByString` (default for string arguments) |
69 | // - `ByChar` (default for a char argument) |
70 | // - `ByAnyChar` |
71 | // - `ByLength` |
72 | // - `MaxSplits` |
73 | // |
74 | // A Delimiter's `Find()` member function will be passed an input `text` that is |
75 | // to be split and a position (`pos`) to begin searching for the next delimiter |
76 | // in `text`. The returned absl::string_view should refer to the next occurrence |
77 | // (after `pos`) of the represented delimiter; this returned absl::string_view |
78 | // represents the next location where the input `text` should be broken. |
79 | // |
80 | // The returned absl::string_view may be zero-length if the Delimiter does not |
81 | // represent a part of the string (e.g., a fixed-length delimiter). If no |
82 | // delimiter is found in the input `text`, a zero-length absl::string_view |
83 | // referring to `text.end()` should be returned (e.g., |
84 | // `text.substr(text.size())`). It is important that the returned |
85 | // absl::string_view always be within the bounds of the input `text` given as an |
86 | // argument--it must not refer to a string that is physically located outside of |
87 | // the given string. |
88 | // |
89 | // The following example is a simple Delimiter object that is created with a |
90 | // single char and will look for that char in the text passed to the `Find()` |
91 | // function: |
92 | // |
93 | // struct SimpleDelimiter { |
94 | // const char c_; |
95 | // explicit SimpleDelimiter(char c) : c_(c) {} |
96 | // absl::string_view Find(absl::string_view text, size_t pos) { |
97 | // auto found = text.find(c_, pos); |
98 | // if (found == absl::string_view::npos) |
99 | // return text.substr(text.size()); |
100 | // |
101 | // return text.substr(found, 1); |
102 | // } |
103 | // }; |
104 | |
105 | // ByString |
106 | // |
107 | // A sub-string delimiter. If `StrSplit()` is passed a string in place of a |
108 | // `Delimiter` object, the string will be implicitly converted into a |
109 | // `ByString` delimiter. |
110 | // |
111 | // Example: |
112 | // |
113 | // // Because a string literal is converted to an `absl::ByString`, |
114 | // // the following two splits are equivalent. |
115 | // |
116 | // std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", "); |
117 | // |
118 | // using absl::ByString; |
119 | // std::vector<std::string> v2 = absl::StrSplit("a, b, c", |
120 | // ByString(", ")); |
121 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
122 | class ByString { |
123 | public: |
124 | explicit ByString(absl::string_view sp); |
125 | absl::string_view Find(absl::string_view text, size_t pos) const; |
126 | |
127 | private: |
128 | const std::string delimiter_; |
129 | }; |
130 | |
131 | // ByChar |
132 | // |
133 | // A single character delimiter. `ByChar` is functionally equivalent to a |
134 | // 1-char string within a `ByString` delimiter, but slightly more efficient. |
135 | // |
136 | // Example: |
137 | // |
138 | // // Because a char literal is converted to a absl::ByChar, |
139 | // // the following two splits are equivalent. |
140 | // std::vector<std::string> v1 = absl::StrSplit("a,b,c", ','); |
141 | // using absl::ByChar; |
142 | // std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(',')); |
143 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
144 | // |
145 | // `ByChar` is also the default delimiter if a single character is given |
146 | // as the delimiter to `StrSplit()`. For example, the following calls are |
147 | // equivalent: |
148 | // |
149 | // std::vector<std::string> v = absl::StrSplit("a-b", '-'); |
150 | // |
151 | // using absl::ByChar; |
152 | // std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-')); |
153 | // |
154 | class ByChar { |
155 | public: |
156 | explicit ByChar(char c) : c_(c) {} |
157 | absl::string_view Find(absl::string_view text, size_t pos) const; |
158 | |
159 | private: |
160 | char c_; |
161 | }; |
162 | |
163 | // ByAnyChar |
164 | // |
165 | // A delimiter that will match any of the given byte-sized characters within |
166 | // its provided string. |
167 | // |
168 | // Note: this delimiter works with single-byte string data, but does not work |
169 | // with variable-width encodings, such as UTF-8. |
170 | // |
171 | // Example: |
172 | // |
173 | // using absl::ByAnyChar; |
174 | // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); |
175 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
176 | // |
177 | // If `ByAnyChar` is given the empty string, it behaves exactly like |
178 | // `ByString` and matches each individual character in the input string. |
179 | // |
180 | class ByAnyChar { |
181 | public: |
182 | explicit ByAnyChar(absl::string_view sp); |
183 | absl::string_view Find(absl::string_view text, size_t pos) const; |
184 | |
185 | private: |
186 | const std::string delimiters_; |
187 | }; |
188 | |
189 | // ByLength |
190 | // |
191 | // A delimiter for splitting into equal-length strings. The length argument to |
192 | // the constructor must be greater than 0. |
193 | // |
194 | // Note: this delimiter works with single-byte string data, but does not work |
195 | // with variable-width encodings, such as UTF-8. |
196 | // |
197 | // Example: |
198 | // |
199 | // using absl::ByLength; |
200 | // std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3)); |
201 | |
202 | // // v[0] == "123", v[1] == "456", v[2] == "789" |
203 | // |
204 | // Note that the string does not have to be a multiple of the fixed split |
205 | // length. In such a case, the last substring will be shorter. |
206 | // |
207 | // using absl::ByLength; |
208 | // std::vector<std::string> v = absl::StrSplit("12345", ByLength(2)); |
209 | // |
210 | // // v[0] == "12", v[1] == "34", v[2] == "5" |
211 | class ByLength { |
212 | public: |
213 | explicit ByLength(ptrdiff_t length); |
214 | absl::string_view Find(absl::string_view text, size_t pos) const; |
215 | |
216 | private: |
217 | const ptrdiff_t length_; |
218 | }; |
219 | |
220 | namespace strings_internal { |
221 | |
222 | // A traits-like metafunction for selecting the default Delimiter object type |
223 | // for a particular Delimiter type. The base case simply exposes type Delimiter |
224 | // itself as the delimiter's Type. However, there are specializations for |
225 | // string-like objects that map them to the ByString delimiter object. |
226 | // This allows functions like absl::StrSplit() and absl::MaxSplits() to accept |
227 | // string-like objects (e.g., ',') as delimiter arguments but they will be |
228 | // treated as if a ByString delimiter was given. |
229 | template <typename Delimiter> |
230 | struct SelectDelimiter { |
231 | using type = Delimiter; |
232 | }; |
233 | |
234 | template <> |
235 | struct SelectDelimiter<char> { |
236 | using type = ByChar; |
237 | }; |
238 | template <> |
239 | struct SelectDelimiter<char*> { |
240 | using type = ByString; |
241 | }; |
242 | template <> |
243 | struct SelectDelimiter<const char*> { |
244 | using type = ByString; |
245 | }; |
246 | template <> |
247 | struct SelectDelimiter<absl::string_view> { |
248 | using type = ByString; |
249 | }; |
250 | template <> |
251 | struct SelectDelimiter<std::string> { |
252 | using type = ByString; |
253 | }; |
254 | |
255 | // Wraps another delimiter and sets a max number of matches for that delimiter. |
256 | template <typename Delimiter> |
257 | class MaxSplitsImpl { |
258 | public: |
259 | MaxSplitsImpl(Delimiter delimiter, int limit) |
260 | : delimiter_(delimiter), limit_(limit), count_(0) {} |
261 | absl::string_view Find(absl::string_view text, size_t pos) { |
262 | if (count_++ == limit_) { |
263 | return absl::string_view(text.data() + text.size(), |
264 | 0); // No more matches. |
265 | } |
266 | return delimiter_.Find(text, pos); |
267 | } |
268 | |
269 | private: |
270 | Delimiter delimiter_; |
271 | const int limit_; |
272 | int count_; |
273 | }; |
274 | |
275 | } // namespace strings_internal |
276 | |
277 | // MaxSplits() |
278 | // |
279 | // A delimiter that limits the number of matches which can occur to the passed |
280 | // `limit`. The last element in the returned collection will contain all |
281 | // remaining unsplit pieces, which may contain instances of the delimiter. |
282 | // The collection will contain at most `limit` + 1 elements. |
283 | // Example: |
284 | // |
285 | // using absl::MaxSplits; |
286 | // std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1)); |
287 | // |
288 | // // v[0] == "a", v[1] == "b,c" |
289 | template <typename Delimiter> |
290 | inline strings_internal::MaxSplitsImpl< |
291 | typename strings_internal::SelectDelimiter<Delimiter>::type> |
292 | MaxSplits(Delimiter delimiter, int limit) { |
293 | typedef |
294 | typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType; |
295 | return strings_internal::MaxSplitsImpl<DelimiterType>( |
296 | DelimiterType(delimiter), limit); |
297 | } |
298 | |
299 | //------------------------------------------------------------------------------ |
300 | // Predicates |
301 | //------------------------------------------------------------------------------ |
302 | // |
303 | // Predicates filter the results of a `StrSplit()` by determining whether or not |
304 | // a resultant element is included in the result set. A predicate may be passed |
305 | // as an optional third argument to the `StrSplit()` function. |
306 | // |
307 | // Predicates are unary functions (or functors) that take a single |
308 | // `absl::string_view` argument and return a bool indicating whether the |
309 | // argument should be included (`true`) or excluded (`false`). |
310 | // |
311 | // Predicates are useful when filtering out empty substrings. By default, empty |
312 | // substrings may be returned by `StrSplit()`, which is similar to the way split |
313 | // functions work in other programming languages. |
314 | |
315 | // AllowEmpty() |
316 | // |
317 | // Always returns `true`, indicating that all strings--including empty |
318 | // strings--should be included in the split output. This predicate is not |
319 | // strictly needed because this is the default behavior of `StrSplit()`; |
320 | // however, it might be useful at some call sites to make the intent explicit. |
321 | // |
322 | // Example: |
323 | // |
324 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty()); |
325 | // |
326 | // // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == "" |
327 | struct AllowEmpty { |
328 | bool operator()(absl::string_view) const { return true; } |
329 | }; |
330 | |
331 | // SkipEmpty() |
332 | // |
333 | // Returns `false` if the given `absl::string_view` is empty, indicating that |
334 | // `StrSplit()` should omit the empty string. |
335 | // |
336 | // Example: |
337 | // |
338 | // std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty()); |
339 | // |
340 | // // v[0] == "a", v[1] == "b" |
341 | // |
342 | // Note: `SkipEmpty()` does not consider a string containing only whitespace |
343 | // to be empty. To skip such whitespace as well, use the `SkipWhitespace()` |
344 | // predicate. |
345 | struct SkipEmpty { |
346 | bool operator()(absl::string_view sp) const { return !sp.empty(); } |
347 | }; |
348 | |
349 | // SkipWhitespace() |
350 | // |
351 | // Returns `false` if the given `absl::string_view` is empty *or* contains only |
352 | // whitespace, indicating that `StrSplit()` should omit the string. |
353 | // |
354 | // Example: |
355 | // |
356 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", |
357 | // ',', SkipWhitespace()); |
358 | // // v[0] == " a ", v[1] == "b" |
359 | // |
360 | // // SkipEmpty() would return whitespace elements |
361 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty()); |
362 | // // v[0] == " a ", v[1] == " ", v[2] == "b" |
363 | struct SkipWhitespace { |
364 | bool operator()(absl::string_view sp) const { |
365 | sp = absl::StripAsciiWhitespace(sp); |
366 | return !sp.empty(); |
367 | } |
368 | }; |
369 | |
370 | //------------------------------------------------------------------------------ |
371 | // StrSplit() |
372 | //------------------------------------------------------------------------------ |
373 | |
374 | // StrSplit() |
375 | // |
376 | // Splits a given string based on the provided `Delimiter` object, returning the |
377 | // elements within the type specified by the caller. Optionally, you may pass a |
378 | // `Predicate` to `StrSplit()` indicating whether to include or exclude the |
379 | // resulting element within the final result set. (See the overviews for |
380 | // Delimiters and Predicates above.) |
381 | // |
382 | // Example: |
383 | // |
384 | // std::vector<std::string> v = absl::StrSplit("a,b,c,d", ','); |
385 | // // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d" |
386 | // |
387 | // You can also provide an explicit `Delimiter` object: |
388 | // |
389 | // Example: |
390 | // |
391 | // using absl::ByAnyChar; |
392 | // std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",=")); |
393 | // // v[0] == "a", v[1] == "b", v[2] == "c" |
394 | // |
395 | // See above for more information on delimiters. |
396 | // |
397 | // By default, empty strings are included in the result set. You can optionally |
398 | // include a third `Predicate` argument to apply a test for whether the |
399 | // resultant element should be included in the result set: |
400 | // |
401 | // Example: |
402 | // |
403 | // std::vector<std::string> v = absl::StrSplit(" a , ,,b,", |
404 | // ',', SkipWhitespace()); |
405 | // // v[0] == " a ", v[1] == "b" |
406 | // |
407 | // See above for more information on predicates. |
408 | // |
409 | //------------------------------------------------------------------------------ |
410 | // StrSplit() Return Types |
411 | //------------------------------------------------------------------------------ |
412 | // |
413 | // The `StrSplit()` function adapts the returned collection to the collection |
414 | // specified by the caller (e.g. `std::vector` above). The returned collections |
415 | // may contain `std::string`, `absl::string_view` (in which case the original |
416 | // string being split must ensure that it outlives the collection), or any |
417 | // object that can be explicitly created from an `absl::string_view`. This |
418 | // behavior works for: |
419 | // |
420 | // 1) All standard STL containers including `std::vector`, `std::list`, |
421 | // `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap` |
422 | // 2) `std::pair` (which is not actually a container). See below. |
423 | // |
424 | // Example: |
425 | // |
426 | // // The results are returned as `absl::string_view` objects. Note that we |
427 | // // have to ensure that the input string outlives any results. |
428 | // std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ','); |
429 | // |
430 | // // Stores results in a std::set<std::string>, which also performs |
431 | // // de-duplication and orders the elements in ascending order. |
432 | // std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ','); |
433 | // // v[0] == "a", v[1] == "b", v[2] = "c" |
434 | // |
435 | // // `StrSplit()` can be used within a range-based for loop, in which case |
436 | // // each element will be of type `absl::string_view`. |
437 | // std::vector<std::string> v; |
438 | // for (const auto sv : absl::StrSplit("a,b,c", ',')) { |
439 | // if (sv != "b") v.emplace_back(sv); |
440 | // } |
441 | // // v[0] == "a", v[1] == "c" |
442 | // |
443 | // // Stores results in a map. The map implementation assumes that the input |
444 | // // is provided as a series of key/value pairs. For example, the 0th element |
445 | // // resulting from the split will be stored as a key to the 1st element. If |
446 | // // an odd number of elements are resolved, the last element is paired with |
447 | // // a default-constructed value (e.g., empty string). |
448 | // std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ','); |
449 | // // m["a"] == "b", m["c"] == "" // last component value equals "" |
450 | // |
451 | // Splitting to `std::pair` is an interesting case because it can hold only two |
452 | // elements and is not a collection type. When splitting to a `std::pair` the |
453 | // first two split strings become the `std::pair` `.first` and `.second` |
454 | // members, respectively. The remaining split substrings are discarded. If there |
455 | // are less than two split substrings, the empty string is used for the |
456 | // corresponding |
457 | // `std::pair` member. |
458 | // |
459 | // Example: |
460 | // |
461 | // // Stores first two split strings as the members in a std::pair. |
462 | // std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ','); |
463 | // // p.first == "a", p.second == "b" // "c" is omitted. |
464 | // |
465 | // The `StrSplit()` function can be used multiple times to perform more |
466 | // complicated splitting logic, such as intelligently parsing key-value pairs. |
467 | // |
468 | // Example: |
469 | // |
470 | // // The input string "a=b=c,d=e,f=,g" becomes |
471 | // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } |
472 | // std::map<std::string, std::string> m; |
473 | // for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) { |
474 | // m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1))); |
475 | // } |
476 | // EXPECT_EQ("b=c", m.find("a")->second); |
477 | // EXPECT_EQ("e", m.find("d")->second); |
478 | // EXPECT_EQ("", m.find("f")->second); |
479 | // EXPECT_EQ("", m.find("g")->second); |
480 | // |
481 | // WARNING: Due to a legacy bug that is maintained for backward compatibility, |
482 | // splitting the following empty string_views produces different results: |
483 | // |
484 | // absl::StrSplit(absl::string_view(""), '-'); // {""} |
485 | // absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""} |
486 | // |
487 | // Try not to depend on this distinction because the bug may one day be fixed. |
488 | template <typename Delimiter> |
489 | strings_internal::Splitter< |
490 | typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty> |
491 | StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) { |
492 | using DelimiterType = |
493 | typename strings_internal::SelectDelimiter<Delimiter>::type; |
494 | return strings_internal::Splitter<DelimiterType, AllowEmpty>( |
495 | std::move(text), DelimiterType(d), AllowEmpty()); |
496 | } |
497 | |
498 | template <typename Delimiter, typename Predicate> |
499 | strings_internal::Splitter< |
500 | typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate> |
501 | StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d, |
502 | Predicate p) { |
503 | using DelimiterType = |
504 | typename strings_internal::SelectDelimiter<Delimiter>::type; |
505 | return strings_internal::Splitter<DelimiterType, Predicate>( |
506 | std::move(text), DelimiterType(d), std::move(p)); |
507 | } |
508 | |
509 | } // namespace absl |
510 | |
511 | #endif // ABSL_STRINGS_STR_SPLIT_H_ |
512 | |