1//
2// Copyright 2017 The Abseil Authors.
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// https://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16// -----------------------------------------------------------------------------
17// File: str_split.h
18// -----------------------------------------------------------------------------
19//
20// This file contains functions for splitting strings. It defines the main
21// `StrSplit()` function, several delimiters for determining the boundaries on
22// which to split the string, and predicates for filtering delimited results.
23// `StrSplit()` adapts the returned collection to the type specified by the
24// caller.
25//
26// Example:
27//
28// // Splits the given string on commas. Returns the results in a
29// // vector of strings.
30// std::vector<std::string> v = absl::StrSplit("a,b,c", ',');
31// // Can also use ","
32// // v[0] == "a", v[1] == "b", v[2] == "c"
33//
34// See StrSplit() below for more information.
35#ifndef ABSL_STRINGS_STR_SPLIT_H_
36#define ABSL_STRINGS_STR_SPLIT_H_
37
38#include <algorithm>
39#include <cstddef>
40#include <map>
41#include <set>
42#include <string>
43#include <utility>
44#include <vector>
45
46#include "absl/base/internal/raw_logging.h"
47#include "absl/strings/internal/str_split_internal.h"
48#include "absl/strings/string_view.h"
49#include "absl/strings/strip.h"
50
51namespace absl {
52
53//------------------------------------------------------------------------------
54// Delimiters
55//------------------------------------------------------------------------------
56//
57// `StrSplit()` uses delimiters to define the boundaries between elements in the
58// provided input. Several `Delimiter` types are defined below. If a string
59// (`const char*`, `std::string`, or `absl::string_view`) is passed in place of
60// an explicit `Delimiter` object, `StrSplit()` treats it the same way as if it
61// were passed a `ByString` delimiter.
62//
63// A `Delimiter` is an object with a `Find()` function that knows how to find
64// the first occurrence of itself in a given `absl::string_view`.
65//
66// The following `Delimiter` types are available for use within `StrSplit()`:
67//
68// - `ByString` (default for string arguments)
69// - `ByChar` (default for a char argument)
70// - `ByAnyChar`
71// - `ByLength`
72// - `MaxSplits`
73//
74// A Delimiter's `Find()` member function will be passed an input `text` that is
75// to be split and a position (`pos`) to begin searching for the next delimiter
76// in `text`. The returned absl::string_view should refer to the next occurrence
77// (after `pos`) of the represented delimiter; this returned absl::string_view
78// represents the next location where the input `text` should be broken.
79//
80// The returned absl::string_view may be zero-length if the Delimiter does not
81// represent a part of the string (e.g., a fixed-length delimiter). If no
82// delimiter is found in the input `text`, a zero-length absl::string_view
83// referring to `text.end()` should be returned (e.g.,
84// `text.substr(text.size())`). It is important that the returned
85// absl::string_view always be within the bounds of the input `text` given as an
86// argument--it must not refer to a string that is physically located outside of
87// the given string.
88//
89// The following example is a simple Delimiter object that is created with a
90// single char and will look for that char in the text passed to the `Find()`
91// function:
92//
93// struct SimpleDelimiter {
94// const char c_;
95// explicit SimpleDelimiter(char c) : c_(c) {}
96// absl::string_view Find(absl::string_view text, size_t pos) {
97// auto found = text.find(c_, pos);
98// if (found == absl::string_view::npos)
99// return text.substr(text.size());
100//
101// return text.substr(found, 1);
102// }
103// };
104
105// ByString
106//
107// A sub-string delimiter. If `StrSplit()` is passed a string in place of a
108// `Delimiter` object, the string will be implicitly converted into a
109// `ByString` delimiter.
110//
111// Example:
112//
113// // Because a string literal is converted to an `absl::ByString`,
114// // the following two splits are equivalent.
115//
116// std::vector<std::string> v1 = absl::StrSplit("a, b, c", ", ");
117//
118// using absl::ByString;
119// std::vector<std::string> v2 = absl::StrSplit("a, b, c",
120// ByString(", "));
121// // v[0] == "a", v[1] == "b", v[2] == "c"
122class ByString {
123 public:
124 explicit ByString(absl::string_view sp);
125 absl::string_view Find(absl::string_view text, size_t pos) const;
126
127 private:
128 const std::string delimiter_;
129};
130
131// ByChar
132//
133// A single character delimiter. `ByChar` is functionally equivalent to a
134// 1-char string within a `ByString` delimiter, but slightly more efficient.
135//
136// Example:
137//
138// // Because a char literal is converted to a absl::ByChar,
139// // the following two splits are equivalent.
140// std::vector<std::string> v1 = absl::StrSplit("a,b,c", ',');
141// using absl::ByChar;
142// std::vector<std::string> v2 = absl::StrSplit("a,b,c", ByChar(','));
143// // v[0] == "a", v[1] == "b", v[2] == "c"
144//
145// `ByChar` is also the default delimiter if a single character is given
146// as the delimiter to `StrSplit()`. For example, the following calls are
147// equivalent:
148//
149// std::vector<std::string> v = absl::StrSplit("a-b", '-');
150//
151// using absl::ByChar;
152// std::vector<std::string> v = absl::StrSplit("a-b", ByChar('-'));
153//
154class ByChar {
155 public:
156 explicit ByChar(char c) : c_(c) {}
157 absl::string_view Find(absl::string_view text, size_t pos) const;
158
159 private:
160 char c_;
161};
162
163// ByAnyChar
164//
165// A delimiter that will match any of the given byte-sized characters within
166// its provided string.
167//
168// Note: this delimiter works with single-byte string data, but does not work
169// with variable-width encodings, such as UTF-8.
170//
171// Example:
172//
173// using absl::ByAnyChar;
174// std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
175// // v[0] == "a", v[1] == "b", v[2] == "c"
176//
177// If `ByAnyChar` is given the empty string, it behaves exactly like
178// `ByString` and matches each individual character in the input string.
179//
180class ByAnyChar {
181 public:
182 explicit ByAnyChar(absl::string_view sp);
183 absl::string_view Find(absl::string_view text, size_t pos) const;
184
185 private:
186 const std::string delimiters_;
187};
188
189// ByLength
190//
191// A delimiter for splitting into equal-length strings. The length argument to
192// the constructor must be greater than 0.
193//
194// Note: this delimiter works with single-byte string data, but does not work
195// with variable-width encodings, such as UTF-8.
196//
197// Example:
198//
199// using absl::ByLength;
200// std::vector<std::string> v = absl::StrSplit("123456789", ByLength(3));
201
202// // v[0] == "123", v[1] == "456", v[2] == "789"
203//
204// Note that the string does not have to be a multiple of the fixed split
205// length. In such a case, the last substring will be shorter.
206//
207// using absl::ByLength;
208// std::vector<std::string> v = absl::StrSplit("12345", ByLength(2));
209//
210// // v[0] == "12", v[1] == "34", v[2] == "5"
211class ByLength {
212 public:
213 explicit ByLength(ptrdiff_t length);
214 absl::string_view Find(absl::string_view text, size_t pos) const;
215
216 private:
217 const ptrdiff_t length_;
218};
219
220namespace strings_internal {
221
222// A traits-like metafunction for selecting the default Delimiter object type
223// for a particular Delimiter type. The base case simply exposes type Delimiter
224// itself as the delimiter's Type. However, there are specializations for
225// string-like objects that map them to the ByString delimiter object.
226// This allows functions like absl::StrSplit() and absl::MaxSplits() to accept
227// string-like objects (e.g., ',') as delimiter arguments but they will be
228// treated as if a ByString delimiter was given.
229template <typename Delimiter>
230struct SelectDelimiter {
231 using type = Delimiter;
232};
233
234template <>
235struct SelectDelimiter<char> {
236 using type = ByChar;
237};
238template <>
239struct SelectDelimiter<char*> {
240 using type = ByString;
241};
242template <>
243struct SelectDelimiter<const char*> {
244 using type = ByString;
245};
246template <>
247struct SelectDelimiter<absl::string_view> {
248 using type = ByString;
249};
250template <>
251struct SelectDelimiter<std::string> {
252 using type = ByString;
253};
254
255// Wraps another delimiter and sets a max number of matches for that delimiter.
256template <typename Delimiter>
257class MaxSplitsImpl {
258 public:
259 MaxSplitsImpl(Delimiter delimiter, int limit)
260 : delimiter_(delimiter), limit_(limit), count_(0) {}
261 absl::string_view Find(absl::string_view text, size_t pos) {
262 if (count_++ == limit_) {
263 return absl::string_view(text.data() + text.size(),
264 0); // No more matches.
265 }
266 return delimiter_.Find(text, pos);
267 }
268
269 private:
270 Delimiter delimiter_;
271 const int limit_;
272 int count_;
273};
274
275} // namespace strings_internal
276
277// MaxSplits()
278//
279// A delimiter that limits the number of matches which can occur to the passed
280// `limit`. The last element in the returned collection will contain all
281// remaining unsplit pieces, which may contain instances of the delimiter.
282// The collection will contain at most `limit` + 1 elements.
283// Example:
284//
285// using absl::MaxSplits;
286// std::vector<std::string> v = absl::StrSplit("a,b,c", MaxSplits(',', 1));
287//
288// // v[0] == "a", v[1] == "b,c"
289template <typename Delimiter>
290inline strings_internal::MaxSplitsImpl<
291 typename strings_internal::SelectDelimiter<Delimiter>::type>
292MaxSplits(Delimiter delimiter, int limit) {
293 typedef
294 typename strings_internal::SelectDelimiter<Delimiter>::type DelimiterType;
295 return strings_internal::MaxSplitsImpl<DelimiterType>(
296 DelimiterType(delimiter), limit);
297}
298
299//------------------------------------------------------------------------------
300// Predicates
301//------------------------------------------------------------------------------
302//
303// Predicates filter the results of a `StrSplit()` by determining whether or not
304// a resultant element is included in the result set. A predicate may be passed
305// as an optional third argument to the `StrSplit()` function.
306//
307// Predicates are unary functions (or functors) that take a single
308// `absl::string_view` argument and return a bool indicating whether the
309// argument should be included (`true`) or excluded (`false`).
310//
311// Predicates are useful when filtering out empty substrings. By default, empty
312// substrings may be returned by `StrSplit()`, which is similar to the way split
313// functions work in other programming languages.
314
315// AllowEmpty()
316//
317// Always returns `true`, indicating that all strings--including empty
318// strings--should be included in the split output. This predicate is not
319// strictly needed because this is the default behavior of `StrSplit()`;
320// however, it might be useful at some call sites to make the intent explicit.
321//
322// Example:
323//
324// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', AllowEmpty());
325//
326// // v[0] == " a ", v[1] == " ", v[2] == "", v[3] = "b", v[4] == ""
327struct AllowEmpty {
328 bool operator()(absl::string_view) const { return true; }
329};
330
331// SkipEmpty()
332//
333// Returns `false` if the given `absl::string_view` is empty, indicating that
334// `StrSplit()` should omit the empty string.
335//
336// Example:
337//
338// std::vector<std::string> v = absl::StrSplit(",a,,b,", ',', SkipEmpty());
339//
340// // v[0] == "a", v[1] == "b"
341//
342// Note: `SkipEmpty()` does not consider a string containing only whitespace
343// to be empty. To skip such whitespace as well, use the `SkipWhitespace()`
344// predicate.
345struct SkipEmpty {
346 bool operator()(absl::string_view sp) const { return !sp.empty(); }
347};
348
349// SkipWhitespace()
350//
351// Returns `false` if the given `absl::string_view` is empty *or* contains only
352// whitespace, indicating that `StrSplit()` should omit the string.
353//
354// Example:
355//
356// std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
357// ',', SkipWhitespace());
358// // v[0] == " a ", v[1] == "b"
359//
360// // SkipEmpty() would return whitespace elements
361// std::vector<std::string> v = absl::StrSplit(" a , ,,b,", ',', SkipEmpty());
362// // v[0] == " a ", v[1] == " ", v[2] == "b"
363struct SkipWhitespace {
364 bool operator()(absl::string_view sp) const {
365 sp = absl::StripAsciiWhitespace(sp);
366 return !sp.empty();
367 }
368};
369
370//------------------------------------------------------------------------------
371// StrSplit()
372//------------------------------------------------------------------------------
373
374// StrSplit()
375//
376// Splits a given string based on the provided `Delimiter` object, returning the
377// elements within the type specified by the caller. Optionally, you may pass a
378// `Predicate` to `StrSplit()` indicating whether to include or exclude the
379// resulting element within the final result set. (See the overviews for
380// Delimiters and Predicates above.)
381//
382// Example:
383//
384// std::vector<std::string> v = absl::StrSplit("a,b,c,d", ',');
385// // v[0] == "a", v[1] == "b", v[2] == "c", v[3] == "d"
386//
387// You can also provide an explicit `Delimiter` object:
388//
389// Example:
390//
391// using absl::ByAnyChar;
392// std::vector<std::string> v = absl::StrSplit("a,b=c", ByAnyChar(",="));
393// // v[0] == "a", v[1] == "b", v[2] == "c"
394//
395// See above for more information on delimiters.
396//
397// By default, empty strings are included in the result set. You can optionally
398// include a third `Predicate` argument to apply a test for whether the
399// resultant element should be included in the result set:
400//
401// Example:
402//
403// std::vector<std::string> v = absl::StrSplit(" a , ,,b,",
404// ',', SkipWhitespace());
405// // v[0] == " a ", v[1] == "b"
406//
407// See above for more information on predicates.
408//
409//------------------------------------------------------------------------------
410// StrSplit() Return Types
411//------------------------------------------------------------------------------
412//
413// The `StrSplit()` function adapts the returned collection to the collection
414// specified by the caller (e.g. `std::vector` above). The returned collections
415// may contain `std::string`, `absl::string_view` (in which case the original
416// string being split must ensure that it outlives the collection), or any
417// object that can be explicitly created from an `absl::string_view`. This
418// behavior works for:
419//
420// 1) All standard STL containers including `std::vector`, `std::list`,
421// `std::deque`, `std::set`,`std::multiset`, 'std::map`, and `std::multimap`
422// 2) `std::pair` (which is not actually a container). See below.
423//
424// Example:
425//
426// // The results are returned as `absl::string_view` objects. Note that we
427// // have to ensure that the input string outlives any results.
428// std::vector<absl::string_view> v = absl::StrSplit("a,b,c", ',');
429//
430// // Stores results in a std::set<std::string>, which also performs
431// // de-duplication and orders the elements in ascending order.
432// std::set<std::string> a = absl::StrSplit("b,a,c,a,b", ',');
433// // v[0] == "a", v[1] == "b", v[2] = "c"
434//
435// // `StrSplit()` can be used within a range-based for loop, in which case
436// // each element will be of type `absl::string_view`.
437// std::vector<std::string> v;
438// for (const auto sv : absl::StrSplit("a,b,c", ',')) {
439// if (sv != "b") v.emplace_back(sv);
440// }
441// // v[0] == "a", v[1] == "c"
442//
443// // Stores results in a map. The map implementation assumes that the input
444// // is provided as a series of key/value pairs. For example, the 0th element
445// // resulting from the split will be stored as a key to the 1st element. If
446// // an odd number of elements are resolved, the last element is paired with
447// // a default-constructed value (e.g., empty string).
448// std::map<std::string, std::string> m = absl::StrSplit("a,b,c", ',');
449// // m["a"] == "b", m["c"] == "" // last component value equals ""
450//
451// Splitting to `std::pair` is an interesting case because it can hold only two
452// elements and is not a collection type. When splitting to a `std::pair` the
453// first two split strings become the `std::pair` `.first` and `.second`
454// members, respectively. The remaining split substrings are discarded. If there
455// are less than two split substrings, the empty string is used for the
456// corresponding
457// `std::pair` member.
458//
459// Example:
460//
461// // Stores first two split strings as the members in a std::pair.
462// std::pair<std::string, std::string> p = absl::StrSplit("a,b,c", ',');
463// // p.first == "a", p.second == "b" // "c" is omitted.
464//
465// The `StrSplit()` function can be used multiple times to perform more
466// complicated splitting logic, such as intelligently parsing key-value pairs.
467//
468// Example:
469//
470// // The input string "a=b=c,d=e,f=,g" becomes
471// // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" }
472// std::map<std::string, std::string> m;
473// for (absl::string_view sp : absl::StrSplit("a=b=c,d=e,f=,g", ',')) {
474// m.insert(absl::StrSplit(sp, absl::MaxSplits('=', 1)));
475// }
476// EXPECT_EQ("b=c", m.find("a")->second);
477// EXPECT_EQ("e", m.find("d")->second);
478// EXPECT_EQ("", m.find("f")->second);
479// EXPECT_EQ("", m.find("g")->second);
480//
481// WARNING: Due to a legacy bug that is maintained for backward compatibility,
482// splitting the following empty string_views produces different results:
483//
484// absl::StrSplit(absl::string_view(""), '-'); // {""}
485// absl::StrSplit(absl::string_view(), '-'); // {}, but should be {""}
486//
487// Try not to depend on this distinction because the bug may one day be fixed.
488template <typename Delimiter>
489strings_internal::Splitter<
490 typename strings_internal::SelectDelimiter<Delimiter>::type, AllowEmpty>
491StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d) {
492 using DelimiterType =
493 typename strings_internal::SelectDelimiter<Delimiter>::type;
494 return strings_internal::Splitter<DelimiterType, AllowEmpty>(
495 std::move(text), DelimiterType(d), AllowEmpty());
496}
497
498template <typename Delimiter, typename Predicate>
499strings_internal::Splitter<
500 typename strings_internal::SelectDelimiter<Delimiter>::type, Predicate>
501StrSplit(strings_internal::ConvertibleToStringView text, Delimiter d,
502 Predicate p) {
503 using DelimiterType =
504 typename strings_internal::SelectDelimiter<Delimiter>::type;
505 return strings_internal::Splitter<DelimiterType, Predicate>(
506 std::move(text), DelimiterType(d), std::move(p));
507}
508
509} // namespace absl
510
511#endif // ABSL_STRINGS_STR_SPLIT_H_
512