1/*
2 * Copyright (c) 2015-2019, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/** \file
30 * \brief Tools for string manipulation, ue2_literal definition.
31 */
32
33#ifndef UE2STRING_H
34#define UE2STRING_H
35
36#include "ue2common.h"
37#include "util/charreach.h"
38#include "util/compare.h"
39#include "util/hash.h"
40#include "util/operators.h"
41
42#include <iterator>
43#include <string>
44#include <vector>
45
46#include <boost/dynamic_bitset.hpp>
47#include <boost/iterator/iterator_facade.hpp>
48
49namespace ue2 {
50
51/// Force the given string to upper-case.
52void upperString(std::string &s);
53
54size_t maxStringOverlap(const std::string &a, const std::string &b,
55 bool nocase);
56
57size_t maxStringSelfOverlap(const std::string &a, bool nocase);
58
59/// Compares two strings, returns non-zero if they're different.
60u32 cmp(const char *a, const char *b, size_t len, bool nocase);
61
62/**
63 * \brief String type that also records whether the whole string is caseful or
64 * caseless.
65 *
66 * You should use \ref ue2_literal if you need to represent a mixed-case
67 * literal.
68 */
69struct ue2_case_string {
70 ue2_case_string(std::string s_in, bool nocase_in)
71 : s(std::move(s_in)), nocase(nocase_in) {
72 if (nocase) {
73 upperString(s);
74 }
75 }
76
77 bool operator==(const ue2_case_string &other) const {
78 return s == other.s && nocase == other.nocase;
79 }
80
81 std::string s;
82 bool nocase;
83};
84
85struct ue2_literal : totally_ordered<ue2_literal> {
86public:
87 /// Single element proxy, pointed to by our const_iterator.
88 struct elem {
89 elem() : c(0), nocase(false) {}
90 elem(char c_in, bool nc_in) : c(c_in), nocase(nc_in) {}
91 bool operator==(const elem &o) const {
92 return c == o.c && nocase == o.nocase;
93 }
94 bool operator!=(const elem &o) const {
95 return c != o.c || nocase != o.nocase;
96 }
97 operator CharReach() const;
98 char c;
99 bool nocase;
100 };
101
102 /// Boost iterator_facade lets us synthesize an iterator simply.
103 class const_iterator : public boost::iterator_facade<
104 const_iterator,
105 elem const,
106 boost::random_access_traversal_tag,
107 elem const> {
108 public:
109 const_iterator() {}
110 private:
111 friend class boost::iterator_core_access;
112 void increment() {
113 ++idx;
114 }
115 void decrement() {
116 --idx;
117 }
118 void advance(size_t n) {
119 idx += n;
120 }
121 difference_type distance_to(const const_iterator &other) const {
122 return other.idx - idx;
123 }
124 bool equal(const const_iterator &other) const {
125 return idx == other.idx && lit == other.lit;
126 }
127 const elem dereference() const {
128 return elem(lit->s[idx], lit->nocase[idx]);
129 }
130
131 friend struct ue2_literal;
132 const_iterator(const ue2_literal &lit_in, size_t idx_in)
133 : lit(&lit_in), idx(idx_in) {}
134
135 const ue2_literal *lit = nullptr;
136 size_t idx;
137 };
138
139 using const_reverse_iterator = std::reverse_iterator<const_iterator>;
140 using size_type = std::string::size_type;
141
142 static const size_type npos;
143
144 ue2_literal() = default;
145 ue2_literal(const std::string &s_in, bool nc_in);
146 ue2_literal(char c, bool nc_in);
147 ue2_literal(const ue2_literal &) = default;
148 ue2_literal(ue2_literal &&) = default;
149 ue2_literal &operator=(const ue2_literal &) = default;
150 ue2_literal &operator=(ue2_literal &&) = default;
151
152 template<typename InputIt>
153 ue2_literal(InputIt b, InputIt e) {
154 for (; b != e; ++b) {
155 push_back(*b);
156 }
157 }
158
159 size_type length() const { return s.length(); }
160 bool empty() const { return s.empty(); }
161 ue2_literal substr(size_type pos, size_type n = npos) const;
162 const char *c_str() const { return s.c_str(); }
163 bool any_nocase() const;
164
165 const_iterator begin() const {
166 return const_iterator(*this, 0);
167 }
168
169 const_iterator end() const {
170 return const_iterator(*this, s.size());
171 }
172
173 const_reverse_iterator rbegin() const {
174 return const_reverse_iterator(end());
175 }
176
177 const_reverse_iterator rend() const {
178 return const_reverse_iterator(begin());
179 }
180
181 ue2_literal &erase(size_type pos = 0, size_type n = npos);
182 void push_back(const elem &e) {
183 push_back(e.c, e.nocase);
184 }
185
186 void push_back(char c, bool nc);
187 const elem back() const { return *rbegin(); }
188
189 friend ue2_literal operator+(ue2_literal a, const ue2_literal &b) {
190 a += b;
191 return a;
192 }
193
194 /// Reverse this literal in-place.
195 void reverse();
196
197 void operator+=(const ue2_literal &b);
198 bool operator==(const ue2_literal &b) const {
199 return s == b.s && nocase == b.nocase;
200 }
201 bool operator<(const ue2_literal &b) const;
202
203 void clear(void) { s.clear(); nocase.clear(); }
204
205 const std::string &get_string() const { return s; }
206
207 void swap(ue2_literal &other) {
208 s.swap(other.s);
209 nocase.swap(other.nocase);
210 }
211
212 size_t hash() const;
213
214 void set_pure() { pure = true; }
215 void unset_pure() { pure = false; }
216 bool get_pure() const { return pure; }
217
218 /* TODO: consider existing member functions possibly related with pure. */
219
220private:
221 friend const_iterator;
222 std::string s;
223 boost::dynamic_bitset<> nocase;
224 bool pure = false; /**< born from cutting or not (pure literal). */
225};
226
227/// Return a reversed copy of this literal.
228ue2_literal reverse_literal(const ue2_literal &in);
229
230// Escape any meta characters in a string
231std::string escapeStringMeta(const std::string &s);
232
233/** Note: may be overly conservative if only partially nocase */
234size_t maxStringSelfOverlap(const ue2_literal &a);
235size_t minStringPeriod(const ue2_literal &a);
236size_t maxStringOverlap(const ue2_literal &a, const ue2_literal &b);
237
238/**
239 * \brief True iff the range of a literal given cannot be considered entirely
240 * case-sensitive nor entirely case-insensitive.
241 */
242template<class Iter>
243bool mixed_sensitivity_in(Iter begin, Iter end) {
244 bool cs = false;
245 bool nc = false;
246 for (auto it = begin; it != end; ++it) {
247 if (!ourisalpha(it->c)) {
248 continue;
249 }
250 if (it->nocase) {
251 nc = true;
252 } else {
253 cs = true;
254 }
255 }
256
257 return cs && nc;
258}
259
260/**
261 * \brief True iff the literal cannot be considered entirely case-sensitive
262 * nor entirely case-insensitive.
263 */
264inline
265bool mixed_sensitivity(const ue2_literal &s) {
266 return mixed_sensitivity_in(s.begin(), s.end());
267}
268
269void make_nocase(ue2_literal *lit);
270
271struct case_iter {
272 explicit case_iter(const ue2_literal &ss);
273 const std::string &operator*() const { return s; } /* limited lifetime */
274 case_iter &operator++ ();
275 bool operator!=(const case_iter &b) const { return s != b.s; }
276private:
277 std::string s;
278 std::string s_orig;
279 std::vector<bool> nocase;
280};
281
282case_iter caseIterateBegin(const ue2_literal &lit);
283case_iter caseIterateEnd();
284
285/** \brief True if there is any overlap between the characters in \a s and the
286 * set characters in \a cr.
287 *
288 * Note: this means that if \a s is nocase, then \a cr only needs to have
289 * either the lower-case or upper-case version of a letter set. */
290bool contains(const ue2_literal &s, const CharReach &cr);
291
292/// Returns true if \a a is a suffix of (or equal to) \a b.
293bool isSuffix(const ue2_literal &a, const ue2_literal &b);
294
295static inline
296std::vector<CharReach> as_cr_seq(const ue2_literal &s) {
297 std::vector<CharReach> rv;
298 rv.reserve(s.length());
299 rv.insert(rv.end(), s.begin(), s.end());
300 return rv;
301}
302
303/** \brief True if the given literal consists entirely of a flood of the same
304 * character. */
305bool is_flood(const ue2_literal &s);
306
307#if defined(DUMP_SUPPORT) || defined(DEBUG)
308/* Utility functions for debugging/dumping */
309
310/// Escape a string so it's dot-printable.
311std::string dotEscapeString(const std::string &s);
312
313std::string dumpString(const ue2_literal &lit);
314
315/// Escape a string so that it's screen-printable.
316std::string escapeString(const std::string &s);
317
318/// Escape a ue2_literal so that it's screen-printable.
319std::string escapeString(const ue2_literal &lit);
320
321#endif
322
323} // namespace ue2
324
325namespace std {
326
327template<>
328struct hash<ue2::ue2_literal::elem> {
329 size_t operator()(const ue2::ue2_literal::elem &elem) const {
330 return ue2::hash_all(elem.c, elem.nocase);
331 }
332};
333
334template<>
335struct hash<ue2::ue2_literal> {
336 size_t operator()(const ue2::ue2_literal &lit) const {
337 return lit.hash();
338 }
339};
340
341} // namespace std
342
343#endif
344