1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the QtCore module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | #include "qbytearraymatcher.h" |
41 | |
42 | #include <limits.h> |
43 | |
44 | QT_BEGIN_NAMESPACE |
45 | |
46 | static inline void bm_init_skiptable(const uchar *cc, qsizetype len, uchar *skiptable) |
47 | { |
48 | int l = int(qMin(len, qsizetype(255))); |
49 | memset(skiptable, l, 256*sizeof(uchar)); |
50 | cc += len - l; |
51 | while (l--) |
52 | skiptable[*cc++] = l; |
53 | } |
54 | |
55 | static inline qsizetype bm_find(const uchar *cc, qsizetype l, qsizetype index, const uchar *puc, |
56 | qsizetype pl, const uchar *skiptable) |
57 | { |
58 | if (pl == 0) |
59 | return index > l ? -1 : index; |
60 | const qsizetype pl_minus_one = pl - 1; |
61 | |
62 | const uchar *current = cc + index + pl_minus_one; |
63 | const uchar *end = cc + l; |
64 | while (current < end) { |
65 | qsizetype skip = skiptable[*current]; |
66 | if (!skip) { |
67 | // possible match |
68 | while (skip < pl) { |
69 | if (*(current - skip) != puc[pl_minus_one - skip]) |
70 | break; |
71 | skip++; |
72 | } |
73 | if (skip > pl_minus_one) // we have a match |
74 | return (current - cc) - skip + 1; |
75 | |
76 | // in case we don't have a match we are a bit inefficient as we only skip by one |
77 | // when we have the non matching char in the string. |
78 | if (skiptable[*(current - skip)] == pl) |
79 | skip = pl - skip; |
80 | else |
81 | skip = 1; |
82 | } |
83 | if (current > end - skip) |
84 | break; |
85 | current += skip; |
86 | } |
87 | return -1; // not found |
88 | } |
89 | |
90 | /*! \class QByteArrayMatcher |
91 | \inmodule QtCore |
92 | \brief The QByteArrayMatcher class holds a sequence of bytes that |
93 | can be quickly matched in a byte array. |
94 | |
95 | \ingroup tools |
96 | \ingroup string-processing |
97 | |
98 | This class is useful when you have a sequence of bytes that you |
99 | want to repeatedly match against some byte arrays (perhaps in a |
100 | loop), or when you want to search for the same sequence of bytes |
101 | multiple times in the same byte array. Using a matcher object and |
102 | indexIn() is faster than matching a plain QByteArray with |
103 | QByteArray::indexOf() if repeated matching takes place. This |
104 | class offers no benefit if you are doing one-off byte array |
105 | matches. |
106 | |
107 | Create the QByteArrayMatcher with the QByteArray you want to |
108 | search for. Then call indexIn() on the QByteArray that you want to |
109 | search. |
110 | |
111 | \sa QByteArray, QStringMatcher |
112 | */ |
113 | |
114 | /*! |
115 | Constructs an empty byte array matcher that won't match anything. |
116 | Call setPattern() to give it a pattern to match. |
117 | */ |
118 | QByteArrayMatcher::QByteArrayMatcher() |
119 | : d(nullptr) |
120 | { |
121 | p.p = nullptr; |
122 | p.l = 0; |
123 | memset(p.q_skiptable, 0, sizeof(p.q_skiptable)); |
124 | } |
125 | |
126 | /*! |
127 | Constructs a byte array matcher from \a pattern. \a pattern |
128 | has the given \a length. \a pattern must remain in scope, but |
129 | the destructor does not delete \a pattern. |
130 | */ |
131 | QByteArrayMatcher::QByteArrayMatcher(const char *pattern, qsizetype length) : d(nullptr) |
132 | { |
133 | p.p = reinterpret_cast<const uchar *>(pattern); |
134 | p.l = length; |
135 | bm_init_skiptable(p.p, p.l, p.q_skiptable); |
136 | } |
137 | |
138 | /*! |
139 | Constructs a byte array matcher that will search for \a pattern. |
140 | Call indexIn() to perform a search. |
141 | */ |
142 | QByteArrayMatcher::QByteArrayMatcher(const QByteArray &pattern) |
143 | : d(nullptr), q_pattern(pattern) |
144 | { |
145 | p.p = reinterpret_cast<const uchar *>(pattern.constData()); |
146 | p.l = pattern.size(); |
147 | bm_init_skiptable(p.p, p.l, p.q_skiptable); |
148 | } |
149 | |
150 | /*! |
151 | Copies the \a other byte array matcher to this byte array matcher. |
152 | */ |
153 | QByteArrayMatcher::QByteArrayMatcher(const QByteArrayMatcher &other) |
154 | : d(nullptr) |
155 | { |
156 | operator=(other); |
157 | } |
158 | |
159 | /*! |
160 | Destroys the byte array matcher. |
161 | */ |
162 | QByteArrayMatcher::~QByteArrayMatcher() |
163 | { |
164 | Q_UNUSED(d); |
165 | } |
166 | |
167 | /*! |
168 | Assigns the \a other byte array matcher to this byte array matcher. |
169 | */ |
170 | QByteArrayMatcher &QByteArrayMatcher::operator=(const QByteArrayMatcher &other) |
171 | { |
172 | q_pattern = other.q_pattern; |
173 | memcpy(&p, &other.p, sizeof(p)); |
174 | return *this; |
175 | } |
176 | |
177 | /*! |
178 | Sets the byte array that this byte array matcher will search for |
179 | to \a pattern. |
180 | |
181 | \sa pattern(), indexIn() |
182 | */ |
183 | void QByteArrayMatcher::setPattern(const QByteArray &pattern) |
184 | { |
185 | q_pattern = pattern; |
186 | p.p = reinterpret_cast<const uchar *>(pattern.constData()); |
187 | p.l = pattern.size(); |
188 | bm_init_skiptable(p.p, p.l, p.q_skiptable); |
189 | } |
190 | |
191 | /*! |
192 | Searches the byte array \a ba, from byte position \a from (default |
193 | 0, i.e. from the first byte), for the byte array pattern() that |
194 | was set in the constructor or in the most recent call to |
195 | setPattern(). Returns the position where the pattern() matched in |
196 | \a ba, or -1 if no match was found. |
197 | */ |
198 | qsizetype QByteArrayMatcher::indexIn(const QByteArray &ba, qsizetype from) const |
199 | { |
200 | if (from < 0) |
201 | from = 0; |
202 | return bm_find(reinterpret_cast<const uchar *>(ba.constData()), ba.size(), from, |
203 | p.p, p.l, p.q_skiptable); |
204 | } |
205 | |
206 | /*! |
207 | Searches the char string \a str, which has length \a len, from |
208 | byte position \a from (default 0, i.e. from the first byte), for |
209 | the byte array pattern() that was set in the constructor or in the |
210 | most recent call to setPattern(). Returns the position where the |
211 | pattern() matched in \a str, or -1 if no match was found. |
212 | */ |
213 | qsizetype QByteArrayMatcher::indexIn(const char *str, qsizetype len, qsizetype from) const |
214 | { |
215 | if (from < 0) |
216 | from = 0; |
217 | return bm_find(reinterpret_cast<const uchar *>(str), len, from, |
218 | p.p, p.l, p.q_skiptable); |
219 | } |
220 | |
221 | /*! |
222 | \fn QByteArray QByteArrayMatcher::pattern() const |
223 | |
224 | Returns the byte array pattern that this byte array matcher will |
225 | search for. |
226 | |
227 | \sa setPattern() |
228 | */ |
229 | |
230 | |
231 | static qsizetype findChar(const char *str, qsizetype len, char ch, qsizetype from) |
232 | { |
233 | const uchar *s = (const uchar *)str; |
234 | uchar c = (uchar)ch; |
235 | if (from < 0) |
236 | from = qMax(from + len, qsizetype(0)); |
237 | if (from < len) { |
238 | const uchar *n = s + from - 1; |
239 | const uchar *e = s + len; |
240 | while (++n != e) |
241 | if (*n == c) |
242 | return n - s; |
243 | } |
244 | return -1; |
245 | } |
246 | |
247 | /*! |
248 | \internal |
249 | */ |
250 | static qsizetype qFindByteArrayBoyerMoore( |
251 | const char *haystack, qsizetype haystackLen, qsizetype haystackOffset, |
252 | const char *needle, qsizetype needleLen) |
253 | { |
254 | uchar skiptable[256]; |
255 | bm_init_skiptable((const uchar *)needle, needleLen, skiptable); |
256 | if (haystackOffset < 0) |
257 | haystackOffset = 0; |
258 | return bm_find((const uchar *)haystack, haystackLen, haystackOffset, |
259 | (const uchar *)needle, needleLen, skiptable); |
260 | } |
261 | |
262 | #define REHASH(a) \ |
263 | if (sl_minus_1 < sizeof(std::size_t) * CHAR_BIT) \ |
264 | hashHaystack -= std::size_t(a) << sl_minus_1; \ |
265 | hashHaystack <<= 1 |
266 | |
267 | /*! |
268 | \internal |
269 | */ |
270 | qsizetype qFindByteArray( |
271 | const char *haystack0, qsizetype haystackLen, qsizetype from, |
272 | const char *needle, qsizetype needleLen) |
273 | { |
274 | const auto l = haystackLen; |
275 | const auto sl = needleLen; |
276 | if (from < 0) |
277 | from += l; |
278 | if (std::size_t(sl + from) > std::size_t(l)) |
279 | return -1; |
280 | if (!sl) |
281 | return from; |
282 | if (!l) |
283 | return -1; |
284 | |
285 | if (sl == 1) |
286 | return findChar(haystack0, haystackLen, needle[0], from); |
287 | |
288 | /* |
289 | We use the Boyer-Moore algorithm in cases where the overhead |
290 | for the skip table should pay off, otherwise we use a simple |
291 | hash function. |
292 | */ |
293 | if (l > 500 && sl > 5) |
294 | return qFindByteArrayBoyerMoore(haystack0, haystackLen, from, |
295 | needle, needleLen); |
296 | |
297 | /* |
298 | We use some hashing for efficiency's sake. Instead of |
299 | comparing strings, we compare the hash value of str with that |
300 | of a part of this QString. Only if that matches, we call memcmp(). |
301 | */ |
302 | const char *haystack = haystack0 + from; |
303 | const char *end = haystack0 + (l - sl); |
304 | const auto sl_minus_1 = std::size_t(sl - 1); |
305 | std::size_t hashNeedle = 0, hashHaystack = 0; |
306 | qsizetype idx; |
307 | for (idx = 0; idx < sl; ++idx) { |
308 | hashNeedle = ((hashNeedle<<1) + needle[idx]); |
309 | hashHaystack = ((hashHaystack<<1) + haystack[idx]); |
310 | } |
311 | hashHaystack -= *(haystack + sl_minus_1); |
312 | |
313 | while (haystack <= end) { |
314 | hashHaystack += *(haystack + sl_minus_1); |
315 | if (hashHaystack == hashNeedle && *needle == *haystack |
316 | && memcmp(needle, haystack, sl) == 0) |
317 | return haystack - haystack0; |
318 | |
319 | REHASH(*haystack); |
320 | ++haystack; |
321 | } |
322 | return -1; |
323 | } |
324 | |
325 | /*! |
326 | \class QStaticByteArrayMatcherBase |
327 | \since 5.9 |
328 | \internal |
329 | \brief Non-template base class of QStaticByteArrayMatcher. |
330 | */ |
331 | |
332 | /*! |
333 | \class QStaticByteArrayMatcher |
334 | \since 5.9 |
335 | \inmodule QtCore |
336 | \brief The QStaticByteArrayMatcher class is a compile-time version of QByteArrayMatcher. |
337 | |
338 | \ingroup tools |
339 | \ingroup string-processing |
340 | |
341 | This class is useful when you have a sequence of bytes that you |
342 | want to repeatedly match against some byte arrays (perhaps in a |
343 | loop), or when you want to search for the same sequence of bytes |
344 | multiple times in the same byte array. Using a matcher object and |
345 | indexIn() is faster than matching a plain QByteArray with |
346 | QByteArray::indexOf(), in particular if repeated matching takes place. |
347 | |
348 | Unlike QByteArrayMatcher, this class calculates the internal |
349 | representation at \e{compile-time}, if your compiler supports |
350 | C++14-level \c{constexpr} (C++11 is not sufficient), so it can |
351 | even benefit if you are doing one-off byte array matches. |
352 | |
353 | Create the QStaticByteArrayMatcher by calling qMakeStaticByteArrayMatcher(), |
354 | passing it the C string literal you want to search for. Store the return |
355 | value of that function in a \c{static const auto} variable, so you don't need |
356 | to pass the \c{N} template parameter explicitly: |
357 | |
358 | \snippet code/src_corelib_text_qbytearraymatcher.cpp 0 |
359 | |
360 | Then call indexIn() on the QByteArray in which you want to search, just like |
361 | with QByteArrayMatcher. |
362 | |
363 | Since this class is designed to do all the up-front calculations at compile-time, |
364 | it does not offer a setPattern() method. |
365 | |
366 | \note Qt detects the necessary C++14 compiler support by way of the feature |
367 | test recommendations from |
368 | \l{https://isocpp.org/std/standing-documents/sd-6-sg10-feature-test-recommendations} |
369 | {C++ Committee's Standing Document 6}. |
370 | |
371 | \sa QByteArrayMatcher, QStringMatcher |
372 | */ |
373 | |
374 | /*! |
375 | \fn template <uint N> int QStaticByteArrayMatcher<N>::indexIn(const char *haystack, int hlen, int from = 0) const |
376 | |
377 | Searches the char string \a haystack, which has length \a hlen, from |
378 | byte position \a from (default 0, i.e. from the first byte), for |
379 | the byte array pattern() that was set in the constructor. |
380 | |
381 | Returns the position where the pattern() matched in \a haystack, or -1 if no match was found. |
382 | */ |
383 | |
384 | /*! |
385 | \fn template <uint N> int QStaticByteArrayMatcher<N>::indexIn(const QByteArray &haystack, int from = 0) const |
386 | |
387 | Searches the char string \a haystack, from byte position \a from |
388 | (default 0, i.e. from the first byte), for the byte array pattern() |
389 | that was set in the constructor. |
390 | |
391 | Returns the position where the pattern() matched in \a haystack, or -1 if no match was found. |
392 | */ |
393 | |
394 | /*! |
395 | \fn template <uint N> QByteArray QStaticByteArrayMatcher<N>::pattern() const |
396 | |
397 | Returns the byte array pattern that this byte array matcher will |
398 | search for. |
399 | |
400 | \sa QByteArrayMatcher::setPattern() |
401 | */ |
402 | |
403 | /*! |
404 | \internal |
405 | */ |
406 | int QStaticByteArrayMatcherBase::indexOfIn(const char *needle, uint nlen, const char *haystack, int hlen, int from) const noexcept |
407 | { |
408 | if (from < 0) |
409 | from = 0; |
410 | return bm_find(reinterpret_cast<const uchar *>(haystack), hlen, from, |
411 | reinterpret_cast<const uchar *>(needle), nlen, m_skiptable.data); |
412 | } |
413 | |
414 | /*! |
415 | \fn template <uint N> QStaticByteArrayMatcher<N>::QStaticByteArrayMatcher(const char (&pattern)[N]) |
416 | \internal |
417 | */ |
418 | |
419 | /*! |
420 | \fn template <uint N> QStaticByteArrayMatcher qMakeStaticByteArrayMatcher(const char (&pattern)[N]) |
421 | \since 5.9 |
422 | \relates QStaticByteArrayMatcher |
423 | |
424 | Return a QStaticByteArrayMatcher with the correct \c{N} determined |
425 | automatically from the \a pattern passed. |
426 | |
427 | To take full advantage of this function, assign the result to an |
428 | \c{auto} variable: |
429 | |
430 | \snippet code/src_corelib_text_qbytearraymatcher.cpp 1 |
431 | */ |
432 | |
433 | |
434 | QT_END_NAMESPACE |
435 | |