rose_build_long_lit.cpp source code [ClickHouse/contrib/hyperscan/src/rose/rose_build_long_lit.cpp]

1	/*
2	* Copyright (c) 2016-2017, Intel Corporation
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions are met:
6	*
7	* * Redistributions of source code must retain the above copyright notice,
8	* this list of conditions and the following disclaimer.
9	* * Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	* * Neither the name of Intel Corporation nor the names of its contributors
13	* may be used to endorse or promote products derived from this software
14	* without specific prior written permission.
15	*
16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26	* POSSIBILITY OF SUCH DAMAGE.
27	*/
28
29	#include "rose_build_long_lit.h"
30
31	#include "rose_build_engine_blob.h"
32	#include "rose_build_impl.h"
33	#include "stream_long_lit_hash.h"
34	#include "util/bytecode_ptr.h"
35	#include "util/bitutils.h"
36	#include "util/verify_types.h"
37	#include "util/compile_context.h"
38
39	#include <algorithm>
40	#include <numeric>
41
42	using namespace std;
43
44	namespace ue2 {
45
46	/* \brief Minimum size for a non-empty hash table. Must be a power of two. /
47	static constexpr size_t MIN_HASH_TABLE_SIZE = `128`;
48
49	/* \brief Maximum load factor (between zero and one) for a hash table. /
50	static constexpr double MAX_HASH_TABLE_LOAD = `0.7`;
51
52	/* \brief Minimum size (in bits) for a bloom filter. Must be a power of two. /
53	static constexpr u32 MIN_BLOOM_FILTER_SIZE = `256`;
54
55	/* \brief Maximum load factor (between zero and one) for a bloom filter. /
56	static constexpr double MAX_BLOOM_FILTER_LOAD = `0.25`;
57
58	struct LongLitModeInfo {
59	u32 num_literals = `0`; //!< Number of strings for this mode.
60	u32 hashed_positions = `0`; //!< Number of hashable string positions.
61	};
62
63	struct LongLitInfo {
64	LongLitModeInfo caseful;
65	LongLitModeInfo nocase;
66	};
67
68	static
69	u32 roundUpToPowerOfTwo(u32 x) {
70	assert(x != `0`);
71	u32 bits = lg2(x - `1`) + `1`;
72	assert(bits < `32`);
73	return `1U` << bits;
74	}
75
76	static
77	LongLitInfo analyzeLongLits(const vector<ue2_case_string> &lits,
78	size_t max_len) {
79	LongLitInfo info;
80
81	for (const auto &lit : lits) {
82	auto &lit_info = lit.nocase ? info.nocase : info.caseful;
83	assert(lit.s.size() > max_len);
84	lit_info.num_literals++;
85	lit_info.hashed_positions += lit.s.size() - max_len;
86	}
87
88	DEBUG_PRINTF("case: hashed %u positions\n", info.caseful.hashed_positions);
89	DEBUG_PRINTF("nocase: hashed %u positions\n", info.nocase.hashed_positions);
90
91	return info;
92	}
93
94	static
95	void addToBloomFilter(vector<u8> &bloom, const u8 substr, bool* nocase) {
96	const u32 num_keys = verify_u32(bloom.size() * `8`);
97	const u32 key_mask = (`1U` << lg2(num_keys)) -`1`;
98
99	const auto hash_functions = { bloomHash_1, bloomHash_2, bloomHash_3 };
100	for (const auto &hash_func : hash_functions) {
101	u32 hash = hash_func(substr, nocase);
102	u32 key = hash & key_mask;
103	DEBUG_PRINTF("set key %u (of %zu)\n", key, bloom.size() * `8`);
104	bloom [key / `8`] \|= `1U` << (key % `8`);
105	}
106	}
107
108	static
109	size_t bloomOccupancy(const vector<u8> &bloom) {
110	return accumulate(begin(bloom), end(bloom), `0`,
111	[](const size_t &sum, const u8 &elem) {
112	return sum + popcount32(elem);
113	});
114	}
115
116	static
117	double bloomLoad(const vector<u8> &bloom) {
118	return (double)bloomOccupancy(bloom) / (double)(bloom.size() * `8`);
119	}
120
121	static
122	vector<u8> buildBloomFilter(const vector<ue2_case_string> &lits, size_t max_len,
123	size_t num_entries, bool nocase) {
124	assert(num_entries % `8` == `0`);
125	assert((num_entries & (num_entries - `1`)) == `0`); // Must be power of two.
126
127	vector<u8> bloom(num_entries / `8`, `0`);
128
129	if (!num_entries) {
130	return bloom;
131	}
132
133	for (const auto &lit : lits) {
134	if (nocase != lit.nocase) {
135	continue;
136	}
137	for (u32 offset = `1`; offset < lit.s.size() - max_len + `1`; offset++) {
138	const u8 substr = (const* u8 *)lit.s.c_str() + offset;
139	addToBloomFilter(bloom, substr, nocase);
140	}
141	}
142
143	DEBUG_PRINTF("%s bloom filter occupancy %zu of %zu entries\n",
144	nocase ? "nocase" : "caseful", bloomOccupancy(bloom),
145	num_entries);
146
147	return bloom;
148	}
149
150
151	static
152	vector<u8> makeBloomFilter(const vector<ue2_case_string> &lits,
153	size_t max_len, bool nocase) {
154	vector<u8> bloom;
155
156	size_t num_entries = MIN_BLOOM_FILTER_SIZE;
157	for (;;) {
158	bloom = buildBloomFilter(lits, max_len, num_entries, nocase);
159	DEBUG_PRINTF("built %s bloom for %zu entries: load %f\n",
160	nocase ? "nocase" : "caseful", num_entries,
161	bloomLoad(bloom));
162	if (bloomLoad(bloom) < MAX_BLOOM_FILTER_LOAD) {
163	break;
164	}
165	num_entries *= `2`;
166	}
167	return bloom;
168	}
169
170	static UNUSED
171	size_t hashTableOccupancy(const vector<RoseLongLitHashEntry> &tab) {
172	return count_if(begin(tab), end(tab), [](const RoseLongLitHashEntry &ent) {
173	return ent.str_offset != `0`;
174	});
175	}
176
177	static UNUSED
178	double hashTableLoad(const vector<RoseLongLitHashEntry> &tab) {
179	return (double)hashTableOccupancy(tab) / (double)(tab.size());
180	}
181
182	using LitOffsetVector = small_vector<pair<u32, u32>, `1`>;
183
184	static
185	vector<RoseLongLitHashEntry> buildHashTable(
186	size_t max_len, const vector<u32> &litToOffsetVal,
187	const map<u32, LitOffsetVector> &hashToLitOffPairs,
188	size_t numEntries) {
189	vector<RoseLongLitHashEntry> tab(numEntries, {`0`,`0`});
190
191	if (!numEntries) {
192	return tab;
193	}
194
195	for (const auto &m : hashToLitOffPairs) {
196	u32 hash = m.first;
197	const LitOffsetVector &d = m.second;
198
199	u32 bucket = hash % numEntries;
200
201	// Placement via linear probing.
202	for (const auto &lit_offset : d) {
203	while (tab [bucket].str_offset != `0`) {
204	bucket++;
205	if (bucket == numEntries) {
206	bucket = `0`;
207	}
208	}
209
210	u32 lit_id = lit_offset.first;
211	u32 offset = lit_offset.second;
212
213	DEBUG_PRINTF("hash 0x%08x lit_id %u offset %u bucket %u\n", hash,
214	lit_id, offset, bucket);
215
216	auto &entry = tab [bucket];
217	entry.str_offset = verify_u32(litToOffsetVal.at(lit_id));
218	assert(entry.str_offset != `0`);
219	entry.str_len = offset + max_len;
220	}
221	}
222
223	DEBUG_PRINTF("hash table occupancy %zu of %zu entries\n",
224	hashTableOccupancy(tab), numEntries);
225
226	return tab;
227	}
228
229	static
230	map<u32, LitOffsetVector> computeLitHashes(const vector<ue2_case_string> &lits,
231	size_t max_len, bool nocase) {
232	map<u32, LitOffsetVector> hashToLitOffPairs;
233
234	for (u32 lit_id = `0`; lit_id < lits.size(); lit_id++) {
235	const ue2_case_string &lit = lits [lit_id];
236	if (nocase != lit.nocase) {
237	continue;
238	}
239	for (u32 offset = `1`; offset < lit.s.size() - max_len + `1`; offset++) {
240	const u8 substr = (const* u8 *)lit.s.c_str() + offset;
241	u32 hash = hashLongLiteral(substr, max_len, lit.nocase);
242	hashToLitOffPairs [hash].emplace_back(lit_id, offset);
243	}
244	}
245
246	for (auto &m : hashToLitOffPairs) {
247	LitOffsetVector &d = m.second;
248	if (d.size() == `1`) {
249	continue;
250	}
251
252	// Sort by (offset, string) so that we'll be able to remove identical
253	// string prefixes.
254	stable_sort(begin(d), end(d),
255	[&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
256	const auto &str_a = lits [a.first].s;
257	const auto &str_b = lits [b.first].s;
258	return tie(a.second, str_a) < tie(b.second, str_b);
259	});
260
261	// Remove entries that point to the same literal prefix.
262	d.erase(unique(begin(d), end(d),
263	[&](const pair<u32, u32> &a, const pair<u32, u32> &b) {
264	if (a.second != b.second) {
265	return false;
266	}
267	const auto &str_a = lits [a.first].s;
268	const auto &str_b = lits [b.first].s;
269	const size_t len = max_len + a.second;
270	return equal(begin(str_a), begin(str_a) + len,
271	begin(str_b));
272	}),
273	end(d));
274
275	// Sort d by distance of the residual string (len minus our depth into
276	// the string). We need to put the 'furthest back' string first.
277	stable_sort(begin(d), end(d),
278	[](const pair<u32, u32> &a, const pair<u32, u32> &b) {
279	if (a.second != b.second) {
280	return a.second > b.second; / longest is first /
281	}
282	return a.first < b.first;
283	});
284	}
285
286	return hashToLitOffPairs;
287	}
288
289	static
290	vector<RoseLongLitHashEntry> makeHashTable(const vector<ue2_case_string> &lits,
291	size_t max_len,
292	const vector<u32> &litToOffsetVal,
293	u32 numPositions, bool nocase) {
294	// Compute lit substring hashes.
295	const auto hashToLitOffPairs = computeLitHashes(lits, max_len, nocase);
296
297	// Compute the size of the hash table: we need enough entries to satisfy
298	// our max load constraint, and it must be a power of two.
299	size_t num_entries = (double)numPositions / MAX_HASH_TABLE_LOAD + `1`;
300	num_entries = roundUpToPowerOfTwo(max(MIN_HASH_TABLE_SIZE, num_entries));
301
302	auto tab = buildHashTable(max_len, litToOffsetVal, hashToLitOffPairs,
303	num_entries);
304	DEBUG_PRINTF("built %s hash table for %zu entries: load %f\n",
305	nocase ? "nocase" : "caseful", num_entries,
306	hashTableLoad(tab));
307	assert(hashTableLoad(tab) < MAX_HASH_TABLE_LOAD);
308
309	return tab;
310	}
311
312	static
313	vector<u8> buildLits(const vector<ue2_case_string> &lits, u32 baseOffset,
314	vector<u32> &litToOffsetVal) {
315	vector<u8> blob;
316	litToOffsetVal.resize(lits.size(), `0`);
317
318	u32 lit_id = `0`;
319	for (const auto &lit : lits) {
320	u32 offset = baseOffset + verify_u32(blob.size());
321	blob.insert(blob.end(), begin(lit.s), end(lit.s));
322	litToOffsetVal [lit_id] = offset;
323	lit_id++;
324	}
325
326	DEBUG_PRINTF("built %zu bytes of strings\n", blob.size());
327	return blob;
328	}
329
330	u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
331	vector<ue2_case_string> &lits,
332	size_t longLitLengthThreshold,
333	size_t *historyRequired,
334	size_t *longLitStreamStateRequired) {
335	// Work in terms of history requirement (i.e. literal len - 1).
336	const size_t max_len = longLitLengthThreshold - `1`;
337
338	// We should only be building the long literal hash table in streaming mode.
339	if (!build.cc.streaming) {
340	return `0`;
341	}
342
343	if (lits.empty()) {
344	DEBUG_PRINTF("no long literals\n");
345	return `0`;
346	}
347
348	// The last char of each literal is trimmed as we're not interested in full
349	// matches, only partial matches.
350	for (auto &lit : lits) {
351	assert(!lit.s.empty());
352	lit.s.pop_back();
353	}
354
355	// Sort by caseful/caseless and in lexicographical order.
356	stable_sort(begin(lits), end(lits), [](const ue2_case_string &a,
357	const ue2_case_string &b) {
358	if (a.nocase != b.nocase) {
359	return a.nocase < b.nocase;
360	}
361	return a.s < b.s;
362	});
363
364	// Find literals that are prefixes of other literals (including
365	// duplicates). Note that we iterate in reverse, since we want to retain
366	// only the longest string from a set of prefixes.
367	auto it = unique(lits.rbegin(), lits.rend(), [](const ue2_case_string &a,
368	const ue2_case_string &b) {
369	return a.nocase == b.nocase && a.s.size() >= b.s.size() &&
370	equal(b.s.begin(), b.s.end(), a.s.begin());
371	});
372
373	// Erase dupes found by unique().
374	lits.erase(lits.begin(), it.base());
375
376	LongLitInfo info = analyzeLongLits(lits, max_len);
377
378	vector<u32> litToOffsetVal;
379	const size_t headerSize = ROUNDUP_16(sizeof(RoseLongLitTable));
380	vector<u8> lit_blob = buildLits(lits, headerSize, litToOffsetVal);
381
382	// Build caseful bloom filter and hash table.
383	vector<u8> bloom_case;
384	vector<RoseLongLitHashEntry> tab_case;
385	if (info.caseful.num_literals) {
386	bloom_case = makeBloomFilter(lits, max_len, false);
387	tab_case = makeHashTable(lits, max_len, litToOffsetVal,
388	info.caseful.hashed_positions, false);
389	}
390
391	// Build nocase bloom filter and hash table.
392	vector<u8> bloom_nocase;
393	vector<RoseLongLitHashEntry> tab_nocase;
394	if (info.nocase.num_literals) {
395	bloom_nocase = makeBloomFilter(lits, max_len, true);
396	tab_nocase = makeHashTable(lits, max_len, litToOffsetVal,
397	info.nocase.hashed_positions, true);
398	}
399
400	size_t wholeLitTabSize = ROUNDUP_16(byte_length(lit_blob));
401	size_t htOffsetCase = headerSize + wholeLitTabSize;
402	size_t htOffsetNocase = htOffsetCase + byte_length(tab_case);
403	size_t bloomOffsetCase = htOffsetNocase + byte_length(tab_nocase);
404	size_t bloomOffsetNocase = bloomOffsetCase + byte_length(bloom_case);
405
406	size_t tabSize = ROUNDUP_16(bloomOffsetNocase + byte_length(bloom_nocase));
407
408	// need to add +2 to both of these to allow space for the actual largest
409	// value as well as handling the fact that we add one to the space when
410	// storing out a position to allow zero to mean "no stream state value"
411	u8 streamBitsCase = lg2(roundUpToPowerOfTwo(tab_case.size() + `2`));
412	u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + `2`));
413	u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, `8`) / `8`;
414
415	auto table = make_zeroed_bytecode_ptr<char>(tabSize, `16`);
416	assert(table); // otherwise would have thrown std::bad_alloc
417
418	// Fill in the RoseLongLitTable header structure.
419	RoseLongLitTable header = (RoseLongLitTable )(table.get());
420	header->size = verify_u32(tabSize);
421	header->maxLen = verify_u8(max_len); // u8 so doesn't matter; won't go > 255
422	header->caseful.hashOffset = verify_u32(htOffsetCase);
423	header->caseful.hashBits = lg2(tab_case.size());
424	header->caseful.streamStateBits = streamBitsCase;
425	header->caseful.bloomOffset = verify_u32(bloomOffsetCase);
426	header->caseful.bloomBits = lg2(bloom_case.size() * `8`);
427	header->nocase.hashOffset = verify_u32(htOffsetNocase);
428	header->nocase.hashBits = lg2(tab_nocase.size());
429	header->nocase.streamStateBits = streamBitsNocase;
430	header->nocase.bloomOffset = verify_u32(bloomOffsetNocase);
431	header->nocase.bloomBits = lg2(bloom_nocase.size() * `8`);
432	assert(tot_state_bytes < sizeof(u64a));
433	header->streamStateBytes = verify_u8(tot_state_bytes); // u8
434
435	// Copy in the literal strings, hash tables and bloom filters,
436	copy_bytes(table.get() + headerSize, lit_blob);
437	copy_bytes(table.get() + htOffsetCase, tab_case);
438	copy_bytes(table.get() + bloomOffsetCase, bloom_case);
439	copy_bytes(table.get() + htOffsetNocase, tab_nocase);
440	copy_bytes(table.get() + bloomOffsetNocase, bloom_nocase);
441
442	DEBUG_PRINTF("built streaming table, size=%zu\n", tabSize);
443	DEBUG_PRINTF("requires %zu bytes of history\n", max_len);
444	DEBUG_PRINTF("requires %u bytes of stream state\n", tot_state_bytes);
445
446	historyRequired = max(historyRequired, max_len);
447	*longLitStreamStateRequired = tot_state_bytes;
448
449	return blob.add(table);
450	}
451
452	} // namespace ue2
453

Browse the source code of ClickHouse/contrib/hyperscan/src/rose/rose_build_long_lit.cpp