1#pragma once
2
3#include <Interpreters/BloomFilter.h>
4#include <Storages/MergeTree/MergeTreeIndices.h>
5#include <Storages/MergeTree/KeyCondition.h>
6
7#include <memory>
8
9
10namespace DB
11{
12
13class MergeTreeIndexFullText;
14
15
16struct MergeTreeIndexGranuleFullText : public IMergeTreeIndexGranule
17{
18 explicit MergeTreeIndexGranuleFullText(
19 const MergeTreeIndexFullText & index_);
20
21 ~MergeTreeIndexGranuleFullText() override = default;
22
23 void serializeBinary(WriteBuffer & ostr) const override;
24 void deserializeBinary(ReadBuffer & istr) override;
25
26 bool empty() const override { return !has_elems; }
27
28 const MergeTreeIndexFullText & index;
29 std::vector<BloomFilter> bloom_filters;
30 bool has_elems;
31};
32
33using MergeTreeIndexGranuleFullTextPtr = std::shared_ptr<MergeTreeIndexGranuleFullText>;
34
35
36struct MergeTreeIndexAggregatorFullText : IMergeTreeIndexAggregator
37{
38 explicit MergeTreeIndexAggregatorFullText(const MergeTreeIndexFullText & index);
39
40 ~MergeTreeIndexAggregatorFullText() override = default;
41
42 bool empty() const override { return !granule || granule->empty(); }
43 MergeTreeIndexGranulePtr getGranuleAndReset() override;
44
45 void update(const Block & block, size_t * pos, size_t limit) override;
46
47 const MergeTreeIndexFullText & index;
48 MergeTreeIndexGranuleFullTextPtr granule;
49};
50
51
52class MergeTreeConditionFullText : public IMergeTreeIndexCondition
53{
54public:
55 MergeTreeConditionFullText(
56 const SelectQueryInfo & query_info,
57 const Context & context,
58 const MergeTreeIndexFullText & index_);
59
60 ~MergeTreeConditionFullText() override = default;
61
62 bool alwaysUnknownOrTrue() const override;
63
64 bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
65private:
66 struct KeyTuplePositionMapping
67 {
68 KeyTuplePositionMapping(size_t tuple_index_, size_t key_index_) : tuple_index(tuple_index_), key_index(key_index_) {}
69
70 size_t tuple_index;
71 size_t key_index;
72 };
73 /// Uses RPN like KeyCondition
74 struct RPNElement
75 {
76 enum Function
77 {
78 /// Atoms of a Boolean expression.
79 FUNCTION_EQUALS,
80 FUNCTION_NOT_EQUALS,
81 FUNCTION_IN,
82 FUNCTION_NOT_IN,
83 FUNCTION_MULTI_SEARCH,
84 FUNCTION_UNKNOWN, /// Can take any value.
85 /// Operators of the logical expression.
86 FUNCTION_NOT,
87 FUNCTION_AND,
88 FUNCTION_OR,
89 /// Constants
90 ALWAYS_FALSE,
91 ALWAYS_TRUE,
92 };
93
94 RPNElement(
95 Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
96 : function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
97
98 Function function = FUNCTION_UNKNOWN;
99 /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH
100 size_t key_column;
101
102 /// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
103 std::unique_ptr<BloomFilter> bloom_filter;
104
105 /// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH
106 std::vector<std::vector<BloomFilter>> set_bloom_filters;
107
108 /// For FUNCTION_IN and FUNCTION_NOT_IN
109 std::vector<size_t> set_key_position;
110 };
111
112 using AtomMap = std::unordered_map<std::string, bool(*)(RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)>;
113 using RPN = std::vector<RPNElement>;
114
115 bool atomFromAST(const ASTPtr & node, Block & block_with_constants, RPNElement & out);
116
117 bool getKey(const ASTPtr & node, size_t & key_column_num);
118 bool tryPrepareSetBloomFilter(const ASTs & args, RPNElement & out);
119
120 static bool createFunctionEqualsCondition(RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx);
121
122 static const AtomMap atom_map;
123
124 const MergeTreeIndexFullText & index;
125 RPN rpn;
126 /// Sets from syntax analyzer.
127 PreparedSets prepared_sets;
128};
129
130
131/// Interface for string parsers.
132struct ITokenExtractor
133{
134 virtual ~ITokenExtractor() = default;
135 /// Fast inplace implementation for regular use.
136 /// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
137 /// Returns false if parsing is finished, otherwise returns true.
138 virtual bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const = 0;
139 /// Special implementation for creating bloom filter for LIKE function.
140 /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
141 virtual bool nextLike(const String & str, size_t * pos, String & out) const = 0;
142
143 virtual bool supportLike() const = 0;
144};
145
146/// Parser extracting all ngrams from string.
147struct NgramTokenExtractor : public ITokenExtractor
148{
149 NgramTokenExtractor(size_t n_) : n(n_) {}
150
151 static String getName() { return "ngrambf_v1"; }
152
153 bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
154 bool nextLike(const String & str, size_t * pos, String & token) const override;
155
156 bool supportLike() const override { return true; }
157
158 size_t n;
159};
160
161/// Parser extracting tokens (sequences of numbers and ascii letters).
162struct SplitTokenExtractor : public ITokenExtractor
163{
164 static String getName() { return "tokenbf_v1"; }
165
166 bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const override;
167 bool nextLike(const String & str, size_t * pos, String & token) const override;
168
169 bool supportLike() const override { return true; }
170};
171
172
173class MergeTreeIndexFullText : public IMergeTreeIndex
174{
175public:
176 MergeTreeIndexFullText(
177 String name_,
178 ExpressionActionsPtr expr_,
179 const Names & columns_,
180 const DataTypes & data_types_,
181 const Block & header_,
182 size_t granularity_,
183 size_t bloom_filter_size_,
184 size_t bloom_filter_hashes_,
185 size_t seed_,
186 std::unique_ptr<ITokenExtractor> && token_extractor_func_)
187 : IMergeTreeIndex(name_, expr_, columns_, data_types_, header_, granularity_)
188 , bloom_filter_size(bloom_filter_size_)
189 , bloom_filter_hashes(bloom_filter_hashes_)
190 , seed(seed_)
191 , token_extractor_func(std::move(token_extractor_func_)) {}
192
193 ~MergeTreeIndexFullText() override = default;
194
195 MergeTreeIndexGranulePtr createIndexGranule() const override;
196 MergeTreeIndexAggregatorPtr createIndexAggregator() const override;
197
198 MergeTreeIndexConditionPtr createIndexCondition(
199 const SelectQueryInfo & query, const Context & context) const override;
200
201 bool mayBenefitFromIndexForIn(const ASTPtr & node) const override;
202
203 /// Bloom filter size in bytes.
204 size_t bloom_filter_size;
205 /// Number of bloom filter hash functions.
206 size_t bloom_filter_hashes;
207 /// Bloom filter seed.
208 size_t seed;
209 /// Function for selecting next token.
210 std::unique_ptr<ITokenExtractor> token_extractor_func;
211};
212
213}
214