1 | #pragma once |
2 | #include <Columns/ColumnString.h> |
3 | #include <Columns/ColumnVector.h> |
4 | #include <Columns/IColumn.h> |
5 | #include <DataStreams/IBlockInputStream.h> |
6 | #include <DataTypes/DataTypeDate.h> |
7 | #include <DataTypes/DataTypesNumber.h> |
8 | #include <ext/range.h> |
9 | #include "DictionaryBlockInputStreamBase.h" |
10 | #include "DictionaryStructure.h" |
11 | #include "IDictionary.h" |
12 | #include "RangeHashedDictionary.h" |
13 | |
14 | namespace DB |
15 | { |
16 | /* |
17 | * BlockInputStream implementation for external dictionaries |
18 | * read() returns single block consisting of the in-memory contents of the dictionaries |
19 | */ |
20 | template <typename DictionaryType, typename RangeType, typename Key> |
21 | class RangeDictionaryBlockInputStream : public DictionaryBlockInputStreamBase |
22 | { |
23 | public: |
24 | using DictionaryPtr = std::shared_ptr<DictionaryType const>; |
25 | |
26 | RangeDictionaryBlockInputStream( |
27 | DictionaryPtr dictionary, |
28 | size_t max_block_size, |
29 | const Names & column_names, |
30 | PaddedPODArray<Key> && ids_to_fill, |
31 | PaddedPODArray<RangeType> && start_dates, |
32 | PaddedPODArray<RangeType> && end_dates); |
33 | |
34 | String getName() const override { return "RangeDictionary" ; } |
35 | |
36 | protected: |
37 | Block getBlock(size_t start, size_t length) const override; |
38 | |
39 | private: |
40 | template <typename Type> |
41 | using DictionaryGetter = void (DictionaryType::*)( |
42 | const std::string &, const PaddedPODArray<Key> &, const PaddedPODArray<Int64> &, PaddedPODArray<Type> &) const; |
43 | |
44 | template <typename Type> |
45 | using DictionaryDecimalGetter = void (DictionaryType::*)( |
46 | const std::string &, const PaddedPODArray<Key> &, const PaddedPODArray<Int64> &, DecimalPaddedPODArray<Type> &) const; |
47 | |
48 | template <typename AttributeType, typename Getter> |
49 | ColumnPtr getColumnFromAttribute( |
50 | Getter getter, |
51 | const PaddedPODArray<Key> & ids_to_fill, |
52 | const PaddedPODArray<Int64> & dates, |
53 | const DictionaryAttribute & attribute, |
54 | const DictionaryType & concrete_dictionary) const; |
55 | ColumnPtr getColumnFromAttributeString( |
56 | const PaddedPODArray<Key> & ids_to_fill, |
57 | const PaddedPODArray<Int64> & dates, |
58 | const DictionaryAttribute & attribute, |
59 | const DictionaryType & concrete_dictionary) const; |
60 | template <typename T> |
61 | ColumnPtr getColumnFromPODArray(const PaddedPODArray<T> & array) const; |
62 | |
63 | template <typename DictionarySpecialAttributeType, typename T> |
64 | void addSpecialColumn( |
65 | const std::optional<DictionarySpecialAttributeType> & attribute, |
66 | DataTypePtr type, |
67 | const std::string & default_name, |
68 | const std::unordered_set<std::string> & column_names_set, |
69 | const PaddedPODArray<T> & values, |
70 | ColumnsWithTypeAndName & columns) const; |
71 | |
72 | Block fillBlock( |
73 | const PaddedPODArray<Key> & ids_to_fill, |
74 | const PaddedPODArray<RangeType> & block_start_dates, |
75 | const PaddedPODArray<RangeType> & block_end_dates) const; |
76 | |
77 | PaddedPODArray<Int64> |
78 | makeDateKey(const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const; |
79 | |
80 | DictionaryPtr dictionary; |
81 | Names column_names; |
82 | PaddedPODArray<Key> ids; |
83 | PaddedPODArray<RangeType> start_dates; |
84 | PaddedPODArray<RangeType> end_dates; |
85 | }; |
86 | |
87 | |
88 | template <typename DictionaryType, typename RangeType, typename Key> |
89 | RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::RangeDictionaryBlockInputStream( |
90 | DictionaryPtr dictionary_, |
91 | size_t max_block_size_, |
92 | const Names & column_names_, |
93 | PaddedPODArray<Key> && ids_, |
94 | PaddedPODArray<RangeType> && block_start_dates, |
95 | PaddedPODArray<RangeType> && block_end_dates) |
96 | : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) |
97 | , dictionary(dictionary_) |
98 | , column_names(column_names_) |
99 | , ids(std::move(ids_)) |
100 | , start_dates(std::move(block_start_dates)) |
101 | , end_dates(std::move(block_end_dates)) |
102 | { |
103 | } |
104 | |
105 | template <typename DictionaryType, typename RangeType, typename Key> |
106 | Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getBlock(size_t start, size_t length) const |
107 | { |
108 | PaddedPODArray<Key> block_ids; |
109 | PaddedPODArray<RangeType> block_start_dates; |
110 | PaddedPODArray<RangeType> block_end_dates; |
111 | block_ids.reserve(length); |
112 | block_start_dates.reserve(length); |
113 | block_end_dates.reserve(length); |
114 | |
115 | for (auto idx : ext::range(start, start + length)) |
116 | { |
117 | block_ids.push_back(ids[idx]); |
118 | block_start_dates.push_back(start_dates[idx]); |
119 | block_end_dates.push_back(end_dates[idx]); |
120 | } |
121 | |
122 | return fillBlock(block_ids, block_start_dates, block_end_dates); |
123 | } |
124 | |
125 | template <typename DictionaryType, typename RangeType, typename Key> |
126 | template <typename AttributeType, typename Getter> |
127 | ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromAttribute( |
128 | Getter getter, |
129 | const PaddedPODArray<Key> & ids_to_fill, |
130 | const PaddedPODArray<Int64> & dates, |
131 | const DictionaryAttribute & attribute, |
132 | const DictionaryType & concrete_dictionary) const |
133 | { |
134 | if constexpr (IsDecimalNumber<AttributeType>) |
135 | { |
136 | auto column = ColumnDecimal<AttributeType>::create(ids_to_fill.size(), 0); /// NOTE: There's wrong scale here, but it's unused. |
137 | (concrete_dictionary.*getter)(attribute.name, ids_to_fill, dates, column->getData()); |
138 | return column; |
139 | } |
140 | else |
141 | { |
142 | auto column_vector = ColumnVector<AttributeType>::create(ids_to_fill.size()); |
143 | (concrete_dictionary.*getter)(attribute.name, ids_to_fill, dates, column_vector->getData()); |
144 | return column_vector; |
145 | } |
146 | } |
147 | |
148 | template <typename DictionaryType, typename RangeType, typename Key> |
149 | ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromAttributeString( |
150 | const PaddedPODArray<Key> & ids_to_fill, |
151 | const PaddedPODArray<Int64> & dates, |
152 | const DictionaryAttribute & attribute, |
153 | const DictionaryType & concrete_dictionary) const |
154 | { |
155 | auto column_string = ColumnString::create(); |
156 | concrete_dictionary.getString(attribute.name, ids_to_fill, dates, column_string.get()); |
157 | return column_string; |
158 | } |
159 | |
160 | template <typename DictionaryType, typename RangeType, typename Key> |
161 | template <typename T> |
162 | ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromPODArray(const PaddedPODArray<T> & array) const |
163 | { |
164 | auto column_vector = ColumnVector<T>::create(); |
165 | column_vector->getData().reserve(array.size()); |
166 | for (T value : array) |
167 | column_vector->insertValue(value); |
168 | return column_vector; |
169 | } |
170 | |
171 | |
172 | template <typename DictionaryType, typename RangeType, typename Key> |
173 | template <typename DictionarySpecialAttributeType, typename T> |
174 | void RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::addSpecialColumn( |
175 | const std::optional<DictionarySpecialAttributeType> & attribute, |
176 | DataTypePtr type, |
177 | const std::string & default_name, |
178 | const std::unordered_set<std::string> & column_names_set, |
179 | const PaddedPODArray<T> & values, |
180 | ColumnsWithTypeAndName & columns) const |
181 | { |
182 | std::string name = default_name; |
183 | if (attribute) |
184 | name = attribute->name; |
185 | |
186 | if (column_names_set.find(name) != column_names_set.end()) |
187 | columns.emplace_back(getColumnFromPODArray(values), type, name); |
188 | } |
189 | |
190 | template <typename DictionaryType, typename RangeType, typename Key> |
191 | PaddedPODArray<Int64> RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::makeDateKey( |
192 | const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const |
193 | { |
194 | PaddedPODArray<Int64> key(block_start_dates.size()); |
195 | for (size_t i = 0; i < key.size(); ++i) |
196 | { |
197 | if (RangeHashedDictionary::Range::isCorrectDate(block_start_dates[i])) |
198 | key[i] = block_start_dates[i]; |
199 | else |
200 | key[i] = block_end_dates[i]; |
201 | } |
202 | |
203 | return key; |
204 | } |
205 | |
206 | |
207 | template <typename DictionaryType, typename RangeType, typename Key> |
208 | Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::fillBlock( |
209 | const PaddedPODArray<Key> & ids_to_fill, |
210 | const PaddedPODArray<RangeType> & block_start_dates, |
211 | const PaddedPODArray<RangeType> & block_end_dates) const |
212 | { |
213 | ColumnsWithTypeAndName columns; |
214 | const DictionaryStructure & structure = dictionary->getStructure(); |
215 | |
216 | std::unordered_set<std::string> names(column_names.begin(), column_names.end()); |
217 | |
218 | addSpecialColumn(structure.id, std::make_shared<DataTypeUInt64>(), "ID" , names, ids_to_fill, columns); |
219 | addSpecialColumn(structure.range_min, structure.range_max->type, "Range Start" , names, block_start_dates, columns); |
220 | addSpecialColumn(structure.range_max, structure.range_max->type, "Range End" , names, block_end_dates, columns); |
221 | |
222 | auto date_key = makeDateKey(block_start_dates, block_end_dates); |
223 | |
224 | for (const auto idx : ext::range(0, structure.attributes.size())) |
225 | { |
226 | const DictionaryAttribute & attribute = structure.attributes[idx]; |
227 | if (names.find(attribute.name) != names.end()) |
228 | { |
229 | ColumnPtr column; |
230 | #define GET_COLUMN_FORM_ATTRIBUTE(TYPE) \ |
231 | column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids_to_fill, date_key, attribute, *dictionary) |
232 | switch (attribute.underlying_type) |
233 | { |
234 | case AttributeUnderlyingType::utUInt8: |
235 | GET_COLUMN_FORM_ATTRIBUTE(UInt8); |
236 | break; |
237 | case AttributeUnderlyingType::utUInt16: |
238 | GET_COLUMN_FORM_ATTRIBUTE(UInt16); |
239 | break; |
240 | case AttributeUnderlyingType::utUInt32: |
241 | GET_COLUMN_FORM_ATTRIBUTE(UInt32); |
242 | break; |
243 | case AttributeUnderlyingType::utUInt64: |
244 | GET_COLUMN_FORM_ATTRIBUTE(UInt64); |
245 | break; |
246 | case AttributeUnderlyingType::utUInt128: |
247 | GET_COLUMN_FORM_ATTRIBUTE(UInt128); |
248 | break; |
249 | case AttributeUnderlyingType::utInt8: |
250 | GET_COLUMN_FORM_ATTRIBUTE(Int8); |
251 | break; |
252 | case AttributeUnderlyingType::utInt16: |
253 | GET_COLUMN_FORM_ATTRIBUTE(Int16); |
254 | break; |
255 | case AttributeUnderlyingType::utInt32: |
256 | GET_COLUMN_FORM_ATTRIBUTE(Int32); |
257 | break; |
258 | case AttributeUnderlyingType::utInt64: |
259 | GET_COLUMN_FORM_ATTRIBUTE(Int64); |
260 | break; |
261 | case AttributeUnderlyingType::utFloat32: |
262 | GET_COLUMN_FORM_ATTRIBUTE(Float32); |
263 | break; |
264 | case AttributeUnderlyingType::utFloat64: |
265 | GET_COLUMN_FORM_ATTRIBUTE(Float64); |
266 | break; |
267 | case AttributeUnderlyingType::utDecimal32: |
268 | GET_COLUMN_FORM_ATTRIBUTE(Decimal32); |
269 | break; |
270 | case AttributeUnderlyingType::utDecimal64: |
271 | GET_COLUMN_FORM_ATTRIBUTE(Decimal64); |
272 | break; |
273 | case AttributeUnderlyingType::utDecimal128: |
274 | GET_COLUMN_FORM_ATTRIBUTE(Decimal128); |
275 | break; |
276 | case AttributeUnderlyingType::utString: |
277 | column = getColumnFromAttributeString(ids_to_fill, date_key, attribute, *dictionary); |
278 | break; |
279 | } |
280 | #undef GET_COLUMN_FORM_ATTRIBUTE |
281 | columns.emplace_back(column, attribute.type, attribute.name); |
282 | } |
283 | } |
284 | return Block(columns); |
285 | } |
286 | |
287 | } |
288 | |