| 1 | #pragma once |
| 2 | #include <Columns/ColumnString.h> |
| 3 | #include <Columns/ColumnVector.h> |
| 4 | #include <Columns/IColumn.h> |
| 5 | #include <DataStreams/IBlockInputStream.h> |
| 6 | #include <DataTypes/DataTypeDate.h> |
| 7 | #include <DataTypes/DataTypesNumber.h> |
| 8 | #include <ext/range.h> |
| 9 | #include "DictionaryBlockInputStreamBase.h" |
| 10 | #include "DictionaryStructure.h" |
| 11 | #include "IDictionary.h" |
| 12 | #include "RangeHashedDictionary.h" |
| 13 | |
| 14 | namespace DB |
| 15 | { |
| 16 | /* |
| 17 | * BlockInputStream implementation for external dictionaries |
| 18 | * read() returns single block consisting of the in-memory contents of the dictionaries |
| 19 | */ |
| 20 | template <typename DictionaryType, typename RangeType, typename Key> |
| 21 | class RangeDictionaryBlockInputStream : public DictionaryBlockInputStreamBase |
| 22 | { |
| 23 | public: |
| 24 | using DictionaryPtr = std::shared_ptr<DictionaryType const>; |
| 25 | |
| 26 | RangeDictionaryBlockInputStream( |
| 27 | DictionaryPtr dictionary, |
| 28 | size_t max_block_size, |
| 29 | const Names & column_names, |
| 30 | PaddedPODArray<Key> && ids_to_fill, |
| 31 | PaddedPODArray<RangeType> && start_dates, |
| 32 | PaddedPODArray<RangeType> && end_dates); |
| 33 | |
| 34 | String getName() const override { return "RangeDictionary" ; } |
| 35 | |
| 36 | protected: |
| 37 | Block getBlock(size_t start, size_t length) const override; |
| 38 | |
| 39 | private: |
| 40 | template <typename Type> |
| 41 | using DictionaryGetter = void (DictionaryType::*)( |
| 42 | const std::string &, const PaddedPODArray<Key> &, const PaddedPODArray<Int64> &, PaddedPODArray<Type> &) const; |
| 43 | |
| 44 | template <typename Type> |
| 45 | using DictionaryDecimalGetter = void (DictionaryType::*)( |
| 46 | const std::string &, const PaddedPODArray<Key> &, const PaddedPODArray<Int64> &, DecimalPaddedPODArray<Type> &) const; |
| 47 | |
| 48 | template <typename AttributeType, typename Getter> |
| 49 | ColumnPtr getColumnFromAttribute( |
| 50 | Getter getter, |
| 51 | const PaddedPODArray<Key> & ids_to_fill, |
| 52 | const PaddedPODArray<Int64> & dates, |
| 53 | const DictionaryAttribute & attribute, |
| 54 | const DictionaryType & concrete_dictionary) const; |
| 55 | ColumnPtr getColumnFromAttributeString( |
| 56 | const PaddedPODArray<Key> & ids_to_fill, |
| 57 | const PaddedPODArray<Int64> & dates, |
| 58 | const DictionaryAttribute & attribute, |
| 59 | const DictionaryType & concrete_dictionary) const; |
| 60 | template <typename T> |
| 61 | ColumnPtr getColumnFromPODArray(const PaddedPODArray<T> & array) const; |
| 62 | |
| 63 | template <typename DictionarySpecialAttributeType, typename T> |
| 64 | void addSpecialColumn( |
| 65 | const std::optional<DictionarySpecialAttributeType> & attribute, |
| 66 | DataTypePtr type, |
| 67 | const std::string & default_name, |
| 68 | const std::unordered_set<std::string> & column_names_set, |
| 69 | const PaddedPODArray<T> & values, |
| 70 | ColumnsWithTypeAndName & columns) const; |
| 71 | |
| 72 | Block fillBlock( |
| 73 | const PaddedPODArray<Key> & ids_to_fill, |
| 74 | const PaddedPODArray<RangeType> & block_start_dates, |
| 75 | const PaddedPODArray<RangeType> & block_end_dates) const; |
| 76 | |
| 77 | PaddedPODArray<Int64> |
| 78 | makeDateKey(const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const; |
| 79 | |
| 80 | DictionaryPtr dictionary; |
| 81 | Names column_names; |
| 82 | PaddedPODArray<Key> ids; |
| 83 | PaddedPODArray<RangeType> start_dates; |
| 84 | PaddedPODArray<RangeType> end_dates; |
| 85 | }; |
| 86 | |
| 87 | |
| 88 | template <typename DictionaryType, typename RangeType, typename Key> |
| 89 | RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::RangeDictionaryBlockInputStream( |
| 90 | DictionaryPtr dictionary_, |
| 91 | size_t max_block_size_, |
| 92 | const Names & column_names_, |
| 93 | PaddedPODArray<Key> && ids_, |
| 94 | PaddedPODArray<RangeType> && block_start_dates, |
| 95 | PaddedPODArray<RangeType> && block_end_dates) |
| 96 | : DictionaryBlockInputStreamBase(ids_.size(), max_block_size_) |
| 97 | , dictionary(dictionary_) |
| 98 | , column_names(column_names_) |
| 99 | , ids(std::move(ids_)) |
| 100 | , start_dates(std::move(block_start_dates)) |
| 101 | , end_dates(std::move(block_end_dates)) |
| 102 | { |
| 103 | } |
| 104 | |
| 105 | template <typename DictionaryType, typename RangeType, typename Key> |
| 106 | Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getBlock(size_t start, size_t length) const |
| 107 | { |
| 108 | PaddedPODArray<Key> block_ids; |
| 109 | PaddedPODArray<RangeType> block_start_dates; |
| 110 | PaddedPODArray<RangeType> block_end_dates; |
| 111 | block_ids.reserve(length); |
| 112 | block_start_dates.reserve(length); |
| 113 | block_end_dates.reserve(length); |
| 114 | |
| 115 | for (auto idx : ext::range(start, start + length)) |
| 116 | { |
| 117 | block_ids.push_back(ids[idx]); |
| 118 | block_start_dates.push_back(start_dates[idx]); |
| 119 | block_end_dates.push_back(end_dates[idx]); |
| 120 | } |
| 121 | |
| 122 | return fillBlock(block_ids, block_start_dates, block_end_dates); |
| 123 | } |
| 124 | |
| 125 | template <typename DictionaryType, typename RangeType, typename Key> |
| 126 | template <typename AttributeType, typename Getter> |
| 127 | ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromAttribute( |
| 128 | Getter getter, |
| 129 | const PaddedPODArray<Key> & ids_to_fill, |
| 130 | const PaddedPODArray<Int64> & dates, |
| 131 | const DictionaryAttribute & attribute, |
| 132 | const DictionaryType & concrete_dictionary) const |
| 133 | { |
| 134 | if constexpr (IsDecimalNumber<AttributeType>) |
| 135 | { |
| 136 | auto column = ColumnDecimal<AttributeType>::create(ids_to_fill.size(), 0); /// NOTE: There's wrong scale here, but it's unused. |
| 137 | (concrete_dictionary.*getter)(attribute.name, ids_to_fill, dates, column->getData()); |
| 138 | return column; |
| 139 | } |
| 140 | else |
| 141 | { |
| 142 | auto column_vector = ColumnVector<AttributeType>::create(ids_to_fill.size()); |
| 143 | (concrete_dictionary.*getter)(attribute.name, ids_to_fill, dates, column_vector->getData()); |
| 144 | return column_vector; |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | template <typename DictionaryType, typename RangeType, typename Key> |
| 149 | ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromAttributeString( |
| 150 | const PaddedPODArray<Key> & ids_to_fill, |
| 151 | const PaddedPODArray<Int64> & dates, |
| 152 | const DictionaryAttribute & attribute, |
| 153 | const DictionaryType & concrete_dictionary) const |
| 154 | { |
| 155 | auto column_string = ColumnString::create(); |
| 156 | concrete_dictionary.getString(attribute.name, ids_to_fill, dates, column_string.get()); |
| 157 | return column_string; |
| 158 | } |
| 159 | |
| 160 | template <typename DictionaryType, typename RangeType, typename Key> |
| 161 | template <typename T> |
| 162 | ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromPODArray(const PaddedPODArray<T> & array) const |
| 163 | { |
| 164 | auto column_vector = ColumnVector<T>::create(); |
| 165 | column_vector->getData().reserve(array.size()); |
| 166 | for (T value : array) |
| 167 | column_vector->insertValue(value); |
| 168 | return column_vector; |
| 169 | } |
| 170 | |
| 171 | |
| 172 | template <typename DictionaryType, typename RangeType, typename Key> |
| 173 | template <typename DictionarySpecialAttributeType, typename T> |
| 174 | void RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::addSpecialColumn( |
| 175 | const std::optional<DictionarySpecialAttributeType> & attribute, |
| 176 | DataTypePtr type, |
| 177 | const std::string & default_name, |
| 178 | const std::unordered_set<std::string> & column_names_set, |
| 179 | const PaddedPODArray<T> & values, |
| 180 | ColumnsWithTypeAndName & columns) const |
| 181 | { |
| 182 | std::string name = default_name; |
| 183 | if (attribute) |
| 184 | name = attribute->name; |
| 185 | |
| 186 | if (column_names_set.find(name) != column_names_set.end()) |
| 187 | columns.emplace_back(getColumnFromPODArray(values), type, name); |
| 188 | } |
| 189 | |
| 190 | template <typename DictionaryType, typename RangeType, typename Key> |
| 191 | PaddedPODArray<Int64> RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::makeDateKey( |
| 192 | const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const |
| 193 | { |
| 194 | PaddedPODArray<Int64> key(block_start_dates.size()); |
| 195 | for (size_t i = 0; i < key.size(); ++i) |
| 196 | { |
| 197 | if (RangeHashedDictionary::Range::isCorrectDate(block_start_dates[i])) |
| 198 | key[i] = block_start_dates[i]; |
| 199 | else |
| 200 | key[i] = block_end_dates[i]; |
| 201 | } |
| 202 | |
| 203 | return key; |
| 204 | } |
| 205 | |
| 206 | |
| 207 | template <typename DictionaryType, typename RangeType, typename Key> |
| 208 | Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::fillBlock( |
| 209 | const PaddedPODArray<Key> & ids_to_fill, |
| 210 | const PaddedPODArray<RangeType> & block_start_dates, |
| 211 | const PaddedPODArray<RangeType> & block_end_dates) const |
| 212 | { |
| 213 | ColumnsWithTypeAndName columns; |
| 214 | const DictionaryStructure & structure = dictionary->getStructure(); |
| 215 | |
| 216 | std::unordered_set<std::string> names(column_names.begin(), column_names.end()); |
| 217 | |
| 218 | addSpecialColumn(structure.id, std::make_shared<DataTypeUInt64>(), "ID" , names, ids_to_fill, columns); |
| 219 | addSpecialColumn(structure.range_min, structure.range_max->type, "Range Start" , names, block_start_dates, columns); |
| 220 | addSpecialColumn(structure.range_max, structure.range_max->type, "Range End" , names, block_end_dates, columns); |
| 221 | |
| 222 | auto date_key = makeDateKey(block_start_dates, block_end_dates); |
| 223 | |
| 224 | for (const auto idx : ext::range(0, structure.attributes.size())) |
| 225 | { |
| 226 | const DictionaryAttribute & attribute = structure.attributes[idx]; |
| 227 | if (names.find(attribute.name) != names.end()) |
| 228 | { |
| 229 | ColumnPtr column; |
| 230 | #define GET_COLUMN_FORM_ATTRIBUTE(TYPE) \ |
| 231 | column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids_to_fill, date_key, attribute, *dictionary) |
| 232 | switch (attribute.underlying_type) |
| 233 | { |
| 234 | case AttributeUnderlyingType::utUInt8: |
| 235 | GET_COLUMN_FORM_ATTRIBUTE(UInt8); |
| 236 | break; |
| 237 | case AttributeUnderlyingType::utUInt16: |
| 238 | GET_COLUMN_FORM_ATTRIBUTE(UInt16); |
| 239 | break; |
| 240 | case AttributeUnderlyingType::utUInt32: |
| 241 | GET_COLUMN_FORM_ATTRIBUTE(UInt32); |
| 242 | break; |
| 243 | case AttributeUnderlyingType::utUInt64: |
| 244 | GET_COLUMN_FORM_ATTRIBUTE(UInt64); |
| 245 | break; |
| 246 | case AttributeUnderlyingType::utUInt128: |
| 247 | GET_COLUMN_FORM_ATTRIBUTE(UInt128); |
| 248 | break; |
| 249 | case AttributeUnderlyingType::utInt8: |
| 250 | GET_COLUMN_FORM_ATTRIBUTE(Int8); |
| 251 | break; |
| 252 | case AttributeUnderlyingType::utInt16: |
| 253 | GET_COLUMN_FORM_ATTRIBUTE(Int16); |
| 254 | break; |
| 255 | case AttributeUnderlyingType::utInt32: |
| 256 | GET_COLUMN_FORM_ATTRIBUTE(Int32); |
| 257 | break; |
| 258 | case AttributeUnderlyingType::utInt64: |
| 259 | GET_COLUMN_FORM_ATTRIBUTE(Int64); |
| 260 | break; |
| 261 | case AttributeUnderlyingType::utFloat32: |
| 262 | GET_COLUMN_FORM_ATTRIBUTE(Float32); |
| 263 | break; |
| 264 | case AttributeUnderlyingType::utFloat64: |
| 265 | GET_COLUMN_FORM_ATTRIBUTE(Float64); |
| 266 | break; |
| 267 | case AttributeUnderlyingType::utDecimal32: |
| 268 | GET_COLUMN_FORM_ATTRIBUTE(Decimal32); |
| 269 | break; |
| 270 | case AttributeUnderlyingType::utDecimal64: |
| 271 | GET_COLUMN_FORM_ATTRIBUTE(Decimal64); |
| 272 | break; |
| 273 | case AttributeUnderlyingType::utDecimal128: |
| 274 | GET_COLUMN_FORM_ATTRIBUTE(Decimal128); |
| 275 | break; |
| 276 | case AttributeUnderlyingType::utString: |
| 277 | column = getColumnFromAttributeString(ids_to_fill, date_key, attribute, *dictionary); |
| 278 | break; |
| 279 | } |
| 280 | #undef GET_COLUMN_FORM_ATTRIBUTE |
| 281 | columns.emplace_back(column, attribute.type, attribute.name); |
| 282 | } |
| 283 | } |
| 284 | return Block(columns); |
| 285 | } |
| 286 | |
| 287 | } |
| 288 | |