| 1 | #pragma once | 
|---|
| 2 |  | 
|---|
| 3 | #include <Interpreters/AggregationCommon.h> | 
|---|
| 4 | #include <Common/ColumnsHashing.h> | 
|---|
| 5 | #include <Common/assert_cast.h> | 
|---|
| 6 | #include <Common/Arena.h> | 
|---|
| 7 | #include <Common/HashTable/HashSet.h> | 
|---|
| 8 | #include <Common/HashTable/ClearableHashSet.h> | 
|---|
| 9 | #include <Common/HashTable/FixedClearableHashSet.h> | 
|---|
| 10 | #include <Common/HashTable/FixedHashSet.h> | 
|---|
| 11 | #include <Common/UInt128.h> | 
|---|
| 12 |  | 
|---|
| 13 |  | 
|---|
| 14 | namespace DB | 
|---|
| 15 | { | 
|---|
| 16 |  | 
|---|
| 17 | /** Methods for different implementations of sets (used in right hand side of IN or for DISTINCT). | 
|---|
| 18 | * To use as template parameter. | 
|---|
| 19 | */ | 
|---|
| 20 |  | 
|---|
| 21 |  | 
|---|
| 22 | /// For the case where there is one numeric key. | 
|---|
| 23 | template <typename FieldType, typename TData, bool use_cache = true>    /// UInt8/16/32/64 for any types with corresponding bit width. | 
|---|
| 24 | struct SetMethodOneNumber | 
|---|
| 25 | { | 
|---|
| 26 | using Data = TData; | 
|---|
| 27 | using Key = typename Data::key_type; | 
|---|
| 28 |  | 
|---|
| 29 | Data data; | 
|---|
| 30 |  | 
|---|
| 31 | using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, | 
|---|
| 32 | void, FieldType, use_cache>; | 
|---|
| 33 | }; | 
|---|
| 34 |  | 
|---|
| 35 | /// For the case where there is one string key. | 
|---|
| 36 | template <typename TData> | 
|---|
| 37 | struct SetMethodString | 
|---|
| 38 | { | 
|---|
| 39 | using Data = TData; | 
|---|
| 40 | using Key = typename Data::key_type; | 
|---|
| 41 |  | 
|---|
| 42 | Data data; | 
|---|
| 43 |  | 
|---|
| 44 | using State = ColumnsHashing::HashMethodString<typename Data::value_type, void, true, false>; | 
|---|
| 45 | }; | 
|---|
| 46 |  | 
|---|
| 47 | /// For the case when there is one fixed-length string key. | 
|---|
| 48 | template <typename TData> | 
|---|
| 49 | struct SetMethodFixedString | 
|---|
| 50 | { | 
|---|
| 51 | using Data = TData; | 
|---|
| 52 | using Key = typename Data::key_type; | 
|---|
| 53 |  | 
|---|
| 54 | Data data; | 
|---|
| 55 |  | 
|---|
| 56 | using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, void, true, false>; | 
|---|
| 57 | }; | 
|---|
| 58 |  | 
|---|
| 59 | namespace set_impl | 
|---|
| 60 | { | 
|---|
| 61 |  | 
|---|
| 62 | /// This class is designed to provide the functionality that is required for | 
|---|
| 63 | /// supporting nullable keys in SetMethodKeysFixed. If there are | 
|---|
| 64 | /// no nullable keys, this class is merely implemented as an empty shell. | 
|---|
| 65 | template <typename Key, bool has_nullable_keys> | 
|---|
| 66 | class BaseStateKeysFixed; | 
|---|
| 67 |  | 
|---|
| 68 | /// Case where nullable keys are supported. | 
|---|
| 69 | template <typename Key> | 
|---|
| 70 | class BaseStateKeysFixed<Key, true> | 
|---|
| 71 | { | 
|---|
| 72 | protected: | 
|---|
| 73 | void init(const ColumnRawPtrs & key_columns) | 
|---|
| 74 | { | 
|---|
| 75 | null_maps.reserve(key_columns.size()); | 
|---|
| 76 | actual_columns.reserve(key_columns.size()); | 
|---|
| 77 |  | 
|---|
| 78 | for (const auto & col : key_columns) | 
|---|
| 79 | { | 
|---|
| 80 | if (auto * nullable = checkAndGetColumn<ColumnNullable>(*col)) | 
|---|
| 81 | { | 
|---|
| 82 | actual_columns.push_back(&nullable->getNestedColumn()); | 
|---|
| 83 | null_maps.push_back(&nullable->getNullMapColumn()); | 
|---|
| 84 | } | 
|---|
| 85 | else | 
|---|
| 86 | { | 
|---|
| 87 | actual_columns.push_back(col); | 
|---|
| 88 | null_maps.push_back(nullptr); | 
|---|
| 89 | } | 
|---|
| 90 | } | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 | /// Return the columns which actually contain the values of the keys. | 
|---|
| 94 | /// For a given key column, if it is nullable, we return its nested | 
|---|
| 95 | /// column. Otherwise we return the key column itself. | 
|---|
| 96 | inline const ColumnRawPtrs & getActualColumns() const | 
|---|
| 97 | { | 
|---|
| 98 | return actual_columns; | 
|---|
| 99 | } | 
|---|
| 100 |  | 
|---|
| 101 | /// Create a bitmap that indicates whether, for a particular row, | 
|---|
| 102 | /// a key column bears a null value or not. | 
|---|
| 103 | KeysNullMap<Key> createBitmap(size_t row) const | 
|---|
| 104 | { | 
|---|
| 105 | KeysNullMap<Key> bitmap{}; | 
|---|
| 106 |  | 
|---|
| 107 | for (size_t k = 0; k < null_maps.size(); ++k) | 
|---|
| 108 | { | 
|---|
| 109 | if (null_maps[k] != nullptr) | 
|---|
| 110 | { | 
|---|
| 111 | const auto & null_map = assert_cast<const ColumnUInt8 &>(*null_maps[k]).getData(); | 
|---|
| 112 | if (null_map[row] == 1) | 
|---|
| 113 | { | 
|---|
| 114 | size_t bucket = k / 8; | 
|---|
| 115 | size_t offset = k % 8; | 
|---|
| 116 | bitmap[bucket] |= UInt8(1) << offset; | 
|---|
| 117 | } | 
|---|
| 118 | } | 
|---|
| 119 | } | 
|---|
| 120 |  | 
|---|
| 121 | return bitmap; | 
|---|
| 122 | } | 
|---|
| 123 |  | 
|---|
| 124 | private: | 
|---|
| 125 | ColumnRawPtrs actual_columns; | 
|---|
| 126 | ColumnRawPtrs null_maps; | 
|---|
| 127 | }; | 
|---|
| 128 |  | 
|---|
| 129 | /// Case where nullable keys are not supported. | 
|---|
| 130 | template <typename Key> | 
|---|
| 131 | class BaseStateKeysFixed<Key, false> | 
|---|
| 132 | { | 
|---|
| 133 | protected: | 
|---|
| 134 | void init(const ColumnRawPtrs &) | 
|---|
| 135 | { | 
|---|
| 136 | throw Exception{ "Internal error: calling init() for non-nullable" | 
|---|
| 137 | " keys is forbidden", ErrorCodes::LOGICAL_ERROR}; | 
|---|
| 138 | } | 
|---|
| 139 |  | 
|---|
| 140 | const ColumnRawPtrs & getActualColumns() const | 
|---|
| 141 | { | 
|---|
| 142 | throw Exception{ "Internal error: calling getActualColumns() for non-nullable" | 
|---|
| 143 | " keys is forbidden", ErrorCodes::LOGICAL_ERROR}; | 
|---|
| 144 | } | 
|---|
| 145 |  | 
|---|
| 146 | KeysNullMap<Key> createBitmap(size_t) const | 
|---|
| 147 | { | 
|---|
| 148 | throw Exception{ "Internal error: calling createBitmap() for non-nullable keys" | 
|---|
| 149 | " is forbidden", ErrorCodes::LOGICAL_ERROR}; | 
|---|
| 150 | } | 
|---|
| 151 | }; | 
|---|
| 152 |  | 
|---|
| 153 | } | 
|---|
| 154 |  | 
|---|
| 155 | /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits. | 
|---|
| 156 | template <typename TData, bool has_nullable_keys_ = false> | 
|---|
| 157 | struct SetMethodKeysFixed | 
|---|
| 158 | { | 
|---|
| 159 | using Data = TData; | 
|---|
| 160 | using Key = typename Data::key_type; | 
|---|
| 161 | static constexpr bool has_nullable_keys = has_nullable_keys_; | 
|---|
| 162 |  | 
|---|
| 163 | Data data; | 
|---|
| 164 |  | 
|---|
| 165 | using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, void, has_nullable_keys, false>; | 
|---|
| 166 | }; | 
|---|
| 167 |  | 
|---|
| 168 | /// For other cases. 128 bit hash from the key. | 
|---|
| 169 | template <typename TData> | 
|---|
| 170 | struct SetMethodHashed | 
|---|
| 171 | { | 
|---|
| 172 | using Data = TData; | 
|---|
| 173 | using Key = typename Data::key_type; | 
|---|
| 174 |  | 
|---|
| 175 | Data data; | 
|---|
| 176 |  | 
|---|
| 177 | using State = ColumnsHashing::HashMethodHashed<typename Data::value_type, void>; | 
|---|
| 178 | }; | 
|---|
| 179 |  | 
|---|
| 180 |  | 
|---|
| 181 | /** Different implementations of the set. | 
|---|
| 182 | */ | 
|---|
| 183 | struct NonClearableSet | 
|---|
| 184 | { | 
|---|
| 185 | /* | 
|---|
| 186 | * As in Aggregator, using consecutive keys cache doesn't improve performance | 
|---|
| 187 | * for FixedHashTables. | 
|---|
| 188 | */ | 
|---|
| 189 | std::unique_ptr<SetMethodOneNumber<UInt8, FixedHashSet<UInt8>, false /* use_cache */>>                                             key8; | 
|---|
| 190 | std::unique_ptr<SetMethodOneNumber<UInt16, FixedHashSet<UInt16>, false /* use_cache */>>                                           key16; | 
|---|
| 191 |  | 
|---|
| 192 | /** Also for the experiment was tested the ability to use SmallSet, | 
|---|
| 193 | *  as long as the number of elements in the set is small (and, if necessary, converted to a full-fledged HashSet). | 
|---|
| 194 | * But this experiment showed that there is an advantage only in rare cases. | 
|---|
| 195 | */ | 
|---|
| 196 | std::unique_ptr<SetMethodOneNumber<UInt32, HashSet<UInt32, HashCRC32<UInt32>>>>                             key32; | 
|---|
| 197 | std::unique_ptr<SetMethodOneNumber<UInt64, HashSet<UInt64, HashCRC32<UInt64>>>>                             key64; | 
|---|
| 198 | std::unique_ptr<SetMethodString<HashSetWithSavedHash<StringRef>>>                                           key_string; | 
|---|
| 199 | std::unique_ptr<SetMethodFixedString<HashSetWithSavedHash<StringRef>>>                                      key_fixed_string; | 
|---|
| 200 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>>>                                     keys128; | 
|---|
| 201 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>>>                                     keys256; | 
|---|
| 202 | std::unique_ptr<SetMethodHashed<HashSet<UInt128, UInt128TrivialHash>>>                                      hashed; | 
|---|
| 203 |  | 
|---|
| 204 | /// Support for nullable keys (for DISTINCT implementation). | 
|---|
| 205 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>, true>>                               nullable_keys128; | 
|---|
| 206 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>, true>>                               nullable_keys256; | 
|---|
| 207 | /** Unlike Aggregator, `concat` method is not used here. | 
|---|
| 208 | * This is done because `hashed` method, although slower, but in this case, uses less RAM. | 
|---|
| 209 | *  since when you use it, the key values themselves are not stored. | 
|---|
| 210 | */ | 
|---|
| 211 | }; | 
|---|
| 212 |  | 
|---|
| 213 | struct ClearableSet | 
|---|
| 214 | { | 
|---|
| 215 | std::unique_ptr<SetMethodOneNumber<UInt8, FixedClearableHashSet<UInt8>, false /* use_cache */>>                                        key8; | 
|---|
| 216 | std::unique_ptr<SetMethodOneNumber<UInt16, FixedClearableHashSet<UInt16>, false /*use_cache */>>                                      key16; | 
|---|
| 217 |  | 
|---|
| 218 | std::unique_ptr<SetMethodOneNumber<UInt32, ClearableHashSet<UInt32, HashCRC32<UInt32>>>>                        key32; | 
|---|
| 219 | std::unique_ptr<SetMethodOneNumber<UInt64, ClearableHashSet<UInt64, HashCRC32<UInt64>>>>                        key64; | 
|---|
| 220 | std::unique_ptr<SetMethodString<ClearableHashSetWithSavedHash<StringRef>>>                                      key_string; | 
|---|
| 221 | std::unique_ptr<SetMethodFixedString<ClearableHashSetWithSavedHash<StringRef>>>                                 key_fixed_string; | 
|---|
| 222 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>>>                                keys128; | 
|---|
| 223 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>>>                                keys256; | 
|---|
| 224 | std::unique_ptr<SetMethodHashed<ClearableHashSet<UInt128, UInt128TrivialHash>>>                                 hashed; | 
|---|
| 225 |  | 
|---|
| 226 | /// Support for nullable keys (for DISTINCT implementation). | 
|---|
| 227 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>, true>>                          nullable_keys128; | 
|---|
| 228 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>, true>>                          nullable_keys256; | 
|---|
| 229 | /** Unlike Aggregator, `concat` method is not used here. | 
|---|
| 230 | * This is done because `hashed` method, although slower, but in this case, uses less RAM. | 
|---|
| 231 | *  since when you use it, the key values themselves are not stored. | 
|---|
| 232 | */ | 
|---|
| 233 | }; | 
|---|
| 234 |  | 
|---|
| 235 | template <typename Variant> | 
|---|
| 236 | struct SetVariantsTemplate: public Variant | 
|---|
| 237 | { | 
|---|
| 238 | Arena string_pool; | 
|---|
| 239 |  | 
|---|
| 240 | #define APPLY_FOR_SET_VARIANTS(M) \ | 
|---|
| 241 | M(key8)                 \ | 
|---|
| 242 | M(key16)                \ | 
|---|
| 243 | M(key32)                \ | 
|---|
| 244 | M(key64)                \ | 
|---|
| 245 | M(key_string)           \ | 
|---|
| 246 | M(key_fixed_string)     \ | 
|---|
| 247 | M(keys128)              \ | 
|---|
| 248 | M(keys256)              \ | 
|---|
| 249 | M(nullable_keys128)     \ | 
|---|
| 250 | M(nullable_keys256)     \ | 
|---|
| 251 | M(hashed) | 
|---|
| 252 |  | 
|---|
| 253 | #define M(NAME) using Variant::NAME; | 
|---|
| 254 | APPLY_FOR_SET_VARIANTS(M) | 
|---|
| 255 | #undef M | 
|---|
| 256 |  | 
|---|
| 257 | enum class Type | 
|---|
| 258 | { | 
|---|
| 259 | EMPTY, | 
|---|
| 260 |  | 
|---|
| 261 | #define M(NAME) NAME, | 
|---|
| 262 | APPLY_FOR_SET_VARIANTS(M) | 
|---|
| 263 | #undef M | 
|---|
| 264 | }; | 
|---|
| 265 |  | 
|---|
| 266 | Type type = Type::EMPTY; | 
|---|
| 267 |  | 
|---|
| 268 | bool empty() const { return type == Type::EMPTY; } | 
|---|
| 269 |  | 
|---|
| 270 | static Type chooseMethod(const ColumnRawPtrs & key_columns, Sizes & key_sizes); | 
|---|
| 271 |  | 
|---|
| 272 | void init(Type type_); | 
|---|
| 273 |  | 
|---|
| 274 | size_t getTotalRowCount() const; | 
|---|
| 275 | /// Counts the size in bytes of the Set buffer and the size of the `string_pool` | 
|---|
| 276 | size_t getTotalByteCount() const; | 
|---|
| 277 | }; | 
|---|
| 278 |  | 
|---|
| 279 | using SetVariants = SetVariantsTemplate<NonClearableSet>; | 
|---|
| 280 | using ClearableSetVariants = SetVariantsTemplate<ClearableSet>; | 
|---|
| 281 |  | 
|---|
| 282 | } | 
|---|
| 283 |  | 
|---|