| 1 | #pragma once |
| 2 | |
| 3 | #include <Interpreters/AggregationCommon.h> |
| 4 | #include <Common/ColumnsHashing.h> |
| 5 | #include <Common/assert_cast.h> |
| 6 | #include <Common/Arena.h> |
| 7 | #include <Common/HashTable/HashSet.h> |
| 8 | #include <Common/HashTable/ClearableHashSet.h> |
| 9 | #include <Common/HashTable/FixedClearableHashSet.h> |
| 10 | #include <Common/HashTable/FixedHashSet.h> |
| 11 | #include <Common/UInt128.h> |
| 12 | |
| 13 | |
| 14 | namespace DB |
| 15 | { |
| 16 | |
| 17 | /** Methods for different implementations of sets (used in right hand side of IN or for DISTINCT). |
| 18 | * To use as template parameter. |
| 19 | */ |
| 20 | |
| 21 | |
| 22 | /// For the case where there is one numeric key. |
| 23 | template <typename FieldType, typename TData, bool use_cache = true> /// UInt8/16/32/64 for any types with corresponding bit width. |
| 24 | struct SetMethodOneNumber |
| 25 | { |
| 26 | using Data = TData; |
| 27 | using Key = typename Data::key_type; |
| 28 | |
| 29 | Data data; |
| 30 | |
| 31 | using State = ColumnsHashing::HashMethodOneNumber<typename Data::value_type, |
| 32 | void, FieldType, use_cache>; |
| 33 | }; |
| 34 | |
| 35 | /// For the case where there is one string key. |
| 36 | template <typename TData> |
| 37 | struct SetMethodString |
| 38 | { |
| 39 | using Data = TData; |
| 40 | using Key = typename Data::key_type; |
| 41 | |
| 42 | Data data; |
| 43 | |
| 44 | using State = ColumnsHashing::HashMethodString<typename Data::value_type, void, true, false>; |
| 45 | }; |
| 46 | |
| 47 | /// For the case when there is one fixed-length string key. |
| 48 | template <typename TData> |
| 49 | struct SetMethodFixedString |
| 50 | { |
| 51 | using Data = TData; |
| 52 | using Key = typename Data::key_type; |
| 53 | |
| 54 | Data data; |
| 55 | |
| 56 | using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, void, true, false>; |
| 57 | }; |
| 58 | |
| 59 | namespace set_impl |
| 60 | { |
| 61 | |
| 62 | /// This class is designed to provide the functionality that is required for |
| 63 | /// supporting nullable keys in SetMethodKeysFixed. If there are |
| 64 | /// no nullable keys, this class is merely implemented as an empty shell. |
| 65 | template <typename Key, bool has_nullable_keys> |
| 66 | class BaseStateKeysFixed; |
| 67 | |
| 68 | /// Case where nullable keys are supported. |
| 69 | template <typename Key> |
| 70 | class BaseStateKeysFixed<Key, true> |
| 71 | { |
| 72 | protected: |
| 73 | void init(const ColumnRawPtrs & key_columns) |
| 74 | { |
| 75 | null_maps.reserve(key_columns.size()); |
| 76 | actual_columns.reserve(key_columns.size()); |
| 77 | |
| 78 | for (const auto & col : key_columns) |
| 79 | { |
| 80 | if (auto * nullable = checkAndGetColumn<ColumnNullable>(*col)) |
| 81 | { |
| 82 | actual_columns.push_back(&nullable->getNestedColumn()); |
| 83 | null_maps.push_back(&nullable->getNullMapColumn()); |
| 84 | } |
| 85 | else |
| 86 | { |
| 87 | actual_columns.push_back(col); |
| 88 | null_maps.push_back(nullptr); |
| 89 | } |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | /// Return the columns which actually contain the values of the keys. |
| 94 | /// For a given key column, if it is nullable, we return its nested |
| 95 | /// column. Otherwise we return the key column itself. |
| 96 | inline const ColumnRawPtrs & getActualColumns() const |
| 97 | { |
| 98 | return actual_columns; |
| 99 | } |
| 100 | |
| 101 | /// Create a bitmap that indicates whether, for a particular row, |
| 102 | /// a key column bears a null value or not. |
| 103 | KeysNullMap<Key> createBitmap(size_t row) const |
| 104 | { |
| 105 | KeysNullMap<Key> bitmap{}; |
| 106 | |
| 107 | for (size_t k = 0; k < null_maps.size(); ++k) |
| 108 | { |
| 109 | if (null_maps[k] != nullptr) |
| 110 | { |
| 111 | const auto & null_map = assert_cast<const ColumnUInt8 &>(*null_maps[k]).getData(); |
| 112 | if (null_map[row] == 1) |
| 113 | { |
| 114 | size_t bucket = k / 8; |
| 115 | size_t offset = k % 8; |
| 116 | bitmap[bucket] |= UInt8(1) << offset; |
| 117 | } |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | return bitmap; |
| 122 | } |
| 123 | |
| 124 | private: |
| 125 | ColumnRawPtrs actual_columns; |
| 126 | ColumnRawPtrs null_maps; |
| 127 | }; |
| 128 | |
| 129 | /// Case where nullable keys are not supported. |
| 130 | template <typename Key> |
| 131 | class BaseStateKeysFixed<Key, false> |
| 132 | { |
| 133 | protected: |
| 134 | void init(const ColumnRawPtrs &) |
| 135 | { |
| 136 | throw Exception{"Internal error: calling init() for non-nullable" |
| 137 | " keys is forbidden" , ErrorCodes::LOGICAL_ERROR}; |
| 138 | } |
| 139 | |
| 140 | const ColumnRawPtrs & getActualColumns() const |
| 141 | { |
| 142 | throw Exception{"Internal error: calling getActualColumns() for non-nullable" |
| 143 | " keys is forbidden" , ErrorCodes::LOGICAL_ERROR}; |
| 144 | } |
| 145 | |
| 146 | KeysNullMap<Key> createBitmap(size_t) const |
| 147 | { |
| 148 | throw Exception{"Internal error: calling createBitmap() for non-nullable keys" |
| 149 | " is forbidden" , ErrorCodes::LOGICAL_ERROR}; |
| 150 | } |
| 151 | }; |
| 152 | |
| 153 | } |
| 154 | |
| 155 | /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits. |
| 156 | template <typename TData, bool has_nullable_keys_ = false> |
| 157 | struct SetMethodKeysFixed |
| 158 | { |
| 159 | using Data = TData; |
| 160 | using Key = typename Data::key_type; |
| 161 | static constexpr bool has_nullable_keys = has_nullable_keys_; |
| 162 | |
| 163 | Data data; |
| 164 | |
| 165 | using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, void, has_nullable_keys, false>; |
| 166 | }; |
| 167 | |
| 168 | /// For other cases. 128 bit hash from the key. |
| 169 | template <typename TData> |
| 170 | struct SetMethodHashed |
| 171 | { |
| 172 | using Data = TData; |
| 173 | using Key = typename Data::key_type; |
| 174 | |
| 175 | Data data; |
| 176 | |
| 177 | using State = ColumnsHashing::HashMethodHashed<typename Data::value_type, void>; |
| 178 | }; |
| 179 | |
| 180 | |
| 181 | /** Different implementations of the set. |
| 182 | */ |
| 183 | struct NonClearableSet |
| 184 | { |
| 185 | /* |
| 186 | * As in Aggregator, using consecutive keys cache doesn't improve performance |
| 187 | * for FixedHashTables. |
| 188 | */ |
| 189 | std::unique_ptr<SetMethodOneNumber<UInt8, FixedHashSet<UInt8>, false /* use_cache */>> key8; |
| 190 | std::unique_ptr<SetMethodOneNumber<UInt16, FixedHashSet<UInt16>, false /* use_cache */>> key16; |
| 191 | |
| 192 | /** Also for the experiment was tested the ability to use SmallSet, |
| 193 | * as long as the number of elements in the set is small (and, if necessary, converted to a full-fledged HashSet). |
| 194 | * But this experiment showed that there is an advantage only in rare cases. |
| 195 | */ |
| 196 | std::unique_ptr<SetMethodOneNumber<UInt32, HashSet<UInt32, HashCRC32<UInt32>>>> key32; |
| 197 | std::unique_ptr<SetMethodOneNumber<UInt64, HashSet<UInt64, HashCRC32<UInt64>>>> key64; |
| 198 | std::unique_ptr<SetMethodString<HashSetWithSavedHash<StringRef>>> key_string; |
| 199 | std::unique_ptr<SetMethodFixedString<HashSetWithSavedHash<StringRef>>> key_fixed_string; |
| 200 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>>> keys128; |
| 201 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>>> keys256; |
| 202 | std::unique_ptr<SetMethodHashed<HashSet<UInt128, UInt128TrivialHash>>> hashed; |
| 203 | |
| 204 | /// Support for nullable keys (for DISTINCT implementation). |
| 205 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128; |
| 206 | std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256; |
| 207 | /** Unlike Aggregator, `concat` method is not used here. |
| 208 | * This is done because `hashed` method, although slower, but in this case, uses less RAM. |
| 209 | * since when you use it, the key values themselves are not stored. |
| 210 | */ |
| 211 | }; |
| 212 | |
| 213 | struct ClearableSet |
| 214 | { |
| 215 | std::unique_ptr<SetMethodOneNumber<UInt8, FixedClearableHashSet<UInt8>, false /* use_cache */>> key8; |
| 216 | std::unique_ptr<SetMethodOneNumber<UInt16, FixedClearableHashSet<UInt16>, false /*use_cache */>> key16; |
| 217 | |
| 218 | std::unique_ptr<SetMethodOneNumber<UInt32, ClearableHashSet<UInt32, HashCRC32<UInt32>>>> key32; |
| 219 | std::unique_ptr<SetMethodOneNumber<UInt64, ClearableHashSet<UInt64, HashCRC32<UInt64>>>> key64; |
| 220 | std::unique_ptr<SetMethodString<ClearableHashSetWithSavedHash<StringRef>>> key_string; |
| 221 | std::unique_ptr<SetMethodFixedString<ClearableHashSetWithSavedHash<StringRef>>> key_fixed_string; |
| 222 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>>> keys128; |
| 223 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>>> keys256; |
| 224 | std::unique_ptr<SetMethodHashed<ClearableHashSet<UInt128, UInt128TrivialHash>>> hashed; |
| 225 | |
| 226 | /// Support for nullable keys (for DISTINCT implementation). |
| 227 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128; |
| 228 | std::unique_ptr<SetMethodKeysFixed<ClearableHashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256; |
| 229 | /** Unlike Aggregator, `concat` method is not used here. |
| 230 | * This is done because `hashed` method, although slower, but in this case, uses less RAM. |
| 231 | * since when you use it, the key values themselves are not stored. |
| 232 | */ |
| 233 | }; |
| 234 | |
| 235 | template <typename Variant> |
| 236 | struct SetVariantsTemplate: public Variant |
| 237 | { |
| 238 | Arena string_pool; |
| 239 | |
| 240 | #define APPLY_FOR_SET_VARIANTS(M) \ |
| 241 | M(key8) \ |
| 242 | M(key16) \ |
| 243 | M(key32) \ |
| 244 | M(key64) \ |
| 245 | M(key_string) \ |
| 246 | M(key_fixed_string) \ |
| 247 | M(keys128) \ |
| 248 | M(keys256) \ |
| 249 | M(nullable_keys128) \ |
| 250 | M(nullable_keys256) \ |
| 251 | M(hashed) |
| 252 | |
| 253 | #define M(NAME) using Variant::NAME; |
| 254 | APPLY_FOR_SET_VARIANTS(M) |
| 255 | #undef M |
| 256 | |
| 257 | enum class Type |
| 258 | { |
| 259 | EMPTY, |
| 260 | |
| 261 | #define M(NAME) NAME, |
| 262 | APPLY_FOR_SET_VARIANTS(M) |
| 263 | #undef M |
| 264 | }; |
| 265 | |
| 266 | Type type = Type::EMPTY; |
| 267 | |
| 268 | bool empty() const { return type == Type::EMPTY; } |
| 269 | |
| 270 | static Type chooseMethod(const ColumnRawPtrs & key_columns, Sizes & key_sizes); |
| 271 | |
| 272 | void init(Type type_); |
| 273 | |
| 274 | size_t getTotalRowCount() const; |
| 275 | /// Counts the size in bytes of the Set buffer and the size of the `string_pool` |
| 276 | size_t getTotalByteCount() const; |
| 277 | }; |
| 278 | |
| 279 | using SetVariants = SetVariantsTemplate<NonClearableSet>; |
| 280 | using ClearableSetVariants = SetVariantsTemplate<ClearableSet>; |
| 281 | |
| 282 | } |
| 283 | |