| 1 | #include <Functions/IFunctionImpl.h> |
| 2 | #include <Functions/FunctionFactory.h> |
| 3 | #include <Functions/FunctionHelpers.h> |
| 4 | #include <DataTypes/DataTypeArray.h> |
| 5 | #include <DataTypes/DataTypeNullable.h> |
| 6 | #include <Columns/ColumnArray.h> |
| 7 | #include <Columns/ColumnNullable.h> |
| 8 | #include <Columns/ColumnString.h> |
| 9 | #include <Common/HashTable/ClearableHashSet.h> |
| 10 | #include <Common/SipHash.h> |
| 11 | #include <Common/assert_cast.h> |
| 12 | |
| 13 | |
| 14 | namespace DB |
| 15 | { |
| 16 | |
| 17 | namespace ErrorCodes |
| 18 | { |
| 19 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
| 20 | } |
| 21 | |
| 22 | |
| 23 | /// Find different elements in an array. |
| 24 | class FunctionArrayDistinct : public IFunction |
| 25 | { |
| 26 | public: |
| 27 | static constexpr auto name = "arrayDistinct" ; |
| 28 | |
| 29 | static FunctionPtr create(const Context &) |
| 30 | { |
| 31 | return std::make_shared<FunctionArrayDistinct>(); |
| 32 | } |
| 33 | |
| 34 | String getName() const override |
| 35 | { |
| 36 | return name; |
| 37 | } |
| 38 | |
| 39 | bool isVariadic() const override { return false; } |
| 40 | |
| 41 | size_t getNumberOfArguments() const override { return 1; } |
| 42 | |
| 43 | bool useDefaultImplementationForConstants() const override { return true; } |
| 44 | |
| 45 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 46 | { |
| 47 | const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get()); |
| 48 | if (!array_type) |
| 49 | throw Exception("Argument for function " + getName() + " must be array but it " |
| 50 | " has type " + arguments[0]->getName() + "." , |
| 51 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 52 | |
| 53 | auto nested_type = removeNullable(array_type->getNestedType()); |
| 54 | |
| 55 | return std::make_shared<DataTypeArray>(nested_type); |
| 56 | } |
| 57 | |
| 58 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override; |
| 59 | |
| 60 | private: |
| 61 | /// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess. |
| 62 | static constexpr size_t INITIAL_SIZE_DEGREE = 9; |
| 63 | |
| 64 | template <typename T> |
| 65 | bool executeNumber( |
| 66 | const IColumn & src_data, |
| 67 | const ColumnArray::Offsets & src_offsets, |
| 68 | IColumn & res_data_col, |
| 69 | ColumnArray::Offsets & res_offsets, |
| 70 | const ColumnNullable * nullable_col); |
| 71 | |
| 72 | bool executeString( |
| 73 | const IColumn & src_data, |
| 74 | const ColumnArray::Offsets & src_offsets, |
| 75 | IColumn & res_data_col, |
| 76 | ColumnArray::Offsets & res_offsets, |
| 77 | const ColumnNullable * nullable_col); |
| 78 | |
| 79 | void executeHashed( |
| 80 | const IColumn & src_data, |
| 81 | const ColumnArray::Offsets & src_offsets, |
| 82 | IColumn & res_data_col, |
| 83 | ColumnArray::Offsets & res_offsets, |
| 84 | const ColumnNullable * nullable_col); |
| 85 | }; |
| 86 | |
| 87 | |
| 88 | void FunctionArrayDistinct::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) |
| 89 | { |
| 90 | ColumnPtr array_ptr = block.getByPosition(arguments[0]).column; |
| 91 | const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get()); |
| 92 | |
| 93 | const auto & return_type = block.getByPosition(result).type; |
| 94 | |
| 95 | auto res_ptr = return_type->createColumn(); |
| 96 | ColumnArray & res = assert_cast<ColumnArray &>(*res_ptr); |
| 97 | |
| 98 | const IColumn & src_data = array->getData(); |
| 99 | const ColumnArray::Offsets & offsets = array->getOffsets(); |
| 100 | |
| 101 | IColumn & res_data = res.getData(); |
| 102 | ColumnArray::Offsets & res_offsets = res.getOffsets(); |
| 103 | |
| 104 | const ColumnNullable * nullable_col = checkAndGetColumn<ColumnNullable>(src_data); |
| 105 | |
| 106 | const IColumn * inner_col; |
| 107 | |
| 108 | if (nullable_col) |
| 109 | { |
| 110 | inner_col = &nullable_col->getNestedColumn(); |
| 111 | } |
| 112 | else |
| 113 | { |
| 114 | inner_col = &src_data; |
| 115 | } |
| 116 | |
| 117 | if (!(executeNumber<UInt8>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 118 | || executeNumber<UInt16>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 119 | || executeNumber<UInt32>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 120 | || executeNumber<UInt64>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 121 | || executeNumber<Int8>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 122 | || executeNumber<Int16>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 123 | || executeNumber<Int32>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 124 | || executeNumber<Int64>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 125 | || executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 126 | || executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col) |
| 127 | || executeString(*inner_col, offsets, res_data, res_offsets, nullable_col))) |
| 128 | executeHashed(*inner_col, offsets, res_data, res_offsets, nullable_col); |
| 129 | |
| 130 | block.getByPosition(result).column = std::move(res_ptr); |
| 131 | } |
| 132 | |
| 133 | template <typename T> |
| 134 | bool FunctionArrayDistinct::executeNumber( |
| 135 | const IColumn & src_data, |
| 136 | const ColumnArray::Offsets & src_offsets, |
| 137 | IColumn & res_data_col, |
| 138 | ColumnArray::Offsets & res_offsets, |
| 139 | const ColumnNullable * nullable_col) |
| 140 | { |
| 141 | const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data); |
| 142 | |
| 143 | if (!src_data_concrete) |
| 144 | { |
| 145 | return false; |
| 146 | } |
| 147 | |
| 148 | const PaddedPODArray<T> & values = src_data_concrete->getData(); |
| 149 | PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData(); |
| 150 | |
| 151 | const PaddedPODArray<UInt8> * src_null_map = nullptr; |
| 152 | |
| 153 | if (nullable_col) |
| 154 | src_null_map = &nullable_col->getNullMapData(); |
| 155 | |
| 156 | using Set = ClearableHashSet<T, |
| 157 | DefaultHash<T>, |
| 158 | HashTableGrower<INITIAL_SIZE_DEGREE>, |
| 159 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>; |
| 160 | |
| 161 | Set set; |
| 162 | |
| 163 | ColumnArray::Offset prev_src_offset = 0; |
| 164 | ColumnArray::Offset res_offset = 0; |
| 165 | |
| 166 | for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i) |
| 167 | { |
| 168 | set.clear(); |
| 169 | |
| 170 | ColumnArray::Offset curr_src_offset = src_offsets[i]; |
| 171 | for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) |
| 172 | { |
| 173 | if (nullable_col && (*src_null_map)[j]) |
| 174 | continue; |
| 175 | |
| 176 | if (!set.find(values[j])) |
| 177 | { |
| 178 | res_data.emplace_back(values[j]); |
| 179 | set.insert(values[j]); |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | res_offset += set.size(); |
| 184 | res_offsets.emplace_back(res_offset); |
| 185 | |
| 186 | prev_src_offset = curr_src_offset; |
| 187 | } |
| 188 | return true; |
| 189 | } |
| 190 | |
| 191 | bool FunctionArrayDistinct::executeString( |
| 192 | const IColumn & src_data, |
| 193 | const ColumnArray::Offsets & src_offsets, |
| 194 | IColumn & res_data_col, |
| 195 | ColumnArray::Offsets & res_offsets, |
| 196 | const ColumnNullable * nullable_col) |
| 197 | { |
| 198 | const ColumnString * src_data_concrete = checkAndGetColumn<ColumnString>(&src_data); |
| 199 | |
| 200 | if (!src_data_concrete) |
| 201 | return false; |
| 202 | |
| 203 | ColumnString & res_data_column_string = typeid_cast<ColumnString &>(res_data_col); |
| 204 | |
| 205 | using Set = ClearableHashSet<StringRef, |
| 206 | StringRefHash, |
| 207 | HashTableGrower<INITIAL_SIZE_DEGREE>, |
| 208 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>; |
| 209 | |
| 210 | const PaddedPODArray<UInt8> * src_null_map = nullptr; |
| 211 | |
| 212 | if (nullable_col) |
| 213 | src_null_map = &nullable_col->getNullMapData(); |
| 214 | |
| 215 | Set set; |
| 216 | |
| 217 | ColumnArray::Offset prev_src_offset = 0; |
| 218 | ColumnArray::Offset res_offset = 0; |
| 219 | |
| 220 | for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i) |
| 221 | { |
| 222 | set.clear(); |
| 223 | |
| 224 | ColumnArray::Offset curr_src_offset = src_offsets[i]; |
| 225 | for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) |
| 226 | { |
| 227 | if (nullable_col && (*src_null_map)[j]) |
| 228 | continue; |
| 229 | |
| 230 | StringRef str_ref = src_data_concrete->getDataAt(j); |
| 231 | |
| 232 | if (!set.find(str_ref)) |
| 233 | { |
| 234 | set.insert(str_ref); |
| 235 | res_data_column_string.insertData(str_ref.data, str_ref.size); |
| 236 | } |
| 237 | } |
| 238 | |
| 239 | res_offset += set.size(); |
| 240 | res_offsets.emplace_back(res_offset); |
| 241 | |
| 242 | prev_src_offset = curr_src_offset; |
| 243 | } |
| 244 | return true; |
| 245 | } |
| 246 | |
| 247 | void FunctionArrayDistinct::executeHashed( |
| 248 | const IColumn & src_data, |
| 249 | const ColumnArray::Offsets & src_offsets, |
| 250 | IColumn & res_data_col, |
| 251 | ColumnArray::Offsets & res_offsets, |
| 252 | const ColumnNullable * nullable_col) |
| 253 | { |
| 254 | using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>, |
| 255 | HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>; |
| 256 | |
| 257 | const PaddedPODArray<UInt8> * src_null_map = nullptr; |
| 258 | |
| 259 | if (nullable_col) |
| 260 | src_null_map = &nullable_col->getNullMapData(); |
| 261 | |
| 262 | Set set; |
| 263 | |
| 264 | ColumnArray::Offset prev_src_offset = 0; |
| 265 | ColumnArray::Offset res_offset = 0; |
| 266 | |
| 267 | for (ColumnArray::Offset i = 0; i < src_offsets.size(); ++i) |
| 268 | { |
| 269 | set.clear(); |
| 270 | |
| 271 | ColumnArray::Offset curr_src_offset = src_offsets[i]; |
| 272 | for (ColumnArray::Offset j = prev_src_offset; j < curr_src_offset; ++j) |
| 273 | { |
| 274 | if (nullable_col && (*src_null_map)[j]) |
| 275 | continue; |
| 276 | |
| 277 | UInt128 hash; |
| 278 | SipHash hash_function; |
| 279 | src_data.updateHashWithValue(j, hash_function); |
| 280 | hash_function.get128(reinterpret_cast<char *>(&hash)); |
| 281 | |
| 282 | if (!set.find(hash)) |
| 283 | { |
| 284 | set.insert(hash); |
| 285 | res_data_col.insertFrom(src_data, j); |
| 286 | } |
| 287 | } |
| 288 | |
| 289 | res_offset += set.size(); |
| 290 | res_offsets.emplace_back(res_offset); |
| 291 | |
| 292 | prev_src_offset = curr_src_offset; |
| 293 | } |
| 294 | } |
| 295 | |
| 296 | |
| 297 | void registerFunctionArrayDistinct(FunctionFactory & factory) |
| 298 | { |
| 299 | factory.registerFunction<FunctionArrayDistinct>(); |
| 300 | } |
| 301 | |
| 302 | } |
| 303 | |