| 1 | #pragma once |
| 2 | |
| 3 | #include <Common/Arena.h> |
| 4 | #include <Common/NaNUtils.h> |
| 5 | |
| 6 | #include <Columns/ColumnVector.h> |
| 7 | #include <Columns/ColumnTuple.h> |
| 8 | #include <Columns/ColumnArray.h> |
| 9 | #include <Common/assert_cast.h> |
| 10 | |
| 11 | #include <DataTypes/DataTypesNumber.h> |
| 12 | #include <DataTypes/DataTypeArray.h> |
| 13 | #include <DataTypes/DataTypeTuple.h> |
| 14 | |
| 15 | #include <IO/WriteBuffer.h> |
| 16 | #include <IO/ReadBuffer.h> |
| 17 | #include <IO/WriteHelpers.h> |
| 18 | #include <IO/ReadHelpers.h> |
| 19 | #include <IO/VarInt.h> |
| 20 | |
| 21 | #include <AggregateFunctions/IAggregateFunction.h> |
| 22 | |
| 23 | #include <math.h> |
| 24 | #include <queue> |
| 25 | #include <stddef.h> |
| 26 | |
| 27 | namespace DB |
| 28 | { |
| 29 | |
| 30 | namespace ErrorCodes |
| 31 | { |
| 32 | extern const int TOO_LARGE_ARRAY_SIZE; |
| 33 | extern const int INCORRECT_DATA; |
| 34 | } |
| 35 | |
| 36 | /** |
| 37 | * distance compression algorigthm implementation |
| 38 | * http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf |
| 39 | */ |
| 40 | class AggregateFunctionHistogramData |
| 41 | { |
| 42 | public: |
| 43 | using Mean = Float64; |
| 44 | using Weight = Float64; |
| 45 | |
| 46 | constexpr static size_t bins_count_limit = 250; |
| 47 | |
| 48 | private: |
| 49 | struct WeightedValue |
| 50 | { |
| 51 | Mean mean; |
| 52 | Weight weight; |
| 53 | |
| 54 | WeightedValue operator+ (const WeightedValue & other) |
| 55 | { |
| 56 | return {mean + other.weight * (other.mean - mean) / (other.weight + weight), other.weight + weight}; |
| 57 | } |
| 58 | }; |
| 59 | |
| 60 | private: |
| 61 | // quantity of stored weighted-values |
| 62 | UInt32 size; |
| 63 | |
| 64 | // calculated lower and upper bounds of seen points |
| 65 | Mean lower_bound; |
| 66 | Mean upper_bound; |
| 67 | |
| 68 | // Weighted values representation of histogram. |
| 69 | WeightedValue points[0]; |
| 70 | |
| 71 | private: |
| 72 | void sort() |
| 73 | { |
| 74 | std::sort(points, points + size, |
| 75 | [](const WeightedValue & first, const WeightedValue & second) |
| 76 | { |
| 77 | return first.mean < second.mean; |
| 78 | }); |
| 79 | } |
| 80 | |
| 81 | template <typename T> |
| 82 | struct PriorityQueueStorage |
| 83 | { |
| 84 | size_t size = 0; |
| 85 | T * data_ptr; |
| 86 | |
| 87 | PriorityQueueStorage(T * value) |
| 88 | : data_ptr(value) |
| 89 | { |
| 90 | } |
| 91 | |
| 92 | void push_back(T val) |
| 93 | { |
| 94 | data_ptr[size] = std::move(val); |
| 95 | ++size; |
| 96 | } |
| 97 | |
| 98 | void pop_back() { --size; } |
| 99 | T * begin() { return data_ptr; } |
| 100 | T * end() const { return data_ptr + size; } |
| 101 | bool empty() const { return size == 0; } |
| 102 | T & front() { return *data_ptr; } |
| 103 | const T & front() const { return *data_ptr; } |
| 104 | |
| 105 | using value_type = T; |
| 106 | using reference = T&; |
| 107 | using const_reference = const T&; |
| 108 | using size_type = size_t; |
| 109 | }; |
| 110 | |
| 111 | /** |
| 112 | * Repeatedly fuse most close values until max_bins bins left |
| 113 | */ |
| 114 | void compress(UInt32 max_bins) |
| 115 | { |
| 116 | sort(); |
| 117 | auto new_size = size; |
| 118 | if (size <= max_bins) |
| 119 | return; |
| 120 | |
| 121 | // Maintain doubly-linked list of "active" points |
| 122 | // and store neighbour pairs in priority queue by distance |
| 123 | UInt32 previous[size + 1]; |
| 124 | UInt32 next[size + 1]; |
| 125 | bool active[size + 1]; |
| 126 | std::fill(active, active + size, true); |
| 127 | active[size] = false; |
| 128 | |
| 129 | auto delete_node = [&](UInt32 i) |
| 130 | { |
| 131 | previous[next[i]] = previous[i]; |
| 132 | next[previous[i]] = next[i]; |
| 133 | active[i] = false; |
| 134 | }; |
| 135 | |
| 136 | for (size_t i = 0; i <= size; ++i) |
| 137 | { |
| 138 | previous[i] = i - 1; |
| 139 | next[i] = i + 1; |
| 140 | } |
| 141 | |
| 142 | next[size] = 0; |
| 143 | previous[0] = size; |
| 144 | |
| 145 | using QueueItem = std::pair<Mean, UInt32>; |
| 146 | |
| 147 | QueueItem storage[2 * size - max_bins]; |
| 148 | |
| 149 | std::priority_queue< |
| 150 | QueueItem, |
| 151 | PriorityQueueStorage<QueueItem>, |
| 152 | std::greater<QueueItem>> |
| 153 | queue{std::greater<QueueItem>(), |
| 154 | PriorityQueueStorage<QueueItem>(storage)}; |
| 155 | |
| 156 | auto quality = [&](UInt32 i) { return points[next[i]].mean - points[i].mean; }; |
| 157 | |
| 158 | for (size_t i = 0; i + 1 < size; ++i) |
| 159 | queue.push({quality(i), i}); |
| 160 | |
| 161 | while (new_size > max_bins && !queue.empty()) |
| 162 | { |
| 163 | auto min_item = queue.top(); |
| 164 | queue.pop(); |
| 165 | auto left = min_item.second; |
| 166 | auto right = next[left]; |
| 167 | |
| 168 | if (!active[left] || !active[right] || quality(left) > min_item.first) |
| 169 | continue; |
| 170 | |
| 171 | points[left] = points[left] + points[right]; |
| 172 | |
| 173 | delete_node(right); |
| 174 | if (active[next[left]]) |
| 175 | queue.push({quality(left), left}); |
| 176 | if (active[previous[left]]) |
| 177 | queue.push({quality(previous[left]), previous[left]}); |
| 178 | |
| 179 | --new_size; |
| 180 | } |
| 181 | |
| 182 | size_t left = 0; |
| 183 | for (size_t right = 0; right < size; ++right) |
| 184 | { |
| 185 | if (active[right]) |
| 186 | { |
| 187 | points[left] = points[right]; |
| 188 | ++left; |
| 189 | } |
| 190 | } |
| 191 | size = new_size; |
| 192 | } |
| 193 | |
| 194 | /*** |
| 195 | * Delete too close points from histogram. |
| 196 | * Assumes that points are sorted. |
| 197 | */ |
| 198 | void unique() |
| 199 | { |
| 200 | if (size == 0) |
| 201 | return; |
| 202 | |
| 203 | size_t left = 0; |
| 204 | |
| 205 | for (auto right = left + 1; right < size; ++right) |
| 206 | { |
| 207 | // Fuse points if their text representations differ only in last digit |
| 208 | auto min_diff = 10 * (points[left].mean + points[right].mean) * std::numeric_limits<Mean>::epsilon(); |
| 209 | if (points[left].mean + min_diff >= points[right].mean) |
| 210 | { |
| 211 | points[left] = points[left] + points[right]; |
| 212 | } |
| 213 | else |
| 214 | { |
| 215 | ++left; |
| 216 | points[left] = points[right]; |
| 217 | } |
| 218 | } |
| 219 | size = left + 1; |
| 220 | } |
| 221 | |
| 222 | public: |
| 223 | AggregateFunctionHistogramData() |
| 224 | : size(0) |
| 225 | , lower_bound(std::numeric_limits<Mean>::max()) |
| 226 | , upper_bound(std::numeric_limits<Mean>::lowest()) |
| 227 | { |
| 228 | static_assert(offsetof(AggregateFunctionHistogramData, points) == sizeof(AggregateFunctionHistogramData), "points should be last member" ); |
| 229 | } |
| 230 | |
| 231 | static size_t structSize(size_t max_bins) |
| 232 | { |
| 233 | return sizeof(AggregateFunctionHistogramData) + max_bins * 2 * sizeof(WeightedValue); |
| 234 | } |
| 235 | |
| 236 | void insertResultInto(ColumnVector<Mean> & to_lower, ColumnVector<Mean> & to_upper, ColumnVector<Weight> & to_weights, UInt32 max_bins) |
| 237 | { |
| 238 | compress(max_bins); |
| 239 | unique(); |
| 240 | |
| 241 | for (size_t i = 0; i < size; ++i) |
| 242 | { |
| 243 | to_lower.insertValue((i == 0) ? lower_bound : (points[i].mean + points[i - 1].mean) / 2); |
| 244 | to_upper.insertValue((i + 1 == size) ? upper_bound : (points[i].mean + points[i + 1].mean) / 2); |
| 245 | |
| 246 | // linear density approximation |
| 247 | Weight lower_weight = (i == 0) ? points[i].weight : ((points[i - 1].weight) + points[i].weight * 3) / 4; |
| 248 | Weight upper_weight = (i + 1 == size) ? points[i].weight : (points[i + 1].weight + points[i].weight * 3) / 4; |
| 249 | to_weights.insertValue((lower_weight + upper_weight) / 2); |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | void add(Mean value, Weight weight, UInt32 max_bins) |
| 254 | { |
| 255 | // nans break sort and compression |
| 256 | // infs don't fit in bins partition method |
| 257 | if (!isFinite(value)) |
| 258 | throw Exception("Invalid value (inf or nan) for aggregation by 'histogram' function" , ErrorCodes::INCORRECT_DATA); |
| 259 | |
| 260 | points[size] = {value, weight}; |
| 261 | ++size; |
| 262 | lower_bound = std::min(lower_bound, value); |
| 263 | upper_bound = std::max(upper_bound, value); |
| 264 | |
| 265 | if (size >= max_bins * 2) |
| 266 | compress(max_bins); |
| 267 | } |
| 268 | |
| 269 | void merge(const AggregateFunctionHistogramData & other, UInt32 max_bins) |
| 270 | { |
| 271 | lower_bound = std::min(lower_bound, other.lower_bound); |
| 272 | upper_bound = std::max(upper_bound, other.upper_bound); |
| 273 | for (size_t i = 0; i < other.size; i++) |
| 274 | add(other.points[i].mean, other.points[i].weight, max_bins); |
| 275 | } |
| 276 | |
| 277 | void write(WriteBuffer & buf) const |
| 278 | { |
| 279 | writeBinary(lower_bound, buf); |
| 280 | writeBinary(upper_bound, buf); |
| 281 | |
| 282 | writeVarUInt(size, buf); |
| 283 | buf.write(reinterpret_cast<const char *>(points), size * sizeof(WeightedValue)); |
| 284 | } |
| 285 | |
| 286 | void read(ReadBuffer & buf, UInt32 max_bins) |
| 287 | { |
| 288 | readBinary(lower_bound, buf); |
| 289 | readBinary(upper_bound, buf); |
| 290 | |
| 291 | readVarUInt(size, buf); |
| 292 | if (size > max_bins * 2) |
| 293 | throw Exception("Too many bins" , ErrorCodes::TOO_LARGE_ARRAY_SIZE); |
| 294 | |
| 295 | buf.read(reinterpret_cast<char *>(points), size * sizeof(WeightedValue)); |
| 296 | } |
| 297 | }; |
| 298 | |
| 299 | template <typename T> |
| 300 | class AggregateFunctionHistogram final: public IAggregateFunctionDataHelper<AggregateFunctionHistogramData, AggregateFunctionHistogram<T>> |
| 301 | { |
| 302 | private: |
| 303 | using Data = AggregateFunctionHistogramData; |
| 304 | |
| 305 | const UInt32 max_bins; |
| 306 | |
| 307 | public: |
| 308 | AggregateFunctionHistogram(UInt32 max_bins_, const DataTypes & arguments, const Array & params) |
| 309 | : IAggregateFunctionDataHelper<AggregateFunctionHistogramData, AggregateFunctionHistogram<T>>(arguments, params) |
| 310 | , max_bins(max_bins_) |
| 311 | { |
| 312 | } |
| 313 | |
| 314 | size_t sizeOfData() const override |
| 315 | { |
| 316 | return Data::structSize(max_bins); |
| 317 | } |
| 318 | DataTypePtr getReturnType() const override |
| 319 | { |
| 320 | DataTypes types; |
| 321 | auto mean = std::make_shared<DataTypeNumber<Data::Mean>>(); |
| 322 | auto weight = std::make_shared<DataTypeNumber<Data::Weight>>(); |
| 323 | |
| 324 | // lower bound |
| 325 | types.emplace_back(mean); |
| 326 | // upper bound |
| 327 | types.emplace_back(mean); |
| 328 | // weight |
| 329 | types.emplace_back(weight); |
| 330 | |
| 331 | auto tuple = std::make_shared<DataTypeTuple>(types); |
| 332 | return std::make_shared<DataTypeArray>(tuple); |
| 333 | } |
| 334 | |
| 335 | void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override |
| 336 | { |
| 337 | auto val = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]; |
| 338 | this->data(place).add(static_cast<Data::Mean>(val), 1, max_bins); |
| 339 | } |
| 340 | |
| 341 | void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override |
| 342 | { |
| 343 | this->data(place).merge(this->data(rhs), max_bins); |
| 344 | } |
| 345 | |
| 346 | void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override |
| 347 | { |
| 348 | this->data(place).write(buf); |
| 349 | } |
| 350 | |
| 351 | void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override |
| 352 | { |
| 353 | this->data(place).read(buf, max_bins); |
| 354 | } |
| 355 | |
| 356 | void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override |
| 357 | { |
| 358 | auto & data = this->data(const_cast<AggregateDataPtr>(place)); |
| 359 | |
| 360 | auto & to_array = assert_cast<ColumnArray &>(to); |
| 361 | ColumnArray::Offsets & offsets_to = to_array.getOffsets(); |
| 362 | auto & to_tuple = assert_cast<ColumnTuple &>(to_array.getData()); |
| 363 | |
| 364 | auto & to_lower = assert_cast<ColumnVector<Data::Mean> &>(to_tuple.getColumn(0)); |
| 365 | auto & to_upper = assert_cast<ColumnVector<Data::Mean> &>(to_tuple.getColumn(1)); |
| 366 | auto & to_weights = assert_cast<ColumnVector<Data::Weight> &>(to_tuple.getColumn(2)); |
| 367 | data.insertResultInto(to_lower, to_upper, to_weights, max_bins); |
| 368 | |
| 369 | offsets_to.push_back(to_tuple.size()); |
| 370 | } |
| 371 | |
| 372 | String getName() const override { return "histogram" ; } |
| 373 | }; |
| 374 | |
| 375 | } |
| 376 | |