1 | #pragma once |
2 | |
3 | #include <DataTypes/DataTypesNumber.h> |
4 | #include <DataTypes/DataTypeArray.h> |
5 | #include <Columns/ColumnArray.h> |
6 | #include <Columns/ColumnConst.h> |
7 | #include <Columns/ColumnsNumber.h> |
8 | |
9 | #include <Functions/IFunctionImpl.h> |
10 | #include <Functions/FunctionHelpers.h> |
11 | |
12 | #include <IO/WriteHelpers.h> |
13 | |
14 | #include <Common/typeid_cast.h> |
15 | |
16 | |
17 | namespace DB |
18 | { |
19 | |
20 | namespace ErrorCodes |
21 | { |
22 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
23 | extern const int ILLEGAL_COLUMN; |
24 | } |
25 | |
26 | enum ClusterOperation |
27 | { |
28 | FindClusterIndex = 0, |
29 | FindCentroidValue = 1 |
30 | }; |
31 | |
32 | /// The centroid values are converted to Float64 for easier coding of |
33 | /// distance calculations. |
34 | /// |
35 | /// We assume to have 10th to 100th centroids, usually of type Float64, as a typical use case. |
36 | /// While it is possible to sort centroids and use a modification of a binary search to find the |
37 | /// nearest centroid, we think for arrays of 10th to 100th this might be an overkill. |
38 | /// |
39 | /// Also, even though centroids of other types are feasible, this first implementation |
40 | /// lacks support of them for simplicity. Date, DateTime and Strings (eg. with the |
41 | /// Levenshtein distance) could be theoretically supported, as well as custom distance |
42 | /// functions (eg. Hamming distance) using Clickhouse lambdas. |
43 | |
44 | // Centroids array has the same size as number of clusters. |
45 | inline size_t find_centroid(Float64 x, std::vector<Float64> & centroids) |
46 | { |
47 | // Centroids array has to have at least one element, and if it has only one element, |
48 | // it is also the result of this Function. |
49 | Float64 distance = std::abs(centroids[0] - x); |
50 | size_t index = 0; |
51 | |
52 | // Check if we have more clusters and if we have, whether some is closer to src[i] |
53 | for (size_t j = 1; j < centroids.size(); ++j) |
54 | { |
55 | Float64 next_distance = std::abs(centroids[j] - x); |
56 | |
57 | if (next_distance < distance) |
58 | { |
59 | distance = next_distance; |
60 | index = j; |
61 | } |
62 | } |
63 | |
64 | // Index of the closest cluster, or 0 in case of just one cluster |
65 | return index; |
66 | } |
67 | |
68 | /** findClusterIndex(x, centroids_array) - find index of element in centroids_array with the value nearest to x |
69 | * findClusterValue(x, centroids_array) - find value of element in centroids_array with the value nearest to x |
70 | * |
71 | * Types: |
72 | * findClusterIndex(T, Array(T)) -> UInt64 |
73 | * findClusterValue(T, Array(T)) -> T |
74 | * |
75 | * T can be any numeric type. |
76 | * centroids_array must be constant |
77 | */ |
78 | class FunctionFindClusterIndex : public IFunction |
79 | { |
80 | public: |
81 | static constexpr auto name = "findClusterIndex" ; |
82 | static FunctionPtr create(const Context &) |
83 | { |
84 | return std::make_shared<FunctionFindClusterIndex>(); |
85 | } |
86 | |
87 | String getName() const override |
88 | { |
89 | return FunctionFindClusterIndex::name; |
90 | } |
91 | |
92 | bool isVariadic() const override |
93 | { |
94 | return true; |
95 | } |
96 | |
97 | size_t getNumberOfArguments() const override |
98 | { |
99 | return 0; |
100 | } |
101 | |
102 | bool useDefaultImplementationForConstants() const override { return true; } |
103 | ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } |
104 | |
105 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
106 | { |
107 | const auto args_size = arguments.size(); |
108 | if (args_size != 2) |
109 | throw Exception{"Number of arguments for function " + getName() + " doesn't match: passed " + toString(args_size) + ", should be 2" , |
110 | ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH}; |
111 | |
112 | const auto type_x = arguments[0]; |
113 | |
114 | if (!isNativeNumber(type_x)) |
115 | throw Exception{"Unsupported type " + type_x->getName() + " of first argument of function " + getName() + " must be a numeric type" , |
116 | ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; |
117 | |
118 | const DataTypeArray * type_arr_from = checkAndGetDataType<DataTypeArray>(arguments[1].get()); |
119 | |
120 | if (!type_arr_from) |
121 | throw Exception{"Second argument of function " + getName() + " must be literal array" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; |
122 | |
123 | return std::make_shared<DataTypeUInt64>(); |
124 | } |
125 | |
126 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
127 | { |
128 | const auto in_untyped = block.getByPosition(arguments[0]).column.get(); |
129 | const auto centroids_array_untyped = block.getByPosition(arguments[1]).column.get(); |
130 | auto column_result = block.getByPosition(result).type->createColumn(); |
131 | auto out_untyped = column_result.get(); |
132 | |
133 | if (!isColumnConst(*centroids_array_untyped)) |
134 | throw Exception{"Second argument of function " + getName() + " must be literal array" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; |
135 | |
136 | executeImplTyped(in_untyped, out_untyped, centroids_array_untyped); |
137 | |
138 | block.getByPosition(result).column = std::move(column_result); |
139 | } |
140 | |
141 | protected: |
142 | virtual ClusterOperation getOperation() |
143 | { |
144 | return ClusterOperation::FindClusterIndex; |
145 | } |
146 | |
147 | virtual void executeImplTyped(const IColumn* in_untyped, IColumn* out_untyped, const IColumn* centroids_array_untyped) |
148 | { |
149 | if (!executeOperation<UInt8, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
150 | && !executeOperation<UInt16, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
151 | && !executeOperation<UInt32, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
152 | && !executeOperation<UInt64, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
153 | && !executeOperation<Int8, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
154 | && !executeOperation<Int16, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
155 | && !executeOperation<Int32, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
156 | && !executeOperation<Int64, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
157 | && !executeOperation<Float32, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
158 | && !executeOperation<Float64, UInt64>(in_untyped, out_untyped, centroids_array_untyped)) |
159 | { |
160 | throw Exception{"Function " + getName() + " expects both x and centroids_array of a numeric type." |
161 | " Passed arguments are " + in_untyped->getName() + " and " + centroids_array_untyped->getName(), ErrorCodes::ILLEGAL_COLUMN}; |
162 | |
163 | } |
164 | } |
165 | |
166 | // Match the type of the centrods array and convert them to Float64, because we |
167 | // don't want to have problems calculating negative distances of UInts |
168 | template <typename CentroidsType> |
169 | bool fillCentroids(const IColumn * centroids_array_untyped, std::vector<Float64> & centroids) |
170 | { |
171 | const ColumnConst * const_centroids_array = checkAndGetColumnConst<ColumnVector<Array>>(centroids_array_untyped); |
172 | |
173 | if (!const_centroids_array) |
174 | return false; |
175 | |
176 | Array array = const_centroids_array->getValue<Array>(); |
177 | if (array.empty()) |
178 | throw Exception{"Centroids array must be not empty" , ErrorCodes::ILLEGAL_COLUMN}; |
179 | |
180 | for (size_t k = 0; k < array.size(); ++k) |
181 | { |
182 | const Field & tmp_field = array[k]; |
183 | NearestFieldType<CentroidsType> value; |
184 | if (!tmp_field.tryGet(value)) |
185 | return false; |
186 | |
187 | centroids.push_back(Float64(value)); |
188 | } |
189 | return true; |
190 | } |
191 | |
192 | template <typename CentroidsType, typename OutputType> |
193 | bool executeOperation(const IColumn * in_untyped, IColumn * out_untyped, const IColumn * centroids_array_untyped) |
194 | { |
195 | // Match the type of the output |
196 | auto out = typeid_cast<ColumnVector<OutputType> *>(out_untyped); |
197 | |
198 | if (!out) |
199 | return false; |
200 | |
201 | PaddedPODArray<OutputType> & dst = out->getData(); |
202 | |
203 | // try to match the type of the input column |
204 | if (!executeOperationTyped<UInt8, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
205 | && !executeOperationTyped<UInt16, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
206 | && !executeOperationTyped<UInt32, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
207 | && !executeOperationTyped<UInt64, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
208 | && !executeOperationTyped<Int8, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
209 | && !executeOperationTyped<Int16, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
210 | && !executeOperationTyped<Int32, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
211 | && !executeOperationTyped<Int64, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
212 | && !executeOperationTyped<Float32, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped) |
213 | && !executeOperationTyped<Float64, OutputType, CentroidsType>(in_untyped, dst, centroids_array_untyped)) |
214 | { |
215 | return false; |
216 | } |
217 | |
218 | return true; |
219 | } |
220 | |
221 | template <typename InputType, typename OutputType, typename CentroidsType> |
222 | bool executeOperationTyped(const IColumn * in_untyped, PaddedPODArray<OutputType> & dst, const IColumn * centroids_array_untyped) |
223 | { |
224 | const auto maybe_const = in_untyped->convertToFullColumnIfConst(); |
225 | in_untyped = maybe_const.get(); |
226 | |
227 | const auto in_vector = checkAndGetColumn<ColumnVector<InputType>>(in_untyped); |
228 | if (in_vector) |
229 | { |
230 | const PaddedPODArray<InputType> & src = in_vector->getData(); |
231 | |
232 | std::vector<Float64> centroids; |
233 | if (!fillCentroids<CentroidsType>(centroids_array_untyped, centroids)) |
234 | return false; |
235 | |
236 | for (size_t i = 0; i < src.size(); ++i) |
237 | { |
238 | size_t index = find_centroid(Float64(src[i]), centroids); |
239 | if (getOperation() == ClusterOperation::FindClusterIndex) |
240 | // Note that array indexes start with 1 in Clickhouse |
241 | dst.push_back(UInt64(index + 1)); |
242 | else if (getOperation() == ClusterOperation::FindCentroidValue) |
243 | dst.push_back(centroids[index]); |
244 | else |
245 | throw Exception{"Unexpected error in findCluster* function" , ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; |
246 | } |
247 | |
248 | return true; |
249 | } |
250 | return false; |
251 | } |
252 | |
253 | }; |
254 | |
255 | class FunctionFindClusterValue : public FunctionFindClusterIndex |
256 | { |
257 | public: |
258 | static constexpr auto name = "findClusterValue" ; |
259 | static FunctionPtr create(const Context &) |
260 | { |
261 | return std::make_shared<FunctionFindClusterValue>(); |
262 | } |
263 | |
264 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
265 | { |
266 | FunctionFindClusterIndex::getReturnTypeImpl(arguments); |
267 | const DataTypeArray * type_arr_from = checkAndGetDataType<DataTypeArray>(arguments[1].get()); |
268 | return type_arr_from->getNestedType(); |
269 | } |
270 | |
271 | String getName() const override |
272 | { |
273 | return FunctionFindClusterValue::name; |
274 | } |
275 | |
276 | protected: |
277 | ClusterOperation getOperation() override |
278 | { |
279 | return ClusterOperation::FindCentroidValue; |
280 | } |
281 | |
282 | void executeImplTyped(const IColumn* in_untyped, IColumn* out_untyped, const IColumn* centroids_array_untyped) override |
283 | { |
284 | if (!executeOperation<UInt8, UInt8>(in_untyped, out_untyped, centroids_array_untyped) |
285 | && !executeOperation<UInt16, UInt16>(in_untyped, out_untyped, centroids_array_untyped) |
286 | && !executeOperation<UInt32, UInt32>(in_untyped, out_untyped, centroids_array_untyped) |
287 | && !executeOperation<UInt64, UInt64>(in_untyped, out_untyped, centroids_array_untyped) |
288 | && !executeOperation<Int8, Int8>(in_untyped, out_untyped, centroids_array_untyped) |
289 | && !executeOperation<Int16, Int16>(in_untyped, out_untyped, centroids_array_untyped) |
290 | && !executeOperation<Int32, Int32>(in_untyped, out_untyped, centroids_array_untyped) |
291 | && !executeOperation<Int64, Int64>(in_untyped, out_untyped, centroids_array_untyped) |
292 | && !executeOperation<Float32, Float32>(in_untyped, out_untyped, centroids_array_untyped) |
293 | && !executeOperation<Float64, Float64>(in_untyped, out_untyped, centroids_array_untyped)) |
294 | { |
295 | throw Exception{"Function " + getName() + " expects both x and centroids_array of a numeric type." |
296 | "Passed arguments are " + in_untyped->getName() + " and " + centroids_array_untyped->getName(), ErrorCodes::ILLEGAL_COLUMN}; |
297 | } |
298 | } |
299 | }; |
300 | |
301 | } |
302 | |