1 | #pragma once |
2 | |
3 | #include <Columns/ColumnConst.h> |
4 | #include <Columns/ColumnString.h> |
5 | #include <Columns/ColumnVector.h> |
6 | #include <DataTypes/DataTypesNumber.h> |
7 | #include <Functions/FunctionHelpers.h> |
8 | #include <Functions/IFunctionImpl.h> |
9 | |
10 | namespace DB |
11 | { |
12 | /** Calculate similarity metrics: |
13 | * |
14 | * ngramDistance(haystack, needle) - calculate n-gram distance between haystack and needle. |
15 | * Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. |
16 | * Also support CaseInsensitive and UTF8 formats. |
17 | * ngramDistanceCaseInsensitive(haystack, needle) |
18 | * ngramDistanceUTF8(haystack, needle) |
19 | * ngramDistanceCaseInsensitiveUTF8(haystack, needle) |
20 | */ |
21 | |
22 | namespace ErrorCodes |
23 | { |
24 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
25 | extern const int ILLEGAL_COLUMN; |
26 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
27 | extern const int TOO_LARGE_STRING_SIZE; |
28 | } |
29 | |
30 | template <typename Impl, typename Name> |
31 | class FunctionsStringSimilarity : public IFunction |
32 | { |
33 | public: |
34 | static constexpr auto name = Name::name; |
35 | |
36 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSimilarity>(); } |
37 | |
38 | String getName() const override { return name; } |
39 | |
40 | size_t getNumberOfArguments() const override { return 2; } |
41 | |
42 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
43 | { |
44 | if (!isString(arguments[0])) |
45 | throw Exception( |
46 | "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
47 | |
48 | if (!isString(arguments[1])) |
49 | throw Exception( |
50 | "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
51 | |
52 | return std::make_shared<DataTypeNumber<typename Impl::ResultType>>(); |
53 | } |
54 | |
55 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
56 | { |
57 | using ResultType = typename Impl::ResultType; |
58 | |
59 | const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; |
60 | const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; |
61 | |
62 | const ColumnConst * col_haystack_const = typeid_cast<const ColumnConst *>(&*column_haystack); |
63 | const ColumnConst * col_needle_const = typeid_cast<const ColumnConst *>(&*column_needle); |
64 | |
65 | if (col_haystack_const && col_needle_const) |
66 | { |
67 | ResultType res{}; |
68 | const String & needle = col_needle_const->getValue<String>(); |
69 | if (needle.size() > Impl::max_string_size) |
70 | { |
71 | throw Exception( |
72 | "String size of needle is too big for function " + getName() + ". Should be at most " |
73 | + std::to_string(Impl::max_string_size), |
74 | ErrorCodes::TOO_LARGE_STRING_SIZE); |
75 | } |
76 | Impl::constant_constant(col_haystack_const->getValue<String>(), needle, res); |
77 | block.getByPosition(result).column |
78 | = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); |
79 | return; |
80 | } |
81 | |
82 | auto col_res = ColumnVector<ResultType>::create(); |
83 | |
84 | typename ColumnVector<ResultType>::Container & vec_res = col_res->getData(); |
85 | vec_res.resize(column_haystack->size()); |
86 | |
87 | const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack); |
88 | const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle); |
89 | |
90 | if (col_haystack_vector && col_needle_const) |
91 | { |
92 | const String & needle = col_needle_const->getValue<String>(); |
93 | if (needle.size() > Impl::max_string_size) |
94 | { |
95 | throw Exception( |
96 | "String size of needle is too big for function " + getName() + ". Should be at most " |
97 | + std::to_string(Impl::max_string_size), |
98 | ErrorCodes::TOO_LARGE_STRING_SIZE); |
99 | } |
100 | Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle, vec_res); |
101 | } |
102 | else if (col_haystack_vector && col_needle_vector) |
103 | { |
104 | Impl::vector_vector( |
105 | col_haystack_vector->getChars(), |
106 | col_haystack_vector->getOffsets(), |
107 | col_needle_vector->getChars(), |
108 | col_needle_vector->getOffsets(), |
109 | vec_res); |
110 | } |
111 | else if (col_haystack_const && col_needle_vector) |
112 | { |
113 | const String & haystack = col_haystack_const->getValue<String>(); |
114 | if (haystack.size() > Impl::max_string_size) |
115 | { |
116 | throw Exception( |
117 | "String size of haystack is too big for function " + getName() + ". Should be at most " |
118 | + std::to_string(Impl::max_string_size), |
119 | ErrorCodes::TOO_LARGE_STRING_SIZE); |
120 | } |
121 | Impl::constant_vector(haystack, col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res); |
122 | } |
123 | else |
124 | { |
125 | throw Exception( |
126 | "Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and " |
127 | + block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(), |
128 | ErrorCodes::ILLEGAL_COLUMN); |
129 | } |
130 | |
131 | block.getByPosition(result).column = std::move(col_res); |
132 | } |
133 | }; |
134 | |
135 | } |
136 | |