| 1 | #pragma once |
| 2 | |
| 3 | #include <Columns/ColumnConst.h> |
| 4 | #include <Columns/ColumnString.h> |
| 5 | #include <Columns/ColumnVector.h> |
| 6 | #include <DataTypes/DataTypesNumber.h> |
| 7 | #include <Functions/FunctionHelpers.h> |
| 8 | #include <Functions/IFunctionImpl.h> |
| 9 | |
| 10 | namespace DB |
| 11 | { |
| 12 | /** Calculate similarity metrics: |
| 13 | * |
| 14 | * ngramDistance(haystack, needle) - calculate n-gram distance between haystack and needle. |
| 15 | * Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. |
| 16 | * Also support CaseInsensitive and UTF8 formats. |
| 17 | * ngramDistanceCaseInsensitive(haystack, needle) |
| 18 | * ngramDistanceUTF8(haystack, needle) |
| 19 | * ngramDistanceCaseInsensitiveUTF8(haystack, needle) |
| 20 | */ |
| 21 | |
| 22 | namespace ErrorCodes |
| 23 | { |
| 24 | extern const int ILLEGAL_TYPE_OF_ARGUMENT; |
| 25 | extern const int ILLEGAL_COLUMN; |
| 26 | extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; |
| 27 | extern const int TOO_LARGE_STRING_SIZE; |
| 28 | } |
| 29 | |
| 30 | template <typename Impl, typename Name> |
| 31 | class FunctionsStringSimilarity : public IFunction |
| 32 | { |
| 33 | public: |
| 34 | static constexpr auto name = Name::name; |
| 35 | |
| 36 | static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSimilarity>(); } |
| 37 | |
| 38 | String getName() const override { return name; } |
| 39 | |
| 40 | size_t getNumberOfArguments() const override { return 2; } |
| 41 | |
| 42 | DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override |
| 43 | { |
| 44 | if (!isString(arguments[0])) |
| 45 | throw Exception( |
| 46 | "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 47 | |
| 48 | if (!isString(arguments[1])) |
| 49 | throw Exception( |
| 50 | "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); |
| 51 | |
| 52 | return std::make_shared<DataTypeNumber<typename Impl::ResultType>>(); |
| 53 | } |
| 54 | |
| 55 | void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override |
| 56 | { |
| 57 | using ResultType = typename Impl::ResultType; |
| 58 | |
| 59 | const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column; |
| 60 | const ColumnPtr & column_needle = block.getByPosition(arguments[1]).column; |
| 61 | |
| 62 | const ColumnConst * col_haystack_const = typeid_cast<const ColumnConst *>(&*column_haystack); |
| 63 | const ColumnConst * col_needle_const = typeid_cast<const ColumnConst *>(&*column_needle); |
| 64 | |
| 65 | if (col_haystack_const && col_needle_const) |
| 66 | { |
| 67 | ResultType res{}; |
| 68 | const String & needle = col_needle_const->getValue<String>(); |
| 69 | if (needle.size() > Impl::max_string_size) |
| 70 | { |
| 71 | throw Exception( |
| 72 | "String size of needle is too big for function " + getName() + ". Should be at most " |
| 73 | + std::to_string(Impl::max_string_size), |
| 74 | ErrorCodes::TOO_LARGE_STRING_SIZE); |
| 75 | } |
| 76 | Impl::constant_constant(col_haystack_const->getValue<String>(), needle, res); |
| 77 | block.getByPosition(result).column |
| 78 | = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); |
| 79 | return; |
| 80 | } |
| 81 | |
| 82 | auto col_res = ColumnVector<ResultType>::create(); |
| 83 | |
| 84 | typename ColumnVector<ResultType>::Container & vec_res = col_res->getData(); |
| 85 | vec_res.resize(column_haystack->size()); |
| 86 | |
| 87 | const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack); |
| 88 | const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle); |
| 89 | |
| 90 | if (col_haystack_vector && col_needle_const) |
| 91 | { |
| 92 | const String & needle = col_needle_const->getValue<String>(); |
| 93 | if (needle.size() > Impl::max_string_size) |
| 94 | { |
| 95 | throw Exception( |
| 96 | "String size of needle is too big for function " + getName() + ". Should be at most " |
| 97 | + std::to_string(Impl::max_string_size), |
| 98 | ErrorCodes::TOO_LARGE_STRING_SIZE); |
| 99 | } |
| 100 | Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle, vec_res); |
| 101 | } |
| 102 | else if (col_haystack_vector && col_needle_vector) |
| 103 | { |
| 104 | Impl::vector_vector( |
| 105 | col_haystack_vector->getChars(), |
| 106 | col_haystack_vector->getOffsets(), |
| 107 | col_needle_vector->getChars(), |
| 108 | col_needle_vector->getOffsets(), |
| 109 | vec_res); |
| 110 | } |
| 111 | else if (col_haystack_const && col_needle_vector) |
| 112 | { |
| 113 | const String & haystack = col_haystack_const->getValue<String>(); |
| 114 | if (haystack.size() > Impl::max_string_size) |
| 115 | { |
| 116 | throw Exception( |
| 117 | "String size of haystack is too big for function " + getName() + ". Should be at most " |
| 118 | + std::to_string(Impl::max_string_size), |
| 119 | ErrorCodes::TOO_LARGE_STRING_SIZE); |
| 120 | } |
| 121 | Impl::constant_vector(haystack, col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res); |
| 122 | } |
| 123 | else |
| 124 | { |
| 125 | throw Exception( |
| 126 | "Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and " |
| 127 | + block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(), |
| 128 | ErrorCodes::ILLEGAL_COLUMN); |
| 129 | } |
| 130 | |
| 131 | block.getByPosition(result).column = std::move(col_res); |
| 132 | } |
| 133 | }; |
| 134 | |
| 135 | } |
| 136 | |