1 | /* SPDX-License-Identifier: MIT */ |
2 | /* Copyright © 2022 Max Bachmann */ |
3 | |
4 | #pragma once |
5 | #include "details/common.hpp" |
6 | #include "details/jaro_impl.hpp" |
7 | |
8 | #include <stdexcept> |
9 | |
10 | namespace duckdb_jaro_winkler { |
11 | |
12 | /** |
13 | * @defgroup jaro_winkler jaro_winkler |
14 | * @{ |
15 | */ |
16 | |
17 | /** |
18 | * @brief Calculates the jaro winkler similarity |
19 | * |
20 | * @tparam Sentence1 This is a string that can be converted to |
21 | * basic_string_view<char_type> |
22 | * @tparam Sentence2 This is a string that can be converted to |
23 | * basic_string_view<char_type> |
24 | * |
25 | * @param s1 |
26 | * string to compare with s2 (for type info check Template parameters above) |
27 | * @param s2 |
28 | * string to compare with s1 (for type info check Template parameters above) |
29 | * @param prefix_weight |
30 | * Weight used for the common prefix of the two strings. |
31 | * Has to be between 0 and 0.25. Default is 0.1. |
32 | * @param score_cutoff |
33 | * Optional argument for a score threshold as a float between 0 and 100. |
34 | * For similarity < score_cutoff 0 is returned instead. Default is 0, |
35 | * which deactivates this behaviour. |
36 | * |
37 | * @return jaro winkler similarity between s1 and s2 |
38 | * as a float between 0 and 100 |
39 | */ |
40 | template <typename InputIt1, typename InputIt2> |
41 | typename std::enable_if< |
42 | common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type |
43 | jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, |
44 | double prefix_weight = 0.1, double score_cutoff = 0.0) |
45 | { |
46 | if (prefix_weight < 0.0 || prefix_weight > 0.25) { |
47 | throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25" ); |
48 | } |
49 | |
50 | return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight, |
51 | score_cutoff); |
52 | } |
53 | |
54 | template <typename S1, typename S2> |
55 | double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1, |
56 | double score_cutoff = 0.0) |
57 | { |
58 | return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), |
59 | prefix_weight, score_cutoff); |
60 | } |
61 | |
62 | template <typename CharT1> |
63 | struct CachedJaroWinklerSimilarity { |
64 | template <typename InputIt1> |
65 | CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1) |
66 | : s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_) |
67 | { |
68 | if (prefix_weight < 0.0 || prefix_weight > 0.25) { |
69 | throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25" ); |
70 | } |
71 | } |
72 | |
73 | template <typename S1> |
74 | CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1) |
75 | : CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_) |
76 | {} |
77 | |
78 | template <typename InputIt2> |
79 | double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
80 | { |
81 | return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2, |
82 | prefix_weight, score_cutoff); |
83 | } |
84 | |
85 | template <typename S2> |
86 | double similarity(const S2& s2, double score_cutoff = 0) const |
87 | { |
88 | return similarity(std::begin(s2), std::end(s2), score_cutoff); |
89 | } |
90 | |
91 | template <typename InputIt2> |
92 | double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
93 | { |
94 | return similarity(first2, last2, score_cutoff); |
95 | } |
96 | |
97 | template <typename S2> |
98 | double normalized_similarity(const S2& s2, double score_cutoff = 0) const |
99 | { |
100 | return similarity(s2, score_cutoff); |
101 | } |
102 | |
103 | private: |
104 | std::basic_string<CharT1> s1; |
105 | common::BlockPatternMatchVector PM; |
106 | |
107 | double prefix_weight; |
108 | }; |
109 | |
110 | /** |
111 | * @brief Calculates the jaro similarity |
112 | * |
113 | * @tparam Sentence1 This is a string that can be converted to |
114 | * basic_string_view<char_type> |
115 | * @tparam Sentence2 This is a string that can be converted to |
116 | * basic_string_view<char_type> |
117 | * |
118 | * @param s1 |
119 | * string to compare with s2 (for type info check Template parameters above) |
120 | * @param s2 |
121 | * string to compare with s1 (for type info check Template parameters above) |
122 | * @param score_cutoff |
123 | * Optional argument for a score threshold as a float between 0 and 100. |
124 | * For similarity < score_cutoff 0 is returned instead. Default is 0, |
125 | * which deactivates this behaviour. |
126 | * |
127 | * @return jaro similarity between s1 and s2 |
128 | * as a float between 0 and 100 |
129 | */ |
130 | template <typename InputIt1, typename InputIt2> |
131 | double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2, |
132 | double score_cutoff = 0.0) |
133 | { |
134 | return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff); |
135 | } |
136 | |
137 | template <typename S1, typename S2> |
138 | double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0) |
139 | { |
140 | return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2), |
141 | score_cutoff); |
142 | } |
143 | |
144 | template <typename CharT1> |
145 | struct CachedJaroSimilarity { |
146 | template <typename InputIt1> |
147 | CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1) |
148 | {} |
149 | |
150 | template <typename S1> |
151 | CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_)) |
152 | {} |
153 | |
154 | template <typename InputIt2> |
155 | double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
156 | { |
157 | return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2, |
158 | score_cutoff); |
159 | } |
160 | |
161 | template <typename S2> |
162 | double similarity(const S2& s2, double score_cutoff = 0) const |
163 | { |
164 | return similarity(std::begin(s2), std::end(s2), score_cutoff); |
165 | } |
166 | |
167 | template <typename InputIt2> |
168 | double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const |
169 | { |
170 | return similarity(first2, last2, score_cutoff); |
171 | } |
172 | |
173 | template <typename S2> |
174 | double normalized_similarity(const S2& s2, double score_cutoff = 0) const |
175 | { |
176 | return similarity(s2, score_cutoff); |
177 | } |
178 | |
179 | private: |
180 | std::basic_string<CharT1> s1; |
181 | common::BlockPatternMatchVector PM; |
182 | }; |
183 | |
184 | /**@}*/ |
185 | |
186 | } // namespace duckdb_jaro_winkler |
187 | |