1/* SPDX-License-Identifier: MIT */
2/* Copyright © 2022 Max Bachmann */
3
4#pragma once
5#include "details/common.hpp"
6#include "details/jaro_impl.hpp"
7
8#include <stdexcept>
9
10namespace duckdb_jaro_winkler {
11
12/**
13 * @defgroup jaro_winkler jaro_winkler
14 * @{
15 */
16
17/**
18 * @brief Calculates the jaro winkler similarity
19 *
20 * @tparam Sentence1 This is a string that can be converted to
21 * basic_string_view<char_type>
22 * @tparam Sentence2 This is a string that can be converted to
23 * basic_string_view<char_type>
24 *
25 * @param s1
26 * string to compare with s2 (for type info check Template parameters above)
27 * @param s2
28 * string to compare with s1 (for type info check Template parameters above)
29 * @param prefix_weight
30 * Weight used for the common prefix of the two strings.
31 * Has to be between 0 and 0.25. Default is 0.1.
32 * @param score_cutoff
33 * Optional argument for a score threshold as a float between 0 and 100.
34 * For similarity < score_cutoff 0 is returned instead. Default is 0,
35 * which deactivates this behaviour.
36 *
37 * @return jaro winkler similarity between s1 and s2
38 * as a float between 0 and 100
39 */
40template <typename InputIt1, typename InputIt2>
41typename std::enable_if<
42 common::is_iterator<InputIt1>::value && common::is_iterator<InputIt2>::value, double>::type
43jaro_winkler_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
44 double prefix_weight = 0.1, double score_cutoff = 0.0)
45{
46 if (prefix_weight < 0.0 || prefix_weight > 0.25) {
47 throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
48 }
49
50 return detail::jaro_winkler_similarity(first1, last1, first2, last2, prefix_weight,
51 score_cutoff);
52}
53
54template <typename S1, typename S2>
55double jaro_winkler_similarity(const S1& s1, const S2& s2, double prefix_weight = 0.1,
56 double score_cutoff = 0.0)
57{
58 return jaro_winkler_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
59 prefix_weight, score_cutoff);
60}
61
62template <typename CharT1>
63struct CachedJaroWinklerSimilarity {
64 template <typename InputIt1>
65 CachedJaroWinklerSimilarity(InputIt1 first1, InputIt1 last1, double prefix_weight_ = 0.1)
66 : s1(first1, last1), PM(first1, last1), prefix_weight(prefix_weight_)
67 {
68 if (prefix_weight < 0.0 || prefix_weight > 0.25) {
69 throw std::invalid_argument("prefix_weight has to be between 0.0 and 0.25");
70 }
71 }
72
73 template <typename S1>
74 CachedJaroWinklerSimilarity(const S1& s1_, double prefix_weight_ = 0.1)
75 : CachedJaroWinklerSimilarity(std::begin(s1_), std::end(s1_), prefix_weight_)
76 {}
77
78 template <typename InputIt2>
79 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
80 {
81 return detail::jaro_winkler_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
82 prefix_weight, score_cutoff);
83 }
84
85 template <typename S2>
86 double similarity(const S2& s2, double score_cutoff = 0) const
87 {
88 return similarity(std::begin(s2), std::end(s2), score_cutoff);
89 }
90
91 template <typename InputIt2>
92 double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
93 {
94 return similarity(first2, last2, score_cutoff);
95 }
96
97 template <typename S2>
98 double normalized_similarity(const S2& s2, double score_cutoff = 0) const
99 {
100 return similarity(s2, score_cutoff);
101 }
102
103private:
104 std::basic_string<CharT1> s1;
105 common::BlockPatternMatchVector PM;
106
107 double prefix_weight;
108};
109
110/**
111 * @brief Calculates the jaro similarity
112 *
113 * @tparam Sentence1 This is a string that can be converted to
114 * basic_string_view<char_type>
115 * @tparam Sentence2 This is a string that can be converted to
116 * basic_string_view<char_type>
117 *
118 * @param s1
119 * string to compare with s2 (for type info check Template parameters above)
120 * @param s2
121 * string to compare with s1 (for type info check Template parameters above)
122 * @param score_cutoff
123 * Optional argument for a score threshold as a float between 0 and 100.
124 * For similarity < score_cutoff 0 is returned instead. Default is 0,
125 * which deactivates this behaviour.
126 *
127 * @return jaro similarity between s1 and s2
128 * as a float between 0 and 100
129 */
130template <typename InputIt1, typename InputIt2>
131double jaro_similarity(InputIt1 first1, InputIt1 last1, InputIt2 first2, InputIt2 last2,
132 double score_cutoff = 0.0)
133{
134 return detail::jaro_similarity(first1, last1, first2, last2, score_cutoff);
135}
136
137template <typename S1, typename S2>
138double jaro_similarity(const S1& s1, const S2& s2, double score_cutoff = 0.0)
139{
140 return jaro_similarity(std::begin(s1), std::end(s1), std::begin(s2), std::end(s2),
141 score_cutoff);
142}
143
144template <typename CharT1>
145struct CachedJaroSimilarity {
146 template <typename InputIt1>
147 CachedJaroSimilarity(InputIt1 first1, InputIt1 last1) : s1(first1, last1), PM(first1, last1)
148 {}
149
150 template <typename S1>
151 CachedJaroSimilarity(const S1& s1_) : CachedJaroSimilarity(std::begin(s1_), std::end(s1_))
152 {}
153
154 template <typename InputIt2>
155 double similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
156 {
157 return detail::jaro_similarity(PM, std::begin(s1), std::end(s1), first2, last2,
158 score_cutoff);
159 }
160
161 template <typename S2>
162 double similarity(const S2& s2, double score_cutoff = 0) const
163 {
164 return similarity(std::begin(s2), std::end(s2), score_cutoff);
165 }
166
167 template <typename InputIt2>
168 double normalized_similarity(InputIt2 first2, InputIt2 last2, double score_cutoff = 0) const
169 {
170 return similarity(first2, last2, score_cutoff);
171 }
172
173 template <typename S2>
174 double normalized_similarity(const S2& s2, double score_cutoff = 0) const
175 {
176 return similarity(s2, score_cutoff);
177 }
178
179private:
180 std::basic_string<CharT1> s1;
181 common::BlockPatternMatchVector PM;
182};
183
184/**@}*/
185
186} // namespace duckdb_jaro_winkler
187