1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2015 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | #include "grn_db.h" |
20 | |
21 | #include <groonga/scorer.h> |
22 | |
23 | #include <math.h> |
24 | |
25 | static double |
26 | scorer_tf_idf(grn_ctx *ctx, grn_scorer_matched_record *record) |
27 | { |
28 | double min_score = 1.0; |
29 | double tf; |
30 | double n_all_documents; |
31 | double n_candidates; |
32 | double n_tokens; |
33 | double n_estimated_match_documents; |
34 | |
35 | tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) + |
36 | grn_scorer_matched_record_get_total_term_weights(ctx, record); |
37 | n_all_documents = grn_scorer_matched_record_get_n_documents(ctx, record); |
38 | n_candidates = grn_scorer_matched_record_get_n_candidates(ctx, record); |
39 | n_tokens = grn_scorer_matched_record_get_n_tokens(ctx, record); |
40 | n_estimated_match_documents = n_candidates / n_tokens; |
41 | |
42 | if (n_estimated_match_documents >= n_all_documents) { |
43 | return min_score; |
44 | } else { |
45 | double idf; |
46 | double tf_idf; |
47 | |
48 | idf = log(n_all_documents / n_estimated_match_documents); |
49 | tf_idf = tf * idf; |
50 | return fmax(tf_idf, min_score); |
51 | } |
52 | } |
53 | |
54 | static double |
55 | scorer_tf_at_most(grn_ctx *ctx, grn_scorer_matched_record *record) |
56 | { |
57 | double tf; |
58 | double max; |
59 | grn_obj *max_raw; |
60 | |
61 | tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) + |
62 | grn_scorer_matched_record_get_total_term_weights(ctx, record); |
63 | max_raw = grn_scorer_matched_record_get_arg(ctx, record, 0); |
64 | |
65 | if (!max_raw) { |
66 | return tf; |
67 | } |
68 | |
69 | if (max_raw->header.type != GRN_BULK) { |
70 | return tf; |
71 | } |
72 | |
73 | if (max_raw->header.domain == GRN_DB_FLOAT) { |
74 | max = GRN_FLOAT_VALUE(max_raw); |
75 | } else { |
76 | grn_obj casted_max_raw; |
77 | GRN_FLOAT_INIT(&casted_max_raw, 0); |
78 | if (grn_obj_cast(ctx, max_raw, &casted_max_raw, GRN_FALSE) != GRN_SUCCESS) { |
79 | GRN_OBJ_FIN(ctx, &casted_max_raw); |
80 | return tf; |
81 | } else { |
82 | max = GRN_FLOAT_VALUE(&casted_max_raw); |
83 | } |
84 | GRN_OBJ_FIN(ctx, &casted_max_raw); |
85 | } |
86 | |
87 | return fmin(tf, max); |
88 | } |
89 | |
90 | grn_rc |
91 | grn_db_init_builtin_scorers(grn_ctx *ctx) |
92 | { |
93 | grn_scorer_register(ctx, "scorer_tf_idf" , -1, scorer_tf_idf); |
94 | grn_scorer_register(ctx, "scorer_tf_at_most" , -1, scorer_tf_at_most); |
95 | return GRN_SUCCESS; |
96 | } |
97 | |