1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2015 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include "grn_db.h"
20
21#include <groonga/scorer.h>
22
23#include <math.h>
24
25static double
26scorer_tf_idf(grn_ctx *ctx, grn_scorer_matched_record *record)
27{
28 double min_score = 1.0;
29 double tf;
30 double n_all_documents;
31 double n_candidates;
32 double n_tokens;
33 double n_estimated_match_documents;
34
35 tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) +
36 grn_scorer_matched_record_get_total_term_weights(ctx, record);
37 n_all_documents = grn_scorer_matched_record_get_n_documents(ctx, record);
38 n_candidates = grn_scorer_matched_record_get_n_candidates(ctx, record);
39 n_tokens = grn_scorer_matched_record_get_n_tokens(ctx, record);
40 n_estimated_match_documents = n_candidates / n_tokens;
41
42 if (n_estimated_match_documents >= n_all_documents) {
43 return min_score;
44 } else {
45 double idf;
46 double tf_idf;
47
48 idf = log(n_all_documents / n_estimated_match_documents);
49 tf_idf = tf * idf;
50 return fmax(tf_idf, min_score);
51 }
52}
53
54static double
55scorer_tf_at_most(grn_ctx *ctx, grn_scorer_matched_record *record)
56{
57 double tf;
58 double max;
59 grn_obj *max_raw;
60
61 tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) +
62 grn_scorer_matched_record_get_total_term_weights(ctx, record);
63 max_raw = grn_scorer_matched_record_get_arg(ctx, record, 0);
64
65 if (!max_raw) {
66 return tf;
67 }
68
69 if (max_raw->header.type != GRN_BULK) {
70 return tf;
71 }
72
73 if (max_raw->header.domain == GRN_DB_FLOAT) {
74 max = GRN_FLOAT_VALUE(max_raw);
75 } else {
76 grn_obj casted_max_raw;
77 GRN_FLOAT_INIT(&casted_max_raw, 0);
78 if (grn_obj_cast(ctx, max_raw, &casted_max_raw, GRN_FALSE) != GRN_SUCCESS) {
79 GRN_OBJ_FIN(ctx, &casted_max_raw);
80 return tf;
81 } else {
82 max = GRN_FLOAT_VALUE(&casted_max_raw);
83 }
84 GRN_OBJ_FIN(ctx, &casted_max_raw);
85 }
86
87 return fmin(tf, max);
88}
89
90grn_rc
91grn_db_init_builtin_scorers(grn_ctx *ctx)
92{
93 grn_scorer_register(ctx, "scorer_tf_idf", -1, scorer_tf_idf);
94 grn_scorer_register(ctx, "scorer_tf_at_most", -1, scorer_tf_at_most);
95 return GRN_SUCCESS;
96}
97