| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* |
| 3 | Copyright(C) 2015 Brazil |
| 4 | |
| 5 | This library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License version 2.1 as published by the Free Software Foundation. |
| 8 | |
| 9 | This library is distributed in the hope that it will be useful, |
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | Lesser General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU Lesser General Public |
| 15 | License along with this library; if not, write to the Free Software |
| 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 17 | */ |
| 18 | |
| 19 | #include "grn_db.h" |
| 20 | |
| 21 | #include <groonga/scorer.h> |
| 22 | |
| 23 | #include <math.h> |
| 24 | |
| 25 | static double |
| 26 | scorer_tf_idf(grn_ctx *ctx, grn_scorer_matched_record *record) |
| 27 | { |
| 28 | double min_score = 1.0; |
| 29 | double tf; |
| 30 | double n_all_documents; |
| 31 | double n_candidates; |
| 32 | double n_tokens; |
| 33 | double n_estimated_match_documents; |
| 34 | |
| 35 | tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) + |
| 36 | grn_scorer_matched_record_get_total_term_weights(ctx, record); |
| 37 | n_all_documents = grn_scorer_matched_record_get_n_documents(ctx, record); |
| 38 | n_candidates = grn_scorer_matched_record_get_n_candidates(ctx, record); |
| 39 | n_tokens = grn_scorer_matched_record_get_n_tokens(ctx, record); |
| 40 | n_estimated_match_documents = n_candidates / n_tokens; |
| 41 | |
| 42 | if (n_estimated_match_documents >= n_all_documents) { |
| 43 | return min_score; |
| 44 | } else { |
| 45 | double idf; |
| 46 | double tf_idf; |
| 47 | |
| 48 | idf = log(n_all_documents / n_estimated_match_documents); |
| 49 | tf_idf = tf * idf; |
| 50 | return fmax(tf_idf, min_score); |
| 51 | } |
| 52 | } |
| 53 | |
| 54 | static double |
| 55 | scorer_tf_at_most(grn_ctx *ctx, grn_scorer_matched_record *record) |
| 56 | { |
| 57 | double tf; |
| 58 | double max; |
| 59 | grn_obj *max_raw; |
| 60 | |
| 61 | tf = grn_scorer_matched_record_get_n_occurrences(ctx, record) + |
| 62 | grn_scorer_matched_record_get_total_term_weights(ctx, record); |
| 63 | max_raw = grn_scorer_matched_record_get_arg(ctx, record, 0); |
| 64 | |
| 65 | if (!max_raw) { |
| 66 | return tf; |
| 67 | } |
| 68 | |
| 69 | if (max_raw->header.type != GRN_BULK) { |
| 70 | return tf; |
| 71 | } |
| 72 | |
| 73 | if (max_raw->header.domain == GRN_DB_FLOAT) { |
| 74 | max = GRN_FLOAT_VALUE(max_raw); |
| 75 | } else { |
| 76 | grn_obj casted_max_raw; |
| 77 | GRN_FLOAT_INIT(&casted_max_raw, 0); |
| 78 | if (grn_obj_cast(ctx, max_raw, &casted_max_raw, GRN_FALSE) != GRN_SUCCESS) { |
| 79 | GRN_OBJ_FIN(ctx, &casted_max_raw); |
| 80 | return tf; |
| 81 | } else { |
| 82 | max = GRN_FLOAT_VALUE(&casted_max_raw); |
| 83 | } |
| 84 | GRN_OBJ_FIN(ctx, &casted_max_raw); |
| 85 | } |
| 86 | |
| 87 | return fmin(tf, max); |
| 88 | } |
| 89 | |
| 90 | grn_rc |
| 91 | grn_db_init_builtin_scorers(grn_ctx *ctx) |
| 92 | { |
| 93 | grn_scorer_register(ctx, "scorer_tf_idf" , -1, scorer_tf_idf); |
| 94 | grn_scorer_register(ctx, "scorer_tf_at_most" , -1, scorer_tf_at_most); |
| 95 | return GRN_SUCCESS; |
| 96 | } |
| 97 | |