| 1 | /* -*- c-basic-offset: 2 -*- */ |
| 2 | /* |
| 3 | Copyright(C) 2017 Brazil |
| 4 | |
| 5 | This library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License version 2.1 as published by the Free Software Foundation. |
| 8 | |
| 9 | This library is distributed in the hope that it will be useful, |
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 12 | Lesser General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU Lesser General Public |
| 15 | License along with this library; if not, write to the Free Software |
| 16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 17 | */ |
| 18 | |
| 19 | #ifdef GRN_EMBEDDED |
| 20 | # define GRN_PLUGIN_FUNCTION_TAG functions_time |
| 21 | #endif |
| 22 | |
| 23 | #include <groonga/plugin.h> |
| 24 | |
| 25 | static grn_rc |
| 26 | selector_index_column_df_ratio_between(grn_ctx *ctx, |
| 27 | grn_obj *table, |
| 28 | grn_obj *index, |
| 29 | int n_args, |
| 30 | grn_obj **args, |
| 31 | grn_obj *res, |
| 32 | grn_operator op) |
| 33 | { |
| 34 | grn_rc rc = GRN_SUCCESS; |
| 35 | grn_obj *index_column; |
| 36 | grn_ii *ii; |
| 37 | double min; |
| 38 | double max; |
| 39 | grn_obj *source_table; |
| 40 | unsigned int n_documents; |
| 41 | grn_posting posting; |
| 42 | |
| 43 | if ((n_args - 1) != 3) { |
| 44 | GRN_PLUGIN_ERROR(ctx, |
| 45 | GRN_INVALID_ARGUMENT, |
| 46 | "index_column_df_ratio_between(): " |
| 47 | "wrong number of arguments (%d for 3)" , n_args - 1); |
| 48 | rc = ctx->rc; |
| 49 | goto exit; |
| 50 | } |
| 51 | |
| 52 | index_column = args[1]; |
| 53 | ii = (grn_ii *)index_column; |
| 54 | min = GRN_FLOAT_VALUE(args[2]); |
| 55 | max = GRN_FLOAT_VALUE(args[3]); |
| 56 | |
| 57 | source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); |
| 58 | n_documents = grn_table_size(ctx, source_table); |
| 59 | memset(&posting, 0, sizeof(grn_posting)); |
| 60 | posting.sid = 1; |
| 61 | |
| 62 | if (op == GRN_OP_AND) { |
| 63 | GRN_TABLE_EACH_BEGIN(ctx, res, cursor, record_id) { |
| 64 | void *key; |
| 65 | grn_id term_id; |
| 66 | uint32_t n_match_documents; |
| 67 | double df_ratio; |
| 68 | |
| 69 | grn_table_cursor_get_key(ctx, cursor, &key); |
| 70 | term_id = *(grn_id *)key; |
| 71 | n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); |
| 72 | if (n_match_documents > n_documents) { |
| 73 | n_match_documents = n_documents; |
| 74 | } |
| 75 | df_ratio = (double)n_match_documents / (double)n_documents; |
| 76 | if (min <= df_ratio && df_ratio <= max) { |
| 77 | posting.rid = term_id; |
| 78 | grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); |
| 79 | } |
| 80 | } GRN_TABLE_EACH_END(ctx, cursor); |
| 81 | grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op); |
| 82 | } else { |
| 83 | GRN_TABLE_EACH_BEGIN(ctx, table, cursor, term_id) { |
| 84 | uint32_t n_match_documents; |
| 85 | double df_ratio; |
| 86 | |
| 87 | n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); |
| 88 | if (n_match_documents > n_documents) { |
| 89 | n_match_documents = n_documents; |
| 90 | } |
| 91 | df_ratio = (double)n_match_documents / (double)n_documents; |
| 92 | { |
| 93 | void *key; |
| 94 | int key_size; |
| 95 | key_size = grn_table_cursor_get_key(ctx, cursor, &key); |
| 96 | } |
| 97 | if (min <= df_ratio && df_ratio <= max) { |
| 98 | posting.rid = term_id; |
| 99 | grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); |
| 100 | } |
| 101 | } GRN_TABLE_EACH_END(ctx, cursor); |
| 102 | } |
| 103 | |
| 104 | exit : |
| 105 | return rc; |
| 106 | } |
| 107 | |
| 108 | static grn_obj * |
| 109 | func_index_column_df_ratio(grn_ctx *ctx, |
| 110 | int n_args, |
| 111 | grn_obj **args, |
| 112 | grn_user_data *user_data) |
| 113 | { |
| 114 | grn_obj *term_table; |
| 115 | grn_obj *index_column_name; |
| 116 | grn_obj *index_column; |
| 117 | grn_ii *ii; |
| 118 | grn_id term_id; |
| 119 | |
| 120 | if (n_args != 1) { |
| 121 | GRN_PLUGIN_ERROR(ctx, |
| 122 | GRN_INVALID_ARGUMENT, |
| 123 | "index_column_df_ratio(): " |
| 124 | "wrong number of arguments (%d for 1)" , n_args - 1); |
| 125 | return NULL; |
| 126 | } |
| 127 | |
| 128 | { |
| 129 | grn_obj *expr; |
| 130 | grn_obj *variable; |
| 131 | |
| 132 | expr = grn_plugin_proc_get_caller(ctx, user_data); |
| 133 | if (!expr) { |
| 134 | GRN_PLUGIN_ERROR(ctx, |
| 135 | GRN_INVALID_ARGUMENT, |
| 136 | "index_column_df_ratio(): " |
| 137 | "called directly" ); |
| 138 | return NULL; |
| 139 | } |
| 140 | |
| 141 | variable = grn_expr_get_var_by_offset(ctx, expr, 0); |
| 142 | if (!variable) { |
| 143 | GRN_PLUGIN_ERROR(ctx, |
| 144 | GRN_INVALID_ARGUMENT, |
| 145 | "index_column_df_ratio(): " |
| 146 | "caller expression must have target record information" ); |
| 147 | return NULL; |
| 148 | } |
| 149 | |
| 150 | term_table = grn_ctx_at(ctx, variable->header.domain); |
| 151 | term_id = GRN_RECORD_VALUE(variable); |
| 152 | while (GRN_TRUE) { |
| 153 | grn_obj *key_type; |
| 154 | |
| 155 | key_type = grn_ctx_at(ctx, term_table->header.domain); |
| 156 | if (!grn_obj_is_table(ctx, key_type)) { |
| 157 | break; |
| 158 | } |
| 159 | |
| 160 | grn_table_get_key(ctx, term_table, term_id, &term_id, sizeof(grn_id)); |
| 161 | term_table = key_type; |
| 162 | } |
| 163 | } |
| 164 | |
| 165 | index_column_name = args[0]; |
| 166 | if (!grn_obj_is_text_family_bulk(ctx, index_column_name)) { |
| 167 | grn_obj inspected; |
| 168 | GRN_TEXT_INIT(&inspected, 0); |
| 169 | grn_inspect(ctx, &inspected, index_column_name); |
| 170 | GRN_PLUGIN_ERROR(ctx, |
| 171 | GRN_INVALID_ARGUMENT, |
| 172 | "index_column_df_ratio(): " |
| 173 | "the first argument must be index column name: %.*s" , |
| 174 | (int)GRN_TEXT_LEN(&inspected), |
| 175 | GRN_TEXT_VALUE(&inspected)); |
| 176 | GRN_OBJ_FIN(ctx, &inspected); |
| 177 | return NULL; |
| 178 | } |
| 179 | |
| 180 | index_column = grn_obj_column(ctx, |
| 181 | term_table, |
| 182 | GRN_TEXT_VALUE(index_column_name), |
| 183 | GRN_TEXT_LEN(index_column_name)); |
| 184 | if (!index_column) { |
| 185 | GRN_PLUGIN_ERROR(ctx, |
| 186 | GRN_INVALID_ARGUMENT, |
| 187 | "index_column_df_ratio(): " |
| 188 | "nonexistent object: <%.*s>" , |
| 189 | (int)GRN_TEXT_LEN(index_column_name), |
| 190 | GRN_TEXT_VALUE(index_column_name)); |
| 191 | return NULL; |
| 192 | } |
| 193 | |
| 194 | if (!grn_obj_is_index_column(ctx, index_column)) { |
| 195 | grn_obj inspected; |
| 196 | GRN_TEXT_INIT(&inspected, 0); |
| 197 | grn_inspect(ctx, &inspected, index_column); |
| 198 | GRN_PLUGIN_ERROR(ctx, |
| 199 | GRN_INVALID_ARGUMENT, |
| 200 | "index_column_df_ratio(): " |
| 201 | "the first argument must be index column: %.*s" , |
| 202 | (int)GRN_TEXT_LEN(&inspected), |
| 203 | GRN_TEXT_VALUE(&inspected)); |
| 204 | GRN_OBJ_FIN(ctx, &inspected); |
| 205 | if (grn_obj_is_accessor(ctx, index_column)) { |
| 206 | grn_obj_unlink(ctx, index_column); |
| 207 | } |
| 208 | return NULL; |
| 209 | } |
| 210 | |
| 211 | ii = (grn_ii *)index_column; |
| 212 | |
| 213 | { |
| 214 | grn_obj *source_table; |
| 215 | unsigned int n_documents; |
| 216 | uint32_t n_match_documents; |
| 217 | double df_ratio; |
| 218 | grn_obj *df_ratio_value; |
| 219 | |
| 220 | source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); |
| 221 | n_documents = grn_table_size(ctx, source_table); |
| 222 | n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); |
| 223 | if (n_match_documents > n_documents) { |
| 224 | n_match_documents = n_documents; |
| 225 | } |
| 226 | df_ratio = (double)n_match_documents / (double)n_documents; |
| 227 | |
| 228 | df_ratio_value = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_FLOAT, 0); |
| 229 | if (!df_ratio_value) { |
| 230 | return NULL; |
| 231 | } |
| 232 | GRN_FLOAT_SET(ctx, df_ratio_value, df_ratio); |
| 233 | return df_ratio_value; |
| 234 | } |
| 235 | } |
| 236 | |
| 237 | grn_rc |
| 238 | GRN_PLUGIN_INIT(grn_ctx *ctx) |
| 239 | { |
| 240 | return ctx->rc; |
| 241 | } |
| 242 | |
| 243 | grn_rc |
| 244 | GRN_PLUGIN_REGISTER(grn_ctx *ctx) |
| 245 | { |
| 246 | grn_obj *selector_proc; |
| 247 | |
| 248 | selector_proc = grn_proc_create(ctx, "index_column_df_ratio_between" , -1, |
| 249 | GRN_PROC_FUNCTION, |
| 250 | NULL, NULL, NULL, 0, NULL); |
| 251 | grn_proc_set_selector(ctx, selector_proc, |
| 252 | selector_index_column_df_ratio_between); |
| 253 | grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_NOP); |
| 254 | |
| 255 | grn_proc_create(ctx, "index_column_df_ratio" , -1, |
| 256 | GRN_PROC_FUNCTION, |
| 257 | func_index_column_df_ratio, NULL, NULL, 0, NULL); |
| 258 | |
| 259 | return ctx->rc; |
| 260 | } |
| 261 | |
| 262 | grn_rc |
| 263 | GRN_PLUGIN_FIN(grn_ctx *ctx) |
| 264 | { |
| 265 | return GRN_SUCCESS; |
| 266 | } |
| 267 | |