1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2017 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | #ifdef GRN_EMBEDDED |
20 | # define GRN_PLUGIN_FUNCTION_TAG functions_time |
21 | #endif |
22 | |
23 | #include <groonga/plugin.h> |
24 | |
25 | static grn_rc |
26 | selector_index_column_df_ratio_between(grn_ctx *ctx, |
27 | grn_obj *table, |
28 | grn_obj *index, |
29 | int n_args, |
30 | grn_obj **args, |
31 | grn_obj *res, |
32 | grn_operator op) |
33 | { |
34 | grn_rc rc = GRN_SUCCESS; |
35 | grn_obj *index_column; |
36 | grn_ii *ii; |
37 | double min; |
38 | double max; |
39 | grn_obj *source_table; |
40 | unsigned int n_documents; |
41 | grn_posting posting; |
42 | |
43 | if ((n_args - 1) != 3) { |
44 | GRN_PLUGIN_ERROR(ctx, |
45 | GRN_INVALID_ARGUMENT, |
46 | "index_column_df_ratio_between(): " |
47 | "wrong number of arguments (%d for 3)" , n_args - 1); |
48 | rc = ctx->rc; |
49 | goto exit; |
50 | } |
51 | |
52 | index_column = args[1]; |
53 | ii = (grn_ii *)index_column; |
54 | min = GRN_FLOAT_VALUE(args[2]); |
55 | max = GRN_FLOAT_VALUE(args[3]); |
56 | |
57 | source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); |
58 | n_documents = grn_table_size(ctx, source_table); |
59 | memset(&posting, 0, sizeof(grn_posting)); |
60 | posting.sid = 1; |
61 | |
62 | if (op == GRN_OP_AND) { |
63 | GRN_TABLE_EACH_BEGIN(ctx, res, cursor, record_id) { |
64 | void *key; |
65 | grn_id term_id; |
66 | uint32_t n_match_documents; |
67 | double df_ratio; |
68 | |
69 | grn_table_cursor_get_key(ctx, cursor, &key); |
70 | term_id = *(grn_id *)key; |
71 | n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); |
72 | if (n_match_documents > n_documents) { |
73 | n_match_documents = n_documents; |
74 | } |
75 | df_ratio = (double)n_match_documents / (double)n_documents; |
76 | if (min <= df_ratio && df_ratio <= max) { |
77 | posting.rid = term_id; |
78 | grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); |
79 | } |
80 | } GRN_TABLE_EACH_END(ctx, cursor); |
81 | grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op); |
82 | } else { |
83 | GRN_TABLE_EACH_BEGIN(ctx, table, cursor, term_id) { |
84 | uint32_t n_match_documents; |
85 | double df_ratio; |
86 | |
87 | n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); |
88 | if (n_match_documents > n_documents) { |
89 | n_match_documents = n_documents; |
90 | } |
91 | df_ratio = (double)n_match_documents / (double)n_documents; |
92 | { |
93 | void *key; |
94 | int key_size; |
95 | key_size = grn_table_cursor_get_key(ctx, cursor, &key); |
96 | } |
97 | if (min <= df_ratio && df_ratio <= max) { |
98 | posting.rid = term_id; |
99 | grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); |
100 | } |
101 | } GRN_TABLE_EACH_END(ctx, cursor); |
102 | } |
103 | |
104 | exit : |
105 | return rc; |
106 | } |
107 | |
108 | static grn_obj * |
109 | func_index_column_df_ratio(grn_ctx *ctx, |
110 | int n_args, |
111 | grn_obj **args, |
112 | grn_user_data *user_data) |
113 | { |
114 | grn_obj *term_table; |
115 | grn_obj *index_column_name; |
116 | grn_obj *index_column; |
117 | grn_ii *ii; |
118 | grn_id term_id; |
119 | |
120 | if (n_args != 1) { |
121 | GRN_PLUGIN_ERROR(ctx, |
122 | GRN_INVALID_ARGUMENT, |
123 | "index_column_df_ratio(): " |
124 | "wrong number of arguments (%d for 1)" , n_args - 1); |
125 | return NULL; |
126 | } |
127 | |
128 | { |
129 | grn_obj *expr; |
130 | grn_obj *variable; |
131 | |
132 | expr = grn_plugin_proc_get_caller(ctx, user_data); |
133 | if (!expr) { |
134 | GRN_PLUGIN_ERROR(ctx, |
135 | GRN_INVALID_ARGUMENT, |
136 | "index_column_df_ratio(): " |
137 | "called directly" ); |
138 | return NULL; |
139 | } |
140 | |
141 | variable = grn_expr_get_var_by_offset(ctx, expr, 0); |
142 | if (!variable) { |
143 | GRN_PLUGIN_ERROR(ctx, |
144 | GRN_INVALID_ARGUMENT, |
145 | "index_column_df_ratio(): " |
146 | "caller expression must have target record information" ); |
147 | return NULL; |
148 | } |
149 | |
150 | term_table = grn_ctx_at(ctx, variable->header.domain); |
151 | term_id = GRN_RECORD_VALUE(variable); |
152 | while (GRN_TRUE) { |
153 | grn_obj *key_type; |
154 | |
155 | key_type = grn_ctx_at(ctx, term_table->header.domain); |
156 | if (!grn_obj_is_table(ctx, key_type)) { |
157 | break; |
158 | } |
159 | |
160 | grn_table_get_key(ctx, term_table, term_id, &term_id, sizeof(grn_id)); |
161 | term_table = key_type; |
162 | } |
163 | } |
164 | |
165 | index_column_name = args[0]; |
166 | if (!grn_obj_is_text_family_bulk(ctx, index_column_name)) { |
167 | grn_obj inspected; |
168 | GRN_TEXT_INIT(&inspected, 0); |
169 | grn_inspect(ctx, &inspected, index_column_name); |
170 | GRN_PLUGIN_ERROR(ctx, |
171 | GRN_INVALID_ARGUMENT, |
172 | "index_column_df_ratio(): " |
173 | "the first argument must be index column name: %.*s" , |
174 | (int)GRN_TEXT_LEN(&inspected), |
175 | GRN_TEXT_VALUE(&inspected)); |
176 | GRN_OBJ_FIN(ctx, &inspected); |
177 | return NULL; |
178 | } |
179 | |
180 | index_column = grn_obj_column(ctx, |
181 | term_table, |
182 | GRN_TEXT_VALUE(index_column_name), |
183 | GRN_TEXT_LEN(index_column_name)); |
184 | if (!index_column) { |
185 | GRN_PLUGIN_ERROR(ctx, |
186 | GRN_INVALID_ARGUMENT, |
187 | "index_column_df_ratio(): " |
188 | "nonexistent object: <%.*s>" , |
189 | (int)GRN_TEXT_LEN(index_column_name), |
190 | GRN_TEXT_VALUE(index_column_name)); |
191 | return NULL; |
192 | } |
193 | |
194 | if (!grn_obj_is_index_column(ctx, index_column)) { |
195 | grn_obj inspected; |
196 | GRN_TEXT_INIT(&inspected, 0); |
197 | grn_inspect(ctx, &inspected, index_column); |
198 | GRN_PLUGIN_ERROR(ctx, |
199 | GRN_INVALID_ARGUMENT, |
200 | "index_column_df_ratio(): " |
201 | "the first argument must be index column: %.*s" , |
202 | (int)GRN_TEXT_LEN(&inspected), |
203 | GRN_TEXT_VALUE(&inspected)); |
204 | GRN_OBJ_FIN(ctx, &inspected); |
205 | if (grn_obj_is_accessor(ctx, index_column)) { |
206 | grn_obj_unlink(ctx, index_column); |
207 | } |
208 | return NULL; |
209 | } |
210 | |
211 | ii = (grn_ii *)index_column; |
212 | |
213 | { |
214 | grn_obj *source_table; |
215 | unsigned int n_documents; |
216 | uint32_t n_match_documents; |
217 | double df_ratio; |
218 | grn_obj *df_ratio_value; |
219 | |
220 | source_table = grn_ctx_at(ctx, grn_obj_get_range(ctx, index_column)); |
221 | n_documents = grn_table_size(ctx, source_table); |
222 | n_match_documents = grn_ii_estimate_size(ctx, ii, term_id); |
223 | if (n_match_documents > n_documents) { |
224 | n_match_documents = n_documents; |
225 | } |
226 | df_ratio = (double)n_match_documents / (double)n_documents; |
227 | |
228 | df_ratio_value = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_FLOAT, 0); |
229 | if (!df_ratio_value) { |
230 | return NULL; |
231 | } |
232 | GRN_FLOAT_SET(ctx, df_ratio_value, df_ratio); |
233 | return df_ratio_value; |
234 | } |
235 | } |
236 | |
237 | grn_rc |
238 | GRN_PLUGIN_INIT(grn_ctx *ctx) |
239 | { |
240 | return ctx->rc; |
241 | } |
242 | |
243 | grn_rc |
244 | GRN_PLUGIN_REGISTER(grn_ctx *ctx) |
245 | { |
246 | grn_obj *selector_proc; |
247 | |
248 | selector_proc = grn_proc_create(ctx, "index_column_df_ratio_between" , -1, |
249 | GRN_PROC_FUNCTION, |
250 | NULL, NULL, NULL, 0, NULL); |
251 | grn_proc_set_selector(ctx, selector_proc, |
252 | selector_index_column_df_ratio_between); |
253 | grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_NOP); |
254 | |
255 | grn_proc_create(ctx, "index_column_df_ratio" , -1, |
256 | GRN_PROC_FUNCTION, |
257 | func_index_column_df_ratio, NULL, NULL, 0, NULL); |
258 | |
259 | return ctx->rc; |
260 | } |
261 | |
262 | grn_rc |
263 | GRN_PLUGIN_FIN(grn_ctx *ctx) |
264 | { |
265 | return GRN_SUCCESS; |
266 | } |
267 | |