| 1 | /* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */ |
| 2 | /* Copyright(C) 2010-2014 Brazil |
| 3 | |
| 4 | This library is free software; you can redistribute it and/or |
| 5 | modify it under the terms of the GNU Lesser General Public |
| 6 | License version 2.1 as published by the Free Software Foundation. |
| 7 | |
| 8 | This library is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 11 | Lesser General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU Lesser General Public |
| 14 | License along with this library; if not, write to the Free Software |
| 15 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 16 | */ |
| 17 | |
| 18 | #ifdef GRN_EMBEDDED |
| 19 | # define GRN_PLUGIN_FUNCTION_TAG suggest_suggest |
| 20 | #endif |
| 21 | |
| 22 | #include <string.h> |
| 23 | |
| 24 | #include "grn_ctx.h" |
| 25 | #include "grn_db.h" |
| 26 | #include "grn_ii.h" |
| 27 | #include "grn_token_cursor.h" |
| 28 | #include "grn_output.h" |
| 29 | #include <groonga/plugin.h> |
| 30 | |
| 31 | #define VAR GRN_PROC_GET_VAR_BY_OFFSET |
| 32 | #define CONST_STR_LEN(x) x, x ? sizeof(x) - 1 : 0 |
| 33 | #define TEXT_VALUE_LEN(x) GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x) |
| 34 | |
| 35 | #define MIN_LEARN_DISTANCE (60 * GRN_TIME_USEC_PER_SEC) |
| 36 | |
| 37 | #define COMPLETE 1 |
| 38 | #define CORRECT 2 |
| 39 | #define SUGGEST 4 |
| 40 | |
| 41 | typedef enum { |
| 42 | GRN_SUGGEST_SEARCH_YES, |
| 43 | GRN_SUGGEST_SEARCH_NO, |
| 44 | GRN_SUGGEST_SEARCH_AUTO |
| 45 | } grn_suggest_search_mode; |
| 46 | |
| 47 | typedef struct { |
| 48 | grn_obj *post_event; |
| 49 | grn_obj *post_type; |
| 50 | grn_obj *post_item; |
| 51 | grn_obj *seq; |
| 52 | grn_obj *post_time; |
| 53 | grn_obj *pairs; |
| 54 | |
| 55 | int learn_distance_in_seconds; |
| 56 | |
| 57 | grn_id post_event_id; |
| 58 | grn_id post_type_id; |
| 59 | grn_id post_item_id; |
| 60 | grn_id seq_id; |
| 61 | int64_t post_time_value; |
| 62 | |
| 63 | grn_obj *seqs; |
| 64 | grn_obj *seqs_events; |
| 65 | grn_obj *events; |
| 66 | grn_obj *events_item; |
| 67 | grn_obj *events_type; |
| 68 | grn_obj *events_time; |
| 69 | grn_obj *event_types; |
| 70 | grn_obj *items; |
| 71 | grn_obj *items_freq; |
| 72 | grn_obj *items_freq2; |
| 73 | grn_obj *items_last; |
| 74 | grn_obj *pairs_pre; |
| 75 | grn_obj *pairs_post; |
| 76 | grn_obj *pairs_freq0; |
| 77 | grn_obj *pairs_freq1; |
| 78 | grn_obj *pairs_freq2; |
| 79 | |
| 80 | grn_obj dataset_name; |
| 81 | |
| 82 | grn_obj *configuration; |
| 83 | |
| 84 | grn_obj weight; |
| 85 | grn_obj pre_events; |
| 86 | |
| 87 | uint64_t key_prefix; |
| 88 | grn_obj pre_item; |
| 89 | } grn_suggest_learner; |
| 90 | |
| 91 | static int |
| 92 | grn_parse_suggest_types(grn_obj *text) |
| 93 | { |
| 94 | const char *nptr = GRN_TEXT_VALUE(text); |
| 95 | const char *end = GRN_BULK_CURR(text); |
| 96 | int types = 0; |
| 97 | while (nptr < end) { |
| 98 | if (*nptr == '|') { |
| 99 | nptr += 1; |
| 100 | continue; |
| 101 | } |
| 102 | { |
| 103 | const char string[] = "complete" ; |
| 104 | size_t length = sizeof(string) - 1; |
| 105 | if (nptr + length <= end && memcmp(nptr, string, length) == 0) { |
| 106 | types |= COMPLETE; |
| 107 | nptr += length; |
| 108 | continue; |
| 109 | } |
| 110 | } |
| 111 | { |
| 112 | const char string[] = "correct" ; |
| 113 | size_t length = sizeof(string) - 1; |
| 114 | if (nptr + length <= end && memcmp(nptr, string, length) == 0) { |
| 115 | types |= CORRECT; |
| 116 | nptr += length; |
| 117 | continue; |
| 118 | } |
| 119 | } |
| 120 | { |
| 121 | const char string[] = "suggest" ; |
| 122 | size_t length = sizeof(string) - 1; |
| 123 | if (nptr + length <= end && memcmp(nptr, string, length) == 0) { |
| 124 | types |= SUGGEST; |
| 125 | nptr += length; |
| 126 | continue; |
| 127 | } |
| 128 | } |
| 129 | break; |
| 130 | } |
| 131 | return types; |
| 132 | } |
| 133 | |
| 134 | static double |
| 135 | cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id id, |
| 136 | grn_obj *res, int query_type, int frequency_threshold, |
| 137 | double conditional_probability_threshold) |
| 138 | { |
| 139 | double max_score = 0.0; |
| 140 | if (id) { |
| 141 | grn_ii_cursor *c; |
| 142 | grn_obj *co = grn_obj_column(ctx, items, CONST_STR_LEN("co" )); |
| 143 | grn_obj *pairs = grn_ctx_at(ctx, grn_obj_get_range(ctx, co)); |
| 144 | grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq" )); |
| 145 | grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2" )); |
| 146 | grn_obj *pairs_freq, *pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post" )); |
| 147 | switch (query_type) { |
| 148 | case COMPLETE : |
| 149 | pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0" )); |
| 150 | break; |
| 151 | case CORRECT : |
| 152 | pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1" )); |
| 153 | break; |
| 154 | case SUGGEST : |
| 155 | pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2" )); |
| 156 | break; |
| 157 | default : |
| 158 | return max_score; |
| 159 | } |
| 160 | if ((c = grn_ii_cursor_open(ctx, (grn_ii *)co, id, GRN_ID_NIL, GRN_ID_MAX, |
| 161 | ((grn_ii *)co)->n_elements - 1, 0))) { |
| 162 | grn_posting *p; |
| 163 | grn_obj post, pair_freq, item_freq, item_freq2, item_boost; |
| 164 | GRN_RECORD_INIT(&post, 0, grn_obj_id(ctx, items)); |
| 165 | GRN_INT32_INIT(&pair_freq, 0); |
| 166 | GRN_INT32_INIT(&item_freq, 0); |
| 167 | GRN_INT32_INIT(&item_freq2, 0); |
| 168 | GRN_INT32_INIT(&item_boost, 0); |
| 169 | while ((p = grn_ii_cursor_next(ctx, c))) { |
| 170 | grn_id post_id; |
| 171 | int pfreq, ifreq, ifreq2, boost; |
| 172 | double conditional_probability; |
| 173 | GRN_BULK_REWIND(&post); |
| 174 | GRN_BULK_REWIND(&pair_freq); |
| 175 | GRN_BULK_REWIND(&item_freq); |
| 176 | GRN_BULK_REWIND(&item_freq2); |
| 177 | GRN_BULK_REWIND(&item_boost); |
| 178 | grn_obj_get_value(ctx, pairs_post, p->rid, &post); |
| 179 | grn_obj_get_value(ctx, pairs_freq, p->rid, &pair_freq); |
| 180 | post_id = GRN_RECORD_VALUE(&post); |
| 181 | grn_obj_get_value(ctx, items_freq, post_id, &item_freq); |
| 182 | grn_obj_get_value(ctx, items_freq2, post_id, &item_freq2); |
| 183 | grn_obj_get_value(ctx, items_boost, post_id, &item_boost); |
| 184 | pfreq = GRN_INT32_VALUE(&pair_freq); |
| 185 | ifreq = GRN_INT32_VALUE(&item_freq); |
| 186 | ifreq2 = GRN_INT32_VALUE(&item_freq2); |
| 187 | if (ifreq2 > 0) { |
| 188 | conditional_probability = (double)pfreq / (double)ifreq2; |
| 189 | } else { |
| 190 | conditional_probability = 0.0; |
| 191 | } |
| 192 | boost = GRN_INT32_VALUE(&item_boost); |
| 193 | if (pfreq >= frequency_threshold && ifreq >= frequency_threshold && |
| 194 | conditional_probability >= conditional_probability_threshold && |
| 195 | boost >= 0) { |
| 196 | grn_rset_recinfo *ri; |
| 197 | void *value; |
| 198 | double score = pfreq; |
| 199 | int added; |
| 200 | if (max_score < score + boost) { max_score = score + boost; } |
| 201 | /* put any formula if desired */ |
| 202 | if (grn_hash_add(ctx, (grn_hash *)res, |
| 203 | &post_id, sizeof(grn_id), &value, &added)) { |
| 204 | ri = value; |
| 205 | ri->score += score; |
| 206 | if (added) { |
| 207 | ri->score += boost; |
| 208 | } |
| 209 | } |
| 210 | } |
| 211 | } |
| 212 | GRN_OBJ_FIN(ctx, &post); |
| 213 | GRN_OBJ_FIN(ctx, &pair_freq); |
| 214 | GRN_OBJ_FIN(ctx, &item_freq); |
| 215 | GRN_OBJ_FIN(ctx, &item_freq2); |
| 216 | GRN_OBJ_FIN(ctx, &item_boost); |
| 217 | grn_ii_cursor_close(ctx, c); |
| 218 | } |
| 219 | } |
| 220 | return max_score; |
| 221 | } |
| 222 | |
| 223 | #define DEFAULT_LIMIT 10 |
| 224 | #define DEFAULT_SORTBY "-_score" |
| 225 | #define DEFAULT_OUTPUT_COLUMNS "_key,_score" |
| 226 | #define DEFAULT_FREQUENCY_THRESHOLD 100 |
| 227 | #define DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD 0.2 |
| 228 | |
| 229 | static void |
| 230 | output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid, |
| 231 | grn_obj *sortby, grn_obj *output_columns, int offset, int limit) |
| 232 | { |
| 233 | grn_obj *sorted; |
| 234 | if ((sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, res))) { |
| 235 | uint32_t nkeys; |
| 236 | grn_obj_format format; |
| 237 | grn_table_sort_key *keys; |
| 238 | const char *sortby_val = GRN_TEXT_VALUE(sortby); |
| 239 | unsigned int sortby_len = GRN_TEXT_LEN(sortby); |
| 240 | const char *oc_val = GRN_TEXT_VALUE(output_columns); |
| 241 | unsigned int oc_len = GRN_TEXT_LEN(output_columns); |
| 242 | if (!sortby_val || !sortby_len) { |
| 243 | sortby_val = DEFAULT_SORTBY; |
| 244 | sortby_len = sizeof(DEFAULT_SORTBY) - 1; |
| 245 | } |
| 246 | if (!oc_val || !oc_len) { |
| 247 | oc_val = DEFAULT_OUTPUT_COLUMNS; |
| 248 | oc_len = sizeof(DEFAULT_OUTPUT_COLUMNS) - 1; |
| 249 | } |
| 250 | if ((keys = grn_table_sort_key_from_str(ctx, sortby_val, sortby_len, res, &nkeys))) { |
| 251 | grn_table_sort(ctx, res, offset, limit, sorted, keys, nkeys); |
| 252 | GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, |
| 253 | ":" , "sort(%d)" , limit); |
| 254 | GRN_OBJ_FORMAT_INIT(&format, grn_table_size(ctx, res), 0, limit, offset); |
| 255 | format.flags = |
| 256 | GRN_OBJ_FORMAT_WITH_COLUMN_NAMES| |
| 257 | GRN_OBJ_FORMAT_XML_ELEMENT_RESULTSET; |
| 258 | grn_obj_columns(ctx, sorted, oc_val, oc_len, &format.columns); |
| 259 | GRN_OUTPUT_OBJ(sorted, &format); |
| 260 | GRN_OBJ_FORMAT_FIN(ctx, &format); |
| 261 | grn_table_sort_key_close(ctx, keys, nkeys); |
| 262 | } |
| 263 | grn_obj_unlink(ctx, sorted); |
| 264 | } else { |
| 265 | ERR(GRN_UNKNOWN_ERROR, "cannot create temporary sort table." ); |
| 266 | } |
| 267 | } |
| 268 | |
| 269 | static inline void |
| 270 | complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int frequency_threshold, |
| 271 | grn_obj *items_freq, grn_obj *items_boost, |
| 272 | grn_obj *item_freq, grn_obj *item_boost) |
| 273 | { |
| 274 | GRN_BULK_REWIND(item_freq); |
| 275 | GRN_BULK_REWIND(item_boost); |
| 276 | grn_obj_get_value(ctx, items_freq, id, item_freq); |
| 277 | grn_obj_get_value(ctx, items_boost, id, item_boost); |
| 278 | if (GRN_INT32_VALUE(item_boost) >= 0) { |
| 279 | double score; |
| 280 | score = 1 + |
| 281 | GRN_INT32_VALUE(item_freq) + |
| 282 | GRN_INT32_VALUE(item_boost); |
| 283 | if (score >= frequency_threshold) { |
| 284 | void *value; |
| 285 | if (grn_hash_add(ctx, (grn_hash *)res, &id, sizeof(grn_id), |
| 286 | &value, NULL)) { |
| 287 | grn_rset_recinfo *ri; |
| 288 | ri = value; |
| 289 | ri->score += score; |
| 290 | } |
| 291 | } |
| 292 | } |
| 293 | } |
| 294 | |
| 295 | static void |
| 296 | complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col, |
| 297 | grn_obj *query, grn_obj *sortby, |
| 298 | grn_obj *output_columns, int offset, int limit, |
| 299 | int frequency_threshold, double conditional_probability_threshold, |
| 300 | grn_suggest_search_mode prefix_search_mode) |
| 301 | { |
| 302 | grn_obj *res; |
| 303 | grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq" )); |
| 304 | grn_obj item_freq, item_boost; |
| 305 | GRN_INT32_INIT(&item_freq, 0); |
| 306 | GRN_INT32_INIT(&item_boost, 0); |
| 307 | if ((res = grn_table_create(ctx, NULL, 0, NULL, |
| 308 | GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { |
| 309 | grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); |
| 310 | if (GRN_TEXT_LEN(query)) { |
| 311 | grn_table_cursor *cur; |
| 312 | /* RK search + prefix search */ |
| 313 | grn_obj *index; |
| 314 | /* FIXME: support index selection */ |
| 315 | if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) { |
| 316 | if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain), |
| 317 | GRN_TEXT_VALUE(query), |
| 318 | GRN_TEXT_LEN(query), |
| 319 | NULL, 0, 0, -1, |
| 320 | GRN_CURSOR_PREFIX|GRN_CURSOR_RK))) { |
| 321 | grn_id id; |
| 322 | while ((id = grn_table_cursor_next(ctx, cur))) { |
| 323 | grn_ii_cursor *icur; |
| 324 | if ((icur = grn_ii_cursor_open(ctx, (grn_ii *)index, id, |
| 325 | GRN_ID_NIL, GRN_ID_MAX, 1, 0))) { |
| 326 | grn_posting *p; |
| 327 | while ((p = grn_ii_cursor_next(ctx, icur))) { |
| 328 | complete_add_item(ctx, p->rid, res, frequency_threshold, |
| 329 | items_freq, items_boost, |
| 330 | &item_freq, &item_boost); |
| 331 | } |
| 332 | grn_ii_cursor_close(ctx, icur); |
| 333 | } |
| 334 | } |
| 335 | grn_table_cursor_close(ctx, cur); |
| 336 | } else { |
| 337 | ERR(GRN_UNKNOWN_ERROR, "cannot open cursor for prefix RK search." ); |
| 338 | } |
| 339 | } else { |
| 340 | ERR(GRN_UNKNOWN_ERROR, "cannot find index for prefix RK search." ); |
| 341 | } |
| 342 | cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE, |
| 343 | frequency_threshold, |
| 344 | conditional_probability_threshold); |
| 345 | if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) || |
| 346 | (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO && |
| 347 | !grn_table_size(ctx, res))) && |
| 348 | (cur = grn_table_cursor_open(ctx, items, |
| 349 | GRN_TEXT_VALUE(query), |
| 350 | GRN_TEXT_LEN(query), |
| 351 | NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) { |
| 352 | grn_id id; |
| 353 | while ((id = grn_table_cursor_next(ctx, cur))) { |
| 354 | complete_add_item(ctx, id, res, frequency_threshold, |
| 355 | items_freq, items_boost, &item_freq, &item_boost); |
| 356 | } |
| 357 | grn_table_cursor_close(ctx, cur); |
| 358 | } |
| 359 | } |
| 360 | output(ctx, items, res, tid, sortby, output_columns, offset, limit); |
| 361 | grn_obj_close(ctx, res); |
| 362 | } else { |
| 363 | ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table." ); |
| 364 | } |
| 365 | GRN_OBJ_FIN(ctx, &item_boost); |
| 366 | GRN_OBJ_FIN(ctx, &item_freq); |
| 367 | } |
| 368 | |
| 369 | static void |
| 370 | correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, |
| 371 | grn_obj *query, grn_obj *sortby, |
| 372 | grn_obj *output_columns, int offset, int limit, |
| 373 | int frequency_threshold, double conditional_probability_threshold, |
| 374 | grn_suggest_search_mode similar_search_mode) |
| 375 | { |
| 376 | grn_obj *res; |
| 377 | grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2" )); |
| 378 | grn_obj item_freq2, item_boost; |
| 379 | GRN_INT32_INIT(&item_freq2, 0); |
| 380 | GRN_INT32_INIT(&item_boost, 0); |
| 381 | if ((res = grn_table_create(ctx, NULL, 0, NULL, |
| 382 | GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { |
| 383 | grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); |
| 384 | double max_score; |
| 385 | max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT, |
| 386 | frequency_threshold, |
| 387 | conditional_probability_threshold); |
| 388 | GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SCORE, |
| 389 | ":" , "cooccur(%f)" , max_score); |
| 390 | if (GRN_TEXT_LEN(query) && |
| 391 | ((similar_search_mode == GRN_SUGGEST_SEARCH_YES) || |
| 392 | (similar_search_mode == GRN_SUGGEST_SEARCH_AUTO && |
| 393 | max_score < frequency_threshold))) { |
| 394 | grn_obj *key, *index; |
| 395 | if ((key = grn_obj_column(ctx, items, |
| 396 | GRN_COLUMN_NAME_KEY, |
| 397 | GRN_COLUMN_NAME_KEY_LEN))) { |
| 398 | if (grn_column_index(ctx, key, GRN_OP_MATCH, &index, 1, NULL)) { |
| 399 | grn_select_optarg optarg; |
| 400 | memset(&optarg, 0, sizeof(grn_select_optarg)); |
| 401 | optarg.mode = GRN_OP_SIMILAR; |
| 402 | optarg.similarity_threshold = 0; |
| 403 | optarg.max_size = 2; |
| 404 | grn_ii_select(ctx, (grn_ii *)index, TEXT_VALUE_LEN(query), |
| 405 | (grn_hash *)res, GRN_OP_OR, &optarg); |
| 406 | grn_obj_unlink(ctx, index); |
| 407 | GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, |
| 408 | ":" , "similar(%d)" , grn_table_size(ctx, res)); |
| 409 | { |
| 410 | grn_hash_cursor *hc = grn_hash_cursor_open(ctx, (grn_hash *)res, NULL, |
| 411 | 0, NULL, 0, 0, -1, 0); |
| 412 | if (hc) { |
| 413 | while (grn_hash_cursor_next(ctx, hc)) { |
| 414 | void *key, *value; |
| 415 | if (grn_hash_cursor_get_key_value(ctx, hc, &key, NULL, &value)) { |
| 416 | grn_id *rp; |
| 417 | rp = key; |
| 418 | GRN_BULK_REWIND(&item_freq2); |
| 419 | GRN_BULK_REWIND(&item_boost); |
| 420 | grn_obj_get_value(ctx, items_freq2, *rp, &item_freq2); |
| 421 | grn_obj_get_value(ctx, items_boost, *rp, &item_boost); |
| 422 | if (GRN_INT32_VALUE(&item_boost) >= 0) { |
| 423 | double score; |
| 424 | grn_rset_recinfo *ri; |
| 425 | score = 1 + |
| 426 | (GRN_INT32_VALUE(&item_freq2) >> 4) + |
| 427 | GRN_INT32_VALUE(&item_boost); |
| 428 | ri = value; |
| 429 | ri->score += score; |
| 430 | if (score >= frequency_threshold) { continue; } |
| 431 | } |
| 432 | /* score < frequency_threshold || item_boost < 0 */ |
| 433 | grn_hash_cursor_delete(ctx, hc, NULL); |
| 434 | } |
| 435 | } |
| 436 | grn_hash_cursor_close(ctx, hc); |
| 437 | } |
| 438 | } |
| 439 | GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE, |
| 440 | ":" , "filter(%d)" , grn_table_size(ctx, res)); |
| 441 | { |
| 442 | /* exec _score -= edit_distance(_key, "query string") for all records */ |
| 443 | grn_obj *var; |
| 444 | grn_obj *expr; |
| 445 | |
| 446 | GRN_EXPR_CREATE_FOR_QUERY(ctx, res, expr, var); |
| 447 | if (expr) { |
| 448 | grn_table_cursor *tc; |
| 449 | grn_obj *score = grn_obj_column(ctx, res, |
| 450 | GRN_COLUMN_NAME_SCORE, |
| 451 | GRN_COLUMN_NAME_SCORE_LEN); |
| 452 | grn_obj *key = grn_obj_column(ctx, res, |
| 453 | GRN_COLUMN_NAME_KEY, |
| 454 | GRN_COLUMN_NAME_KEY_LEN); |
| 455 | grn_expr_append_obj(ctx, expr, |
| 456 | score, |
| 457 | GRN_OP_GET_VALUE, 1); |
| 458 | grn_expr_append_obj(ctx, expr, |
| 459 | grn_ctx_get(ctx, CONST_STR_LEN("edit_distance" )), |
| 460 | GRN_OP_PUSH, 1); |
| 461 | grn_expr_append_obj(ctx, expr, |
| 462 | key, |
| 463 | GRN_OP_GET_VALUE, 1); |
| 464 | grn_expr_append_const(ctx, expr, query, GRN_OP_PUSH, 1); |
| 465 | grn_expr_append_op(ctx, expr, GRN_OP_CALL, 2); |
| 466 | grn_expr_append_op(ctx, expr, GRN_OP_MINUS_ASSIGN, 2); |
| 467 | |
| 468 | if ((tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, 0))) { |
| 469 | grn_id id; |
| 470 | grn_obj score_value; |
| 471 | GRN_FLOAT_INIT(&score_value, 0); |
| 472 | while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { |
| 473 | GRN_RECORD_SET(ctx, var, id); |
| 474 | grn_expr_exec(ctx, expr, 0); |
| 475 | GRN_BULK_REWIND(&score_value); |
| 476 | grn_obj_get_value(ctx, score, id, &score_value); |
| 477 | if (GRN_FLOAT_VALUE(&score_value) < frequency_threshold) { |
| 478 | grn_table_cursor_delete(ctx, tc); |
| 479 | } |
| 480 | } |
| 481 | grn_obj_unlink(ctx, &score_value); |
| 482 | grn_table_cursor_close(ctx, tc); |
| 483 | } |
| 484 | grn_obj_unlink(ctx, score); |
| 485 | grn_obj_unlink(ctx, key); |
| 486 | grn_obj_unlink(ctx, expr); |
| 487 | } else { |
| 488 | ERR(GRN_UNKNOWN_ERROR, |
| 489 | "error on building expr. for calicurating edit distance" ); |
| 490 | } |
| 491 | } |
| 492 | } |
| 493 | grn_obj_unlink(ctx, key); |
| 494 | } |
| 495 | } |
| 496 | output(ctx, items, res, tid, sortby, output_columns, offset, limit); |
| 497 | grn_obj_close(ctx, res); |
| 498 | } else { |
| 499 | ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table." ); |
| 500 | } |
| 501 | GRN_OBJ_FIN(ctx, &item_boost); |
| 502 | GRN_OBJ_FIN(ctx, &item_freq2); |
| 503 | } |
| 504 | |
| 505 | static void |
| 506 | suggest(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, |
| 507 | grn_obj *query, grn_obj *sortby, |
| 508 | grn_obj *output_columns, int offset, int limit, |
| 509 | int frequency_threshold, double conditional_probability_threshold) |
| 510 | { |
| 511 | grn_obj *res; |
| 512 | if ((res = grn_table_create(ctx, NULL, 0, NULL, |
| 513 | GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) { |
| 514 | grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query)); |
| 515 | cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST, |
| 516 | frequency_threshold, conditional_probability_threshold); |
| 517 | output(ctx, items, res, tid, sortby, output_columns, offset, limit); |
| 518 | grn_obj_close(ctx, res); |
| 519 | } else { |
| 520 | ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table." ); |
| 521 | } |
| 522 | } |
| 523 | |
| 524 | static grn_suggest_search_mode |
| 525 | parse_search_mode(grn_ctx *ctx, grn_obj *mode_text) |
| 526 | { |
| 527 | grn_suggest_search_mode mode; |
| 528 | int mode_length; |
| 529 | |
| 530 | mode_length = GRN_TEXT_LEN(mode_text); |
| 531 | if (mode_length == 3 && |
| 532 | grn_strncasecmp("yes" , GRN_TEXT_VALUE(mode_text), 3) == 0) { |
| 533 | mode = GRN_SUGGEST_SEARCH_YES; |
| 534 | } else if (mode_length == 2 && |
| 535 | grn_strncasecmp("no" , GRN_TEXT_VALUE(mode_text), 2) == 0) { |
| 536 | mode = GRN_SUGGEST_SEARCH_NO; |
| 537 | } else { |
| 538 | mode = GRN_SUGGEST_SEARCH_AUTO; |
| 539 | } |
| 540 | |
| 541 | return mode; |
| 542 | } |
| 543 | |
| 544 | static grn_obj * |
| 545 | command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
| 546 | { |
| 547 | grn_obj *items, *col, *items_boost; |
| 548 | int types; |
| 549 | int offset = 0; |
| 550 | int limit = DEFAULT_LIMIT; |
| 551 | int frequency_threshold = DEFAULT_FREQUENCY_THRESHOLD; |
| 552 | double conditional_probability_threshold = |
| 553 | DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD; |
| 554 | grn_suggest_search_mode prefix_search_mode; |
| 555 | grn_suggest_search_mode similar_search_mode; |
| 556 | |
| 557 | types = grn_parse_suggest_types(VAR(0)); |
| 558 | if (GRN_TEXT_LEN(VAR(6)) > 0) { |
| 559 | offset = grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL); |
| 560 | } |
| 561 | if (GRN_TEXT_LEN(VAR(7)) > 0) { |
| 562 | limit = grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL); |
| 563 | } |
| 564 | if (GRN_TEXT_LEN(VAR(8)) > 0) { |
| 565 | frequency_threshold = grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL); |
| 566 | } |
| 567 | if (GRN_TEXT_LEN(VAR(9)) > 0) { |
| 568 | GRN_TEXT_PUTC(ctx, VAR(9), '\0'); |
| 569 | conditional_probability_threshold = strtod(GRN_TEXT_VALUE(VAR(9)), NULL); |
| 570 | } |
| 571 | |
| 572 | prefix_search_mode = parse_search_mode(ctx, VAR(10)); |
| 573 | similar_search_mode = parse_search_mode(ctx, VAR(11)); |
| 574 | |
| 575 | if ((items = grn_ctx_get(ctx, TEXT_VALUE_LEN(VAR(1))))) { |
| 576 | if ((items_boost = grn_obj_column(ctx, items, CONST_STR_LEN("boost" )))) { |
| 577 | int n_outputs = 0; |
| 578 | if (types & COMPLETE) { |
| 579 | n_outputs++; |
| 580 | } |
| 581 | if (types & CORRECT) { |
| 582 | n_outputs++; |
| 583 | } |
| 584 | if (types & SUGGEST) { |
| 585 | n_outputs++; |
| 586 | } |
| 587 | GRN_OUTPUT_MAP_OPEN("RESULT_SET" , n_outputs); |
| 588 | |
| 589 | if (types & COMPLETE) { |
| 590 | if ((col = grn_obj_column(ctx, items, TEXT_VALUE_LEN(VAR(2))))) { |
| 591 | GRN_OUTPUT_CSTR("complete" ); |
| 592 | complete(ctx, items, items_boost, col, VAR(3), VAR(4), |
| 593 | VAR(5), offset, limit, |
| 594 | frequency_threshold, conditional_probability_threshold, |
| 595 | prefix_search_mode); |
| 596 | } else { |
| 597 | ERR(GRN_INVALID_ARGUMENT, "invalid column." ); |
| 598 | } |
| 599 | } |
| 600 | if (types & CORRECT) { |
| 601 | GRN_OUTPUT_CSTR("correct" ); |
| 602 | correct(ctx, items, items_boost, VAR(3), VAR(4), |
| 603 | VAR(5), offset, limit, |
| 604 | frequency_threshold, conditional_probability_threshold, |
| 605 | similar_search_mode); |
| 606 | } |
| 607 | if (types & SUGGEST) { |
| 608 | GRN_OUTPUT_CSTR("suggest" ); |
| 609 | suggest(ctx, items, items_boost, VAR(3), VAR(4), |
| 610 | VAR(5), offset, limit, |
| 611 | frequency_threshold, conditional_probability_threshold); |
| 612 | } |
| 613 | GRN_OUTPUT_MAP_CLOSE(); |
| 614 | } else { |
| 615 | ERR(GRN_INVALID_ARGUMENT, "nonexistent column: <%.*s.boost>" , |
| 616 | (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); |
| 617 | } |
| 618 | grn_obj_unlink(ctx, items); |
| 619 | } else { |
| 620 | ERR(GRN_INVALID_ARGUMENT, "nonexistent table: <%.*s>" , |
| 621 | (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1))); |
| 622 | } |
| 623 | return NULL; |
| 624 | } |
| 625 | |
| 626 | static void |
| 627 | learner_init_values(grn_ctx *ctx, grn_suggest_learner *learner) |
| 628 | { |
| 629 | learner->post_event_id = GRN_RECORD_VALUE(learner->post_event); |
| 630 | learner->post_type_id = GRN_RECORD_VALUE(learner->post_type); |
| 631 | learner->post_item_id = GRN_RECORD_VALUE(learner->post_item); |
| 632 | learner->seq_id = GRN_RECORD_VALUE(learner->seq); |
| 633 | learner->post_time_value = GRN_TIME_VALUE(learner->post_time); |
| 634 | } |
| 635 | |
| 636 | static void |
| 637 | learner_init(grn_ctx *ctx, grn_suggest_learner *learner, |
| 638 | grn_obj *post_event, grn_obj *post_type, grn_obj *post_item, |
| 639 | grn_obj *seq, grn_obj *post_time, grn_obj *pairs) |
| 640 | { |
| 641 | learner->post_event = post_event; |
| 642 | learner->post_type = post_type; |
| 643 | learner->post_item = post_item; |
| 644 | learner->seq = seq; |
| 645 | learner->post_time = post_time; |
| 646 | learner->pairs = pairs; |
| 647 | |
| 648 | learner->learn_distance_in_seconds = 0; |
| 649 | |
| 650 | learner_init_values(ctx, learner); |
| 651 | } |
| 652 | |
| 653 | static void |
| 654 | learner_init_columns(grn_ctx *ctx, grn_suggest_learner *learner) |
| 655 | { |
| 656 | grn_id events_id, event_types_id; |
| 657 | grn_obj *seqs, *events, *post_item, *items, *pairs; |
| 658 | |
| 659 | learner->seqs = seqs = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(learner->seq)); |
| 660 | learner->seqs_events = grn_obj_column(ctx, seqs, CONST_STR_LEN("events" )); |
| 661 | |
| 662 | events_id = grn_obj_get_range(ctx, learner->seqs_events); |
| 663 | learner->events = events = grn_ctx_at(ctx, events_id); |
| 664 | learner->events_item = grn_obj_column(ctx, events, CONST_STR_LEN("item" )); |
| 665 | learner->events_type = grn_obj_column(ctx, events, CONST_STR_LEN("type" )); |
| 666 | learner->events_time = grn_obj_column(ctx, events, CONST_STR_LEN("time" )); |
| 667 | |
| 668 | event_types_id = grn_obj_get_range(ctx, learner->events_type); |
| 669 | learner->event_types = grn_obj_column(ctx, events, CONST_STR_LEN("time" )); |
| 670 | |
| 671 | post_item = learner->post_item; |
| 672 | learner->items = items = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(post_item)); |
| 673 | learner->items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq" )); |
| 674 | learner->items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2" )); |
| 675 | learner->items_last = grn_obj_column(ctx, items, CONST_STR_LEN("last" )); |
| 676 | |
| 677 | pairs = learner->pairs; |
| 678 | learner->pairs_pre = grn_obj_column(ctx, pairs, CONST_STR_LEN("pre" )); |
| 679 | learner->pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post" )); |
| 680 | learner->pairs_freq0 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0" )); |
| 681 | learner->pairs_freq1 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1" )); |
| 682 | learner->pairs_freq2 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2" )); |
| 683 | } |
| 684 | |
| 685 | static void |
| 686 | learner_fin_columns(grn_ctx *ctx, grn_suggest_learner *learner) |
| 687 | { |
| 688 | grn_obj_unlink(ctx, learner->seqs); |
| 689 | grn_obj_unlink(ctx, learner->seqs_events); |
| 690 | |
| 691 | grn_obj_unlink(ctx, learner->events); |
| 692 | grn_obj_unlink(ctx, learner->events_item); |
| 693 | grn_obj_unlink(ctx, learner->events_type); |
| 694 | grn_obj_unlink(ctx, learner->events_time); |
| 695 | |
| 696 | grn_obj_unlink(ctx, learner->event_types); |
| 697 | |
| 698 | grn_obj_unlink(ctx, learner->items); |
| 699 | grn_obj_unlink(ctx, learner->items_freq); |
| 700 | grn_obj_unlink(ctx, learner->items_freq2); |
| 701 | grn_obj_unlink(ctx, learner->items_last); |
| 702 | |
| 703 | grn_obj_unlink(ctx, learner->pairs_pre); |
| 704 | grn_obj_unlink(ctx, learner->pairs_post); |
| 705 | grn_obj_unlink(ctx, learner->pairs_freq0); |
| 706 | grn_obj_unlink(ctx, learner->pairs_freq1); |
| 707 | grn_obj_unlink(ctx, learner->pairs_freq2); |
| 708 | } |
| 709 | |
| 710 | static void |
| 711 | learner_init_weight(grn_ctx *ctx, grn_suggest_learner *learner) |
| 712 | { |
| 713 | grn_obj *weight_column = NULL; |
| 714 | unsigned int weight = 1; |
| 715 | |
| 716 | if (learner->configuration) { |
| 717 | weight_column = grn_obj_column(ctx, |
| 718 | learner->configuration, |
| 719 | CONST_STR_LEN("weight" )); |
| 720 | } |
| 721 | if (weight_column) { |
| 722 | grn_id id; |
| 723 | id = grn_table_get(ctx, learner->configuration, |
| 724 | GRN_TEXT_VALUE(&(learner->dataset_name)), |
| 725 | GRN_TEXT_LEN(&(learner->dataset_name))); |
| 726 | if (id != GRN_ID_NIL) { |
| 727 | grn_obj weight_value; |
| 728 | GRN_UINT32_INIT(&weight_value, 0); |
| 729 | grn_obj_get_value(ctx, weight_column, id, &weight_value); |
| 730 | weight = GRN_UINT32_VALUE(&weight_value); |
| 731 | GRN_OBJ_FIN(ctx, &weight_value); |
| 732 | } |
| 733 | grn_obj_unlink(ctx, weight_column); |
| 734 | } |
| 735 | |
| 736 | GRN_UINT32_INIT(&(learner->weight), 0); |
| 737 | GRN_UINT32_SET(ctx, &(learner->weight), weight); |
| 738 | } |
| 739 | |
| 740 | static void |
| 741 | learner_init_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner) |
| 742 | { |
| 743 | char events_name[GRN_TABLE_MAX_KEY_SIZE]; |
| 744 | unsigned int events_name_size; |
| 745 | unsigned int events_name_prefix_size; |
| 746 | |
| 747 | events_name_size = grn_obj_name(ctx, learner->events, |
| 748 | events_name, GRN_TABLE_MAX_KEY_SIZE); |
| 749 | GRN_TEXT_INIT(&(learner->dataset_name), 0); |
| 750 | events_name_prefix_size = strlen("event_" ); |
| 751 | if (events_name_size > events_name_prefix_size) { |
| 752 | GRN_TEXT_PUT(ctx, |
| 753 | &(learner->dataset_name), |
| 754 | events_name + events_name_prefix_size, |
| 755 | events_name_size - events_name_prefix_size); |
| 756 | } |
| 757 | } |
| 758 | |
| 759 | static void |
| 760 | learner_fin_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner) |
| 761 | { |
| 762 | GRN_OBJ_FIN(ctx, &(learner->dataset_name)); |
| 763 | } |
| 764 | |
| 765 | static void |
| 766 | learner_init_configuration(grn_ctx *ctx, grn_suggest_learner *learner) |
| 767 | { |
| 768 | learner->configuration = grn_ctx_get(ctx, "configuration" , -1); |
| 769 | } |
| 770 | |
| 771 | static void |
| 772 | learner_fin_configuration(grn_ctx *ctx, grn_suggest_learner *learner) |
| 773 | { |
| 774 | if (learner->configuration) { |
| 775 | grn_obj_unlink(ctx, learner->configuration); |
| 776 | } |
| 777 | } |
| 778 | |
| 779 | static void |
| 780 | learner_init_buffers(grn_ctx *ctx, grn_suggest_learner *learner) |
| 781 | { |
| 782 | learner_init_weight(ctx, learner); |
| 783 | GRN_RECORD_INIT(&(learner->pre_events), 0, grn_obj_id(ctx, learner->events)); |
| 784 | } |
| 785 | |
| 786 | static void |
| 787 | learner_fin_buffers(grn_ctx *ctx, grn_suggest_learner *learner) |
| 788 | { |
| 789 | grn_obj_unlink(ctx, &(learner->weight)); |
| 790 | grn_obj_unlink(ctx, &(learner->pre_events)); |
| 791 | } |
| 792 | |
| 793 | static void |
| 794 | learner_init_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner) |
| 795 | { |
| 796 | grn_id items_id; |
| 797 | |
| 798 | learner->key_prefix = ((uint64_t)learner->post_item_id) << 32; |
| 799 | |
| 800 | items_id = grn_obj_get_range(ctx, learner->events_item); |
| 801 | GRN_RECORD_INIT(&(learner->pre_item), 0, items_id); |
| 802 | |
| 803 | grn_obj_get_value(ctx, learner->seqs_events, learner->seq_id, |
| 804 | &(learner->pre_events)); |
| 805 | } |
| 806 | |
| 807 | static void |
| 808 | learner_fin_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner) |
| 809 | { |
| 810 | grn_obj_unlink(ctx, &(learner->pre_item)); |
| 811 | GRN_BULK_REWIND(&(learner->pre_events)); |
| 812 | } |
| 813 | |
| 814 | static grn_bool |
| 815 | learner_is_valid_input(grn_ctx *ctx, grn_suggest_learner *learner) |
| 816 | { |
| 817 | return learner->post_event_id && learner->post_item_id && learner->seq_id; |
| 818 | } |
| 819 | |
| 820 | static void |
| 821 | learner_increment(grn_ctx *ctx, grn_suggest_learner *learner, |
| 822 | grn_obj *column, grn_id record_id) |
| 823 | { |
| 824 | grn_obj_set_value(ctx, column, record_id, &(learner->weight), GRN_OBJ_INCR); |
| 825 | } |
| 826 | |
| 827 | static void |
| 828 | learner_increment_item_freq(grn_ctx *ctx, grn_suggest_learner *learner, |
| 829 | grn_obj *column) |
| 830 | { |
| 831 | learner_increment(ctx, learner, column, learner->post_item_id); |
| 832 | } |
| 833 | |
| 834 | static void |
| 835 | learner_set_last_post_time(grn_ctx *ctx, grn_suggest_learner *learner) |
| 836 | { |
| 837 | grn_obj_set_value(ctx, learner->items_last, learner->post_item_id, |
| 838 | learner->post_time, GRN_OBJ_SET); |
| 839 | } |
| 840 | |
| 841 | static void |
| 842 | learner_learn_for_complete_and_correcnt(grn_ctx *ctx, |
| 843 | grn_suggest_learner *learner) |
| 844 | { |
| 845 | grn_obj *pre_item, *post_item, *pre_events; |
| 846 | grn_obj pre_type, pre_time; |
| 847 | grn_id *ep, *es; |
| 848 | uint64_t key; |
| 849 | int64_t post_time_value; |
| 850 | |
| 851 | pre_item = &(learner->pre_item); |
| 852 | post_item = learner->post_item; |
| 853 | pre_events = &(learner->pre_events); |
| 854 | post_time_value = learner->post_time_value; |
| 855 | GRN_RECORD_INIT(&pre_type, 0, grn_obj_get_range(ctx, learner->events_type)); |
| 856 | GRN_TIME_INIT(&pre_time, 0); |
| 857 | ep = (grn_id *)GRN_BULK_CURR(pre_events); |
| 858 | es = (grn_id *)GRN_BULK_HEAD(pre_events); |
| 859 | while (es < ep--) { |
| 860 | grn_id pair_id; |
| 861 | int added; |
| 862 | int64_t learn_distance; |
| 863 | |
| 864 | GRN_BULK_REWIND(&pre_type); |
| 865 | GRN_BULK_REWIND(&pre_time); |
| 866 | GRN_BULK_REWIND(pre_item); |
| 867 | grn_obj_get_value(ctx, learner->events_type, *ep, &pre_type); |
| 868 | grn_obj_get_value(ctx, learner->events_time, *ep, &pre_time); |
| 869 | grn_obj_get_value(ctx, learner->events_item, *ep, pre_item); |
| 870 | learn_distance = post_time_value - GRN_TIME_VALUE(&pre_time); |
| 871 | if (learn_distance >= MIN_LEARN_DISTANCE) { |
| 872 | learner->learn_distance_in_seconds = |
| 873 | (int)(learn_distance / GRN_TIME_USEC_PER_SEC); |
| 874 | break; |
| 875 | } |
| 876 | key = learner->key_prefix + GRN_RECORD_VALUE(pre_item); |
| 877 | pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t), |
| 878 | &added); |
| 879 | if (added) { |
| 880 | grn_obj_set_value(ctx, learner->pairs_pre, pair_id, pre_item, |
| 881 | GRN_OBJ_SET); |
| 882 | grn_obj_set_value(ctx, learner->pairs_post, pair_id, post_item, |
| 883 | GRN_OBJ_SET); |
| 884 | } |
| 885 | if (GRN_RECORD_VALUE(&pre_type)) { |
| 886 | learner_increment(ctx, learner, learner->pairs_freq1, pair_id); |
| 887 | break; |
| 888 | } else { |
| 889 | learner_increment(ctx, learner, learner->pairs_freq0, pair_id); |
| 890 | } |
| 891 | } |
| 892 | GRN_OBJ_FIN(ctx, &pre_type); |
| 893 | GRN_OBJ_FIN(ctx, &pre_time); |
| 894 | } |
| 895 | |
| 896 | static void |
| 897 | learner_learn_for_suggest(grn_ctx *ctx, grn_suggest_learner *learner) |
| 898 | { |
| 899 | char keybuf[GRN_TABLE_MAX_KEY_SIZE]; |
| 900 | int keylen = grn_table_get_key(ctx, learner->items, learner->post_item_id, |
| 901 | keybuf, GRN_TABLE_MAX_KEY_SIZE); |
| 902 | unsigned int token_flags = 0; |
| 903 | grn_token_cursor *token_cursor = |
| 904 | grn_token_cursor_open(ctx, learner->items, keybuf, keylen, |
| 905 | GRN_TOKEN_ADD, token_flags); |
| 906 | if (token_cursor) { |
| 907 | grn_id tid; |
| 908 | grn_obj *pre_item = &(learner->pre_item); |
| 909 | grn_obj *post_item = learner->post_item; |
| 910 | grn_hash *token_ids = NULL; |
| 911 | while ((tid = grn_token_cursor_next(ctx, token_cursor)) && tid != learner->post_item_id) { |
| 912 | uint64_t key; |
| 913 | int added; |
| 914 | grn_id pair_id; |
| 915 | key = learner->key_prefix + tid; |
| 916 | pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t), |
| 917 | &added); |
| 918 | if (added) { |
| 919 | GRN_RECORD_SET(ctx, pre_item, tid); |
| 920 | grn_obj_set_value(ctx, learner->pairs_pre, pair_id, |
| 921 | pre_item, GRN_OBJ_SET); |
| 922 | grn_obj_set_value(ctx, learner->pairs_post, pair_id, |
| 923 | post_item, GRN_OBJ_SET); |
| 924 | } |
| 925 | if (!token_ids) { |
| 926 | token_ids = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, |
| 927 | GRN_OBJ_TABLE_HASH_KEY|GRN_HASH_TINY); |
| 928 | } |
| 929 | if (token_ids) { |
| 930 | int token_added; |
| 931 | grn_hash_add(ctx, token_ids, &tid, sizeof(grn_id), NULL, &token_added); |
| 932 | if (token_added) { |
| 933 | learner_increment(ctx, learner, learner->pairs_freq2, pair_id); |
| 934 | } |
| 935 | } |
| 936 | } |
| 937 | if (token_ids) { |
| 938 | grn_hash_close(ctx, token_ids); |
| 939 | } |
| 940 | grn_token_cursor_close(ctx, token_cursor); |
| 941 | } |
| 942 | } |
| 943 | |
| 944 | static void |
| 945 | learner_append_post_event(grn_ctx *ctx, grn_suggest_learner *learner) |
| 946 | { |
| 947 | GRN_RECORD_SET(ctx, &(learner->pre_events), learner->post_event_id); |
| 948 | grn_obj_set_value(ctx, learner->seqs_events, learner->seq_id, |
| 949 | &(learner->pre_events), GRN_OBJ_APPEND); |
| 950 | } |
| 951 | |
| 952 | static void |
| 953 | learner_learn(grn_ctx *ctx, grn_suggest_learner *learner) |
| 954 | { |
| 955 | if (learner_is_valid_input(ctx, learner)) { |
| 956 | learner_init_columns(ctx, learner); |
| 957 | learner_init_dataset_name(ctx, learner); |
| 958 | learner_init_configuration(ctx, learner); |
| 959 | learner_init_buffers(ctx, learner); |
| 960 | learner_increment_item_freq(ctx, learner, learner->items_freq); |
| 961 | learner_set_last_post_time(ctx, learner); |
| 962 | if (learner->post_type_id) { |
| 963 | learner_init_submit_learn(ctx, learner); |
| 964 | learner_increment_item_freq(ctx, learner, learner->items_freq2); |
| 965 | learner_learn_for_complete_and_correcnt(ctx, learner); |
| 966 | learner_learn_for_suggest(ctx, learner); |
| 967 | learner_fin_submit_learn(ctx, learner); |
| 968 | } |
| 969 | learner_append_post_event(ctx, learner); |
| 970 | learner_fin_buffers(ctx, learner); |
| 971 | learner_fin_configuration(ctx, learner); |
| 972 | learner_fin_dataset_name(ctx, learner); |
| 973 | learner_fin_columns(ctx, learner); |
| 974 | } |
| 975 | } |
| 976 | |
| 977 | static grn_obj * |
| 978 | func_suggest_preparer(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
| 979 | { |
| 980 | int learn_distance_in_seconds = 0; |
| 981 | grn_obj *obj; |
| 982 | if (nargs == 6) { |
| 983 | grn_obj *post_event = args[0]; |
| 984 | grn_obj *post_type = args[1]; |
| 985 | grn_obj *post_item = args[2]; |
| 986 | grn_obj *seq = args[3]; |
| 987 | grn_obj *post_time = args[4]; |
| 988 | grn_obj *pairs = args[5]; |
| 989 | grn_suggest_learner learner; |
| 990 | learner_init(ctx, &learner, |
| 991 | post_event, post_type, post_item, seq, post_time, pairs); |
| 992 | learner_learn(ctx, &learner); |
| 993 | learn_distance_in_seconds = learner.learn_distance_in_seconds; |
| 994 | } |
| 995 | if ((obj = GRN_PROC_ALLOC(GRN_DB_UINT32, 0))) { |
| 996 | GRN_UINT32_SET(ctx, obj, learn_distance_in_seconds); |
| 997 | } |
| 998 | return obj; |
| 999 | } |
| 1000 | |
| 1001 | grn_rc |
| 1002 | GRN_PLUGIN_INIT(grn_ctx *ctx) |
| 1003 | { |
| 1004 | return GRN_SUCCESS; |
| 1005 | } |
| 1006 | |
| 1007 | grn_rc |
| 1008 | GRN_PLUGIN_REGISTER(grn_ctx *ctx) |
| 1009 | { |
| 1010 | grn_expr_var vars[12]; |
| 1011 | |
| 1012 | grn_plugin_expr_var_init(ctx, &vars[0], "types" , -1); |
| 1013 | grn_plugin_expr_var_init(ctx, &vars[1], "table" , -1); |
| 1014 | grn_plugin_expr_var_init(ctx, &vars[2], "column" , -1); |
| 1015 | grn_plugin_expr_var_init(ctx, &vars[3], "query" , -1); |
| 1016 | grn_plugin_expr_var_init(ctx, &vars[4], "sortby" , -1); |
| 1017 | grn_plugin_expr_var_init(ctx, &vars[5], "output_columns" , -1); |
| 1018 | grn_plugin_expr_var_init(ctx, &vars[6], "offset" , -1); |
| 1019 | grn_plugin_expr_var_init(ctx, &vars[7], "limit" , -1); |
| 1020 | grn_plugin_expr_var_init(ctx, &vars[8], "frequency_threshold" , -1); |
| 1021 | grn_plugin_expr_var_init(ctx, &vars[9], "conditional_probability_threshold" , -1); |
| 1022 | grn_plugin_expr_var_init(ctx, &vars[10], "prefix_search" , -1); |
| 1023 | grn_plugin_expr_var_init(ctx, &vars[11], "similar_search" , -1); |
| 1024 | grn_plugin_command_create(ctx, "suggest" , -1, command_suggest, 12, vars); |
| 1025 | |
| 1026 | grn_proc_create(ctx, CONST_STR_LEN("suggest_preparer" ), GRN_PROC_FUNCTION, |
| 1027 | func_suggest_preparer, NULL, NULL, 0, NULL); |
| 1028 | return ctx->rc; |
| 1029 | } |
| 1030 | |
| 1031 | grn_rc |
| 1032 | GRN_PLUGIN_FIN(grn_ctx *ctx) |
| 1033 | { |
| 1034 | return GRN_SUCCESS; |
| 1035 | } |
| 1036 | |