1/* -*- c-basic-offset: 2; indent-tabs-mode: nil -*- */
2/* Copyright(C) 2010-2014 Brazil
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License version 2.1 as published by the Free Software Foundation.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18#ifdef GRN_EMBEDDED
19# define GRN_PLUGIN_FUNCTION_TAG suggest_suggest
20#endif
21
22#include <string.h>
23
24#include "grn_ctx.h"
25#include "grn_db.h"
26#include "grn_ii.h"
27#include "grn_token_cursor.h"
28#include "grn_output.h"
29#include <groonga/plugin.h>
30
31#define VAR GRN_PROC_GET_VAR_BY_OFFSET
32#define CONST_STR_LEN(x) x, x ? sizeof(x) - 1 : 0
33#define TEXT_VALUE_LEN(x) GRN_TEXT_VALUE(x), GRN_TEXT_LEN(x)
34
35#define MIN_LEARN_DISTANCE (60 * GRN_TIME_USEC_PER_SEC)
36
37#define COMPLETE 1
38#define CORRECT 2
39#define SUGGEST 4
40
41typedef enum {
42 GRN_SUGGEST_SEARCH_YES,
43 GRN_SUGGEST_SEARCH_NO,
44 GRN_SUGGEST_SEARCH_AUTO
45} grn_suggest_search_mode;
46
47typedef struct {
48 grn_obj *post_event;
49 grn_obj *post_type;
50 grn_obj *post_item;
51 grn_obj *seq;
52 grn_obj *post_time;
53 grn_obj *pairs;
54
55 int learn_distance_in_seconds;
56
57 grn_id post_event_id;
58 grn_id post_type_id;
59 grn_id post_item_id;
60 grn_id seq_id;
61 int64_t post_time_value;
62
63 grn_obj *seqs;
64 grn_obj *seqs_events;
65 grn_obj *events;
66 grn_obj *events_item;
67 grn_obj *events_type;
68 grn_obj *events_time;
69 grn_obj *event_types;
70 grn_obj *items;
71 grn_obj *items_freq;
72 grn_obj *items_freq2;
73 grn_obj *items_last;
74 grn_obj *pairs_pre;
75 grn_obj *pairs_post;
76 grn_obj *pairs_freq0;
77 grn_obj *pairs_freq1;
78 grn_obj *pairs_freq2;
79
80 grn_obj dataset_name;
81
82 grn_obj *configuration;
83
84 grn_obj weight;
85 grn_obj pre_events;
86
87 uint64_t key_prefix;
88 grn_obj pre_item;
89} grn_suggest_learner;
90
91static int
92grn_parse_suggest_types(grn_obj *text)
93{
94 const char *nptr = GRN_TEXT_VALUE(text);
95 const char *end = GRN_BULK_CURR(text);
96 int types = 0;
97 while (nptr < end) {
98 if (*nptr == '|') {
99 nptr += 1;
100 continue;
101 }
102 {
103 const char string[] = "complete";
104 size_t length = sizeof(string) - 1;
105 if (nptr + length <= end && memcmp(nptr, string, length) == 0) {
106 types |= COMPLETE;
107 nptr += length;
108 continue;
109 }
110 }
111 {
112 const char string[] = "correct";
113 size_t length = sizeof(string) - 1;
114 if (nptr + length <= end && memcmp(nptr, string, length) == 0) {
115 types |= CORRECT;
116 nptr += length;
117 continue;
118 }
119 }
120 {
121 const char string[] = "suggest";
122 size_t length = sizeof(string) - 1;
123 if (nptr + length <= end && memcmp(nptr, string, length) == 0) {
124 types |= SUGGEST;
125 nptr += length;
126 continue;
127 }
128 }
129 break;
130 }
131 return types;
132}
133
134static double
135cooccurrence_search(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_id id,
136 grn_obj *res, int query_type, int frequency_threshold,
137 double conditional_probability_threshold)
138{
139 double max_score = 0.0;
140 if (id) {
141 grn_ii_cursor *c;
142 grn_obj *co = grn_obj_column(ctx, items, CONST_STR_LEN("co"));
143 grn_obj *pairs = grn_ctx_at(ctx, grn_obj_get_range(ctx, co));
144 grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
145 grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
146 grn_obj *pairs_freq, *pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post"));
147 switch (query_type) {
148 case COMPLETE :
149 pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0"));
150 break;
151 case CORRECT :
152 pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1"));
153 break;
154 case SUGGEST :
155 pairs_freq = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2"));
156 break;
157 default :
158 return max_score;
159 }
160 if ((c = grn_ii_cursor_open(ctx, (grn_ii *)co, id, GRN_ID_NIL, GRN_ID_MAX,
161 ((grn_ii *)co)->n_elements - 1, 0))) {
162 grn_posting *p;
163 grn_obj post, pair_freq, item_freq, item_freq2, item_boost;
164 GRN_RECORD_INIT(&post, 0, grn_obj_id(ctx, items));
165 GRN_INT32_INIT(&pair_freq, 0);
166 GRN_INT32_INIT(&item_freq, 0);
167 GRN_INT32_INIT(&item_freq2, 0);
168 GRN_INT32_INIT(&item_boost, 0);
169 while ((p = grn_ii_cursor_next(ctx, c))) {
170 grn_id post_id;
171 int pfreq, ifreq, ifreq2, boost;
172 double conditional_probability;
173 GRN_BULK_REWIND(&post);
174 GRN_BULK_REWIND(&pair_freq);
175 GRN_BULK_REWIND(&item_freq);
176 GRN_BULK_REWIND(&item_freq2);
177 GRN_BULK_REWIND(&item_boost);
178 grn_obj_get_value(ctx, pairs_post, p->rid, &post);
179 grn_obj_get_value(ctx, pairs_freq, p->rid, &pair_freq);
180 post_id = GRN_RECORD_VALUE(&post);
181 grn_obj_get_value(ctx, items_freq, post_id, &item_freq);
182 grn_obj_get_value(ctx, items_freq2, post_id, &item_freq2);
183 grn_obj_get_value(ctx, items_boost, post_id, &item_boost);
184 pfreq = GRN_INT32_VALUE(&pair_freq);
185 ifreq = GRN_INT32_VALUE(&item_freq);
186 ifreq2 = GRN_INT32_VALUE(&item_freq2);
187 if (ifreq2 > 0) {
188 conditional_probability = (double)pfreq / (double)ifreq2;
189 } else {
190 conditional_probability = 0.0;
191 }
192 boost = GRN_INT32_VALUE(&item_boost);
193 if (pfreq >= frequency_threshold && ifreq >= frequency_threshold &&
194 conditional_probability >= conditional_probability_threshold &&
195 boost >= 0) {
196 grn_rset_recinfo *ri;
197 void *value;
198 double score = pfreq;
199 int added;
200 if (max_score < score + boost) { max_score = score + boost; }
201 /* put any formula if desired */
202 if (grn_hash_add(ctx, (grn_hash *)res,
203 &post_id, sizeof(grn_id), &value, &added)) {
204 ri = value;
205 ri->score += score;
206 if (added) {
207 ri->score += boost;
208 }
209 }
210 }
211 }
212 GRN_OBJ_FIN(ctx, &post);
213 GRN_OBJ_FIN(ctx, &pair_freq);
214 GRN_OBJ_FIN(ctx, &item_freq);
215 GRN_OBJ_FIN(ctx, &item_freq2);
216 GRN_OBJ_FIN(ctx, &item_boost);
217 grn_ii_cursor_close(ctx, c);
218 }
219 }
220 return max_score;
221}
222
223#define DEFAULT_LIMIT 10
224#define DEFAULT_SORTBY "-_score"
225#define DEFAULT_OUTPUT_COLUMNS "_key,_score"
226#define DEFAULT_FREQUENCY_THRESHOLD 100
227#define DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD 0.2
228
229static void
230output(grn_ctx *ctx, grn_obj *table, grn_obj *res, grn_id tid,
231 grn_obj *sortby, grn_obj *output_columns, int offset, int limit)
232{
233 grn_obj *sorted;
234 if ((sorted = grn_table_create(ctx, NULL, 0, NULL, GRN_OBJ_TABLE_NO_KEY, NULL, res))) {
235 uint32_t nkeys;
236 grn_obj_format format;
237 grn_table_sort_key *keys;
238 const char *sortby_val = GRN_TEXT_VALUE(sortby);
239 unsigned int sortby_len = GRN_TEXT_LEN(sortby);
240 const char *oc_val = GRN_TEXT_VALUE(output_columns);
241 unsigned int oc_len = GRN_TEXT_LEN(output_columns);
242 if (!sortby_val || !sortby_len) {
243 sortby_val = DEFAULT_SORTBY;
244 sortby_len = sizeof(DEFAULT_SORTBY) - 1;
245 }
246 if (!oc_val || !oc_len) {
247 oc_val = DEFAULT_OUTPUT_COLUMNS;
248 oc_len = sizeof(DEFAULT_OUTPUT_COLUMNS) - 1;
249 }
250 if ((keys = grn_table_sort_key_from_str(ctx, sortby_val, sortby_len, res, &nkeys))) {
251 grn_table_sort(ctx, res, offset, limit, sorted, keys, nkeys);
252 GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE,
253 ":", "sort(%d)", limit);
254 GRN_OBJ_FORMAT_INIT(&format, grn_table_size(ctx, res), 0, limit, offset);
255 format.flags =
256 GRN_OBJ_FORMAT_WITH_COLUMN_NAMES|
257 GRN_OBJ_FORMAT_XML_ELEMENT_RESULTSET;
258 grn_obj_columns(ctx, sorted, oc_val, oc_len, &format.columns);
259 GRN_OUTPUT_OBJ(sorted, &format);
260 GRN_OBJ_FORMAT_FIN(ctx, &format);
261 grn_table_sort_key_close(ctx, keys, nkeys);
262 }
263 grn_obj_unlink(ctx, sorted);
264 } else {
265 ERR(GRN_UNKNOWN_ERROR, "cannot create temporary sort table.");
266 }
267}
268
269static inline void
270complete_add_item(grn_ctx *ctx, grn_id id, grn_obj *res, int frequency_threshold,
271 grn_obj *items_freq, grn_obj *items_boost,
272 grn_obj *item_freq, grn_obj *item_boost)
273{
274 GRN_BULK_REWIND(item_freq);
275 GRN_BULK_REWIND(item_boost);
276 grn_obj_get_value(ctx, items_freq, id, item_freq);
277 grn_obj_get_value(ctx, items_boost, id, item_boost);
278 if (GRN_INT32_VALUE(item_boost) >= 0) {
279 double score;
280 score = 1 +
281 GRN_INT32_VALUE(item_freq) +
282 GRN_INT32_VALUE(item_boost);
283 if (score >= frequency_threshold) {
284 void *value;
285 if (grn_hash_add(ctx, (grn_hash *)res, &id, sizeof(grn_id),
286 &value, NULL)) {
287 grn_rset_recinfo *ri;
288 ri = value;
289 ri->score += score;
290 }
291 }
292 }
293}
294
295static void
296complete(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost, grn_obj *col,
297 grn_obj *query, grn_obj *sortby,
298 grn_obj *output_columns, int offset, int limit,
299 int frequency_threshold, double conditional_probability_threshold,
300 grn_suggest_search_mode prefix_search_mode)
301{
302 grn_obj *res;
303 grn_obj *items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
304 grn_obj item_freq, item_boost;
305 GRN_INT32_INIT(&item_freq, 0);
306 GRN_INT32_INIT(&item_boost, 0);
307 if ((res = grn_table_create(ctx, NULL, 0, NULL,
308 GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
309 grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
310 if (GRN_TEXT_LEN(query)) {
311 grn_table_cursor *cur;
312 /* RK search + prefix search */
313 grn_obj *index;
314 /* FIXME: support index selection */
315 if (grn_column_index(ctx, col, GRN_OP_PREFIX, &index, 1, NULL)) {
316 if ((cur = grn_table_cursor_open(ctx, grn_ctx_at(ctx, index->header.domain),
317 GRN_TEXT_VALUE(query),
318 GRN_TEXT_LEN(query),
319 NULL, 0, 0, -1,
320 GRN_CURSOR_PREFIX|GRN_CURSOR_RK))) {
321 grn_id id;
322 while ((id = grn_table_cursor_next(ctx, cur))) {
323 grn_ii_cursor *icur;
324 if ((icur = grn_ii_cursor_open(ctx, (grn_ii *)index, id,
325 GRN_ID_NIL, GRN_ID_MAX, 1, 0))) {
326 grn_posting *p;
327 while ((p = grn_ii_cursor_next(ctx, icur))) {
328 complete_add_item(ctx, p->rid, res, frequency_threshold,
329 items_freq, items_boost,
330 &item_freq, &item_boost);
331 }
332 grn_ii_cursor_close(ctx, icur);
333 }
334 }
335 grn_table_cursor_close(ctx, cur);
336 } else {
337 ERR(GRN_UNKNOWN_ERROR, "cannot open cursor for prefix RK search.");
338 }
339 } else {
340 ERR(GRN_UNKNOWN_ERROR, "cannot find index for prefix RK search.");
341 }
342 cooccurrence_search(ctx, items, items_boost, tid, res, COMPLETE,
343 frequency_threshold,
344 conditional_probability_threshold);
345 if (((prefix_search_mode == GRN_SUGGEST_SEARCH_YES) ||
346 (prefix_search_mode == GRN_SUGGEST_SEARCH_AUTO &&
347 !grn_table_size(ctx, res))) &&
348 (cur = grn_table_cursor_open(ctx, items,
349 GRN_TEXT_VALUE(query),
350 GRN_TEXT_LEN(query),
351 NULL, 0, 0, -1, GRN_CURSOR_PREFIX))) {
352 grn_id id;
353 while ((id = grn_table_cursor_next(ctx, cur))) {
354 complete_add_item(ctx, id, res, frequency_threshold,
355 items_freq, items_boost, &item_freq, &item_boost);
356 }
357 grn_table_cursor_close(ctx, cur);
358 }
359 }
360 output(ctx, items, res, tid, sortby, output_columns, offset, limit);
361 grn_obj_close(ctx, res);
362 } else {
363 ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table.");
364 }
365 GRN_OBJ_FIN(ctx, &item_boost);
366 GRN_OBJ_FIN(ctx, &item_freq);
367}
368
369static void
370correct(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
371 grn_obj *query, grn_obj *sortby,
372 grn_obj *output_columns, int offset, int limit,
373 int frequency_threshold, double conditional_probability_threshold,
374 grn_suggest_search_mode similar_search_mode)
375{
376 grn_obj *res;
377 grn_obj *items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
378 grn_obj item_freq2, item_boost;
379 GRN_INT32_INIT(&item_freq2, 0);
380 GRN_INT32_INIT(&item_boost, 0);
381 if ((res = grn_table_create(ctx, NULL, 0, NULL,
382 GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
383 grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
384 double max_score;
385 max_score = cooccurrence_search(ctx, items, items_boost, tid, res, CORRECT,
386 frequency_threshold,
387 conditional_probability_threshold);
388 GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SCORE,
389 ":", "cooccur(%f)", max_score);
390 if (GRN_TEXT_LEN(query) &&
391 ((similar_search_mode == GRN_SUGGEST_SEARCH_YES) ||
392 (similar_search_mode == GRN_SUGGEST_SEARCH_AUTO &&
393 max_score < frequency_threshold))) {
394 grn_obj *key, *index;
395 if ((key = grn_obj_column(ctx, items,
396 GRN_COLUMN_NAME_KEY,
397 GRN_COLUMN_NAME_KEY_LEN))) {
398 if (grn_column_index(ctx, key, GRN_OP_MATCH, &index, 1, NULL)) {
399 grn_select_optarg optarg;
400 memset(&optarg, 0, sizeof(grn_select_optarg));
401 optarg.mode = GRN_OP_SIMILAR;
402 optarg.similarity_threshold = 0;
403 optarg.max_size = 2;
404 grn_ii_select(ctx, (grn_ii *)index, TEXT_VALUE_LEN(query),
405 (grn_hash *)res, GRN_OP_OR, &optarg);
406 grn_obj_unlink(ctx, index);
407 GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE,
408 ":", "similar(%d)", grn_table_size(ctx, res));
409 {
410 grn_hash_cursor *hc = grn_hash_cursor_open(ctx, (grn_hash *)res, NULL,
411 0, NULL, 0, 0, -1, 0);
412 if (hc) {
413 while (grn_hash_cursor_next(ctx, hc)) {
414 void *key, *value;
415 if (grn_hash_cursor_get_key_value(ctx, hc, &key, NULL, &value)) {
416 grn_id *rp;
417 rp = key;
418 GRN_BULK_REWIND(&item_freq2);
419 GRN_BULK_REWIND(&item_boost);
420 grn_obj_get_value(ctx, items_freq2, *rp, &item_freq2);
421 grn_obj_get_value(ctx, items_boost, *rp, &item_boost);
422 if (GRN_INT32_VALUE(&item_boost) >= 0) {
423 double score;
424 grn_rset_recinfo *ri;
425 score = 1 +
426 (GRN_INT32_VALUE(&item_freq2) >> 4) +
427 GRN_INT32_VALUE(&item_boost);
428 ri = value;
429 ri->score += score;
430 if (score >= frequency_threshold) { continue; }
431 }
432 /* score < frequency_threshold || item_boost < 0 */
433 grn_hash_cursor_delete(ctx, hc, NULL);
434 }
435 }
436 grn_hash_cursor_close(ctx, hc);
437 }
438 }
439 GRN_QUERY_LOG(ctx, GRN_QUERY_LOG_SIZE,
440 ":", "filter(%d)", grn_table_size(ctx, res));
441 {
442 /* exec _score -= edit_distance(_key, "query string") for all records */
443 grn_obj *var;
444 grn_obj *expr;
445
446 GRN_EXPR_CREATE_FOR_QUERY(ctx, res, expr, var);
447 if (expr) {
448 grn_table_cursor *tc;
449 grn_obj *score = grn_obj_column(ctx, res,
450 GRN_COLUMN_NAME_SCORE,
451 GRN_COLUMN_NAME_SCORE_LEN);
452 grn_obj *key = grn_obj_column(ctx, res,
453 GRN_COLUMN_NAME_KEY,
454 GRN_COLUMN_NAME_KEY_LEN);
455 grn_expr_append_obj(ctx, expr,
456 score,
457 GRN_OP_GET_VALUE, 1);
458 grn_expr_append_obj(ctx, expr,
459 grn_ctx_get(ctx, CONST_STR_LEN("edit_distance")),
460 GRN_OP_PUSH, 1);
461 grn_expr_append_obj(ctx, expr,
462 key,
463 GRN_OP_GET_VALUE, 1);
464 grn_expr_append_const(ctx, expr, query, GRN_OP_PUSH, 1);
465 grn_expr_append_op(ctx, expr, GRN_OP_CALL, 2);
466 grn_expr_append_op(ctx, expr, GRN_OP_MINUS_ASSIGN, 2);
467
468 if ((tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, 0))) {
469 grn_id id;
470 grn_obj score_value;
471 GRN_FLOAT_INIT(&score_value, 0);
472 while ((id = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) {
473 GRN_RECORD_SET(ctx, var, id);
474 grn_expr_exec(ctx, expr, 0);
475 GRN_BULK_REWIND(&score_value);
476 grn_obj_get_value(ctx, score, id, &score_value);
477 if (GRN_FLOAT_VALUE(&score_value) < frequency_threshold) {
478 grn_table_cursor_delete(ctx, tc);
479 }
480 }
481 grn_obj_unlink(ctx, &score_value);
482 grn_table_cursor_close(ctx, tc);
483 }
484 grn_obj_unlink(ctx, score);
485 grn_obj_unlink(ctx, key);
486 grn_obj_unlink(ctx, expr);
487 } else {
488 ERR(GRN_UNKNOWN_ERROR,
489 "error on building expr. for calicurating edit distance");
490 }
491 }
492 }
493 grn_obj_unlink(ctx, key);
494 }
495 }
496 output(ctx, items, res, tid, sortby, output_columns, offset, limit);
497 grn_obj_close(ctx, res);
498 } else {
499 ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table.");
500 }
501 GRN_OBJ_FIN(ctx, &item_boost);
502 GRN_OBJ_FIN(ctx, &item_freq2);
503}
504
505static void
506suggest(grn_ctx *ctx, grn_obj *items, grn_obj *items_boost,
507 grn_obj *query, grn_obj *sortby,
508 grn_obj *output_columns, int offset, int limit,
509 int frequency_threshold, double conditional_probability_threshold)
510{
511 grn_obj *res;
512 if ((res = grn_table_create(ctx, NULL, 0, NULL,
513 GRN_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, items, NULL))) {
514 grn_id tid = grn_table_get(ctx, items, TEXT_VALUE_LEN(query));
515 cooccurrence_search(ctx, items, items_boost, tid, res, SUGGEST,
516 frequency_threshold, conditional_probability_threshold);
517 output(ctx, items, res, tid, sortby, output_columns, offset, limit);
518 grn_obj_close(ctx, res);
519 } else {
520 ERR(GRN_UNKNOWN_ERROR, "cannot create temporary table.");
521 }
522}
523
524static grn_suggest_search_mode
525parse_search_mode(grn_ctx *ctx, grn_obj *mode_text)
526{
527 grn_suggest_search_mode mode;
528 int mode_length;
529
530 mode_length = GRN_TEXT_LEN(mode_text);
531 if (mode_length == 3 &&
532 grn_strncasecmp("yes", GRN_TEXT_VALUE(mode_text), 3) == 0) {
533 mode = GRN_SUGGEST_SEARCH_YES;
534 } else if (mode_length == 2 &&
535 grn_strncasecmp("no", GRN_TEXT_VALUE(mode_text), 2) == 0) {
536 mode = GRN_SUGGEST_SEARCH_NO;
537 } else {
538 mode = GRN_SUGGEST_SEARCH_AUTO;
539 }
540
541 return mode;
542}
543
544static grn_obj *
545command_suggest(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
546{
547 grn_obj *items, *col, *items_boost;
548 int types;
549 int offset = 0;
550 int limit = DEFAULT_LIMIT;
551 int frequency_threshold = DEFAULT_FREQUENCY_THRESHOLD;
552 double conditional_probability_threshold =
553 DEFAULT_CONDITIONAL_PROBABILITY_THRESHOLD;
554 grn_suggest_search_mode prefix_search_mode;
555 grn_suggest_search_mode similar_search_mode;
556
557 types = grn_parse_suggest_types(VAR(0));
558 if (GRN_TEXT_LEN(VAR(6)) > 0) {
559 offset = grn_atoi(GRN_TEXT_VALUE(VAR(6)), GRN_BULK_CURR(VAR(6)), NULL);
560 }
561 if (GRN_TEXT_LEN(VAR(7)) > 0) {
562 limit = grn_atoi(GRN_TEXT_VALUE(VAR(7)), GRN_BULK_CURR(VAR(7)), NULL);
563 }
564 if (GRN_TEXT_LEN(VAR(8)) > 0) {
565 frequency_threshold = grn_atoi(GRN_TEXT_VALUE(VAR(8)), GRN_BULK_CURR(VAR(8)), NULL);
566 }
567 if (GRN_TEXT_LEN(VAR(9)) > 0) {
568 GRN_TEXT_PUTC(ctx, VAR(9), '\0');
569 conditional_probability_threshold = strtod(GRN_TEXT_VALUE(VAR(9)), NULL);
570 }
571
572 prefix_search_mode = parse_search_mode(ctx, VAR(10));
573 similar_search_mode = parse_search_mode(ctx, VAR(11));
574
575 if ((items = grn_ctx_get(ctx, TEXT_VALUE_LEN(VAR(1))))) {
576 if ((items_boost = grn_obj_column(ctx, items, CONST_STR_LEN("boost")))) {
577 int n_outputs = 0;
578 if (types & COMPLETE) {
579 n_outputs++;
580 }
581 if (types & CORRECT) {
582 n_outputs++;
583 }
584 if (types & SUGGEST) {
585 n_outputs++;
586 }
587 GRN_OUTPUT_MAP_OPEN("RESULT_SET", n_outputs);
588
589 if (types & COMPLETE) {
590 if ((col = grn_obj_column(ctx, items, TEXT_VALUE_LEN(VAR(2))))) {
591 GRN_OUTPUT_CSTR("complete");
592 complete(ctx, items, items_boost, col, VAR(3), VAR(4),
593 VAR(5), offset, limit,
594 frequency_threshold, conditional_probability_threshold,
595 prefix_search_mode);
596 } else {
597 ERR(GRN_INVALID_ARGUMENT, "invalid column.");
598 }
599 }
600 if (types & CORRECT) {
601 GRN_OUTPUT_CSTR("correct");
602 correct(ctx, items, items_boost, VAR(3), VAR(4),
603 VAR(5), offset, limit,
604 frequency_threshold, conditional_probability_threshold,
605 similar_search_mode);
606 }
607 if (types & SUGGEST) {
608 GRN_OUTPUT_CSTR("suggest");
609 suggest(ctx, items, items_boost, VAR(3), VAR(4),
610 VAR(5), offset, limit,
611 frequency_threshold, conditional_probability_threshold);
612 }
613 GRN_OUTPUT_MAP_CLOSE();
614 } else {
615 ERR(GRN_INVALID_ARGUMENT, "nonexistent column: <%.*s.boost>",
616 (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1)));
617 }
618 grn_obj_unlink(ctx, items);
619 } else {
620 ERR(GRN_INVALID_ARGUMENT, "nonexistent table: <%.*s>",
621 (int)GRN_TEXT_LEN(VAR(1)), GRN_TEXT_VALUE(VAR(1)));
622 }
623 return NULL;
624}
625
626static void
627learner_init_values(grn_ctx *ctx, grn_suggest_learner *learner)
628{
629 learner->post_event_id = GRN_RECORD_VALUE(learner->post_event);
630 learner->post_type_id = GRN_RECORD_VALUE(learner->post_type);
631 learner->post_item_id = GRN_RECORD_VALUE(learner->post_item);
632 learner->seq_id = GRN_RECORD_VALUE(learner->seq);
633 learner->post_time_value = GRN_TIME_VALUE(learner->post_time);
634}
635
636static void
637learner_init(grn_ctx *ctx, grn_suggest_learner *learner,
638 grn_obj *post_event, grn_obj *post_type, grn_obj *post_item,
639 grn_obj *seq, grn_obj *post_time, grn_obj *pairs)
640{
641 learner->post_event = post_event;
642 learner->post_type = post_type;
643 learner->post_item = post_item;
644 learner->seq = seq;
645 learner->post_time = post_time;
646 learner->pairs = pairs;
647
648 learner->learn_distance_in_seconds = 0;
649
650 learner_init_values(ctx, learner);
651}
652
653static void
654learner_init_columns(grn_ctx *ctx, grn_suggest_learner *learner)
655{
656 grn_id events_id, event_types_id;
657 grn_obj *seqs, *events, *post_item, *items, *pairs;
658
659 learner->seqs = seqs = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(learner->seq));
660 learner->seqs_events = grn_obj_column(ctx, seqs, CONST_STR_LEN("events"));
661
662 events_id = grn_obj_get_range(ctx, learner->seqs_events);
663 learner->events = events = grn_ctx_at(ctx, events_id);
664 learner->events_item = grn_obj_column(ctx, events, CONST_STR_LEN("item"));
665 learner->events_type = grn_obj_column(ctx, events, CONST_STR_LEN("type"));
666 learner->events_time = grn_obj_column(ctx, events, CONST_STR_LEN("time"));
667
668 event_types_id = grn_obj_get_range(ctx, learner->events_type);
669 learner->event_types = grn_obj_column(ctx, events, CONST_STR_LEN("time"));
670
671 post_item = learner->post_item;
672 learner->items = items = grn_ctx_at(ctx, GRN_OBJ_GET_DOMAIN(post_item));
673 learner->items_freq = grn_obj_column(ctx, items, CONST_STR_LEN("freq"));
674 learner->items_freq2 = grn_obj_column(ctx, items, CONST_STR_LEN("freq2"));
675 learner->items_last = grn_obj_column(ctx, items, CONST_STR_LEN("last"));
676
677 pairs = learner->pairs;
678 learner->pairs_pre = grn_obj_column(ctx, pairs, CONST_STR_LEN("pre"));
679 learner->pairs_post = grn_obj_column(ctx, pairs, CONST_STR_LEN("post"));
680 learner->pairs_freq0 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq0"));
681 learner->pairs_freq1 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq1"));
682 learner->pairs_freq2 = grn_obj_column(ctx, pairs, CONST_STR_LEN("freq2"));
683}
684
685static void
686learner_fin_columns(grn_ctx *ctx, grn_suggest_learner *learner)
687{
688 grn_obj_unlink(ctx, learner->seqs);
689 grn_obj_unlink(ctx, learner->seqs_events);
690
691 grn_obj_unlink(ctx, learner->events);
692 grn_obj_unlink(ctx, learner->events_item);
693 grn_obj_unlink(ctx, learner->events_type);
694 grn_obj_unlink(ctx, learner->events_time);
695
696 grn_obj_unlink(ctx, learner->event_types);
697
698 grn_obj_unlink(ctx, learner->items);
699 grn_obj_unlink(ctx, learner->items_freq);
700 grn_obj_unlink(ctx, learner->items_freq2);
701 grn_obj_unlink(ctx, learner->items_last);
702
703 grn_obj_unlink(ctx, learner->pairs_pre);
704 grn_obj_unlink(ctx, learner->pairs_post);
705 grn_obj_unlink(ctx, learner->pairs_freq0);
706 grn_obj_unlink(ctx, learner->pairs_freq1);
707 grn_obj_unlink(ctx, learner->pairs_freq2);
708}
709
710static void
711learner_init_weight(grn_ctx *ctx, grn_suggest_learner *learner)
712{
713 grn_obj *weight_column = NULL;
714 unsigned int weight = 1;
715
716 if (learner->configuration) {
717 weight_column = grn_obj_column(ctx,
718 learner->configuration,
719 CONST_STR_LEN("weight"));
720 }
721 if (weight_column) {
722 grn_id id;
723 id = grn_table_get(ctx, learner->configuration,
724 GRN_TEXT_VALUE(&(learner->dataset_name)),
725 GRN_TEXT_LEN(&(learner->dataset_name)));
726 if (id != GRN_ID_NIL) {
727 grn_obj weight_value;
728 GRN_UINT32_INIT(&weight_value, 0);
729 grn_obj_get_value(ctx, weight_column, id, &weight_value);
730 weight = GRN_UINT32_VALUE(&weight_value);
731 GRN_OBJ_FIN(ctx, &weight_value);
732 }
733 grn_obj_unlink(ctx, weight_column);
734 }
735
736 GRN_UINT32_INIT(&(learner->weight), 0);
737 GRN_UINT32_SET(ctx, &(learner->weight), weight);
738}
739
740static void
741learner_init_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner)
742{
743 char events_name[GRN_TABLE_MAX_KEY_SIZE];
744 unsigned int events_name_size;
745 unsigned int events_name_prefix_size;
746
747 events_name_size = grn_obj_name(ctx, learner->events,
748 events_name, GRN_TABLE_MAX_KEY_SIZE);
749 GRN_TEXT_INIT(&(learner->dataset_name), 0);
750 events_name_prefix_size = strlen("event_");
751 if (events_name_size > events_name_prefix_size) {
752 GRN_TEXT_PUT(ctx,
753 &(learner->dataset_name),
754 events_name + events_name_prefix_size,
755 events_name_size - events_name_prefix_size);
756 }
757}
758
759static void
760learner_fin_dataset_name(grn_ctx *ctx, grn_suggest_learner *learner)
761{
762 GRN_OBJ_FIN(ctx, &(learner->dataset_name));
763}
764
765static void
766learner_init_configuration(grn_ctx *ctx, grn_suggest_learner *learner)
767{
768 learner->configuration = grn_ctx_get(ctx, "configuration", -1);
769}
770
771static void
772learner_fin_configuration(grn_ctx *ctx, grn_suggest_learner *learner)
773{
774 if (learner->configuration) {
775 grn_obj_unlink(ctx, learner->configuration);
776 }
777}
778
779static void
780learner_init_buffers(grn_ctx *ctx, grn_suggest_learner *learner)
781{
782 learner_init_weight(ctx, learner);
783 GRN_RECORD_INIT(&(learner->pre_events), 0, grn_obj_id(ctx, learner->events));
784}
785
786static void
787learner_fin_buffers(grn_ctx *ctx, grn_suggest_learner *learner)
788{
789 grn_obj_unlink(ctx, &(learner->weight));
790 grn_obj_unlink(ctx, &(learner->pre_events));
791}
792
793static void
794learner_init_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner)
795{
796 grn_id items_id;
797
798 learner->key_prefix = ((uint64_t)learner->post_item_id) << 32;
799
800 items_id = grn_obj_get_range(ctx, learner->events_item);
801 GRN_RECORD_INIT(&(learner->pre_item), 0, items_id);
802
803 grn_obj_get_value(ctx, learner->seqs_events, learner->seq_id,
804 &(learner->pre_events));
805}
806
807static void
808learner_fin_submit_learn(grn_ctx *ctx, grn_suggest_learner *learner)
809{
810 grn_obj_unlink(ctx, &(learner->pre_item));
811 GRN_BULK_REWIND(&(learner->pre_events));
812}
813
814static grn_bool
815learner_is_valid_input(grn_ctx *ctx, grn_suggest_learner *learner)
816{
817 return learner->post_event_id && learner->post_item_id && learner->seq_id;
818}
819
820static void
821learner_increment(grn_ctx *ctx, grn_suggest_learner *learner,
822 grn_obj *column, grn_id record_id)
823{
824 grn_obj_set_value(ctx, column, record_id, &(learner->weight), GRN_OBJ_INCR);
825}
826
827static void
828learner_increment_item_freq(grn_ctx *ctx, grn_suggest_learner *learner,
829 grn_obj *column)
830{
831 learner_increment(ctx, learner, column, learner->post_item_id);
832}
833
834static void
835learner_set_last_post_time(grn_ctx *ctx, grn_suggest_learner *learner)
836{
837 grn_obj_set_value(ctx, learner->items_last, learner->post_item_id,
838 learner->post_time, GRN_OBJ_SET);
839}
840
841static void
842learner_learn_for_complete_and_correcnt(grn_ctx *ctx,
843 grn_suggest_learner *learner)
844{
845 grn_obj *pre_item, *post_item, *pre_events;
846 grn_obj pre_type, pre_time;
847 grn_id *ep, *es;
848 uint64_t key;
849 int64_t post_time_value;
850
851 pre_item = &(learner->pre_item);
852 post_item = learner->post_item;
853 pre_events = &(learner->pre_events);
854 post_time_value = learner->post_time_value;
855 GRN_RECORD_INIT(&pre_type, 0, grn_obj_get_range(ctx, learner->events_type));
856 GRN_TIME_INIT(&pre_time, 0);
857 ep = (grn_id *)GRN_BULK_CURR(pre_events);
858 es = (grn_id *)GRN_BULK_HEAD(pre_events);
859 while (es < ep--) {
860 grn_id pair_id;
861 int added;
862 int64_t learn_distance;
863
864 GRN_BULK_REWIND(&pre_type);
865 GRN_BULK_REWIND(&pre_time);
866 GRN_BULK_REWIND(pre_item);
867 grn_obj_get_value(ctx, learner->events_type, *ep, &pre_type);
868 grn_obj_get_value(ctx, learner->events_time, *ep, &pre_time);
869 grn_obj_get_value(ctx, learner->events_item, *ep, pre_item);
870 learn_distance = post_time_value - GRN_TIME_VALUE(&pre_time);
871 if (learn_distance >= MIN_LEARN_DISTANCE) {
872 learner->learn_distance_in_seconds =
873 (int)(learn_distance / GRN_TIME_USEC_PER_SEC);
874 break;
875 }
876 key = learner->key_prefix + GRN_RECORD_VALUE(pre_item);
877 pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t),
878 &added);
879 if (added) {
880 grn_obj_set_value(ctx, learner->pairs_pre, pair_id, pre_item,
881 GRN_OBJ_SET);
882 grn_obj_set_value(ctx, learner->pairs_post, pair_id, post_item,
883 GRN_OBJ_SET);
884 }
885 if (GRN_RECORD_VALUE(&pre_type)) {
886 learner_increment(ctx, learner, learner->pairs_freq1, pair_id);
887 break;
888 } else {
889 learner_increment(ctx, learner, learner->pairs_freq0, pair_id);
890 }
891 }
892 GRN_OBJ_FIN(ctx, &pre_type);
893 GRN_OBJ_FIN(ctx, &pre_time);
894}
895
896static void
897learner_learn_for_suggest(grn_ctx *ctx, grn_suggest_learner *learner)
898{
899 char keybuf[GRN_TABLE_MAX_KEY_SIZE];
900 int keylen = grn_table_get_key(ctx, learner->items, learner->post_item_id,
901 keybuf, GRN_TABLE_MAX_KEY_SIZE);
902 unsigned int token_flags = 0;
903 grn_token_cursor *token_cursor =
904 grn_token_cursor_open(ctx, learner->items, keybuf, keylen,
905 GRN_TOKEN_ADD, token_flags);
906 if (token_cursor) {
907 grn_id tid;
908 grn_obj *pre_item = &(learner->pre_item);
909 grn_obj *post_item = learner->post_item;
910 grn_hash *token_ids = NULL;
911 while ((tid = grn_token_cursor_next(ctx, token_cursor)) && tid != learner->post_item_id) {
912 uint64_t key;
913 int added;
914 grn_id pair_id;
915 key = learner->key_prefix + tid;
916 pair_id = grn_table_add(ctx, learner->pairs, &key, sizeof(uint64_t),
917 &added);
918 if (added) {
919 GRN_RECORD_SET(ctx, pre_item, tid);
920 grn_obj_set_value(ctx, learner->pairs_pre, pair_id,
921 pre_item, GRN_OBJ_SET);
922 grn_obj_set_value(ctx, learner->pairs_post, pair_id,
923 post_item, GRN_OBJ_SET);
924 }
925 if (!token_ids) {
926 token_ids = grn_hash_create(ctx, NULL, sizeof(grn_id), 0,
927 GRN_OBJ_TABLE_HASH_KEY|GRN_HASH_TINY);
928 }
929 if (token_ids) {
930 int token_added;
931 grn_hash_add(ctx, token_ids, &tid, sizeof(grn_id), NULL, &token_added);
932 if (token_added) {
933 learner_increment(ctx, learner, learner->pairs_freq2, pair_id);
934 }
935 }
936 }
937 if (token_ids) {
938 grn_hash_close(ctx, token_ids);
939 }
940 grn_token_cursor_close(ctx, token_cursor);
941 }
942}
943
944static void
945learner_append_post_event(grn_ctx *ctx, grn_suggest_learner *learner)
946{
947 GRN_RECORD_SET(ctx, &(learner->pre_events), learner->post_event_id);
948 grn_obj_set_value(ctx, learner->seqs_events, learner->seq_id,
949 &(learner->pre_events), GRN_OBJ_APPEND);
950}
951
952static void
953learner_learn(grn_ctx *ctx, grn_suggest_learner *learner)
954{
955 if (learner_is_valid_input(ctx, learner)) {
956 learner_init_columns(ctx, learner);
957 learner_init_dataset_name(ctx, learner);
958 learner_init_configuration(ctx, learner);
959 learner_init_buffers(ctx, learner);
960 learner_increment_item_freq(ctx, learner, learner->items_freq);
961 learner_set_last_post_time(ctx, learner);
962 if (learner->post_type_id) {
963 learner_init_submit_learn(ctx, learner);
964 learner_increment_item_freq(ctx, learner, learner->items_freq2);
965 learner_learn_for_complete_and_correcnt(ctx, learner);
966 learner_learn_for_suggest(ctx, learner);
967 learner_fin_submit_learn(ctx, learner);
968 }
969 learner_append_post_event(ctx, learner);
970 learner_fin_buffers(ctx, learner);
971 learner_fin_configuration(ctx, learner);
972 learner_fin_dataset_name(ctx, learner);
973 learner_fin_columns(ctx, learner);
974 }
975}
976
977static grn_obj *
978func_suggest_preparer(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
979{
980 int learn_distance_in_seconds = 0;
981 grn_obj *obj;
982 if (nargs == 6) {
983 grn_obj *post_event = args[0];
984 grn_obj *post_type = args[1];
985 grn_obj *post_item = args[2];
986 grn_obj *seq = args[3];
987 grn_obj *post_time = args[4];
988 grn_obj *pairs = args[5];
989 grn_suggest_learner learner;
990 learner_init(ctx, &learner,
991 post_event, post_type, post_item, seq, post_time, pairs);
992 learner_learn(ctx, &learner);
993 learn_distance_in_seconds = learner.learn_distance_in_seconds;
994 }
995 if ((obj = GRN_PROC_ALLOC(GRN_DB_UINT32, 0))) {
996 GRN_UINT32_SET(ctx, obj, learn_distance_in_seconds);
997 }
998 return obj;
999}
1000
1001grn_rc
1002GRN_PLUGIN_INIT(grn_ctx *ctx)
1003{
1004 return GRN_SUCCESS;
1005}
1006
1007grn_rc
1008GRN_PLUGIN_REGISTER(grn_ctx *ctx)
1009{
1010 grn_expr_var vars[12];
1011
1012 grn_plugin_expr_var_init(ctx, &vars[0], "types", -1);
1013 grn_plugin_expr_var_init(ctx, &vars[1], "table", -1);
1014 grn_plugin_expr_var_init(ctx, &vars[2], "column", -1);
1015 grn_plugin_expr_var_init(ctx, &vars[3], "query", -1);
1016 grn_plugin_expr_var_init(ctx, &vars[4], "sortby", -1);
1017 grn_plugin_expr_var_init(ctx, &vars[5], "output_columns", -1);
1018 grn_plugin_expr_var_init(ctx, &vars[6], "offset", -1);
1019 grn_plugin_expr_var_init(ctx, &vars[7], "limit", -1);
1020 grn_plugin_expr_var_init(ctx, &vars[8], "frequency_threshold", -1);
1021 grn_plugin_expr_var_init(ctx, &vars[9], "conditional_probability_threshold", -1);
1022 grn_plugin_expr_var_init(ctx, &vars[10], "prefix_search", -1);
1023 grn_plugin_expr_var_init(ctx, &vars[11], "similar_search", -1);
1024 grn_plugin_command_create(ctx, "suggest", -1, command_suggest, 12, vars);
1025
1026 grn_proc_create(ctx, CONST_STR_LEN("suggest_preparer"), GRN_PROC_FUNCTION,
1027 func_suggest_preparer, NULL, NULL, 0, NULL);
1028 return ctx->rc;
1029}
1030
1031grn_rc
1032GRN_PLUGIN_FIN(grn_ctx *ctx)
1033{
1034 return GRN_SUCCESS;
1035}
1036