1/* -*- c-basic-offset: 2 -*- */
2/*
3 Copyright(C) 2009-2016 Brazil
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License version 2.1 as published by the Free Software Foundation.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include "../grn_proc.h"
20#include "../grn_rset.h"
21#include "../grn_ii.h"
22
23#include <groonga/plugin.h>
24
25#include <string.h>
26
27#define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)])
28
29static uint32_t
30calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags)
31{
32 int d = 0;
33 uint32_t cx, lx, cy, ly, *dists;
34 char *px, *py;
35 for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++);
36 for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++);
37 if ((dists = GRN_PLUGIN_MALLOC(ctx, (lx + 1) * (ly + 1) * sizeof(uint32_t)))) {
38 uint32_t x, y;
39 for (x = 0; x <= lx; x++) { DIST(x, 0) = x; }
40 for (y = 0; y <= ly; y++) { DIST(0, y) = y; }
41 for (x = 1, px = sx; x <= lx; x++, px += cx) {
42 cx = grn_charlen(ctx, px, ex);
43 for (y = 1, py = sy; y <= ly; y++, py += cy) {
44 cy = grn_charlen(ctx, py, ey);
45 if (cx == cy && !memcmp(px, py, cx)) {
46 DIST(x, y) = DIST(x - 1, y - 1);
47 } else {
48 uint32_t a = DIST(x - 1, y) + 1;
49 uint32_t b = DIST(x, y - 1) + 1;
50 uint32_t c = DIST(x - 1, y - 1) + 1;
51 DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c));
52 if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION &&
53 x > 1 && y > 1 && cx == cy &&
54 memcmp(px, py - cy, cx) == 0 &&
55 memcmp(px - cx, py, cx) == 0) {
56 uint32_t t = DIST(x - 2, y - 2) + 1;
57 DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t);
58 }
59 }
60 }
61 }
62 d = DIST(lx, ly);
63 GRN_PLUGIN_FREE(ctx, dists);
64 }
65 return d;
66}
67
68static grn_obj *
69func_edit_distance(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data)
70{
71#define N_REQUIRED_ARGS 2
72#define MAX_ARGS 3
73 int d = 0;
74 int flags = 0;
75 grn_obj *obj;
76 if (nargs >= N_REQUIRED_ARGS && nargs <= MAX_ARGS) {
77 if (nargs == MAX_ARGS && GRN_BOOL_VALUE(args[2])) {
78 flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
79 }
80 d = calc_edit_distance(ctx, GRN_TEXT_VALUE(args[0]), GRN_BULK_CURR(args[0]),
81 GRN_TEXT_VALUE(args[1]), GRN_BULK_CURR(args[1]), flags);
82 }
83 if ((obj = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0))) {
84 GRN_UINT32_SET(ctx, obj, d);
85 }
86 return obj;
87#undef N_REQUIRED_ARGS
88#undef MAX_ARGS
89}
90
91void
92grn_proc_init_edit_distance(grn_ctx *ctx)
93{
94 grn_proc_create(ctx, "edit_distance", -1, GRN_PROC_FUNCTION,
95 func_edit_distance, NULL, NULL, 0, NULL);
96}
97
98#define SCORE_HEAP_SIZE 256
99
100typedef struct {
101 grn_id id;
102 uint32_t score;
103} score_heap_node;
104
105typedef struct {
106 int n_entries;
107 int limit;
108 score_heap_node *nodes;
109} score_heap;
110
111static inline score_heap *
112score_heap_open(grn_ctx *ctx, int max)
113{
114 score_heap *h = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap));
115 if (!h) { return NULL; }
116 h->nodes = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap_node) * max);
117 if (!h->nodes) {
118 GRN_PLUGIN_FREE(ctx, h);
119 return NULL;
120 }
121 h->n_entries = 0;
122 h->limit = max;
123 return h;
124}
125
126static inline grn_bool
127score_heap_push(grn_ctx *ctx, score_heap *h, grn_id id, uint32_t score)
128{
129 int n, n2;
130 score_heap_node node = {id, score};
131 score_heap_node node2;
132 if (h->n_entries >= h->limit) {
133 int max = h->limit * 2;
134 score_heap_node *nodes;
135 nodes = GRN_PLUGIN_REALLOC(ctx, h->nodes, sizeof(score_heap) * max);
136 if (!nodes) {
137 return GRN_FALSE;
138 }
139 h->limit = max;
140 h->nodes = nodes;
141 }
142 h->nodes[h->n_entries] = node;
143 n = h->n_entries++;
144 while (n) {
145 n2 = (n - 1) >> 1;
146 if (h->nodes[n2].score <= h->nodes[n].score) { break; }
147 node2 = h->nodes[n];
148 h->nodes[n] = h->nodes[n2];
149 h->nodes[n2] = node2;
150 n = n2;
151 }
152 return GRN_TRUE;
153}
154
155static inline void
156score_heap_close(grn_ctx *ctx, score_heap *h)
157{
158 GRN_PLUGIN_FREE(ctx, h->nodes);
159 GRN_PLUGIN_FREE(ctx, h);
160}
161
162static grn_rc
163sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *query,
164 uint32_t max_distance, uint32_t prefix_match_size,
165 uint32_t max_expansion, int flags, grn_obj *res, grn_operator op)
166{
167 grn_table_cursor *tc;
168 char *sx = GRN_TEXT_VALUE(query);
169 char *ex = GRN_BULK_CURR(query);
170
171 if (op == GRN_OP_AND) {
172 tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID);
173 } else {
174 tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID);
175 }
176 if (tc) {
177 grn_id id;
178 grn_obj value;
179 score_heap *heap;
180 int i, n;
181 GRN_TEXT_INIT(&value, 0);
182
183 heap = score_heap_open(ctx, SCORE_HEAP_SIZE);
184 if (!heap) {
185 grn_table_cursor_close(ctx, tc);
186 grn_obj_unlink(ctx, &value);
187 return GRN_NO_MEMORY_AVAILABLE;
188 }
189
190 while ((id = grn_table_cursor_next(ctx, tc))) {
191 unsigned int distance = 0;
192 grn_obj *domain;
193 grn_id record_id;
194
195 if (op == GRN_OP_AND) {
196 grn_id *key;
197 grn_table_cursor_get_key(ctx, tc, (void **)&key);
198 record_id = *key;
199 } else {
200 record_id = id;
201 }
202 GRN_BULK_REWIND(&value);
203 grn_obj_get_value(ctx, column, record_id, &value);
204 domain = grn_ctx_at(ctx, ((&value))->header.domain);
205 if ((&(value))->header.type == GRN_VECTOR) {
206 n = grn_vector_size(ctx, &value);
207 for (i = 0; i < n; i++) {
208 unsigned int length;
209 const char *vector_value = NULL;
210 length = grn_vector_get_element(ctx, &value, i, &vector_value, NULL, NULL);
211
212 if (!prefix_match_size ||
213 (prefix_match_size > 0 && length >= prefix_match_size &&
214 !memcmp(sx, vector_value, prefix_match_size))) {
215 distance = calc_edit_distance(ctx, sx, ex,
216 (char *)vector_value,
217 (char *)vector_value + length, flags);
218 if (distance <= max_distance) {
219 score_heap_push(ctx, heap, record_id, distance);
220 break;
221 }
222 }
223 }
224 } else if ((&(value))->header.type == GRN_UVECTOR &&
225 grn_obj_is_table(ctx, domain)) {
226 n = grn_vector_size(ctx, &value);
227 for (i = 0; i < n; i++) {
228 grn_id rid;
229 char key_name[GRN_TABLE_MAX_KEY_SIZE];
230 int key_length;
231 rid = grn_uvector_get_element(ctx, &value, i, NULL);
232 key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
233
234 if (!prefix_match_size ||
235 (prefix_match_size > 0 && key_length >= prefix_match_size &&
236 !memcmp(sx, key_name, prefix_match_size))) {
237 distance = calc_edit_distance(ctx, sx, ex,
238 key_name, key_name + key_length, flags);
239 if (distance <= max_distance) {
240 score_heap_push(ctx, heap, record_id, distance);
241 break;
242 }
243 }
244 }
245 } else {
246 if (grn_obj_is_reference_column(ctx, column)) {
247 grn_id rid;
248 char key_name[GRN_TABLE_MAX_KEY_SIZE];
249 int key_length;
250 rid = GRN_RECORD_VALUE(&value);
251 key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE);
252 if (!prefix_match_size ||
253 (prefix_match_size > 0 && key_length >= prefix_match_size &&
254 !memcmp(sx, key_name, prefix_match_size))) {
255 distance = calc_edit_distance(ctx, sx, ex,
256 key_name, key_name + key_length, flags);
257 if (distance <= max_distance) {
258 score_heap_push(ctx, heap, record_id, distance);
259 }
260 }
261 } else {
262 if (!prefix_match_size ||
263 (prefix_match_size > 0 && GRN_TEXT_LEN(&value) >= prefix_match_size &&
264 !memcmp(sx, GRN_TEXT_VALUE(&value), prefix_match_size))) {
265 distance = calc_edit_distance(ctx, sx, ex,
266 GRN_TEXT_VALUE(&value),
267 GRN_BULK_CURR(&value), flags);
268 if (distance <= max_distance) {
269 score_heap_push(ctx, heap, record_id, distance);
270 }
271 }
272 }
273 }
274 grn_obj_unlink(ctx, domain);
275 }
276 grn_table_cursor_close(ctx, tc);
277 grn_obj_unlink(ctx, &value);
278
279 for (i = 0; i < heap->n_entries; i++) {
280 if (max_expansion > 0 && i >= max_expansion) {
281 break;
282 }
283 {
284 grn_posting posting;
285 posting.rid = heap->nodes[i].id;
286 posting.sid = 1;
287 posting.pos = 0;
288 posting.weight = max_distance - heap->nodes[i].score;
289 grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op);
290 }
291 }
292 grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op);
293 score_heap_close(ctx, heap);
294 }
295
296 return GRN_SUCCESS;
297}
298
299static grn_rc
300selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index,
301 int nargs, grn_obj **args,
302 grn_obj *res, grn_operator op)
303{
304 grn_rc rc = GRN_SUCCESS;
305 grn_obj *target = NULL;
306 grn_obj *obj;
307 grn_obj *query;
308 uint32_t max_distance = 1;
309 uint32_t prefix_length = 0;
310 uint32_t prefix_match_size = 0;
311 uint32_t max_expansion = 0;
312 int flags = 0;
313 grn_bool use_sequential_search = GRN_FALSE;
314
315 if ((nargs - 1) < 2) {
316 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
317 "fuzzy_search(): wrong number of arguments (%d ...)",
318 nargs - 1);
319 rc = ctx->rc;
320 goto exit;
321 }
322 obj = args[1];
323 query = args[2];
324
325 if (nargs == 4) {
326 grn_obj *options = args[3];
327
328 switch (options->header.type) {
329 case GRN_BULK :
330 max_distance = GRN_UINT32_VALUE(options);
331 break;
332 case GRN_TABLE_HASH_KEY :
333 {
334 grn_hash_cursor *cursor;
335 void *key;
336 grn_obj *value;
337 int key_size;
338 cursor = grn_hash_cursor_open(ctx, (grn_hash *)options,
339 NULL, 0, NULL, 0,
340 0, -1, 0);
341 if (!cursor) {
342 GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE,
343 "fuzzy_search(): couldn't open cursor");
344 goto exit;
345 }
346 while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) {
347 grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size,
348 (void **)&value);
349
350 if (key_size == 12 && !memcmp(key, "max_distance", 12)) {
351 max_distance = GRN_UINT32_VALUE(value);
352 } else if (key_size == 13 && !memcmp(key, "prefix_length", 13)) {
353 prefix_length = GRN_UINT32_VALUE(value);
354 } else if (key_size == 13 && !memcmp(key, "max_expansion", 13)) {
355 max_expansion = GRN_UINT32_VALUE(value);
356 } else if (key_size == 18 && !memcmp(key, "with_transposition", 18)) {
357 if (GRN_BOOL_VALUE(value)) {
358 flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION;
359 }
360 } else {
361 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
362 "invalid option name: <%.*s>",
363 key_size, (char *)key);
364 grn_hash_cursor_close(ctx, cursor);
365 goto exit;
366 }
367 }
368 grn_hash_cursor_close(ctx, cursor);
369 }
370 break;
371 default :
372 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
373 "fuzzy_search(): "
374 "3rd argument must be integer or object literal: <%.*s>",
375 (int)GRN_TEXT_LEN(options),
376 GRN_TEXT_VALUE(options));
377 goto exit;
378 }
379 }
380
381 if (index) {
382 target = index;
383 } else {
384 if (obj->header.type == GRN_COLUMN_INDEX) {
385 target = obj;
386 } else {
387 grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL);
388 }
389 }
390
391 if (target) {
392 grn_obj *lexicon;
393 use_sequential_search = GRN_TRUE;
394 lexicon = grn_ctx_at(ctx, target->header.domain);
395 if (lexicon) {
396 if (lexicon->header.type == GRN_TABLE_PAT_KEY) {
397 use_sequential_search = GRN_FALSE;
398 }
399 grn_obj_unlink(ctx, lexicon);
400 }
401 } else {
402 if (grn_obj_is_key_accessor(ctx, obj) &&
403 table->header.type == GRN_TABLE_PAT_KEY) {
404 target = table;
405 } else {
406 use_sequential_search = GRN_TRUE;
407 }
408 }
409
410 if (prefix_length) {
411 const char *s = GRN_TEXT_VALUE(query);
412 const char *e = GRN_BULK_CURR(query);
413 const char *p;
414 unsigned int cl = 0;
415 unsigned int length = 0;
416 for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) {
417 length++;
418 if (length > prefix_length) {
419 break;
420 }
421 }
422 prefix_match_size = p - s;
423 }
424
425 if (use_sequential_search) {
426 rc = sequential_fuzzy_search(ctx, table, obj, query,
427 max_distance, prefix_match_size,
428 max_expansion, flags, res, op);
429 goto exit;
430 }
431
432 if (!target) {
433 grn_obj inspected;
434 GRN_TEXT_INIT(&inspected, 0);
435 grn_inspect(ctx, &inspected, target);
436 GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT,
437 "fuzzy_search(): "
438 "column must be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>",
439 (int)GRN_TEXT_LEN(&inspected),
440 GRN_TEXT_VALUE(&inspected));
441 rc = ctx->rc;
442 GRN_OBJ_FIN(ctx, &inspected);
443 } else {
444 grn_search_optarg options = {0};
445 options.mode = GRN_OP_FUZZY;
446 options.fuzzy.prefix_match_size = prefix_match_size;
447 options.fuzzy.max_distance = max_distance;
448 options.fuzzy.max_expansion = max_expansion;
449 options.fuzzy.flags = flags;
450 grn_obj_search(ctx, target, query, res, op, &options);
451 }
452
453exit :
454 return rc;
455}
456
457void
458grn_proc_init_fuzzy_search(grn_ctx *ctx)
459{
460 grn_obj *selector_proc;
461
462 selector_proc = grn_proc_create(ctx, "fuzzy_search", -1,
463 GRN_PROC_FUNCTION,
464 NULL, NULL, NULL, 0, NULL);
465 grn_proc_set_selector(ctx, selector_proc, selector_fuzzy_search);
466 grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_FUZZY);
467}
468