1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2009-2016 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | #include "../grn_proc.h" |
20 | #include "../grn_rset.h" |
21 | #include "../grn_ii.h" |
22 | |
23 | #include <groonga/plugin.h> |
24 | |
25 | #include <string.h> |
26 | |
27 | #define DIST(ox,oy) (dists[((lx + 1) * (oy)) + (ox)]) |
28 | |
29 | static uint32_t |
30 | calc_edit_distance(grn_ctx *ctx, char *sx, char *ex, char *sy, char *ey, int flags) |
31 | { |
32 | int d = 0; |
33 | uint32_t cx, lx, cy, ly, *dists; |
34 | char *px, *py; |
35 | for (px = sx, lx = 0; px < ex && (cx = grn_charlen(ctx, px, ex)); px += cx, lx++); |
36 | for (py = sy, ly = 0; py < ey && (cy = grn_charlen(ctx, py, ey)); py += cy, ly++); |
37 | if ((dists = GRN_PLUGIN_MALLOC(ctx, (lx + 1) * (ly + 1) * sizeof(uint32_t)))) { |
38 | uint32_t x, y; |
39 | for (x = 0; x <= lx; x++) { DIST(x, 0) = x; } |
40 | for (y = 0; y <= ly; y++) { DIST(0, y) = y; } |
41 | for (x = 1, px = sx; x <= lx; x++, px += cx) { |
42 | cx = grn_charlen(ctx, px, ex); |
43 | for (y = 1, py = sy; y <= ly; y++, py += cy) { |
44 | cy = grn_charlen(ctx, py, ey); |
45 | if (cx == cy && !memcmp(px, py, cx)) { |
46 | DIST(x, y) = DIST(x - 1, y - 1); |
47 | } else { |
48 | uint32_t a = DIST(x - 1, y) + 1; |
49 | uint32_t b = DIST(x, y - 1) + 1; |
50 | uint32_t c = DIST(x - 1, y - 1) + 1; |
51 | DIST(x, y) = ((a < b) ? ((a < c) ? a : c) : ((b < c) ? b : c)); |
52 | if (flags & GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION && |
53 | x > 1 && y > 1 && cx == cy && |
54 | memcmp(px, py - cy, cx) == 0 && |
55 | memcmp(px - cx, py, cx) == 0) { |
56 | uint32_t t = DIST(x - 2, y - 2) + 1; |
57 | DIST(x, y) = ((DIST(x, y) < t) ? DIST(x, y) : t); |
58 | } |
59 | } |
60 | } |
61 | } |
62 | d = DIST(lx, ly); |
63 | GRN_PLUGIN_FREE(ctx, dists); |
64 | } |
65 | return d; |
66 | } |
67 | |
68 | static grn_obj * |
69 | func_edit_distance(grn_ctx *ctx, int nargs, grn_obj **args, grn_user_data *user_data) |
70 | { |
71 | #define N_REQUIRED_ARGS 2 |
72 | #define MAX_ARGS 3 |
73 | int d = 0; |
74 | int flags = 0; |
75 | grn_obj *obj; |
76 | if (nargs >= N_REQUIRED_ARGS && nargs <= MAX_ARGS) { |
77 | if (nargs == MAX_ARGS && GRN_BOOL_VALUE(args[2])) { |
78 | flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION; |
79 | } |
80 | d = calc_edit_distance(ctx, GRN_TEXT_VALUE(args[0]), GRN_BULK_CURR(args[0]), |
81 | GRN_TEXT_VALUE(args[1]), GRN_BULK_CURR(args[1]), flags); |
82 | } |
83 | if ((obj = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_UINT32, 0))) { |
84 | GRN_UINT32_SET(ctx, obj, d); |
85 | } |
86 | return obj; |
87 | #undef N_REQUIRED_ARGS |
88 | #undef MAX_ARGS |
89 | } |
90 | |
91 | void |
92 | grn_proc_init_edit_distance(grn_ctx *ctx) |
93 | { |
94 | grn_proc_create(ctx, "edit_distance" , -1, GRN_PROC_FUNCTION, |
95 | func_edit_distance, NULL, NULL, 0, NULL); |
96 | } |
97 | |
98 | #define SCORE_HEAP_SIZE 256 |
99 | |
100 | typedef struct { |
101 | grn_id id; |
102 | uint32_t score; |
103 | } score_heap_node; |
104 | |
105 | typedef struct { |
106 | int n_entries; |
107 | int limit; |
108 | score_heap_node *nodes; |
109 | } score_heap; |
110 | |
111 | static inline score_heap * |
112 | score_heap_open(grn_ctx *ctx, int max) |
113 | { |
114 | score_heap *h = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap)); |
115 | if (!h) { return NULL; } |
116 | h->nodes = GRN_PLUGIN_MALLOC(ctx, sizeof(score_heap_node) * max); |
117 | if (!h->nodes) { |
118 | GRN_PLUGIN_FREE(ctx, h); |
119 | return NULL; |
120 | } |
121 | h->n_entries = 0; |
122 | h->limit = max; |
123 | return h; |
124 | } |
125 | |
126 | static inline grn_bool |
127 | score_heap_push(grn_ctx *ctx, score_heap *h, grn_id id, uint32_t score) |
128 | { |
129 | int n, n2; |
130 | score_heap_node node = {id, score}; |
131 | score_heap_node node2; |
132 | if (h->n_entries >= h->limit) { |
133 | int max = h->limit * 2; |
134 | score_heap_node *nodes; |
135 | nodes = GRN_PLUGIN_REALLOC(ctx, h->nodes, sizeof(score_heap) * max); |
136 | if (!nodes) { |
137 | return GRN_FALSE; |
138 | } |
139 | h->limit = max; |
140 | h->nodes = nodes; |
141 | } |
142 | h->nodes[h->n_entries] = node; |
143 | n = h->n_entries++; |
144 | while (n) { |
145 | n2 = (n - 1) >> 1; |
146 | if (h->nodes[n2].score <= h->nodes[n].score) { break; } |
147 | node2 = h->nodes[n]; |
148 | h->nodes[n] = h->nodes[n2]; |
149 | h->nodes[n2] = node2; |
150 | n = n2; |
151 | } |
152 | return GRN_TRUE; |
153 | } |
154 | |
155 | static inline void |
156 | score_heap_close(grn_ctx *ctx, score_heap *h) |
157 | { |
158 | GRN_PLUGIN_FREE(ctx, h->nodes); |
159 | GRN_PLUGIN_FREE(ctx, h); |
160 | } |
161 | |
162 | static grn_rc |
163 | sequential_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *column, grn_obj *query, |
164 | uint32_t max_distance, uint32_t prefix_match_size, |
165 | uint32_t max_expansion, int flags, grn_obj *res, grn_operator op) |
166 | { |
167 | grn_table_cursor *tc; |
168 | char *sx = GRN_TEXT_VALUE(query); |
169 | char *ex = GRN_BULK_CURR(query); |
170 | |
171 | if (op == GRN_OP_AND) { |
172 | tc = grn_table_cursor_open(ctx, res, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID); |
173 | } else { |
174 | tc = grn_table_cursor_open(ctx, table, NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_ID); |
175 | } |
176 | if (tc) { |
177 | grn_id id; |
178 | grn_obj value; |
179 | score_heap *heap; |
180 | int i, n; |
181 | GRN_TEXT_INIT(&value, 0); |
182 | |
183 | heap = score_heap_open(ctx, SCORE_HEAP_SIZE); |
184 | if (!heap) { |
185 | grn_table_cursor_close(ctx, tc); |
186 | grn_obj_unlink(ctx, &value); |
187 | return GRN_NO_MEMORY_AVAILABLE; |
188 | } |
189 | |
190 | while ((id = grn_table_cursor_next(ctx, tc))) { |
191 | unsigned int distance = 0; |
192 | grn_obj *domain; |
193 | grn_id record_id; |
194 | |
195 | if (op == GRN_OP_AND) { |
196 | grn_id *key; |
197 | grn_table_cursor_get_key(ctx, tc, (void **)&key); |
198 | record_id = *key; |
199 | } else { |
200 | record_id = id; |
201 | } |
202 | GRN_BULK_REWIND(&value); |
203 | grn_obj_get_value(ctx, column, record_id, &value); |
204 | domain = grn_ctx_at(ctx, ((&value))->header.domain); |
205 | if ((&(value))->header.type == GRN_VECTOR) { |
206 | n = grn_vector_size(ctx, &value); |
207 | for (i = 0; i < n; i++) { |
208 | unsigned int length; |
209 | const char *vector_value = NULL; |
210 | length = grn_vector_get_element(ctx, &value, i, &vector_value, NULL, NULL); |
211 | |
212 | if (!prefix_match_size || |
213 | (prefix_match_size > 0 && length >= prefix_match_size && |
214 | !memcmp(sx, vector_value, prefix_match_size))) { |
215 | distance = calc_edit_distance(ctx, sx, ex, |
216 | (char *)vector_value, |
217 | (char *)vector_value + length, flags); |
218 | if (distance <= max_distance) { |
219 | score_heap_push(ctx, heap, record_id, distance); |
220 | break; |
221 | } |
222 | } |
223 | } |
224 | } else if ((&(value))->header.type == GRN_UVECTOR && |
225 | grn_obj_is_table(ctx, domain)) { |
226 | n = grn_vector_size(ctx, &value); |
227 | for (i = 0; i < n; i++) { |
228 | grn_id rid; |
229 | char key_name[GRN_TABLE_MAX_KEY_SIZE]; |
230 | int key_length; |
231 | rid = grn_uvector_get_element(ctx, &value, i, NULL); |
232 | key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE); |
233 | |
234 | if (!prefix_match_size || |
235 | (prefix_match_size > 0 && key_length >= prefix_match_size && |
236 | !memcmp(sx, key_name, prefix_match_size))) { |
237 | distance = calc_edit_distance(ctx, sx, ex, |
238 | key_name, key_name + key_length, flags); |
239 | if (distance <= max_distance) { |
240 | score_heap_push(ctx, heap, record_id, distance); |
241 | break; |
242 | } |
243 | } |
244 | } |
245 | } else { |
246 | if (grn_obj_is_reference_column(ctx, column)) { |
247 | grn_id rid; |
248 | char key_name[GRN_TABLE_MAX_KEY_SIZE]; |
249 | int key_length; |
250 | rid = GRN_RECORD_VALUE(&value); |
251 | key_length = grn_table_get_key(ctx, domain, rid, key_name, GRN_TABLE_MAX_KEY_SIZE); |
252 | if (!prefix_match_size || |
253 | (prefix_match_size > 0 && key_length >= prefix_match_size && |
254 | !memcmp(sx, key_name, prefix_match_size))) { |
255 | distance = calc_edit_distance(ctx, sx, ex, |
256 | key_name, key_name + key_length, flags); |
257 | if (distance <= max_distance) { |
258 | score_heap_push(ctx, heap, record_id, distance); |
259 | } |
260 | } |
261 | } else { |
262 | if (!prefix_match_size || |
263 | (prefix_match_size > 0 && GRN_TEXT_LEN(&value) >= prefix_match_size && |
264 | !memcmp(sx, GRN_TEXT_VALUE(&value), prefix_match_size))) { |
265 | distance = calc_edit_distance(ctx, sx, ex, |
266 | GRN_TEXT_VALUE(&value), |
267 | GRN_BULK_CURR(&value), flags); |
268 | if (distance <= max_distance) { |
269 | score_heap_push(ctx, heap, record_id, distance); |
270 | } |
271 | } |
272 | } |
273 | } |
274 | grn_obj_unlink(ctx, domain); |
275 | } |
276 | grn_table_cursor_close(ctx, tc); |
277 | grn_obj_unlink(ctx, &value); |
278 | |
279 | for (i = 0; i < heap->n_entries; i++) { |
280 | if (max_expansion > 0 && i >= max_expansion) { |
281 | break; |
282 | } |
283 | { |
284 | grn_posting posting; |
285 | posting.rid = heap->nodes[i].id; |
286 | posting.sid = 1; |
287 | posting.pos = 0; |
288 | posting.weight = max_distance - heap->nodes[i].score; |
289 | grn_ii_posting_add(ctx, &posting, (grn_hash *)res, op); |
290 | } |
291 | } |
292 | grn_ii_resolve_sel_and(ctx, (grn_hash *)res, op); |
293 | score_heap_close(ctx, heap); |
294 | } |
295 | |
296 | return GRN_SUCCESS; |
297 | } |
298 | |
299 | static grn_rc |
300 | selector_fuzzy_search(grn_ctx *ctx, grn_obj *table, grn_obj *index, |
301 | int nargs, grn_obj **args, |
302 | grn_obj *res, grn_operator op) |
303 | { |
304 | grn_rc rc = GRN_SUCCESS; |
305 | grn_obj *target = NULL; |
306 | grn_obj *obj; |
307 | grn_obj *query; |
308 | uint32_t max_distance = 1; |
309 | uint32_t prefix_length = 0; |
310 | uint32_t prefix_match_size = 0; |
311 | uint32_t max_expansion = 0; |
312 | int flags = 0; |
313 | grn_bool use_sequential_search = GRN_FALSE; |
314 | |
315 | if ((nargs - 1) < 2) { |
316 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
317 | "fuzzy_search(): wrong number of arguments (%d ...)" , |
318 | nargs - 1); |
319 | rc = ctx->rc; |
320 | goto exit; |
321 | } |
322 | obj = args[1]; |
323 | query = args[2]; |
324 | |
325 | if (nargs == 4) { |
326 | grn_obj *options = args[3]; |
327 | |
328 | switch (options->header.type) { |
329 | case GRN_BULK : |
330 | max_distance = GRN_UINT32_VALUE(options); |
331 | break; |
332 | case GRN_TABLE_HASH_KEY : |
333 | { |
334 | grn_hash_cursor *cursor; |
335 | void *key; |
336 | grn_obj *value; |
337 | int key_size; |
338 | cursor = grn_hash_cursor_open(ctx, (grn_hash *)options, |
339 | NULL, 0, NULL, 0, |
340 | 0, -1, 0); |
341 | if (!cursor) { |
342 | GRN_PLUGIN_ERROR(ctx, GRN_NO_MEMORY_AVAILABLE, |
343 | "fuzzy_search(): couldn't open cursor" ); |
344 | goto exit; |
345 | } |
346 | while (grn_hash_cursor_next(ctx, cursor) != GRN_ID_NIL) { |
347 | grn_hash_cursor_get_key_value(ctx, cursor, &key, &key_size, |
348 | (void **)&value); |
349 | |
350 | if (key_size == 12 && !memcmp(key, "max_distance" , 12)) { |
351 | max_distance = GRN_UINT32_VALUE(value); |
352 | } else if (key_size == 13 && !memcmp(key, "prefix_length" , 13)) { |
353 | prefix_length = GRN_UINT32_VALUE(value); |
354 | } else if (key_size == 13 && !memcmp(key, "max_expansion" , 13)) { |
355 | max_expansion = GRN_UINT32_VALUE(value); |
356 | } else if (key_size == 18 && !memcmp(key, "with_transposition" , 18)) { |
357 | if (GRN_BOOL_VALUE(value)) { |
358 | flags |= GRN_TABLE_FUZZY_SEARCH_WITH_TRANSPOSITION; |
359 | } |
360 | } else { |
361 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
362 | "invalid option name: <%.*s>" , |
363 | key_size, (char *)key); |
364 | grn_hash_cursor_close(ctx, cursor); |
365 | goto exit; |
366 | } |
367 | } |
368 | grn_hash_cursor_close(ctx, cursor); |
369 | } |
370 | break; |
371 | default : |
372 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
373 | "fuzzy_search(): " |
374 | "3rd argument must be integer or object literal: <%.*s>" , |
375 | (int)GRN_TEXT_LEN(options), |
376 | GRN_TEXT_VALUE(options)); |
377 | goto exit; |
378 | } |
379 | } |
380 | |
381 | if (index) { |
382 | target = index; |
383 | } else { |
384 | if (obj->header.type == GRN_COLUMN_INDEX) { |
385 | target = obj; |
386 | } else { |
387 | grn_column_index(ctx, obj, GRN_OP_FUZZY, &target, 1, NULL); |
388 | } |
389 | } |
390 | |
391 | if (target) { |
392 | grn_obj *lexicon; |
393 | use_sequential_search = GRN_TRUE; |
394 | lexicon = grn_ctx_at(ctx, target->header.domain); |
395 | if (lexicon) { |
396 | if (lexicon->header.type == GRN_TABLE_PAT_KEY) { |
397 | use_sequential_search = GRN_FALSE; |
398 | } |
399 | grn_obj_unlink(ctx, lexicon); |
400 | } |
401 | } else { |
402 | if (grn_obj_is_key_accessor(ctx, obj) && |
403 | table->header.type == GRN_TABLE_PAT_KEY) { |
404 | target = table; |
405 | } else { |
406 | use_sequential_search = GRN_TRUE; |
407 | } |
408 | } |
409 | |
410 | if (prefix_length) { |
411 | const char *s = GRN_TEXT_VALUE(query); |
412 | const char *e = GRN_BULK_CURR(query); |
413 | const char *p; |
414 | unsigned int cl = 0; |
415 | unsigned int length = 0; |
416 | for (p = s; p < e && (cl = grn_charlen(ctx, p, e)); p += cl) { |
417 | length++; |
418 | if (length > prefix_length) { |
419 | break; |
420 | } |
421 | } |
422 | prefix_match_size = p - s; |
423 | } |
424 | |
425 | if (use_sequential_search) { |
426 | rc = sequential_fuzzy_search(ctx, table, obj, query, |
427 | max_distance, prefix_match_size, |
428 | max_expansion, flags, res, op); |
429 | goto exit; |
430 | } |
431 | |
432 | if (!target) { |
433 | grn_obj inspected; |
434 | GRN_TEXT_INIT(&inspected, 0); |
435 | grn_inspect(ctx, &inspected, target); |
436 | GRN_PLUGIN_ERROR(ctx, GRN_INVALID_ARGUMENT, |
437 | "fuzzy_search(): " |
438 | "column must be COLUMN_INDEX or TABLE_PAT_KEY: <%.*s>" , |
439 | (int)GRN_TEXT_LEN(&inspected), |
440 | GRN_TEXT_VALUE(&inspected)); |
441 | rc = ctx->rc; |
442 | GRN_OBJ_FIN(ctx, &inspected); |
443 | } else { |
444 | grn_search_optarg options = {0}; |
445 | options.mode = GRN_OP_FUZZY; |
446 | options.fuzzy.prefix_match_size = prefix_match_size; |
447 | options.fuzzy.max_distance = max_distance; |
448 | options.fuzzy.max_expansion = max_expansion; |
449 | options.fuzzy.flags = flags; |
450 | grn_obj_search(ctx, target, query, res, op, &options); |
451 | } |
452 | |
453 | exit : |
454 | return rc; |
455 | } |
456 | |
457 | void |
458 | grn_proc_init_fuzzy_search(grn_ctx *ctx) |
459 | { |
460 | grn_obj *selector_proc; |
461 | |
462 | selector_proc = grn_proc_create(ctx, "fuzzy_search" , -1, |
463 | GRN_PROC_FUNCTION, |
464 | NULL, NULL, NULL, 0, NULL); |
465 | grn_proc_set_selector(ctx, selector_proc, selector_fuzzy_search); |
466 | grn_proc_set_selector_operator(ctx, selector_proc, GRN_OP_FUZZY); |
467 | } |
468 | |