1 | /* -*- c-basic-offset: 2 -*- */ |
2 | /* |
3 | Copyright(C) 2009-2017 Brazil |
4 | |
5 | This library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License version 2.1 as published by the Free Software Foundation. |
8 | |
9 | This library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with this library; if not, write to the Free Software |
16 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | #include "grn.h" |
19 | #include <stdio.h> |
20 | #include <fcntl.h> |
21 | #include <string.h> |
22 | #include <sys/stat.h> |
23 | |
24 | #ifdef WIN32 |
25 | # include <io.h> |
26 | # include <share.h> |
27 | #endif /* WIN32 */ |
28 | |
29 | #include "grn_ii.h" |
30 | #include "grn_ctx_impl.h" |
31 | #include "grn_token_cursor.h" |
32 | #include "grn_pat.h" |
33 | #include "grn_db.h" |
34 | #include "grn_output.h" |
35 | #include "grn_scorer.h" |
36 | #include "grn_util.h" |
37 | |
38 | #ifdef GRN_WITH_ONIGMO |
39 | # define GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH |
40 | #endif |
41 | |
42 | #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH |
43 | # include "grn_string.h" |
44 | # include <onigmo.h> |
45 | #endif |
46 | |
47 | #define MAX_PSEG 0x20000 |
48 | #define MAX_PSEG_SMALL 0x00200 |
49 | /* MAX_PSEG_MEDIUM has enough space for the following source: |
50 | * * Single source. |
51 | * * Source is a fixed size column or _key of a table. |
52 | * * Source column is a scalar column. |
53 | * * Lexicon doesn't have tokenizer. |
54 | */ |
55 | #define MAX_PSEG_MEDIUM 0x10000 |
56 | #define S_CHUNK (1 << GRN_II_W_CHUNK) |
57 | #define W_SEGMENT 18 |
58 | #define S_SEGMENT (1 << W_SEGMENT) |
59 | #define W_ARRAY_ELEMENT 3 |
60 | #define S_ARRAY_ELEMENT (1 << W_ARRAY_ELEMENT) |
61 | #define W_ARRAY (W_SEGMENT - W_ARRAY_ELEMENT) |
62 | #define ARRAY_MASK_IN_A_SEGMENT ((1 << W_ARRAY) - 1) |
63 | |
64 | #define S_GARBAGE (1<<12) |
65 | |
66 | #define CHUNK_SPLIT 0x80000000 |
67 | #define CHUNK_SPLIT_THRESHOLD 0x60000 |
68 | |
69 | #define MAX_N_ELEMENTS 5 |
70 | |
71 | #define DEFINE_NAME(ii) \ |
72 | const char *name; \ |
73 | char name_buffer[GRN_TABLE_MAX_KEY_SIZE]; \ |
74 | int name_size; \ |
75 | do { \ |
76 | if (DB_OBJ(ii)->id == GRN_ID_NIL) { \ |
77 | name = "(temporary)"; \ |
78 | name_size = strlen(name); \ |
79 | } else { \ |
80 | name_size = grn_obj_name(ctx, (grn_obj *)ii, \ |
81 | name_buffer, GRN_TABLE_MAX_KEY_SIZE); \ |
82 | name = name_buffer; \ |
83 | } \ |
84 | } while (GRN_FALSE) |
85 | |
86 | #define LSEG(pos) ((pos) >> 16) |
87 | #define LPOS(pos) (((pos) & 0xffff) << 2) |
88 | #define SEG2POS(seg,pos) ((((uint32_t)(seg)) << 16) + (((uint32_t)(pos)) >> 2)) |
89 | |
90 | #ifndef S_IRUSR |
91 | # define S_IRUSR 0400 |
92 | #endif /* S_IRUSR */ |
93 | #ifndef S_IWUSR |
94 | # define S_IWUSR 0200 |
95 | #endif /* S_IWUSR */ |
96 | |
97 | static grn_bool grn_ii_cursor_set_min_enable = GRN_TRUE; |
98 | static double grn_ii_select_too_many_index_match_ratio = -1; |
99 | static double grn_ii_estimate_size_for_query_reduce_ratio = 0.9; |
100 | static grn_bool grn_ii_overlap_token_skip_enable = GRN_FALSE; |
101 | static uint32_t grn_ii_builder_block_threshold_force = 0; |
102 | static uint32_t grn_ii_max_n_segments_small = MAX_PSEG_SMALL; |
103 | static uint32_t grn_ii_max_n_chunks_small = GRN_II_MAX_CHUNK_SMALL; |
104 | |
105 | void |
106 | grn_ii_init_from_env(void) |
107 | { |
108 | { |
109 | char grn_ii_cursor_set_min_enable_env[GRN_ENV_BUFFER_SIZE]; |
110 | grn_getenv("GRN_II_CURSOR_SET_MIN_ENABLE" , |
111 | grn_ii_cursor_set_min_enable_env, |
112 | GRN_ENV_BUFFER_SIZE); |
113 | if (strcmp(grn_ii_cursor_set_min_enable_env, "no" ) == 0) { |
114 | grn_ii_cursor_set_min_enable = GRN_FALSE; |
115 | } else { |
116 | grn_ii_cursor_set_min_enable = GRN_TRUE; |
117 | } |
118 | } |
119 | |
120 | { |
121 | char grn_ii_select_too_many_index_match_ratio_env[GRN_ENV_BUFFER_SIZE]; |
122 | grn_getenv("GRN_II_SELECT_TOO_MANY_INDEX_MATCH_RATIO" , |
123 | grn_ii_select_too_many_index_match_ratio_env, |
124 | GRN_ENV_BUFFER_SIZE); |
125 | if (grn_ii_select_too_many_index_match_ratio_env[0]) { |
126 | grn_ii_select_too_many_index_match_ratio = |
127 | atof(grn_ii_select_too_many_index_match_ratio_env); |
128 | } |
129 | } |
130 | |
131 | { |
132 | char grn_ii_estimate_size_for_query_reduce_ratio_env[GRN_ENV_BUFFER_SIZE]; |
133 | grn_getenv("GRN_II_ESTIMATE_SIZE_FOR_QUERY_REDUCE_RATIO" , |
134 | grn_ii_estimate_size_for_query_reduce_ratio_env, |
135 | GRN_ENV_BUFFER_SIZE); |
136 | if (grn_ii_estimate_size_for_query_reduce_ratio_env[0]) { |
137 | grn_ii_estimate_size_for_query_reduce_ratio = |
138 | atof(grn_ii_estimate_size_for_query_reduce_ratio_env); |
139 | } |
140 | } |
141 | |
142 | { |
143 | char grn_ii_overlap_token_skip_enable_env[GRN_ENV_BUFFER_SIZE]; |
144 | grn_getenv("GRN_II_OVERLAP_TOKEN_SKIP_ENABLE" , |
145 | grn_ii_overlap_token_skip_enable_env, |
146 | GRN_ENV_BUFFER_SIZE); |
147 | if (grn_ii_overlap_token_skip_enable_env[0]) { |
148 | grn_ii_overlap_token_skip_enable = GRN_TRUE; |
149 | } else { |
150 | grn_ii_overlap_token_skip_enable = GRN_FALSE; |
151 | } |
152 | } |
153 | |
154 | { |
155 | char grn_ii_builder_block_threshold_env[GRN_ENV_BUFFER_SIZE]; |
156 | grn_getenv("GRN_II_BUILDER_BLOCK_THRESHOLD" , |
157 | grn_ii_builder_block_threshold_env, |
158 | GRN_ENV_BUFFER_SIZE); |
159 | if (grn_ii_builder_block_threshold_env[0]) { |
160 | grn_ii_builder_block_threshold_force = |
161 | grn_atoui(grn_ii_builder_block_threshold_env, |
162 | grn_ii_builder_block_threshold_env + |
163 | strlen(grn_ii_builder_block_threshold_env), |
164 | NULL); |
165 | } else { |
166 | grn_ii_builder_block_threshold_force = 0; |
167 | } |
168 | } |
169 | |
170 | { |
171 | char grn_ii_max_n_segments_small_env[GRN_ENV_BUFFER_SIZE]; |
172 | grn_getenv("GRN_II_MAX_N_SEGMENTS_SMALL" , |
173 | grn_ii_max_n_segments_small_env, |
174 | GRN_ENV_BUFFER_SIZE); |
175 | if (grn_ii_max_n_segments_small_env[0]) { |
176 | grn_ii_max_n_segments_small = |
177 | grn_atoui(grn_ii_max_n_segments_small_env, |
178 | grn_ii_max_n_segments_small_env + |
179 | strlen(grn_ii_max_n_segments_small_env), |
180 | NULL); |
181 | if (grn_ii_max_n_segments_small > MAX_PSEG) { |
182 | grn_ii_max_n_segments_small = MAX_PSEG; |
183 | } |
184 | } |
185 | } |
186 | |
187 | { |
188 | char grn_ii_max_n_chunks_small_env[GRN_ENV_BUFFER_SIZE]; |
189 | grn_getenv("GRN_II_MAX_N_CHUNKS_SMALL" , |
190 | grn_ii_max_n_chunks_small_env, |
191 | GRN_ENV_BUFFER_SIZE); |
192 | if (grn_ii_max_n_chunks_small_env[0]) { |
193 | grn_ii_max_n_chunks_small = |
194 | grn_atoui(grn_ii_max_n_chunks_small_env, |
195 | grn_ii_max_n_chunks_small_env + |
196 | strlen(grn_ii_max_n_chunks_small_env), |
197 | NULL); |
198 | if (grn_ii_max_n_chunks_small > GRN_II_MAX_CHUNK) { |
199 | grn_ii_max_n_chunks_small = GRN_II_MAX_CHUNK; |
200 | } |
201 | } |
202 | } |
203 | } |
204 | |
205 | void |
206 | grn_ii_cursor_set_min_enable_set(grn_bool enable) |
207 | { |
208 | grn_ii_cursor_set_min_enable = enable; |
209 | } |
210 | |
211 | grn_bool |
212 | grn_ii_cursor_set_min_enable_get(void) |
213 | { |
214 | return grn_ii_cursor_set_min_enable; |
215 | } |
216 | |
217 | /* segment */ |
218 | |
219 | inline static uint32_t |
220 | segment_get(grn_ctx *ctx, grn_ii *ii) |
221 | { |
222 | uint32_t pseg; |
223 | if (ii->header->bgqtail == ((ii->header->bgqhead + 1) & (GRN_II_BGQSIZE - 1))) { |
224 | pseg = ii->header->bgqbody[ii->header->bgqtail]; |
225 | ii->header->bgqtail = (ii->header->bgqtail + 1) & (GRN_II_BGQSIZE - 1); |
226 | } else { |
227 | pseg = ii->header->pnext; |
228 | #ifndef CUT_OFF_COMPATIBILITY |
229 | if (!pseg) { |
230 | int i; |
231 | uint32_t pmax = 0; |
232 | char *used; |
233 | uint32_t max_segment = ii->seg->header->max_segment; |
234 | used = GRN_CALLOC(max_segment); |
235 | if (!used) { return max_segment; } |
236 | for (i = 0; i < GRN_II_MAX_LSEG && i < max_segment; i++) { |
237 | if ((pseg = ii->header->ainfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { |
238 | if (pseg > pmax) { pmax = pseg; } |
239 | used[pseg] = 1; |
240 | } |
241 | if ((pseg = ii->header->binfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { |
242 | if (pseg > pmax) { pmax = pseg; } |
243 | used[pseg] = 1; |
244 | } |
245 | } |
246 | for (pseg = 0; pseg < max_segment && used[pseg]; pseg++) ; |
247 | GRN_FREE(used); |
248 | ii->header->pnext = pmax + 1; |
249 | } else |
250 | #endif /* CUT_OFF_COMPATIBILITY */ |
251 | if (ii->header->pnext < ii->seg->header->max_segment) { |
252 | ii->header->pnext++; |
253 | } |
254 | } |
255 | return pseg; |
256 | } |
257 | |
258 | inline static grn_rc |
259 | segment_get_clear(grn_ctx *ctx, grn_ii *ii, uint32_t *pseg) |
260 | { |
261 | uint32_t seg = segment_get(ctx, ii); |
262 | if (seg < ii->seg->header->max_segment) { |
263 | void *p = NULL; |
264 | GRN_IO_SEG_REF(ii->seg, seg, p); |
265 | if (!p) { return GRN_NO_MEMORY_AVAILABLE; } |
266 | memset(p, 0, S_SEGMENT); |
267 | GRN_IO_SEG_UNREF(ii->seg, seg); |
268 | *pseg = seg; |
269 | return GRN_SUCCESS; |
270 | } else { |
271 | return GRN_NO_MEMORY_AVAILABLE; |
272 | } |
273 | } |
274 | |
275 | inline static grn_rc |
276 | buffer_segment_new(grn_ctx *ctx, grn_ii *ii, uint32_t *segno) |
277 | { |
278 | uint32_t lseg, pseg; |
279 | if (*segno < GRN_II_MAX_LSEG) { |
280 | if (ii->header->binfo[*segno] != GRN_II_PSEG_NOT_ASSIGNED) { |
281 | return GRN_INVALID_ARGUMENT; |
282 | } |
283 | lseg = *segno; |
284 | } else { |
285 | for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { |
286 | if (ii->header->binfo[lseg] == GRN_II_PSEG_NOT_ASSIGNED) { break; } |
287 | } |
288 | if (lseg == GRN_II_MAX_LSEG) { return GRN_NO_MEMORY_AVAILABLE; } |
289 | *segno = lseg; |
290 | } |
291 | pseg = segment_get(ctx, ii); |
292 | if (pseg < ii->seg->header->max_segment) { |
293 | ii->header->binfo[lseg] = pseg; |
294 | if (lseg >= ii->header->bmax) { ii->header->bmax = lseg + 1; } |
295 | return GRN_SUCCESS; |
296 | } else { |
297 | return GRN_NO_MEMORY_AVAILABLE; |
298 | } |
299 | } |
300 | |
301 | static grn_rc |
302 | buffer_segment_reserve(grn_ctx *ctx, grn_ii *ii, |
303 | uint32_t *lseg0, uint32_t *pseg0, |
304 | uint32_t *lseg1, uint32_t *pseg1) |
305 | { |
306 | uint32_t i = 0; |
307 | for (;; i++) { |
308 | if (i == GRN_II_MAX_LSEG) { |
309 | DEFINE_NAME(ii); |
310 | MERR("[ii][buffer][segment][reserve] " |
311 | "couldn't find a free buffer: <%.*s>: max:<%u>" , |
312 | name_size, name, |
313 | GRN_II_MAX_LSEG); |
314 | return ctx->rc; |
315 | } |
316 | if (ii->header->binfo[i] == GRN_II_PSEG_NOT_ASSIGNED) { break; } |
317 | } |
318 | *lseg0 = i++; |
319 | for (;; i++) { |
320 | if (i == GRN_II_MAX_LSEG) { |
321 | DEFINE_NAME(ii); |
322 | MERR("[ii][buffer][segment][reserve] " |
323 | "couldn't find two free buffers: " |
324 | "<%.*s>: " |
325 | "found:<%u>, max:<%u>" , |
326 | name_size, name, |
327 | *lseg0, GRN_II_MAX_LSEG); |
328 | return ctx->rc; |
329 | } |
330 | if (ii->header->binfo[i] == GRN_II_PSEG_NOT_ASSIGNED) { break; } |
331 | } |
332 | *lseg1 = i; |
333 | if ((*pseg0 = segment_get(ctx, ii)) == ii->seg->header->max_segment) { |
334 | DEFINE_NAME(ii); |
335 | MERR("[ii][buffer][segment][reserve] " |
336 | "couldn't allocate a free segment: <%.*s>: " |
337 | "buffer:<%u>, max:<%u>" , |
338 | name_size, name, |
339 | *lseg0, ii->seg->header->max_segment); |
340 | return ctx->rc; |
341 | } |
342 | if ((*pseg1 = segment_get(ctx, ii)) == ii->seg->header->max_segment) { |
343 | DEFINE_NAME(ii); |
344 | MERR("[ii][buffer][segment][reserve] " |
345 | "couldn't allocate two free segments: " |
346 | "<%.*s>: " |
347 | "found:<%u>, not-found:<%u>, max:<%u>" , |
348 | name_size, name, |
349 | *lseg0, *lseg1, ii->seg->header->max_segment); |
350 | return ctx->rc; |
351 | } |
352 | /* |
353 | { |
354 | uint32_t pseg; |
355 | char *used = GRN_CALLOC(ii->seg->header->max_segment); |
356 | if (!used) { return GRN_NO_MEMORY_AVAILABLE; } |
357 | for (i = 0; i < GRN_II_MAX_LSEG; i++) { |
358 | if ((pseg = ii->header->ainfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { |
359 | used[pseg] = 1; |
360 | } |
361 | if ((pseg = ii->header->binfo[i]) != GRN_II_PSEG_NOT_ASSIGNED) { |
362 | used[pseg] = 1; |
363 | } |
364 | } |
365 | for (pseg = 0;; pseg++) { |
366 | if (pseg == ii->seg->header->max_segment) { |
367 | GRN_FREE(used); |
368 | return GRN_NO_MEMORY_AVAILABLE; |
369 | } |
370 | if (!used[pseg]) { break; } |
371 | } |
372 | *pseg0 = pseg++; |
373 | for (;; pseg++) { |
374 | if (pseg == ii->seg->header->max_segment) { |
375 | GRN_FREE(used); |
376 | return GRN_NO_MEMORY_AVAILABLE; |
377 | } |
378 | if (!used[pseg]) { break; } |
379 | } |
380 | *pseg1 = pseg; |
381 | GRN_FREE(used); |
382 | } |
383 | */ |
384 | return ctx->rc; |
385 | } |
386 | |
387 | #define BGQENQUE(lseg) do {\ |
388 | if (ii->header->binfo[lseg] != GRN_II_PSEG_NOT_ASSIGNED) {\ |
389 | ii->header->bgqbody[ii->header->bgqhead] = ii->header->binfo[lseg];\ |
390 | ii->header->bgqhead = (ii->header->bgqhead + 1) & (GRN_II_BGQSIZE - 1);\ |
391 | GRN_ASSERT(ii->header->bgqhead != ii->header->bgqtail);\ |
392 | }\ |
393 | } while (0) |
394 | |
395 | inline static void |
396 | buffer_segment_update(grn_ii *ii, uint32_t lseg, uint32_t pseg) |
397 | { |
398 | BGQENQUE(lseg); |
399 | // smb_wmb(); |
400 | ii->header->binfo[lseg] = pseg; |
401 | if (lseg >= ii->header->bmax) { ii->header->bmax = lseg + 1; } |
402 | } |
403 | |
404 | inline static void |
405 | buffer_segment_clear(grn_ii *ii, uint32_t lseg) |
406 | { |
407 | BGQENQUE(lseg); |
408 | // smb_wmb(); |
409 | ii->header->binfo[lseg] = GRN_II_PSEG_NOT_ASSIGNED; |
410 | } |
411 | |
412 | /* chunk */ |
413 | |
414 | #define (ii,offset) \ |
415 | ((((ii)->header->chunks[((offset) >> 3)]) >> ((offset) & 7)) & 1) |
416 | |
417 | #define (ii,offset) \ |
418 | (((ii)->header->chunks[((offset) >> 3)]) |= (1 << ((offset) & 7))) |
419 | |
420 | #define (ii,offset) \ |
421 | (((ii)->header->chunks[((offset) >> 3)]) &= ~(1 << ((offset) & 7))) |
422 | |
423 | #define N_GARBAGES_TH 1 |
424 | |
425 | #define N_GARBAGES ((S_GARBAGE - (sizeof(uint32_t) * 4))/(sizeof(uint32_t))) |
426 | |
427 | typedef struct { |
428 | uint32_t head; |
429 | uint32_t tail; |
430 | uint32_t nrecs; |
431 | uint32_t next; |
432 | uint32_t recs[N_GARBAGES]; |
433 | } grn_ii_ginfo; |
434 | |
435 | #define WIN_MAP(chunk,ctx,iw,seg,pos,size,mode)\ |
436 | grn_io_win_map(chunk, ctx, iw,\ |
437 | ((seg) >> GRN_II_N_CHUNK_VARIATION),\ |
438 | (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK) + (pos),\ |
439 | size, mode) |
440 | /* |
441 | static int new_histogram[32]; |
442 | static int free_histogram[32]; |
443 | */ |
444 | static grn_rc |
445 | chunk_new(grn_ctx *ctx, grn_ii *ii, uint32_t *res, uint32_t size) |
446 | { |
447 | uint32_t n_chunks; |
448 | |
449 | n_chunks = ii->chunk->header->max_segment; |
450 | |
451 | /* |
452 | if (size) { |
453 | int m, es = size - 1; |
454 | GRN_BIT_SCAN_REV(es, m); |
455 | m++; |
456 | new_histogram[m]++; |
457 | } |
458 | */ |
459 | if (size > S_CHUNK) { |
460 | int i, j; |
461 | uint32_t n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK; |
462 | for (i = 0, j = -1; i < n_chunks; i++) { |
463 | if (HEADER_CHUNK_AT(ii, i)) { |
464 | j = i; |
465 | } else { |
466 | if (i == j + n) { |
467 | j++; |
468 | *res = j << GRN_II_N_CHUNK_VARIATION; |
469 | for (; j <= i; j++) { HEADER_CHUNK_ON(ii, j); } |
470 | return GRN_SUCCESS; |
471 | } |
472 | } |
473 | } |
474 | { |
475 | DEFINE_NAME(ii); |
476 | MERR("[ii][chunk][new] index is full: " |
477 | "<%.*s>: " |
478 | "size:<%u>, n-chunks:<%u>" , |
479 | name_size, name, |
480 | size, n_chunks); |
481 | } |
482 | return ctx->rc; |
483 | } else { |
484 | uint32_t *vp; |
485 | int m, aligned_size; |
486 | if (size > (1 << GRN_II_W_LEAST_CHUNK)) { |
487 | int es = size - 1; |
488 | GRN_BIT_SCAN_REV(es, m); |
489 | m++; |
490 | } else { |
491 | m = GRN_II_W_LEAST_CHUNK; |
492 | } |
493 | aligned_size = 1 << (m - GRN_II_W_LEAST_CHUNK); |
494 | if (ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK] > N_GARBAGES_TH) { |
495 | grn_ii_ginfo *ginfo; |
496 | uint32_t *gseg; |
497 | grn_io_win iw, iw_; |
498 | iw_.addr = NULL; |
499 | gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; |
500 | while (*gseg != GRN_II_PSEG_NOT_ASSIGNED) { |
501 | ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); |
502 | //GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); |
503 | if (!ginfo) { |
504 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
505 | { |
506 | DEFINE_NAME(ii); |
507 | MERR("[ii][chunk][new] failed to allocate garbage segment: " |
508 | "<%.*s>: " |
509 | "n-garbages:<%u>, size:<%u>, n-chunks:<%u>" , |
510 | name_size, name, |
511 | ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK], |
512 | size, |
513 | n_chunks); |
514 | } |
515 | return ctx->rc; |
516 | } |
517 | if (ginfo->next != GRN_II_PSEG_NOT_ASSIGNED || |
518 | ginfo->nrecs > N_GARBAGES_TH) { |
519 | *res = ginfo->recs[ginfo->tail]; |
520 | if (++ginfo->tail == N_GARBAGES) { ginfo->tail = 0; } |
521 | ginfo->nrecs--; |
522 | ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]--; |
523 | if (!ginfo->nrecs) { |
524 | HEADER_CHUNK_OFF(ii, *gseg); |
525 | *gseg = ginfo->next; |
526 | } |
527 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
528 | grn_io_win_unmap(&iw); |
529 | return GRN_SUCCESS; |
530 | } |
531 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
532 | iw_ = iw; |
533 | gseg = &ginfo->next; |
534 | } |
535 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
536 | } |
537 | vp = &ii->header->free_chunks[m - GRN_II_W_LEAST_CHUNK]; |
538 | if (*vp == GRN_II_PSEG_NOT_ASSIGNED) { |
539 | int i = 0; |
540 | while (HEADER_CHUNK_AT(ii, i)) { |
541 | if (++i >= n_chunks) { |
542 | DEFINE_NAME(ii); |
543 | MERR("[ii][chunk][new] failed to find a free chunk: " |
544 | "<%.*s>: " |
545 | "index:<%u>, size:<%u>, n-chunks:<%u>" , |
546 | name_size, name, |
547 | m - GRN_II_W_LEAST_CHUNK, |
548 | size, |
549 | n_chunks); |
550 | return ctx->rc; |
551 | } |
552 | } |
553 | HEADER_CHUNK_ON(ii, i); |
554 | *vp = i << GRN_II_N_CHUNK_VARIATION; |
555 | } |
556 | *res = *vp; |
557 | *vp += 1 << (m - GRN_II_W_LEAST_CHUNK); |
558 | if (!(*vp & ((1 << GRN_II_N_CHUNK_VARIATION) - 1))) { |
559 | *vp = GRN_II_PSEG_NOT_ASSIGNED; |
560 | } |
561 | return GRN_SUCCESS; |
562 | } |
563 | } |
564 | |
565 | static grn_rc |
566 | chunk_free(grn_ctx *ctx, grn_ii *ii, |
567 | uint32_t offset, uint32_t dummy, uint32_t size) |
568 | { |
569 | /* |
570 | if (size) { |
571 | int m, es = size - 1; |
572 | GRN_BIT_SCAN_REV(es, m); |
573 | m++; |
574 | free_histogram[m]++; |
575 | } |
576 | */ |
577 | grn_io_win iw, iw_; |
578 | grn_ii_ginfo *ginfo= 0; |
579 | uint32_t seg, m, *gseg; |
580 | seg = offset >> GRN_II_N_CHUNK_VARIATION; |
581 | if (size > S_CHUNK) { |
582 | int n = (size + S_CHUNK - 1) >> GRN_II_W_CHUNK; |
583 | for (; n--; seg++) { HEADER_CHUNK_OFF(ii, seg); } |
584 | return GRN_SUCCESS; |
585 | } |
586 | if (size > (1 << GRN_II_W_LEAST_CHUNK)) { |
587 | int es = size - 1; |
588 | GRN_BIT_SCAN_REV(es, m); |
589 | m++; |
590 | } else { |
591 | m = GRN_II_W_LEAST_CHUNK; |
592 | } |
593 | gseg = &ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; |
594 | iw_.addr = NULL; |
595 | while (*gseg != GRN_II_PSEG_NOT_ASSIGNED) { |
596 | ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); |
597 | // GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); |
598 | if (!ginfo) { |
599 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
600 | return GRN_NO_MEMORY_AVAILABLE; |
601 | } |
602 | if (ginfo->nrecs < N_GARBAGES) { break; } |
603 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
604 | iw_ = iw; |
605 | gseg = &ginfo->next; |
606 | } |
607 | if (*gseg == GRN_II_PSEG_NOT_ASSIGNED) { |
608 | grn_rc rc; |
609 | if ((rc = chunk_new(ctx, ii, gseg, S_GARBAGE))) { |
610 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
611 | return rc; |
612 | } |
613 | ginfo = WIN_MAP(ii->chunk, ctx, &iw, *gseg, 0, S_GARBAGE, grn_io_rdwr); |
614 | /* |
615 | uint32_t i = 0; |
616 | while (HEADER_CHUNK_AT(ii, i)) { |
617 | if (++i >= ii->chunk->header->max_segment) { |
618 | return GRN_NO_MEMORY_AVAILABLE; |
619 | } |
620 | } |
621 | HEADER_CHUNK_ON(ii, i); |
622 | *gseg = i; |
623 | GRN_IO_SEG_MAP2(ii->chunk, *gseg, ginfo); |
624 | */ |
625 | if (!ginfo) { |
626 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
627 | return GRN_NO_MEMORY_AVAILABLE; |
628 | } |
629 | ginfo->head = 0; |
630 | ginfo->tail = 0; |
631 | ginfo->nrecs = 0; |
632 | ginfo->next = GRN_II_PSEG_NOT_ASSIGNED; |
633 | } |
634 | if (iw_.addr) { grn_io_win_unmap(&iw_); } |
635 | ginfo->recs[ginfo->head] = offset; |
636 | if (++ginfo->head == N_GARBAGES) { ginfo->head = 0; } |
637 | ginfo->nrecs++; |
638 | grn_io_win_unmap(&iw); |
639 | ii->header->ngarbages[m - GRN_II_W_LEAST_CHUNK]++; |
640 | return GRN_SUCCESS; |
641 | } |
642 | |
643 | #define UNIT_SIZE 0x80 |
644 | #define UNIT_MASK (UNIT_SIZE - 1) |
645 | |
646 | /* <generated> */ |
647 | static uint8_t * |
648 | pack_1(uint32_t *p, uint8_t *rp) |
649 | { |
650 | uint8_t v; |
651 | v = *p++ << 7; |
652 | v += *p++ << 6; |
653 | v += *p++ << 5; |
654 | v += *p++ << 4; |
655 | v += *p++ << 3; |
656 | v += *p++ << 2; |
657 | v += *p++ << 1; |
658 | *rp++ = v + *p++; |
659 | return rp; |
660 | } |
661 | static uint8_t * |
662 | unpack_1(uint32_t *p, uint8_t *dp) |
663 | { |
664 | *p++ = (*dp >> 7); |
665 | *p++ = ((*dp >> 6) & 0x1); |
666 | *p++ = ((*dp >> 5) & 0x1); |
667 | *p++ = ((*dp >> 4) & 0x1); |
668 | *p++ = ((*dp >> 3) & 0x1); |
669 | *p++ = ((*dp >> 2) & 0x1); |
670 | *p++ = ((*dp >> 1) & 0x1); |
671 | *p++ = (*dp++ & 0x1); |
672 | return dp; |
673 | } |
674 | static uint8_t * |
675 | pack_2(uint32_t *p, uint8_t *rp) |
676 | { |
677 | uint8_t v; |
678 | v = *p++ << 6; |
679 | v += *p++ << 4; |
680 | v += *p++ << 2; |
681 | *rp++ = v + *p++; |
682 | v = *p++ << 6; |
683 | v += *p++ << 4; |
684 | v += *p++ << 2; |
685 | *rp++ = v + *p++; |
686 | return rp; |
687 | } |
688 | static uint8_t * |
689 | unpack_2(uint32_t *p, uint8_t *dp) |
690 | { |
691 | *p++ = (*dp >> 6); |
692 | *p++ = ((*dp >> 4) & 0x3); |
693 | *p++ = ((*dp >> 2) & 0x3); |
694 | *p++ = (*dp++ & 0x3); |
695 | *p++ = (*dp >> 6); |
696 | *p++ = ((*dp >> 4) & 0x3); |
697 | *p++ = ((*dp >> 2) & 0x3); |
698 | *p++ = (*dp++ & 0x3); |
699 | return dp; |
700 | } |
701 | static uint8_t * |
702 | pack_3(uint32_t *p, uint8_t *rp) |
703 | { |
704 | uint8_t v; |
705 | v = *p++ << 5; |
706 | v += *p++ << 2; |
707 | *rp++ = v + (*p >> 1); v = *p++ << 7; |
708 | v += *p++ << 4; |
709 | v += *p++ << 1; |
710 | *rp++ = v + (*p >> 2); v = *p++ << 6; |
711 | v += *p++ << 3; |
712 | *rp++ = v + *p++; |
713 | return rp; |
714 | } |
715 | static uint8_t * |
716 | unpack_3(uint32_t *p, uint8_t *dp) |
717 | { |
718 | uint32_t v; |
719 | *p++ = (*dp >> 5); |
720 | *p++ = ((*dp >> 2) & 0x7); |
721 | v = ((*dp++ << 1) & 0x7); *p++ = v + (*dp >> 7); |
722 | *p++ = ((*dp >> 4) & 0x7); |
723 | *p++ = ((*dp >> 1) & 0x7); |
724 | v = ((*dp++ << 2) & 0x7); *p++ = v + (*dp >> 6); |
725 | *p++ = ((*dp >> 3) & 0x7); |
726 | *p++ = (*dp++ & 0x7); |
727 | return dp; |
728 | } |
729 | static uint8_t * |
730 | pack_4(uint32_t *p, uint8_t *rp) |
731 | { |
732 | uint8_t v; |
733 | v = *p++ << 4; |
734 | *rp++ = v + *p++; |
735 | v = *p++ << 4; |
736 | *rp++ = v + *p++; |
737 | v = *p++ << 4; |
738 | *rp++ = v + *p++; |
739 | v = *p++ << 4; |
740 | *rp++ = v + *p++; |
741 | return rp; |
742 | } |
743 | static uint8_t * |
744 | unpack_4(uint32_t *p, uint8_t *dp) |
745 | { |
746 | *p++ = (*dp >> 4); |
747 | *p++ = (*dp++ & 0xf); |
748 | *p++ = (*dp >> 4); |
749 | *p++ = (*dp++ & 0xf); |
750 | *p++ = (*dp >> 4); |
751 | *p++ = (*dp++ & 0xf); |
752 | *p++ = (*dp >> 4); |
753 | *p++ = (*dp++ & 0xf); |
754 | return dp; |
755 | } |
756 | static uint8_t * |
757 | pack_5(uint32_t *p, uint8_t *rp) |
758 | { |
759 | uint8_t v; |
760 | v = *p++ << 3; |
761 | *rp++ = v + (*p >> 2); v = *p++ << 6; |
762 | v += *p++ << 1; |
763 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
764 | *rp++ = v + (*p >> 1); v = *p++ << 7; |
765 | v += *p++ << 2; |
766 | *rp++ = v + (*p >> 3); v = *p++ << 5; |
767 | *rp++ = v + *p++; |
768 | return rp; |
769 | } |
770 | static uint8_t * |
771 | unpack_5(uint32_t *p, uint8_t *dp) |
772 | { |
773 | uint32_t v; |
774 | *p++ = (*dp >> 3); |
775 | v = ((*dp++ << 2) & 0x1f); *p++ = v + (*dp >> 6); |
776 | *p++ = ((*dp >> 1) & 0x1f); |
777 | v = ((*dp++ << 4) & 0x1f); *p++ = v + (*dp >> 4); |
778 | v = ((*dp++ << 1) & 0x1f); *p++ = v + (*dp >> 7); |
779 | *p++ = ((*dp >> 2) & 0x1f); |
780 | v = ((*dp++ << 3) & 0x1f); *p++ = v + (*dp >> 5); |
781 | *p++ = (*dp++ & 0x1f); |
782 | return dp; |
783 | } |
784 | static uint8_t * |
785 | pack_6(uint32_t *p, uint8_t *rp) |
786 | { |
787 | uint8_t v; |
788 | v = *p++ << 2; |
789 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
790 | *rp++ = v + (*p >> 2); v = *p++ << 6; |
791 | *rp++ = v + *p++; |
792 | v = *p++ << 2; |
793 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
794 | *rp++ = v + (*p >> 2); v = *p++ << 6; |
795 | *rp++ = v + *p++; |
796 | return rp; |
797 | } |
798 | static uint8_t * |
799 | unpack_6(uint32_t *p, uint8_t *dp) |
800 | { |
801 | uint32_t v; |
802 | *p++ = (*dp >> 2); |
803 | v = ((*dp++ << 4) & 0x3f); *p++ = v + (*dp >> 4); |
804 | v = ((*dp++ << 2) & 0x3f); *p++ = v + (*dp >> 6); |
805 | *p++ = (*dp++ & 0x3f); |
806 | *p++ = (*dp >> 2); |
807 | v = ((*dp++ << 4) & 0x3f); *p++ = v + (*dp >> 4); |
808 | v = ((*dp++ << 2) & 0x3f); *p++ = v + (*dp >> 6); |
809 | *p++ = (*dp++ & 0x3f); |
810 | return dp; |
811 | } |
812 | static uint8_t * |
813 | pack_7(uint32_t *p, uint8_t *rp) |
814 | { |
815 | uint8_t v; |
816 | v = *p++ << 1; |
817 | *rp++ = v + (*p >> 6); v = *p++ << 2; |
818 | *rp++ = v + (*p >> 5); v = *p++ << 3; |
819 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
820 | *rp++ = v + (*p >> 3); v = *p++ << 5; |
821 | *rp++ = v + (*p >> 2); v = *p++ << 6; |
822 | *rp++ = v + (*p >> 1); v = *p++ << 7; |
823 | *rp++ = v + *p++; |
824 | return rp; |
825 | } |
826 | static uint8_t * |
827 | unpack_7(uint32_t *p, uint8_t *dp) |
828 | { |
829 | uint32_t v; |
830 | *p++ = (*dp >> 1); |
831 | v = ((*dp++ << 6) & 0x7f); *p++ = v + (*dp >> 2); |
832 | v = ((*dp++ << 5) & 0x7f); *p++ = v + (*dp >> 3); |
833 | v = ((*dp++ << 4) & 0x7f); *p++ = v + (*dp >> 4); |
834 | v = ((*dp++ << 3) & 0x7f); *p++ = v + (*dp >> 5); |
835 | v = ((*dp++ << 2) & 0x7f); *p++ = v + (*dp >> 6); |
836 | v = ((*dp++ << 1) & 0x7f); *p++ = v + (*dp >> 7); |
837 | *p++ = (*dp++ & 0x7f); |
838 | return dp; |
839 | } |
840 | static uint8_t * |
841 | pack_8(uint32_t *p, uint8_t *rp) |
842 | { |
843 | *rp++ = *p++; |
844 | *rp++ = *p++; |
845 | *rp++ = *p++; |
846 | *rp++ = *p++; |
847 | *rp++ = *p++; |
848 | *rp++ = *p++; |
849 | *rp++ = *p++; |
850 | *rp++ = *p++; |
851 | return rp; |
852 | } |
853 | static uint8_t * |
854 | unpack_8(uint32_t *p, uint8_t *dp) |
855 | { |
856 | *p++ = *dp++; |
857 | *p++ = *dp++; |
858 | *p++ = *dp++; |
859 | *p++ = *dp++; |
860 | *p++ = *dp++; |
861 | *p++ = *dp++; |
862 | *p++ = *dp++; |
863 | *p++ = *dp++; |
864 | return dp; |
865 | } |
866 | static uint8_t * |
867 | pack_9(uint32_t *p, uint8_t *rp) |
868 | { |
869 | uint8_t v; |
870 | *rp++ = (*p >> 1); v = *p++ << 7; |
871 | *rp++ = v + (*p >> 2); v = *p++ << 6; |
872 | *rp++ = v + (*p >> 3); v = *p++ << 5; |
873 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
874 | *rp++ = v + (*p >> 5); v = *p++ << 3; |
875 | *rp++ = v + (*p >> 6); v = *p++ << 2; |
876 | *rp++ = v + (*p >> 7); v = *p++ << 1; |
877 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
878 | return rp; |
879 | } |
880 | static uint8_t * |
881 | unpack_9(uint32_t *p, uint8_t *dp) |
882 | { |
883 | uint32_t v; |
884 | v = *dp++ << 1; *p++ = v + (*dp >> 7); |
885 | v = ((*dp++ << 2) & 0x1ff); *p++ = v + (*dp >> 6); |
886 | v = ((*dp++ << 3) & 0x1ff); *p++ = v + (*dp >> 5); |
887 | v = ((*dp++ << 4) & 0x1ff); *p++ = v + (*dp >> 4); |
888 | v = ((*dp++ << 5) & 0x1ff); *p++ = v + (*dp >> 3); |
889 | v = ((*dp++ << 6) & 0x1ff); *p++ = v + (*dp >> 2); |
890 | v = ((*dp++ << 7) & 0x1ff); *p++ = v + (*dp >> 1); |
891 | v = ((*dp++ << 8) & 0x1ff); *p++ = v + *dp++; |
892 | return dp; |
893 | } |
894 | static uint8_t * |
895 | pack_10(uint32_t *p, uint8_t *rp) |
896 | { |
897 | uint8_t v; |
898 | *rp++ = (*p >> 2); v = *p++ << 6; |
899 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
900 | *rp++ = v + (*p >> 6); v = *p++ << 2; |
901 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
902 | *rp++ = (*p >> 2); v = *p++ << 6; |
903 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
904 | *rp++ = v + (*p >> 6); v = *p++ << 2; |
905 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
906 | return rp; |
907 | } |
908 | static uint8_t * |
909 | unpack_10(uint32_t *p, uint8_t *dp) |
910 | { |
911 | uint32_t v; |
912 | v = *dp++ << 2; *p++ = v + (*dp >> 6); |
913 | v = ((*dp++ << 4) & 0x3ff); *p++ = v + (*dp >> 4); |
914 | v = ((*dp++ << 6) & 0x3ff); *p++ = v + (*dp >> 2); |
915 | v = ((*dp++ << 8) & 0x3ff); *p++ = v + *dp++; |
916 | v = *dp++ << 2; *p++ = v + (*dp >> 6); |
917 | v = ((*dp++ << 4) & 0x3ff); *p++ = v + (*dp >> 4); |
918 | v = ((*dp++ << 6) & 0x3ff); *p++ = v + (*dp >> 2); |
919 | v = ((*dp++ << 8) & 0x3ff); *p++ = v + *dp++; |
920 | return dp; |
921 | } |
922 | static uint8_t * |
923 | pack_11(uint32_t *p, uint8_t *rp) |
924 | { |
925 | uint8_t v; |
926 | *rp++ = (*p >> 3); v = *p++ << 5; |
927 | *rp++ = v + (*p >> 6); v = *p++ << 2; |
928 | *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
929 | *rp++ = v + (*p >> 4); v = *p++ << 4; |
930 | *rp++ = v + (*p >> 7); v = *p++ << 1; |
931 | *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
932 | *rp++ = v + (*p >> 5); v = *p++ << 3; |
933 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
934 | return rp; |
935 | } |
936 | static uint8_t * |
937 | unpack_11(uint32_t *p, uint8_t *dp) |
938 | { |
939 | uint32_t v; |
940 | v = *dp++ << 3; *p++ = v + (*dp >> 5); |
941 | v = ((*dp++ << 6) & 0x7ff); *p++ = v + (*dp >> 2); |
942 | v = ((*dp++ << 9) & 0x7ff); v += *dp++ << 1; *p++ = v + (*dp >> 7); |
943 | v = ((*dp++ << 4) & 0x7ff); *p++ = v + (*dp >> 4); |
944 | v = ((*dp++ << 7) & 0x7ff); *p++ = v + (*dp >> 1); |
945 | v = ((*dp++ << 10) & 0x7ff); v += *dp++ << 2; *p++ = v + (*dp >> 6); |
946 | v = ((*dp++ << 5) & 0x7ff); *p++ = v + (*dp >> 3); |
947 | v = ((*dp++ << 8) & 0x7ff); *p++ = v + *dp++; |
948 | return dp; |
949 | } |
950 | static uint8_t * |
951 | pack_12(uint32_t *p, uint8_t *rp) |
952 | { |
953 | uint8_t v; |
954 | *rp++ = (*p >> 4); v = *p++ << 4; |
955 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
956 | *rp++ = (*p >> 4); v = *p++ << 4; |
957 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
958 | *rp++ = (*p >> 4); v = *p++ << 4; |
959 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
960 | *rp++ = (*p >> 4); v = *p++ << 4; |
961 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
962 | return rp; |
963 | } |
964 | static uint8_t * |
965 | unpack_12(uint32_t *p, uint8_t *dp) |
966 | { |
967 | uint32_t v; |
968 | v = *dp++ << 4; *p++ = v + (*dp >> 4); |
969 | v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; |
970 | v = *dp++ << 4; *p++ = v + (*dp >> 4); |
971 | v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; |
972 | v = *dp++ << 4; *p++ = v + (*dp >> 4); |
973 | v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; |
974 | v = *dp++ << 4; *p++ = v + (*dp >> 4); |
975 | v = ((*dp++ << 8) & 0xfff); *p++ = v + *dp++; |
976 | return dp; |
977 | } |
978 | static uint8_t * |
979 | pack_13(uint32_t *p, uint8_t *rp) |
980 | { |
981 | uint8_t v; |
982 | *rp++ = (*p >> 5); v = *p++ << 3; |
983 | *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
984 | *rp++ = v + (*p >> 7); v = *p++ << 1; |
985 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
986 | *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
987 | *rp++ = v + (*p >> 6); v = *p++ << 2; |
988 | *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
989 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
990 | return rp; |
991 | } |
992 | static uint8_t * |
993 | unpack_13(uint32_t *p, uint8_t *dp) |
994 | { |
995 | uint32_t v; |
996 | v = *dp++ << 5; *p++ = v + (*dp >> 3); |
997 | v = ((*dp++ << 10) & 0x1fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); |
998 | v = ((*dp++ << 7) & 0x1fff); *p++ = v + (*dp >> 1); |
999 | v = ((*dp++ << 12) & 0x1fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1000 | v = ((*dp++ << 9) & 0x1fff); v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1001 | v = ((*dp++ << 6) & 0x1fff); *p++ = v + (*dp >> 2); |
1002 | v = ((*dp++ << 11) & 0x1fff); v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1003 | v = ((*dp++ << 8) & 0x1fff); *p++ = v + *dp++; |
1004 | return dp; |
1005 | } |
1006 | static uint8_t * |
1007 | pack_14(uint32_t *p, uint8_t *rp) |
1008 | { |
1009 | uint8_t v; |
1010 | *rp++ = (*p >> 6); v = *p++ << 2; |
1011 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1012 | *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1013 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
1014 | *rp++ = (*p >> 6); v = *p++ << 2; |
1015 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1016 | *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1017 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
1018 | return rp; |
1019 | } |
1020 | static uint8_t * |
1021 | unpack_14(uint32_t *p, uint8_t *dp) |
1022 | { |
1023 | uint32_t v; |
1024 | v = *dp++ << 6; *p++ = v + (*dp >> 2); |
1025 | v = ((*dp++ << 12) & 0x3fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1026 | v = ((*dp++ << 10) & 0x3fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1027 | v = ((*dp++ << 8) & 0x3fff); *p++ = v + *dp++; |
1028 | v = *dp++ << 6; *p++ = v + (*dp >> 2); |
1029 | v = ((*dp++ << 12) & 0x3fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1030 | v = ((*dp++ << 10) & 0x3fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1031 | v = ((*dp++ << 8) & 0x3fff); *p++ = v + *dp++; |
1032 | return dp; |
1033 | } |
1034 | static uint8_t * |
1035 | pack_15(uint32_t *p, uint8_t *rp) |
1036 | { |
1037 | uint8_t v; |
1038 | *rp++ = (*p >> 7); v = *p++ << 1; |
1039 | *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1040 | *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1041 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1042 | *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1043 | *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1044 | *rp++ = v + (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
1045 | *rp++ = v + (*p >> 8); *rp++ = *p++; |
1046 | return rp; |
1047 | } |
1048 | static uint8_t * |
1049 | unpack_15(uint32_t *p, uint8_t *dp) |
1050 | { |
1051 | uint32_t v; |
1052 | v = *dp++ << 7; *p++ = v + (*dp >> 1); |
1053 | v = ((*dp++ << 14) & 0x7fff); v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1054 | v = ((*dp++ << 13) & 0x7fff); v += *dp++ << 5; *p++ = v + (*dp >> 3); |
1055 | v = ((*dp++ << 12) & 0x7fff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1056 | v = ((*dp++ << 11) & 0x7fff); v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1057 | v = ((*dp++ << 10) & 0x7fff); v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1058 | v = ((*dp++ << 9) & 0x7fff); v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1059 | v = ((*dp++ << 8) & 0x7fff); *p++ = v + *dp++; |
1060 | return dp; |
1061 | } |
1062 | static uint8_t * |
1063 | pack_16(uint32_t *p, uint8_t *rp) |
1064 | { |
1065 | *rp++ = (*p >> 8); *rp++ = *p++; |
1066 | *rp++ = (*p >> 8); *rp++ = *p++; |
1067 | *rp++ = (*p >> 8); *rp++ = *p++; |
1068 | *rp++ = (*p >> 8); *rp++ = *p++; |
1069 | *rp++ = (*p >> 8); *rp++ = *p++; |
1070 | *rp++ = (*p >> 8); *rp++ = *p++; |
1071 | *rp++ = (*p >> 8); *rp++ = *p++; |
1072 | *rp++ = (*p >> 8); *rp++ = *p++; |
1073 | return rp; |
1074 | } |
1075 | static uint8_t * |
1076 | unpack_16(uint32_t *p, uint8_t *dp) |
1077 | { |
1078 | uint32_t v; |
1079 | v = *dp++ << 8; *p++ = v + *dp++; |
1080 | v = *dp++ << 8; *p++ = v + *dp++; |
1081 | v = *dp++ << 8; *p++ = v + *dp++; |
1082 | v = *dp++ << 8; *p++ = v + *dp++; |
1083 | v = *dp++ << 8; *p++ = v + *dp++; |
1084 | v = *dp++ << 8; *p++ = v + *dp++; |
1085 | v = *dp++ << 8; *p++ = v + *dp++; |
1086 | v = *dp++ << 8; *p++ = v + *dp++; |
1087 | return dp; |
1088 | } |
1089 | static uint8_t * |
1090 | pack_17(uint32_t *p, uint8_t *rp) |
1091 | { |
1092 | uint8_t v; |
1093 | *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
1094 | *rp++ = v + (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1095 | *rp++ = v + (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1096 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1097 | *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1098 | *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1099 | *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1100 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1101 | return rp; |
1102 | } |
1103 | static uint8_t * |
1104 | unpack_17(uint32_t *p, uint8_t *dp) |
1105 | { |
1106 | uint32_t v; |
1107 | v = *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1108 | v = ((*dp++ << 10) & 0x1ffff); v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1109 | v = ((*dp++ << 11) & 0x1ffff); v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1110 | v = ((*dp++ << 12) & 0x1ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1111 | v = ((*dp++ << 13) & 0x1ffff); v += *dp++ << 5; *p++ = v + (*dp >> 3); |
1112 | v = ((*dp++ << 14) & 0x1ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1113 | v = ((*dp++ << 15) & 0x1ffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); |
1114 | v = ((*dp++ << 16) & 0x1ffff); v += *dp++ << 8; *p++ = v + *dp++; |
1115 | return dp; |
1116 | } |
1117 | static uint8_t * |
1118 | pack_18(uint32_t *p, uint8_t *rp) |
1119 | { |
1120 | uint8_t v; |
1121 | *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1122 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1123 | *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1124 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1125 | *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1126 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1127 | *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1128 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1129 | return rp; |
1130 | } |
1131 | static uint8_t * |
1132 | unpack_18(uint32_t *p, uint8_t *dp) |
1133 | { |
1134 | uint32_t v; |
1135 | v = *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1136 | v = ((*dp++ << 12) & 0x3ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1137 | v = ((*dp++ << 14) & 0x3ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1138 | v = ((*dp++ << 16) & 0x3ffff); v += *dp++ << 8; *p++ = v + *dp++; |
1139 | v = *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1140 | v = ((*dp++ << 12) & 0x3ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1141 | v = ((*dp++ << 14) & 0x3ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1142 | v = ((*dp++ << 16) & 0x3ffff); v += *dp++ << 8; *p++ = v + *dp++; |
1143 | return dp; |
1144 | } |
1145 | static uint8_t * |
1146 | pack_19(uint32_t *p, uint8_t *rp) |
1147 | { |
1148 | uint8_t v; |
1149 | *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1150 | *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1151 | *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
1152 | *rp++ = v + (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1153 | *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1154 | *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1155 | *rp++ = v + (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1156 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1157 | return rp; |
1158 | } |
1159 | static uint8_t * |
1160 | unpack_19(uint32_t *p, uint8_t *dp) |
1161 | { |
1162 | uint32_t v; |
1163 | v = *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1164 | v = ((*dp++ << 14) & 0x7ffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1165 | v = ((*dp++ << 17) & 0x7ffff); v += *dp++ << 9; v += *dp++ << 1; |
1166 | *p++ = v + (*dp >> 7); |
1167 | v = ((*dp++ << 12) & 0x7ffff); v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1168 | v = ((*dp++ << 15) & 0x7ffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); |
1169 | v = ((*dp++ << 18) & 0x7ffff); v += *dp++ << 10; v += *dp++ << 2; |
1170 | *p++ = v + (*dp >> 6); |
1171 | v = ((*dp++ << 13) & 0x7ffff); v += *dp++ << 5; *p++ = v + (*dp >> 3); |
1172 | v = ((*dp++ << 16) & 0x7ffff); v += *dp++ << 8; *p++ = v + *dp++; |
1173 | return dp; |
1174 | } |
1175 | static uint8_t * |
1176 | pack_20(uint32_t *p, uint8_t *rp) |
1177 | { |
1178 | uint8_t v; |
1179 | *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1180 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1181 | *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1182 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1183 | *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1184 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1185 | *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1186 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1187 | return rp; |
1188 | } |
1189 | static uint8_t * |
1190 | unpack_20(uint32_t *p, uint8_t *dp) |
1191 | { |
1192 | uint32_t v; |
1193 | v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1194 | v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; |
1195 | v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1196 | v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; |
1197 | v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1198 | v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; |
1199 | v = *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1200 | v = ((*dp++ << 16) & 0xfffff); v += *dp++ << 8; *p++ = v + *dp++; |
1201 | return dp; |
1202 | } |
1203 | static uint8_t * |
1204 | pack_21(uint32_t *p, uint8_t *rp) |
1205 | { |
1206 | uint8_t v; |
1207 | *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1208 | *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1209 | *rp++ = v + (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1210 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1211 | *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
1212 | *rp++ = v + (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1213 | *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1214 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1215 | return rp; |
1216 | } |
1217 | static uint8_t * |
1218 | unpack_21(uint32_t *p, uint8_t *dp) |
1219 | { |
1220 | uint32_t v; |
1221 | v = *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); |
1222 | v = ((*dp++ << 18) & 0x1fffff); v += *dp++ << 10; v += *dp++ << 2; |
1223 | *p++ = v + (*dp >> 6); |
1224 | v = ((*dp++ << 15) & 0x1fffff); v += *dp++ << 7; *p++ = v + (*dp >> 1); |
1225 | v = ((*dp++ << 20) & 0x1fffff); v += *dp++ << 12; v += *dp++ << 4; |
1226 | *p++ = v + (*dp >> 4); |
1227 | v = ((*dp++ << 17) & 0x1fffff); v += *dp++ << 9; v += *dp++ << 1; |
1228 | *p++ = v + (*dp >> 7); |
1229 | v = ((*dp++ << 14) & 0x1fffff); v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1230 | v = ((*dp++ << 19) & 0x1fffff); v += *dp++ << 11; v += *dp++ << 3; |
1231 | *p++ = v + (*dp >> 5); |
1232 | v = ((*dp++ << 16) & 0x1fffff); v += *dp++ << 8; *p++ = v + *dp++; |
1233 | return dp; |
1234 | } |
1235 | static uint8_t * |
1236 | pack_22(uint32_t *p, uint8_t *rp) |
1237 | { |
1238 | uint8_t v; |
1239 | *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1240 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1241 | *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1242 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1243 | *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1244 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1245 | *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1246 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1247 | return rp; |
1248 | } |
1249 | static uint8_t * |
1250 | unpack_22(uint32_t *p, uint8_t *dp) |
1251 | { |
1252 | uint32_t v; |
1253 | v = *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1254 | v = ((*dp++ << 20) & 0x3fffff); v += *dp++ << 12; v += *dp++ << 4; |
1255 | *p++ = v + (*dp >> 4); |
1256 | v = ((*dp++ << 18) & 0x3fffff); v += *dp++ << 10; v += *dp++ << 2; |
1257 | *p++ = v + (*dp >> 6); |
1258 | v = ((*dp++ << 16) & 0x3fffff); v += *dp++ << 8; *p++ = v + *dp++; |
1259 | v = *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1260 | v = ((*dp++ << 20) & 0x3fffff); v += *dp++ << 12; v += *dp++ << 4; |
1261 | *p++ = v + (*dp >> 4); |
1262 | v = ((*dp++ << 18) & 0x3fffff); v += *dp++ << 10; v += *dp++ << 2; |
1263 | *p++ = v + (*dp >> 6); |
1264 | v = ((*dp++ << 16) & 0x3fffff); v += *dp++ << 8; *p++ = v + *dp++; |
1265 | return dp; |
1266 | } |
1267 | static uint8_t * |
1268 | pack_23(uint32_t *p, uint8_t *rp) |
1269 | { |
1270 | uint8_t v; |
1271 | *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1272 | *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1273 | *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1274 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1275 | *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1276 | *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1277 | *rp++ = v + (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
1278 | *rp++ = v + (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1279 | return rp; |
1280 | } |
1281 | static uint8_t * |
1282 | unpack_23(uint32_t *p, uint8_t *dp) |
1283 | { |
1284 | uint32_t v; |
1285 | v = *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); |
1286 | v = ((*dp++ << 22) & 0x7fffff); v += *dp++ << 14; v += *dp++ << 6; |
1287 | *p++ = v + (*dp >> 2); |
1288 | v = ((*dp++ << 21) & 0x7fffff); v += *dp++ << 13; v += *dp++ << 5; |
1289 | *p++ = v + (*dp >> 3); |
1290 | v = ((*dp++ << 20) & 0x7fffff); v += *dp++ << 12; v += *dp++ << 4; |
1291 | *p++ = v + (*dp >> 4); |
1292 | v = ((*dp++ << 19) & 0x7fffff); v += *dp++ << 11; v += *dp++ << 3; |
1293 | *p++ = v + (*dp >> 5); |
1294 | v = ((*dp++ << 18) & 0x7fffff); v += *dp++ << 10; v += *dp++ << 2; |
1295 | *p++ = v + (*dp >> 6); |
1296 | v = ((*dp++ << 17) & 0x7fffff); v += *dp++ << 9; v += *dp++ << 1; |
1297 | *p++ = v + (*dp >> 7); |
1298 | v = ((*dp++ << 16) & 0x7fffff); v += *dp++ << 8; *p++ = v + *dp++; |
1299 | return dp; |
1300 | } |
1301 | static uint8_t * |
1302 | pack_24(uint32_t *p, uint8_t *rp) |
1303 | { |
1304 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1305 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1306 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1307 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1308 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1309 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1310 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1311 | *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1312 | return rp; |
1313 | } |
1314 | static uint8_t * |
1315 | unpack_24(uint32_t *p, uint8_t *dp) |
1316 | { |
1317 | uint32_t v; |
1318 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1319 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1320 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1321 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1322 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1323 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1324 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1325 | v = *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1326 | return dp; |
1327 | } |
1328 | static uint8_t * |
1329 | pack_25(uint32_t *p, uint8_t *rp) |
1330 | { |
1331 | uint8_t v; |
1332 | *rp++ = (*p >> 17); *rp++ = (*p >> 9); *rp++ = (*p >> 1); v = *p++ << 7; |
1333 | *rp++ = v + (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1334 | *rp++ = v + (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1335 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1336 | *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1337 | *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1338 | *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1339 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1340 | return rp; |
1341 | } |
1342 | static uint8_t * |
1343 | unpack_25(uint32_t *p, uint8_t *dp) |
1344 | { |
1345 | uint32_t v; |
1346 | v = *dp++ << 17; v += *dp++ << 9; v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1347 | v = ((*dp++ << 18) & 0x1ffffff); v += *dp++ << 10; v += *dp++ << 2; |
1348 | *p++ = v + (*dp >> 6); |
1349 | v = ((*dp++ << 19) & 0x1ffffff); v += *dp++ << 11; v += *dp++ << 3; |
1350 | *p++ = v + (*dp >> 5); |
1351 | v = ((*dp++ << 20) & 0x1ffffff); v += *dp++ << 12; v += *dp++ << 4; |
1352 | *p++ = v + (*dp >> 4); |
1353 | v = ((*dp++ << 21) & 0x1ffffff); v += *dp++ << 13; v += *dp++ << 5; |
1354 | *p++ = v + (*dp >> 3); |
1355 | v = ((*dp++ << 22) & 0x1ffffff); v += *dp++ << 14; v += *dp++ << 6; |
1356 | *p++ = v + (*dp >> 2); |
1357 | v = ((*dp++ << 23) & 0x1ffffff); v += *dp++ << 15; v += *dp++ << 7; |
1358 | *p++ = v + (*dp >> 1); |
1359 | v = ((*dp++ << 24) & 0x1ffffff); v += *dp++ << 16; v += *dp++ << 8; |
1360 | *p++ = v + *dp++; |
1361 | return dp; |
1362 | } |
1363 | static uint8_t * |
1364 | pack_26(uint32_t *p, uint8_t *rp) |
1365 | { |
1366 | uint8_t v; |
1367 | *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1368 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1369 | *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1370 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1371 | *rp++ = (*p >> 18); *rp++ = (*p >> 10); *rp++ = (*p >> 2); v = *p++ << 6; |
1372 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1373 | *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1374 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1375 | return rp; |
1376 | } |
1377 | static uint8_t * |
1378 | unpack_26(uint32_t *p, uint8_t *dp) |
1379 | { |
1380 | uint32_t v; |
1381 | v = *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1382 | v = ((*dp++ << 20) & 0x3ffffff); v += *dp++ << 12; v += *dp++ << 4; |
1383 | *p++ = v + (*dp >> 4); |
1384 | v = ((*dp++ << 22) & 0x3ffffff); v += *dp++ << 14; v += *dp++ << 6; |
1385 | *p++ = v + (*dp >> 2); |
1386 | v = ((*dp++ << 24) & 0x3ffffff); v += *dp++ << 16; v += *dp++ << 8; |
1387 | *p++ = v + *dp++; |
1388 | v = *dp++ << 18; v += *dp++ << 10; v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1389 | v = ((*dp++ << 20) & 0x3ffffff); v += *dp++ << 12; v += *dp++ << 4; |
1390 | *p++ = v + (*dp >> 4); |
1391 | v = ((*dp++ << 22) & 0x3ffffff); v += *dp++ << 14; v += *dp++ << 6; |
1392 | *p++ = v + (*dp >> 2); |
1393 | v = ((*dp++ << 24) & 0x3ffffff); v += *dp++ << 16; v += *dp++ << 8; |
1394 | *p++ = v + *dp++; |
1395 | return dp; |
1396 | } |
1397 | static uint8_t * |
1398 | pack_27(uint32_t *p, uint8_t *rp) |
1399 | { |
1400 | uint8_t v; |
1401 | *rp++ = (*p >> 19); *rp++ = (*p >> 11); *rp++ = (*p >> 3); v = *p++ << 5; |
1402 | *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1403 | *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); |
1404 | *rp++ = (*p >> 1); v = *p++ << 7; |
1405 | *rp++ = v + (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1406 | *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1407 | *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); |
1408 | *rp++ = (*p >> 2); v = *p++ << 6; |
1409 | *rp++ = v + (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1410 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1411 | return rp; |
1412 | } |
1413 | static uint8_t * |
1414 | unpack_27(uint32_t *p, uint8_t *dp) |
1415 | { |
1416 | uint32_t v; |
1417 | v = *dp++ << 19; v += *dp++ << 11; v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1418 | v = ((*dp++ << 22) & 0x7ffffff); v += *dp++ << 14; v += *dp++ << 6; |
1419 | *p++ = v + (*dp >> 2); |
1420 | v = ((*dp++ << 25) & 0x7ffffff); v += *dp++ << 17; v += *dp++ << 9; |
1421 | v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1422 | v = ((*dp++ << 20) & 0x7ffffff); v += *dp++ << 12; v += *dp++ << 4; |
1423 | *p++ = v + (*dp >> 4); |
1424 | v = ((*dp++ << 23) & 0x7ffffff); v += *dp++ << 15; v += *dp++ << 7; |
1425 | *p++ = v + (*dp >> 1); |
1426 | v = ((*dp++ << 26) & 0x7ffffff); v += *dp++ << 18; v += *dp++ << 10; |
1427 | v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1428 | v = ((*dp++ << 21) & 0x7ffffff); v += *dp++ << 13; v += *dp++ << 5; |
1429 | *p++ = v + (*dp >> 3); |
1430 | v = ((*dp++ << 24) & 0x7ffffff); v += *dp++ << 16; v += *dp++ << 8; |
1431 | *p++ = v + *dp++; |
1432 | return dp; |
1433 | } |
1434 | static uint8_t * |
1435 | pack_28(uint32_t *p, uint8_t *rp) |
1436 | { |
1437 | uint8_t v; |
1438 | *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1439 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1440 | *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1441 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1442 | *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1443 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1444 | *rp++ = (*p >> 20); *rp++ = (*p >> 12); *rp++ = (*p >> 4); v = *p++ << 4; |
1445 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1446 | return rp; |
1447 | } |
1448 | static uint8_t * |
1449 | unpack_28(uint32_t *p, uint8_t *dp) |
1450 | { |
1451 | uint32_t v; |
1452 | v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1453 | v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; |
1454 | *p++ = v + *dp++; |
1455 | v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1456 | v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; |
1457 | *p++ = v + *dp++; |
1458 | v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1459 | v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; |
1460 | *p++ = v + *dp++; |
1461 | v = *dp++ << 20; v += *dp++ << 12; v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1462 | v = ((*dp++ << 24) & 0xfffffff); v += *dp++ << 16; v += *dp++ << 8; |
1463 | *p++ = v + *dp++; |
1464 | return dp; |
1465 | } |
1466 | static uint8_t * |
1467 | pack_29(uint32_t *p, uint8_t *rp) |
1468 | { |
1469 | uint8_t v; |
1470 | *rp++ = (*p >> 21); *rp++ = (*p >> 13); *rp++ = (*p >> 5); v = *p++ << 3; |
1471 | *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); |
1472 | *rp++ = (*p >> 2); v = *p++ << 6; |
1473 | *rp++ = v + (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1474 | *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); |
1475 | *rp++ = (*p >> 4); v = *p++ << 4; |
1476 | *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); |
1477 | *rp++ = (*p >> 1); v = *p++ << 7; |
1478 | *rp++ = v + (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1479 | *rp++ = v + (*p >> 27); *rp++ = (*p >> 19); *rp++ = (*p >> 11); |
1480 | *rp++ = (*p >> 3); v = *p++ << 5; |
1481 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1482 | return rp; |
1483 | } |
1484 | static uint8_t * |
1485 | unpack_29(uint32_t *p, uint8_t *dp) |
1486 | { |
1487 | uint32_t v; |
1488 | v = *dp++ << 21; v += *dp++ << 13; v += *dp++ << 5; *p++ = v + (*dp >> 3); |
1489 | v = ((*dp++ << 26) & 0x1fffffff); v += *dp++ << 18; v += *dp++ << 10; |
1490 | v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1491 | v = ((*dp++ << 23) & 0x1fffffff); v += *dp++ << 15; v += *dp++ << 7; |
1492 | *p++ = v + (*dp >> 1); |
1493 | v = ((*dp++ << 28) & 0x1fffffff); v += *dp++ << 20; v += *dp++ << 12; |
1494 | v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1495 | v = ((*dp++ << 25) & 0x1fffffff); v += *dp++ << 17; v += *dp++ << 9; |
1496 | v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1497 | v = ((*dp++ << 22) & 0x1fffffff); v += *dp++ << 14; v += *dp++ << 6; |
1498 | *p++ = v + (*dp >> 2); |
1499 | v = ((*dp++ << 27) & 0x1fffffff); v += *dp++ << 19; v += *dp++ << 11; |
1500 | v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1501 | v = ((*dp++ << 24) & 0x1fffffff); v += *dp++ << 16; v += *dp++ << 8; |
1502 | *p++ = v + *dp++; |
1503 | return dp; |
1504 | } |
1505 | static uint8_t * |
1506 | pack_30(uint32_t *p, uint8_t *rp) |
1507 | { |
1508 | uint8_t v; |
1509 | *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1510 | *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); |
1511 | *rp++ = (*p >> 4); v = *p++ << 4; |
1512 | *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); |
1513 | *rp++ = (*p >> 2); v = *p++ << 6; |
1514 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1515 | *rp++ = (*p >> 22); *rp++ = (*p >> 14); *rp++ = (*p >> 6); v = *p++ << 2; |
1516 | *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); |
1517 | *rp++ = (*p >> 4); v = *p++ << 4; |
1518 | *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); |
1519 | *rp++ = (*p >> 2); v = *p++ << 6; |
1520 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); |
1521 | *rp++ = *p++; |
1522 | return rp; |
1523 | } |
1524 | static uint8_t * |
1525 | unpack_30(uint32_t *p, uint8_t *dp) |
1526 | { |
1527 | uint32_t v; |
1528 | v = *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1529 | v = ((*dp++ << 28) & 0x3fffffff); v += *dp++ << 20; v += *dp++ << 12; |
1530 | v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1531 | v = ((*dp++ << 26) & 0x3fffffff); v += *dp++ << 18; v += *dp++ << 10; |
1532 | v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1533 | v = ((*dp++ << 24) & 0x3fffffff); v += *dp++ << 16; v += *dp++ << 8; |
1534 | *p++ = v + *dp++; |
1535 | v = *dp++ << 22; v += *dp++ << 14; v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1536 | v = ((*dp++ << 28) & 0x3fffffff); v += *dp++ << 20; v += *dp++ << 12; |
1537 | v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1538 | v = ((*dp++ << 26) & 0x3fffffff); v += *dp++ << 18; v += *dp++ << 10; |
1539 | v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1540 | v = ((*dp++ << 24) & 0x3fffffff); v += *dp++ << 16; v += *dp++ << 8; |
1541 | *p++ = v + *dp++; |
1542 | return dp; |
1543 | } |
1544 | static uint8_t * |
1545 | pack_31(uint32_t *p, uint8_t *rp) |
1546 | { |
1547 | uint8_t v; |
1548 | *rp++ = (*p >> 23); *rp++ = (*p >> 15); *rp++ = (*p >> 7); v = *p++ << 1; |
1549 | *rp++ = v + (*p >> 30); *rp++ = (*p >> 22); *rp++ = (*p >> 14); |
1550 | *rp++ = (*p >> 6); v = *p++ << 2; |
1551 | *rp++ = v + (*p >> 29); *rp++ = (*p >> 21); *rp++ = (*p >> 13); |
1552 | *rp++ = (*p >> 5); v = *p++ << 3; |
1553 | *rp++ = v + (*p >> 28); *rp++ = (*p >> 20); *rp++ = (*p >> 12); |
1554 | *rp++ = (*p >> 4); v = *p++ << 4; |
1555 | *rp++ = v + (*p >> 27); *rp++ = (*p >> 19); *rp++ = (*p >> 11); |
1556 | *rp++ = (*p >> 3); v = *p++ << 5; |
1557 | *rp++ = v + (*p >> 26); *rp++ = (*p >> 18); *rp++ = (*p >> 10); |
1558 | *rp++ = (*p >> 2); v = *p++ << 6; |
1559 | *rp++ = v + (*p >> 25); *rp++ = (*p >> 17); *rp++ = (*p >> 9); |
1560 | *rp++ = (*p >> 1); v = *p++ << 7; |
1561 | *rp++ = v + (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); |
1562 | *rp++ = *p++; |
1563 | return rp; |
1564 | } |
1565 | static uint8_t * |
1566 | unpack_31(uint32_t *p, uint8_t *dp) |
1567 | { |
1568 | uint32_t v; |
1569 | v = *dp++ << 23; v += *dp++ << 15; v += *dp++ << 7; *p++ = v + (*dp >> 1); |
1570 | v = ((*dp++ << 30) & 0x7fffffff); v += *dp++ << 22; v += *dp++ << 14; |
1571 | v += *dp++ << 6; *p++ = v + (*dp >> 2); |
1572 | v = ((*dp++ << 29) & 0x7fffffff); v += *dp++ << 21; v += *dp++ << 13; |
1573 | v += *dp++ << 5; *p++ = v + (*dp >> 3); |
1574 | v = ((*dp++ << 28) & 0x7fffffff); v += *dp++ << 20; v += *dp++ << 12; |
1575 | v += *dp++ << 4; *p++ = v + (*dp >> 4); |
1576 | v = ((*dp++ << 27) & 0x7fffffff); v += *dp++ << 19; v += *dp++ << 11; |
1577 | v += *dp++ << 3; *p++ = v + (*dp >> 5); |
1578 | v = ((*dp++ << 26) & 0x7fffffff); v += *dp++ << 18; v += *dp++ << 10; |
1579 | v += *dp++ << 2; *p++ = v + (*dp >> 6); |
1580 | v = ((*dp++ << 25) & 0x7fffffff); v += *dp++ << 17; v += *dp++ << 9; |
1581 | v += *dp++ << 1; *p++ = v + (*dp >> 7); |
1582 | v = ((*dp++ << 24) & 0x7fffffff); v += *dp++ << 16; v += *dp++ << 8; |
1583 | *p++ = v + *dp++; |
1584 | return dp; |
1585 | } |
1586 | static uint8_t * |
1587 | pack_32(uint32_t *p, uint8_t *rp) |
1588 | { |
1589 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1590 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1591 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1592 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1593 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1594 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1595 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1596 | *rp++ = (*p >> 24); *rp++ = (*p >> 16); *rp++ = (*p >> 8); *rp++ = *p++; |
1597 | return rp; |
1598 | } |
1599 | static uint8_t * |
1600 | unpack_32(uint32_t *p, uint8_t *dp) |
1601 | { |
1602 | uint32_t v; |
1603 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1604 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1605 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1606 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1607 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1608 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1609 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1610 | v = *dp++ << 24; v += *dp++ << 16; v += *dp++ << 8; *p++ = v + *dp++; |
1611 | return dp; |
1612 | } |
1613 | /* </generated> */ |
1614 | |
1615 | static uint8_t * |
1616 | pack_(uint32_t *p, uint32_t i, int w, uint8_t *rp) |
1617 | { |
1618 | while (i >= 8) { |
1619 | switch (w) { |
1620 | case 0 : break; |
1621 | case 1 : rp = pack_1(p, rp); break; |
1622 | case 2 : rp = pack_2(p, rp); break; |
1623 | case 3 : rp = pack_3(p, rp); break; |
1624 | case 4 : rp = pack_4(p, rp); break; |
1625 | case 5 : rp = pack_5(p, rp); break; |
1626 | case 6 : rp = pack_6(p, rp); break; |
1627 | case 7 : rp = pack_7(p, rp); break; |
1628 | case 8 : rp = pack_8(p, rp); break; |
1629 | case 9 : rp = pack_9(p, rp); break; |
1630 | case 10 : rp = pack_10(p, rp); break; |
1631 | case 11 : rp = pack_11(p, rp); break; |
1632 | case 12 : rp = pack_12(p, rp); break; |
1633 | case 13 : rp = pack_13(p, rp); break; |
1634 | case 14 : rp = pack_14(p, rp); break; |
1635 | case 15 : rp = pack_15(p, rp); break; |
1636 | case 16 : rp = pack_16(p, rp); break; |
1637 | case 17 : rp = pack_17(p, rp); break; |
1638 | case 18 : rp = pack_18(p, rp); break; |
1639 | case 19 : rp = pack_19(p, rp); break; |
1640 | case 20 : rp = pack_20(p, rp); break; |
1641 | case 21 : rp = pack_21(p, rp); break; |
1642 | case 22 : rp = pack_22(p, rp); break; |
1643 | case 23 : rp = pack_23(p, rp); break; |
1644 | case 24 : rp = pack_24(p, rp); break; |
1645 | case 25 : rp = pack_25(p, rp); break; |
1646 | case 26 : rp = pack_26(p, rp); break; |
1647 | case 27 : rp = pack_27(p, rp); break; |
1648 | case 28 : rp = pack_28(p, rp); break; |
1649 | case 29 : rp = pack_29(p, rp); break; |
1650 | case 30 : rp = pack_30(p, rp); break; |
1651 | case 31 : rp = pack_31(p, rp); break; |
1652 | case 32 : rp = pack_32(p, rp); break; |
1653 | } |
1654 | p += 8; |
1655 | i -= 8; |
1656 | } |
1657 | { |
1658 | int b; |
1659 | uint8_t v; |
1660 | uint32_t *pe = p + i; |
1661 | for (b = 8 - w, v = 0; p < pe;) { |
1662 | if (b > 0) { |
1663 | v += *p++ << b; |
1664 | b -= w; |
1665 | } else if (b < 0) { |
1666 | *rp++ = v + (*p >> -b); |
1667 | b += 8; |
1668 | v = 0; |
1669 | } else { |
1670 | *rp++ = v + *p++; |
1671 | b = 8 - w; |
1672 | v = 0; |
1673 | } |
1674 | } |
1675 | if (b + w != 8) { *rp++ = v; } |
1676 | return rp; |
1677 | } |
1678 | } |
1679 | |
1680 | static uint8_t * |
1681 | pack(uint32_t *p, uint32_t i, uint8_t *freq, uint8_t *rp) |
1682 | { |
1683 | int32_t k, w; |
1684 | uint8_t ebuf[UNIT_SIZE], *ep = ebuf; |
1685 | uint32_t s, *pe = p + i, r, th = i - (i >> 3); |
1686 | for (w = 0, s = 0; w <= 32; w++) { |
1687 | if ((s += freq[w]) >= th) { break; } |
1688 | } |
1689 | if (i == s) { |
1690 | *rp++ = w; |
1691 | return pack_(p, i, w, rp); |
1692 | } |
1693 | r = 1 << w; |
1694 | *rp++ = w + 0x80; |
1695 | *rp++ = i - s; |
1696 | if (r >= UNIT_SIZE) { |
1697 | uint32_t first, *last = &first; |
1698 | for (k = 0; p < pe; p++, k++) { |
1699 | if (*p >= r) { |
1700 | GRN_B_ENC(*p - r, ep); |
1701 | *last = k; |
1702 | last = p; |
1703 | } |
1704 | } |
1705 | *last = 0; |
1706 | *rp++ = (uint8_t) first; |
1707 | } else { |
1708 | for (k = 0; p < pe; p++, k++) { |
1709 | if (*p >= r) { |
1710 | *ep++ = k; |
1711 | GRN_B_ENC(*p - r, ep); |
1712 | *p = 0; |
1713 | } |
1714 | } |
1715 | } |
1716 | rp = pack_(p - i, i, w, rp); |
1717 | grn_memcpy(rp, ebuf, ep - ebuf); |
1718 | return rp + (ep - ebuf); |
1719 | } |
1720 | |
1721 | int |
1722 | grn_p_enc(grn_ctx *ctx, uint32_t *data, uint32_t data_size, uint8_t **res) |
1723 | { |
1724 | uint8_t *rp, freq[33]; |
1725 | uint32_t j, *dp, *dpe, d, w, buf[UNIT_SIZE]; |
1726 | *res = rp = GRN_MALLOC(data_size * sizeof(uint32_t) * 2); |
1727 | GRN_B_ENC(data_size, rp); |
1728 | memset(freq, 0, 33); |
1729 | for (j = 0, dp = data, dpe = dp + data_size; dp < dpe; j++, dp++) { |
1730 | if (j == UNIT_SIZE) { |
1731 | rp = pack(buf, j, freq, rp); |
1732 | memset(freq, 0, 33); |
1733 | j = 0; |
1734 | } |
1735 | if ((d = buf[j] = *dp)) { |
1736 | GRN_BIT_SCAN_REV(d, w); |
1737 | freq[w + 1]++; |
1738 | } else { |
1739 | freq[0]++; |
1740 | } |
1741 | } |
1742 | if (j) { rp = pack(buf, j, freq, rp); } |
1743 | return rp - *res; |
1744 | } |
1745 | |
1746 | #define USE_P_ENC (1<<0) /* Use PForDelta */ |
1747 | #define CUT_OFF (1<<1) /* Deprecated */ |
1748 | #define ODD (1<<2) /* Variable size data */ |
1749 | |
1750 | typedef struct { |
1751 | uint32_t *data; |
1752 | uint32_t data_size; |
1753 | uint32_t flags; |
1754 | } datavec; |
1755 | |
1756 | static grn_rc |
1757 | datavec_reset(grn_ctx *ctx, datavec *dv, uint32_t dvlen, |
1758 | size_t unitsize, size_t totalsize) |
1759 | { |
1760 | int i; |
1761 | if (!dv[0].data || dv[dvlen].data < dv[0].data + totalsize) { |
1762 | if (dv[0].data) { GRN_FREE(dv[0].data); } |
1763 | if (!(dv[0].data = GRN_MALLOC(totalsize * sizeof(uint32_t)))) { |
1764 | MERR("[ii][data-vector][reset] failed to allocate data: " |
1765 | "length:<%u>, " |
1766 | "unit-size:<%" GRN_FMT_SIZE ">, " |
1767 | "total-size:<%" GRN_FMT_SIZE ">" , |
1768 | dvlen, |
1769 | unitsize, |
1770 | totalsize); |
1771 | return ctx->rc; |
1772 | } |
1773 | dv[dvlen].data = dv[0].data + totalsize; |
1774 | } |
1775 | for (i = 1; i < dvlen; i++) { |
1776 | dv[i].data = dv[i - 1].data + unitsize; |
1777 | } |
1778 | return GRN_SUCCESS; |
1779 | } |
1780 | |
1781 | static grn_rc |
1782 | datavec_init(grn_ctx *ctx, datavec *dv, uint32_t dvlen, |
1783 | size_t unitsize, size_t totalsize) |
1784 | { |
1785 | int i; |
1786 | if (!totalsize) { |
1787 | memset(dv, 0, sizeof(datavec) * (dvlen + 1)); |
1788 | return GRN_SUCCESS; |
1789 | } |
1790 | if (!(dv[0].data = GRN_MALLOC(totalsize * sizeof(uint32_t)))) { |
1791 | MERR("[ii][data-vector][init] failed to allocate data: " |
1792 | "length:<%u>, " |
1793 | "unit-size:<%" GRN_FMT_SIZE ">, " |
1794 | "total-size:<%" GRN_FMT_SIZE ">" , |
1795 | dvlen, |
1796 | unitsize, |
1797 | totalsize); |
1798 | return ctx->rc; |
1799 | } |
1800 | dv[dvlen].data = dv[0].data + totalsize; |
1801 | for (i = 1; i < dvlen; i++) { |
1802 | dv[i].data = dv[i - 1].data + unitsize; |
1803 | } |
1804 | return GRN_SUCCESS; |
1805 | } |
1806 | |
1807 | static void |
1808 | datavec_fin(grn_ctx *ctx, datavec *dv) |
1809 | { |
1810 | if (dv[0].data) { GRN_FREE(dv[0].data); } |
1811 | } |
1812 | |
1813 | size_t |
1814 | grn_p_encv(grn_ctx *ctx, datavec *dv, uint32_t dvlen, uint8_t *res) |
1815 | { |
1816 | uint8_t *rp = res, freq[33]; |
1817 | uint32_t pgap, usep, l, df, data_size, *dp, *dpe; |
1818 | if (!dvlen || !(df = dv[0].data_size)) { return 0; } |
1819 | for (usep = 0, data_size = 0, l = 0; l < dvlen; l++) { |
1820 | uint32_t dl = dv[l].data_size; |
1821 | if (dl < df || ((dl > df) && (l != dvlen - 1))) { |
1822 | /* invalid argument */ |
1823 | return 0; |
1824 | } |
1825 | usep += (dv[l].flags & USE_P_ENC) << l; |
1826 | data_size += dl; |
1827 | } |
1828 | pgap = data_size - df * dvlen; |
1829 | if (!usep) { |
1830 | GRN_B_ENC((df << 1) + 1, rp); |
1831 | for (l = 0; l < dvlen; l++) { |
1832 | for (dp = dv[l].data, dpe = dp + dv[l].data_size; dp < dpe; dp++) { |
1833 | GRN_B_ENC(*dp, rp); |
1834 | } |
1835 | } |
1836 | } else { |
1837 | uint32_t buf[UNIT_SIZE]; |
1838 | GRN_B_ENC((usep << 1), rp); |
1839 | GRN_B_ENC(df, rp); |
1840 | if (dv[dvlen - 1].flags & ODD) { |
1841 | GRN_B_ENC(pgap, rp); |
1842 | } else { |
1843 | GRN_ASSERT(!pgap); |
1844 | } |
1845 | for (l = 0; l < dvlen; l++) { |
1846 | dp = dv[l].data; |
1847 | dpe = dp + dv[l].data_size; |
1848 | if ((dv[l].flags & USE_P_ENC)) { |
1849 | uint32_t j = 0, d; |
1850 | memset(freq, 0, 33); |
1851 | while (dp < dpe) { |
1852 | if (j == UNIT_SIZE) { |
1853 | rp = pack(buf, j, freq, rp); |
1854 | memset(freq, 0, 33); |
1855 | j = 0; |
1856 | } |
1857 | if ((d = buf[j++] = *dp++)) { |
1858 | uint32_t w; |
1859 | GRN_BIT_SCAN_REV(d, w); |
1860 | freq[w + 1]++; |
1861 | } else { |
1862 | freq[0]++; |
1863 | } |
1864 | } |
1865 | if (j) { rp = pack(buf, j, freq, rp); } |
1866 | } else { |
1867 | while (dp < dpe) { GRN_B_ENC(*dp++, rp); } |
1868 | } |
1869 | } |
1870 | } |
1871 | return rp - res; |
1872 | } |
1873 | |
1874 | #define GRN_B_DEC_CHECK(v,p,pe) do { \ |
1875 | uint8_t *_p = (uint8_t *)p; \ |
1876 | uint32_t _v; \ |
1877 | if (_p >= pe) { return 0; } \ |
1878 | _v = *_p++; \ |
1879 | switch (_v >> 4) { \ |
1880 | case 0x08 : \ |
1881 | if (_v == 0x8f) { \ |
1882 | if (_p + sizeof(uint32_t) > pe) { return 0; } \ |
1883 | grn_memcpy(&_v, _p, sizeof(uint32_t)); \ |
1884 | _p += sizeof(uint32_t); \ |
1885 | } \ |
1886 | break; \ |
1887 | case 0x09 : \ |
1888 | if (_p + 3 > pe) { return 0; } \ |
1889 | _v = (_v - 0x90) * 0x100 + *_p++; \ |
1890 | _v = _v * 0x100 + *_p++; \ |
1891 | _v = _v * 0x100 + *_p++ + 0x20408f; \ |
1892 | break; \ |
1893 | case 0x0a : \ |
1894 | case 0x0b : \ |
1895 | if (_p + 2 > pe) { return 0; } \ |
1896 | _v = (_v - 0xa0) * 0x100 + *_p++; \ |
1897 | _v = _v * 0x100 + *_p++ + 0x408f; \ |
1898 | break; \ |
1899 | case 0x0c : \ |
1900 | case 0x0d : \ |
1901 | case 0x0e : \ |
1902 | case 0x0f : \ |
1903 | if (_p + 1 > pe) { return 0; } \ |
1904 | _v = (_v - 0xc0) * 0x100 + *_p++ + 0x8f; \ |
1905 | break; \ |
1906 | } \ |
1907 | v = _v; \ |
1908 | p = _p; \ |
1909 | } while (0) |
1910 | |
1911 | static uint8_t * |
1912 | unpack(uint8_t *dp, uint8_t *dpe, int i, uint32_t *rp) |
1913 | { |
1914 | uint8_t ne = 0, k = 0, w = *dp++; |
1915 | uint32_t m, *p = rp; |
1916 | if (w & 0x80) { |
1917 | ne = *dp++; |
1918 | w -= 0x80; |
1919 | m = (1 << w) - 1; |
1920 | if (m >= UNIT_MASK) { k = *dp++; } |
1921 | } else { |
1922 | m = (1 << w) - 1; |
1923 | } |
1924 | if (w) { |
1925 | while (i >= 8) { |
1926 | if (dp + w > dpe) { return NULL; } |
1927 | switch (w) { |
1928 | case 1 : dp = unpack_1(p, dp); break; |
1929 | case 2 : dp = unpack_2(p, dp); break; |
1930 | case 3 : dp = unpack_3(p, dp); break; |
1931 | case 4 : dp = unpack_4(p, dp); break; |
1932 | case 5 : dp = unpack_5(p, dp); break; |
1933 | case 6 : dp = unpack_6(p, dp); break; |
1934 | case 7 : dp = unpack_7(p, dp); break; |
1935 | case 8 : dp = unpack_8(p, dp); break; |
1936 | case 9 : dp = unpack_9(p, dp); break; |
1937 | case 10 : dp = unpack_10(p, dp); break; |
1938 | case 11 : dp = unpack_11(p, dp); break; |
1939 | case 12 : dp = unpack_12(p, dp); break; |
1940 | case 13 : dp = unpack_13(p, dp); break; |
1941 | case 14 : dp = unpack_14(p, dp); break; |
1942 | case 15 : dp = unpack_15(p, dp); break; |
1943 | case 16 : dp = unpack_16(p, dp); break; |
1944 | case 17 : dp = unpack_17(p, dp); break; |
1945 | case 18 : dp = unpack_18(p, dp); break; |
1946 | case 19 : dp = unpack_19(p, dp); break; |
1947 | case 20 : dp = unpack_20(p, dp); break; |
1948 | case 21 : dp = unpack_21(p, dp); break; |
1949 | case 22 : dp = unpack_22(p, dp); break; |
1950 | case 23 : dp = unpack_23(p, dp); break; |
1951 | case 24 : dp = unpack_24(p, dp); break; |
1952 | case 25 : dp = unpack_25(p, dp); break; |
1953 | case 26 : dp = unpack_26(p, dp); break; |
1954 | case 27 : dp = unpack_27(p, dp); break; |
1955 | case 28 : dp = unpack_28(p, dp); break; |
1956 | case 29 : dp = unpack_29(p, dp); break; |
1957 | case 30 : dp = unpack_30(p, dp); break; |
1958 | case 31 : dp = unpack_31(p, dp); break; |
1959 | case 32 : dp = unpack_32(p, dp); break; |
1960 | } |
1961 | i -= 8; |
1962 | p += 8; |
1963 | } |
1964 | { |
1965 | int b; |
1966 | uint32_t v, *pe; |
1967 | for (b = 8 - w, v = 0, pe = p + i; p < pe && dp < dpe;) { |
1968 | if (b > 0) { |
1969 | *p++ = v + ((*dp >> b) & m); |
1970 | b -= w; |
1971 | v = 0; |
1972 | } else if (b < 0) { |
1973 | v += (*dp++ << -b) & m; |
1974 | b += 8; |
1975 | } else { |
1976 | *p++ = v + (*dp++ & m); |
1977 | b = 8 - w; |
1978 | v = 0; |
1979 | } |
1980 | } |
1981 | if (b + w != 8) { dp++; } |
1982 | } |
1983 | } else { |
1984 | memset(p, 0, sizeof(uint32_t) * i); |
1985 | } |
1986 | if (ne) { |
1987 | if (m >= UNIT_MASK) { |
1988 | uint32_t *pp; |
1989 | while (ne--) { |
1990 | pp = &rp[k]; |
1991 | k = *pp; |
1992 | GRN_B_DEC_CHECK(*pp, dp, dpe); |
1993 | *pp += (m + 1); |
1994 | } |
1995 | } else { |
1996 | while (ne--) { |
1997 | k = *dp++; |
1998 | GRN_B_DEC_CHECK(rp[k], dp, dpe); |
1999 | rp[k] += (m + 1); |
2000 | } |
2001 | } |
2002 | } |
2003 | return dp; |
2004 | } |
2005 | |
2006 | int |
2007 | grn_p_dec(grn_ctx *ctx, uint8_t *data, uint32_t data_size, uint32_t nreq, uint32_t **res) |
2008 | { |
2009 | uint8_t *dp = data, *dpe = data + data_size; |
2010 | uint32_t rest, orig_size, *rp, *rpe; |
2011 | GRN_B_DEC(orig_size, dp); |
2012 | if (!orig_size) { |
2013 | if (!nreq || nreq > data_size) { nreq = data_size; } |
2014 | if ((*res = rp = GRN_MALLOC(nreq * 4))) { |
2015 | for (rpe = rp + nreq; dp < data + data_size && rp < rpe; rp++) { |
2016 | GRN_B_DEC(*rp, dp); |
2017 | } |
2018 | } |
2019 | return rp - *res; |
2020 | } else { |
2021 | if (!(*res = rp = GRN_MALLOC(orig_size * sizeof(uint32_t)))) { |
2022 | return 0; |
2023 | } |
2024 | if (!nreq || nreq > orig_size) { nreq = orig_size; } |
2025 | for (rest = nreq; rest >= UNIT_SIZE; rest -= UNIT_SIZE) { |
2026 | if (!(dp = unpack(dp, dpe, UNIT_SIZE, rp))) { return 0; } |
2027 | rp += UNIT_SIZE; |
2028 | } |
2029 | if (rest) { if (!(dp = unpack(dp, dpe, rest, rp))) { return 0; } } |
2030 | GRN_ASSERT(data + data_size == dp); |
2031 | return nreq; |
2032 | } |
2033 | } |
2034 | |
2035 | int |
2036 | grn_p_decv(grn_ctx *ctx, uint8_t *data, uint32_t data_size, datavec *dv, uint32_t dvlen) |
2037 | { |
2038 | size_t size; |
2039 | uint32_t df, l, i, *rp, nreq; |
2040 | uint8_t *dp = data, *dpe = data + data_size; |
2041 | if (!data_size) { |
2042 | dv[0].data_size = 0; |
2043 | return 0; |
2044 | } |
2045 | for (nreq = 0; nreq < dvlen; nreq++) { |
2046 | if (dv[nreq].flags & CUT_OFF) { break; } |
2047 | } |
2048 | if (!nreq) { return 0; } |
2049 | GRN_B_DEC_CHECK(df, dp, dpe); |
2050 | if ((df & 1)) { |
2051 | df >>= 1; |
2052 | size = nreq == dvlen ? data_size : df * nreq; |
2053 | if (dv[dvlen].data < dv[0].data + size) { |
2054 | if (dv[0].data) { GRN_FREE(dv[0].data); } |
2055 | if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } |
2056 | dv[dvlen].data = rp + size; |
2057 | } else { |
2058 | rp = dv[0].data; |
2059 | } |
2060 | for (l = 0; l < dvlen; l++) { |
2061 | if (dv[l].flags & CUT_OFF) { break; } |
2062 | dv[l].data = rp; |
2063 | if (l < dvlen - 1) { |
2064 | for (i = 0; i < df; i++, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } |
2065 | } else { |
2066 | for (i = 0; dp < dpe; i++, rp++) { GRN_B_DEC_CHECK(*rp, dp, dpe); } |
2067 | } |
2068 | dv[l].data_size = i; |
2069 | } |
2070 | } else { |
2071 | uint32_t n, rest, usep = df >> 1; |
2072 | GRN_B_DEC_CHECK(df, dp, dpe); |
2073 | if (dv[dvlen -1].flags & ODD) { |
2074 | GRN_B_DEC_CHECK(rest, dp, dpe); |
2075 | } else { |
2076 | rest = 0; |
2077 | } |
2078 | size = df * nreq + (nreq == dvlen ? rest : 0); |
2079 | if (dv[dvlen].data < dv[0].data + size) { |
2080 | if (dv[0].data) { GRN_FREE(dv[0].data); } |
2081 | if (!(rp = GRN_MALLOC(size * sizeof(uint32_t)))) { return 0; } |
2082 | dv[dvlen].data = rp + size; |
2083 | } else { |
2084 | rp = dv[0].data; |
2085 | } |
2086 | for (l = 0; l < dvlen; l++) { |
2087 | if (dv[l].flags & CUT_OFF) { break; } |
2088 | dv[l].data = rp; |
2089 | dv[l].data_size = n = (l < dvlen - 1) ? df : df + rest; |
2090 | if (usep & (1 << l)) { |
2091 | for (; n >= UNIT_SIZE; n -= UNIT_SIZE) { |
2092 | if (!(dp = unpack(dp, dpe, UNIT_SIZE, rp))) { return 0; } |
2093 | rp += UNIT_SIZE; |
2094 | } |
2095 | if (n) { |
2096 | if (!(dp = unpack(dp, dpe, n, rp))) { return 0; } |
2097 | rp += n; |
2098 | } |
2099 | dv[l].flags |= USE_P_ENC; |
2100 | } else { |
2101 | for (; n; n--, rp++) { |
2102 | GRN_B_DEC_CHECK(*rp, dp, dpe); |
2103 | } |
2104 | } |
2105 | } |
2106 | GRN_ASSERT(dp == dpe); |
2107 | if (dp != dpe) { |
2108 | GRN_LOG(ctx, GRN_LOG_DEBUG, "data_size=%d, %" GRN_FMT_LLD, |
2109 | data_size, (long long int)(dpe - dp)); |
2110 | } |
2111 | } |
2112 | return rp - dv[0].data; |
2113 | } |
2114 | |
2115 | int |
2116 | grn_b_enc(grn_ctx *ctx, uint32_t *data, uint32_t data_size, uint8_t **res) |
2117 | { |
2118 | uint8_t *rp; |
2119 | uint32_t *dp, i; |
2120 | *res = rp = GRN_MALLOC(data_size * sizeof(uint32_t) * 2); |
2121 | GRN_B_ENC(data_size, rp); |
2122 | for (i = data_size, dp = data; i; i--, dp++) { |
2123 | GRN_B_ENC(*dp, rp); |
2124 | } |
2125 | return rp - *res; |
2126 | } |
2127 | |
2128 | int |
2129 | grn_b_dec(grn_ctx *ctx, uint8_t *data, uint32_t data_size, uint32_t **res) |
2130 | { |
2131 | uint32_t i, *rp, orig_size; |
2132 | uint8_t *dp = data; |
2133 | GRN_B_DEC(orig_size, dp); |
2134 | *res = rp = GRN_MALLOC(orig_size * sizeof(uint32_t)); |
2135 | for (i = orig_size; i; i--, rp++) { |
2136 | GRN_B_DEC(*rp, dp); |
2137 | } |
2138 | return orig_size; |
2139 | } |
2140 | |
2141 | /* buffer */ |
2142 | |
2143 | typedef struct { |
2144 | uint32_t tid; |
2145 | uint32_t size_in_chunk; |
2146 | uint32_t pos_in_chunk; |
2147 | uint16_t size_in_buffer; |
2148 | uint16_t pos_in_buffer; |
2149 | } buffer_term; |
2150 | |
2151 | typedef struct { |
2152 | uint16_t step; |
2153 | uint16_t jump; |
2154 | } buffer_rec; |
2155 | |
2156 | typedef struct { |
2157 | uint32_t chunk; |
2158 | uint32_t chunk_size; |
2159 | uint32_t buffer_free; |
2160 | uint16_t nterms; |
2161 | uint16_t nterms_void; |
2162 | } ; |
2163 | |
2164 | struct grn_ii_buffer { |
2165 | buffer_header ; |
2166 | buffer_term terms[(S_SEGMENT - sizeof(buffer_header))/sizeof(buffer_term)]; |
2167 | }; |
2168 | |
2169 | typedef struct grn_ii_buffer buffer; |
2170 | |
2171 | inline static uint32_t |
2172 | buffer_open(grn_ctx *ctx, grn_ii *ii, uint32_t pos, buffer_term **bt, buffer **b) |
2173 | { |
2174 | byte *p = NULL; |
2175 | uint16_t lseg = (uint16_t) (LSEG(pos)); |
2176 | uint32_t pseg = ii->header->binfo[lseg]; |
2177 | if (pseg != GRN_II_PSEG_NOT_ASSIGNED) { |
2178 | GRN_IO_SEG_REF(ii->seg, pseg, p); |
2179 | if (!p) { return GRN_II_PSEG_NOT_ASSIGNED; } |
2180 | if (b) { *b = (buffer *)p; } |
2181 | if (bt) { *bt = (buffer_term *)(p + LPOS(pos)); } |
2182 | } |
2183 | return pseg; |
2184 | } |
2185 | |
2186 | inline static grn_rc |
2187 | buffer_close(grn_ctx *ctx, grn_ii *ii, uint32_t pseg) |
2188 | { |
2189 | if (pseg >= ii->seg->header->max_segment) { |
2190 | GRN_LOG(ctx, GRN_LOG_NOTICE, "invalid pseg buffer_close(%d)" , pseg); |
2191 | return GRN_INVALID_ARGUMENT; |
2192 | } |
2193 | GRN_IO_SEG_UNREF(ii->seg, pseg); |
2194 | return GRN_SUCCESS; |
2195 | } |
2196 | |
2197 | inline static uint32_t |
2198 | buffer_open_if_capable(grn_ctx *ctx, grn_ii *ii, int32_t seg, int size, buffer **b) |
2199 | { |
2200 | uint32_t pseg, pos = SEG2POS(seg, 0); |
2201 | if ((pseg = buffer_open(ctx, ii, pos, NULL, b)) != GRN_II_PSEG_NOT_ASSIGNED) { |
2202 | uint16_t nterms = (*b)->header.nterms - (*b)->header.nterms_void; |
2203 | if (!((nterms < 4096 || |
2204 | (ii->header->total_chunk_size >> ((nterms >> 8) - 6)) |
2205 | > (*b)->header.chunk_size) && |
2206 | ((*b)->header.buffer_free >= size + sizeof(buffer_term)))) { |
2207 | buffer_close(ctx, ii, pseg); |
2208 | return GRN_II_PSEG_NOT_ASSIGNED; |
2209 | } |
2210 | } |
2211 | return pseg; |
2212 | } |
2213 | |
2214 | typedef struct { |
2215 | uint32_t rid; |
2216 | uint32_t sid; |
2217 | } docid; |
2218 | |
2219 | #define BUFFER_REC_DEL(r) ((r)->jump = 1) |
2220 | #define BUFFER_REC_DELETED(r) ((r)->jump == 1) |
2221 | |
2222 | #define BUFFER_REC_AT(b,pos) ((buffer_rec *)(b) + (pos)) |
2223 | #define BUFFER_REC_POS(b,rec) ((uint16_t)((rec) - (buffer_rec *)(b))) |
2224 | |
2225 | inline static void |
2226 | buffer_term_dump(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt) |
2227 | { |
2228 | int pos, rid, sid; |
2229 | uint8_t *p; |
2230 | buffer_rec *r; |
2231 | |
2232 | if (!grn_logger_pass(ctx, GRN_LOG_DEBUG)) { |
2233 | return; |
2234 | } |
2235 | |
2236 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
2237 | "b=(%x %u %u %u)" , b->header.chunk, b->header.chunk_size, |
2238 | b->header.buffer_free, b->header.nterms); |
2239 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
2240 | "bt=(%u %u %u %u %u)" , bt->tid, bt->size_in_chunk, bt->pos_in_chunk, |
2241 | bt->size_in_buffer, bt->pos_in_buffer); |
2242 | for (pos = bt->pos_in_buffer; pos; pos = r->step) { |
2243 | r = BUFFER_REC_AT(b, pos); |
2244 | p = GRN_NEXT_ADDR(r); |
2245 | GRN_B_DEC(rid, p); |
2246 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2247 | GRN_B_DEC(sid, p); |
2248 | } else { |
2249 | sid = 1; |
2250 | } |
2251 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
2252 | "%d=(%d:%d),(%d:%d)" , pos, r->jump, r->step, rid, sid); |
2253 | } |
2254 | } |
2255 | |
2256 | inline static grn_rc |
2257 | check_jump(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_rec *r, int j) |
2258 | { |
2259 | uint16_t i = BUFFER_REC_POS(b, r); |
2260 | uint8_t *p; |
2261 | buffer_rec *r2; |
2262 | docid id, id2; |
2263 | if (!j) { return GRN_SUCCESS; } |
2264 | p = GRN_NEXT_ADDR(r); |
2265 | GRN_B_DEC(id.rid, p); |
2266 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2267 | GRN_B_DEC(id.sid, p); |
2268 | } else { |
2269 | id.sid = 1; |
2270 | } |
2271 | if (j == 1) { |
2272 | GRN_LOG(ctx, GRN_LOG_DEBUG, "deleting! %d(%d:%d)" , i, id.rid, id.sid); |
2273 | return GRN_SUCCESS; |
2274 | } |
2275 | r2 = BUFFER_REC_AT(b, j); |
2276 | p = GRN_NEXT_ADDR(r2); |
2277 | GRN_B_DEC(id2.rid, p); |
2278 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2279 | GRN_B_DEC(id2.sid, p); |
2280 | } else { |
2281 | id2.sid = 1; |
2282 | } |
2283 | if (r2->step == i) { |
2284 | GRN_LOG(ctx, GRN_LOG_EMERG, "cycle! %d(%d:%d)<->%d(%d:%d)" , |
2285 | i, id.rid, id.sid, j, id2.rid, id2.sid); |
2286 | return GRN_FILE_CORRUPT; |
2287 | } |
2288 | if (id2.rid < id.rid || (id2.rid == id.rid && id2.sid <= id.sid)) { |
2289 | GRN_LOG(ctx, GRN_LOG_CRIT, |
2290 | "invalid jump! %d(%d:%d)(%d:%d)->%d(%d:%d)(%d:%d)" , |
2291 | i, r->jump, r->step, id.rid, id.sid, j, r2->jump, r2->step, |
2292 | id2.rid, id2.sid); |
2293 | return GRN_FILE_CORRUPT; |
2294 | } |
2295 | return GRN_SUCCESS; |
2296 | } |
2297 | |
2298 | inline static grn_rc |
2299 | set_jump_r(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_rec *from, int to) |
2300 | { |
2301 | int i, j, max_jump = 100; |
2302 | buffer_rec *r, *r2; |
2303 | for (r = from, j = to; j > 1 && max_jump--; r = BUFFER_REC_AT(b, r->step)) { |
2304 | r2 = BUFFER_REC_AT(b, j); |
2305 | if (r == r2) { break; } |
2306 | if (BUFFER_REC_DELETED(r2)) { break; } |
2307 | if (j == (i = r->jump)) { break; } |
2308 | if (j == r->step) { break; } |
2309 | if (check_jump(ctx, ii, b, r, j)) { |
2310 | ERR(GRN_FILE_CORRUPT, "check_jump failed" ); |
2311 | return ctx->rc; |
2312 | } |
2313 | r->jump = j; |
2314 | j = i; |
2315 | if (!r->step) { return GRN_FILE_CORRUPT; } |
2316 | } |
2317 | return GRN_SUCCESS; |
2318 | } |
2319 | |
2320 | #define GET_NUM_BITS(x,n) do {\ |
2321 | n = x;\ |
2322 | n = (n & 0x55555555) + ((n >> 1) & 0x55555555);\ |
2323 | n = (n & 0x33333333) + ((n >> 2) & 0x33333333);\ |
2324 | n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F0F);\ |
2325 | n = (n & 0x00FF00FF) + ((n >> 8) & 0x00FF00FF);\ |
2326 | n = (n & 0x0000FFFF) + ((n >>16) & 0x0000FFFF);\ |
2327 | } while (0) |
2328 | |
2329 | inline static grn_rc |
2330 | buffer_put(grn_ctx *ctx, grn_ii *ii, buffer *b, buffer_term *bt, |
2331 | buffer_rec *rnew, uint8_t *bs, grn_ii_updspec *u, int size) |
2332 | { |
2333 | uint8_t *p; |
2334 | docid id_curr = {0, 0}, id_start = {0, 0}, id_post = {0, 0}; |
2335 | buffer_rec *r_curr, *r_start = NULL; |
2336 | uint16_t last = 0, *lastp = &bt->pos_in_buffer, pos = BUFFER_REC_POS(b, rnew); |
2337 | int vdelta = 0, delta, delta0 = 0, vhops = 0, nhops = 0, reset = 1; |
2338 | grn_memcpy(GRN_NEXT_ADDR(rnew), bs, size - sizeof(buffer_rec)); |
2339 | for (;;) { |
2340 | if (!*lastp) { |
2341 | rnew->step = 0; |
2342 | rnew->jump = 0; |
2343 | // smb_wmb(); |
2344 | *lastp = pos; |
2345 | if (bt->size_in_buffer++ > 1) { |
2346 | buffer_rec *rhead = BUFFER_REC_AT(b, bt->pos_in_buffer); |
2347 | rhead->jump = pos; |
2348 | if (!(bt->size_in_buffer & 1)) { |
2349 | int n; |
2350 | buffer_rec *r = BUFFER_REC_AT(b, rhead->step), *r2; |
2351 | GET_NUM_BITS(bt->size_in_buffer, n); |
2352 | while (n-- && (r->jump > 1)) { |
2353 | r2 = BUFFER_REC_AT(b, r->jump); |
2354 | if (BUFFER_REC_DELETED(r2)) { break; } |
2355 | r = r2; |
2356 | } |
2357 | if (r != rnew) { set_jump_r(ctx, ii, b, r, last); } |
2358 | } |
2359 | } |
2360 | break; |
2361 | } |
2362 | r_curr = BUFFER_REC_AT(b, *lastp); |
2363 | p = GRN_NEXT_ADDR(r_curr); |
2364 | GRN_B_DEC(id_curr.rid, p); |
2365 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2366 | GRN_B_DEC(id_curr.sid, p); |
2367 | } else { |
2368 | id_curr.sid = 1; |
2369 | } |
2370 | if (id_curr.rid < id_post.rid || |
2371 | (id_curr.rid == id_post.rid && id_curr.sid < id_post.sid)) { |
2372 | { |
2373 | DEFINE_NAME(ii); |
2374 | CRIT(GRN_FILE_CORRUPT, |
2375 | "[ii][buffer][put] loop is found: " |
2376 | "<%.*s>: " |
2377 | "(%d:%d)->(%d:%d)" , |
2378 | name_size, name, |
2379 | id_post.rid, id_post.sid, id_curr.rid, id_curr.sid); |
2380 | } |
2381 | buffer_term_dump(ctx, ii, b, bt); |
2382 | bt->pos_in_buffer = 0; |
2383 | bt->size_in_buffer = 0; |
2384 | lastp = &bt->pos_in_buffer; |
2385 | continue; |
2386 | } |
2387 | id_post.rid = id_curr.rid; |
2388 | id_post.sid = id_curr.sid; |
2389 | if (u->rid < id_curr.rid || (u->rid == id_curr.rid && u->sid <= id_curr.sid)) { |
2390 | uint16_t step = *lastp, jump = r_curr->jump; |
2391 | if (u->rid == id_curr.rid) { |
2392 | if (u->sid == 0) { |
2393 | while (id_curr.rid == u->rid) { |
2394 | BUFFER_REC_DEL(r_curr); |
2395 | if (!(step = r_curr->step)) { break; } |
2396 | r_curr = BUFFER_REC_AT(b, step); |
2397 | p = GRN_NEXT_ADDR(r_curr); |
2398 | GRN_B_DEC(id_curr.rid, p); |
2399 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2400 | GRN_B_DEC(id_curr.sid, p); |
2401 | } else { |
2402 | id_curr.sid = 1; |
2403 | } |
2404 | } |
2405 | } else if (u->sid == id_curr.sid) { |
2406 | BUFFER_REC_DEL(r_curr); |
2407 | step = r_curr->step; |
2408 | } |
2409 | } |
2410 | rnew->step = step; |
2411 | rnew->jump = check_jump(ctx, ii, b, rnew, jump) ? 0 : jump; |
2412 | // smb_wmb(); |
2413 | *lastp = pos; |
2414 | break; |
2415 | } |
2416 | |
2417 | if (reset) { |
2418 | r_start = r_curr; |
2419 | id_start.rid = id_curr.rid; |
2420 | id_start.sid = id_curr.sid; |
2421 | if (!(delta0 = u->rid - id_start.rid)) { delta0 = u->sid - id_start.sid; } |
2422 | nhops = 0; |
2423 | vhops = 1; |
2424 | vdelta = delta0 >> 1; |
2425 | } else { |
2426 | if (!(delta = id_curr.rid - id_start.rid)) { |
2427 | delta = id_curr.sid - id_start.sid; |
2428 | } |
2429 | if (vdelta < delta) { |
2430 | vdelta += (delta0 >> ++vhops); |
2431 | r_start = r_curr; |
2432 | } |
2433 | if (nhops > vhops) { |
2434 | set_jump_r(ctx, ii, b, r_start, *lastp); |
2435 | } else { |
2436 | nhops++; |
2437 | } |
2438 | } |
2439 | |
2440 | last = *lastp; |
2441 | lastp = &r_curr->step; |
2442 | reset = 0; |
2443 | { |
2444 | uint16_t posj = r_curr->jump; |
2445 | if (posj > 1) { |
2446 | buffer_rec *rj = BUFFER_REC_AT(b, posj); |
2447 | if (!BUFFER_REC_DELETED(rj)) { |
2448 | docid idj; |
2449 | p = GRN_NEXT_ADDR(rj); |
2450 | GRN_B_DEC(idj.rid, p); |
2451 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2452 | GRN_B_DEC(idj.sid, p); |
2453 | } else { |
2454 | idj.sid = 1; |
2455 | } |
2456 | if (idj.rid < u->rid || (idj.rid == u->rid && idj.sid < u->sid)) { |
2457 | last = posj; |
2458 | lastp = &rj->step; |
2459 | } else { |
2460 | reset = 1; |
2461 | } |
2462 | } |
2463 | } |
2464 | } |
2465 | } |
2466 | return ctx->rc; |
2467 | } |
2468 | |
2469 | /* array */ |
2470 | |
2471 | inline static uint32_t * |
2472 | array_at(grn_ctx *ctx, grn_ii *ii, uint32_t id) |
2473 | { |
2474 | byte *p = NULL; |
2475 | uint32_t seg, pseg; |
2476 | if (id > GRN_ID_MAX) { return NULL; } |
2477 | seg = id >> W_ARRAY; |
2478 | if ((pseg = ii->header->ainfo[seg]) == GRN_II_PSEG_NOT_ASSIGNED) { |
2479 | return NULL; |
2480 | } |
2481 | GRN_IO_SEG_REF(ii->seg, pseg, p); |
2482 | if (!p) { return NULL; } |
2483 | return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * S_ARRAY_ELEMENT); |
2484 | } |
2485 | |
2486 | inline static uint32_t * |
2487 | array_get(grn_ctx *ctx, grn_ii *ii, uint32_t id) |
2488 | { |
2489 | byte *p = NULL; |
2490 | uint16_t seg; |
2491 | uint32_t pseg; |
2492 | if (id > GRN_ID_MAX) { return NULL; } |
2493 | seg = id >> W_ARRAY; |
2494 | if ((pseg = ii->header->ainfo[seg]) == GRN_II_PSEG_NOT_ASSIGNED) { |
2495 | if (segment_get_clear(ctx, ii, &pseg)) { return NULL; } |
2496 | ii->header->ainfo[seg] = pseg; |
2497 | if (seg >= ii->header->amax) { ii->header->amax = seg + 1; } |
2498 | } |
2499 | GRN_IO_SEG_REF(ii->seg, pseg, p); |
2500 | if (!p) { return NULL; } |
2501 | return (uint32_t *)(p + (id & ARRAY_MASK_IN_A_SEGMENT) * S_ARRAY_ELEMENT); |
2502 | } |
2503 | |
2504 | inline static void |
2505 | array_unref(grn_ii *ii, uint32_t id) |
2506 | { |
2507 | GRN_IO_SEG_UNREF(ii->seg, ii->header->ainfo[id >> W_ARRAY]); |
2508 | } |
2509 | |
2510 | /* updspec */ |
2511 | |
2512 | grn_ii_updspec * |
2513 | grn_ii_updspec_open(grn_ctx *ctx, uint32_t rid, uint32_t sid) |
2514 | { |
2515 | grn_ii_updspec *u; |
2516 | if (!(u = GRN_MALLOC(sizeof(grn_ii_updspec)))) { return NULL; } |
2517 | u->rid = rid; |
2518 | u->sid = sid; |
2519 | u->weight = 0; |
2520 | u->tf = 0; |
2521 | u->atf = 0; |
2522 | u->pos = NULL; |
2523 | u->tail = NULL; |
2524 | // u->vnodes = NULL; |
2525 | return u; |
2526 | } |
2527 | |
2528 | #define GRN_II_MAX_TF 0x1ffff |
2529 | |
2530 | grn_rc |
2531 | grn_ii_updspec_add(grn_ctx *ctx, grn_ii_updspec *u, int pos, int32_t weight) |
2532 | { |
2533 | struct _grn_ii_pos *p; |
2534 | u->atf++; |
2535 | if (u->tf >= GRN_II_MAX_TF) { return GRN_SUCCESS; } |
2536 | if (!(p = GRN_MALLOC(sizeof(struct _grn_ii_pos)))) { |
2537 | return GRN_NO_MEMORY_AVAILABLE; |
2538 | } |
2539 | u->weight += weight; |
2540 | p->pos = pos; |
2541 | p->next = NULL; |
2542 | if (u->tail) { |
2543 | u->tail->next = p; |
2544 | } else { |
2545 | u->pos = p; |
2546 | } |
2547 | u->tail = p; |
2548 | u->tf++; |
2549 | return GRN_SUCCESS; |
2550 | } |
2551 | |
2552 | int |
2553 | grn_ii_updspec_cmp(grn_ii_updspec *a, grn_ii_updspec *b) |
2554 | { |
2555 | struct _grn_ii_pos *pa, *pb; |
2556 | if (a->rid != b->rid) { return a->rid - b->rid; } |
2557 | if (a->sid != b->sid) { return a->sid - b->sid; } |
2558 | if (a->weight != b->weight) { return a->weight - b->weight; } |
2559 | if (a->tf != b->tf) { return a->tf - b->tf; } |
2560 | for (pa = a->pos, pb = b->pos; pa && pb; pa = pa->next, pb = pb->next) { |
2561 | if (pa->pos != pb->pos) { return pa->pos - pb->pos; } |
2562 | } |
2563 | if (pa) { return 1; } |
2564 | if (pb) { return -1; } |
2565 | return 0; |
2566 | } |
2567 | |
2568 | grn_rc |
2569 | grn_ii_updspec_close(grn_ctx *ctx, grn_ii_updspec *u) |
2570 | { |
2571 | struct _grn_ii_pos *p = u->pos, *q; |
2572 | while (p) { |
2573 | q = p->next; |
2574 | GRN_FREE(p); |
2575 | p = q; |
2576 | } |
2577 | GRN_FREE(u); |
2578 | return GRN_SUCCESS; |
2579 | } |
2580 | |
2581 | inline static uint8_t * |
2582 | encode_rec(grn_ctx *ctx, grn_ii *ii, grn_ii_updspec *u, unsigned int *size, int deletep) |
2583 | { |
2584 | uint8_t *br, *p; |
2585 | struct _grn_ii_pos *pp; |
2586 | uint32_t lpos, tf, weight; |
2587 | if (deletep) { |
2588 | tf = 0; |
2589 | weight = 0; |
2590 | } else { |
2591 | tf = u->tf; |
2592 | weight = u->weight; |
2593 | } |
2594 | if (!(br = GRN_MALLOC((tf + 4) * 5))) { |
2595 | return NULL; |
2596 | } |
2597 | p = br; |
2598 | GRN_B_ENC(u->rid, p); |
2599 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2600 | GRN_B_ENC(u->sid, p); |
2601 | } else { |
2602 | u->sid = 1; |
2603 | } |
2604 | GRN_B_ENC(tf, p); |
2605 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { GRN_B_ENC(weight, p); } |
2606 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
2607 | for (lpos = 0, pp = u->pos; pp && tf--; lpos = pp->pos, pp = pp->next) { |
2608 | GRN_B_ENC(pp->pos - lpos, p); |
2609 | } |
2610 | } |
2611 | while (((intptr_t)p & 0x03)) { *p++ = 0; } |
2612 | *size = (unsigned int) ((p - br) + sizeof(buffer_rec)); |
2613 | return br; |
2614 | } |
2615 | |
2616 | typedef struct { |
2617 | grn_ii *ii; |
2618 | grn_hash *h; |
2619 | } lexicon_deletable_arg; |
2620 | |
2621 | #ifdef CASCADE_DELETE_LEXICON |
2622 | static int |
2623 | lexicon_deletable(grn_ctx *ctx, grn_obj *lexicon, grn_id tid, void *arg) |
2624 | { |
2625 | uint32_t *a; |
2626 | grn_hash *h = ((lexicon_deletable_arg *)arg)->h; |
2627 | grn_ii *ii = ((lexicon_deletable_arg *)arg)->ii; |
2628 | if (!h) { return 0; } |
2629 | if ((a = array_at(ctx, ii, tid))) { |
2630 | if (a[0]) { |
2631 | array_unref(ii, tid); |
2632 | return 0; |
2633 | } |
2634 | array_unref(ii, tid); |
2635 | } |
2636 | { |
2637 | grn_ii_updspec **u; |
2638 | if (!grn_hash_get(ctx, h, &tid, sizeof(grn_id), (void **) &u)) { |
2639 | return (ERRP(ctx, GRN_ERROR)) ? 0 : 1; |
2640 | } |
2641 | if (!(*u)->tf || !(*u)->sid) { return 1; } |
2642 | return 0; |
2643 | } |
2644 | } |
2645 | #endif /* CASCADE_DELETE_LEXICON */ |
2646 | |
2647 | inline static void |
2648 | lexicon_delete(grn_ctx *ctx, grn_ii *ii, uint32_t tid, grn_hash *h) |
2649 | { |
2650 | #ifdef CASCADE_DELETE_LEXICON |
2651 | lexicon_deletable_arg arg = {ii, h}; |
2652 | grn_table_delete_optarg optarg = {0, lexicon_deletable, &arg}; |
2653 | _grn_table_delete_by_id(ctx, ii->lexicon, tid, &optarg); |
2654 | #endif /* CASCADE_DELETE_LEXICON */ |
2655 | } |
2656 | |
2657 | typedef struct { |
2658 | grn_id rid; |
2659 | uint32_t sid; |
2660 | uint32_t tf; |
2661 | uint32_t weight; |
2662 | uint32_t flags; |
2663 | } docinfo; |
2664 | |
2665 | #define GETNEXTC() do {\ |
2666 | if (sdf) {\ |
2667 | uint32_t dgap = *srp++;\ |
2668 | cid.rid += dgap;\ |
2669 | if (dgap) { cid.sid = 0; }\ |
2670 | snp += cid.tf;\ |
2671 | cid.tf = 1 + *stp++;\ |
2672 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { cid.weight = *sop++; }\ |
2673 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ |
2674 | cid.sid += 1 + *ssp++;\ |
2675 | } else {\ |
2676 | cid.sid = 1;\ |
2677 | }\ |
2678 | sdf--;\ |
2679 | } else {\ |
2680 | cid.rid = 0;\ |
2681 | }\ |
2682 | } while (0) |
2683 | |
2684 | #define PUTNEXT_(id) do {\ |
2685 | uint32_t dgap = id.rid - lid.rid;\ |
2686 | uint32_t sgap = (dgap ? id.sid : id.sid - lid.sid) - 1;\ |
2687 | *ridp++ = dgap;\ |
2688 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ |
2689 | *sidp++ = sgap;\ |
2690 | }\ |
2691 | *tfp++ = id.tf - 1;\ |
2692 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { *weightp++ = id.weight; }\ |
2693 | lid.rid = id.rid;\ |
2694 | lid.sid = id.sid;\ |
2695 | } while (0) |
2696 | |
2697 | #define PUTNEXTC() do {\ |
2698 | if (cid.rid) {\ |
2699 | if (cid.tf) {\ |
2700 | if (lid.rid > cid.rid || (lid.rid == cid.rid && lid.sid >= cid.sid)) {\ |
2701 | DEFINE_NAME(ii);\ |
2702 | CRIT(GRN_FILE_CORRUPT,\ |
2703 | "[ii][broken] posting in list is larger than posting in chunk: "\ |
2704 | "<%.*s>: (%d:%d) -> (%d:%d)",\ |
2705 | name_size, name, lid.rid, lid.sid, cid.rid, cid.sid);\ |
2706 | break;\ |
2707 | }\ |
2708 | PUTNEXT_(cid);\ |
2709 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) {\ |
2710 | uint32_t i;\ |
2711 | for (i = 0; i < cid.tf; i++) {\ |
2712 | *posp++ = snp[i];\ |
2713 | spos += snp[i];\ |
2714 | }\ |
2715 | }\ |
2716 | } else {\ |
2717 | DEFINE_NAME(ii);\ |
2718 | CRIT(GRN_FILE_CORRUPT,\ |
2719 | "[ii][broken] invalid posting in chunk: <%.*s>: (%d,%d)",\ |
2720 | name_size, name, bt->tid, cid.rid);\ |
2721 | break;\ |
2722 | }\ |
2723 | }\ |
2724 | GETNEXTC();\ |
2725 | } while (0) |
2726 | |
2727 | #define GETNEXTB() do {\ |
2728 | if (nextb) {\ |
2729 | uint32_t lrid = bid.rid, lsid = bid.sid;\ |
2730 | buffer_rec *br = BUFFER_REC_AT(sb, nextb);\ |
2731 | sbp = GRN_NEXT_ADDR(br);\ |
2732 | GRN_B_DEC(bid.rid, sbp);\ |
2733 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) {\ |
2734 | GRN_B_DEC(bid.sid, sbp);\ |
2735 | } else {\ |
2736 | bid.sid = 1;\ |
2737 | }\ |
2738 | if (lrid > bid.rid || (lrid == bid.rid && lsid >= bid.sid)) {\ |
2739 | DEFINE_NAME(ii);\ |
2740 | CRIT(GRN_FILE_CORRUPT,\ |
2741 | "[ii][broken] postings in block aren't sorted: "\ |
2742 | "<%.*s>: (%d:%d) -> (%d:%d)",\ |
2743 | name_size, name, lrid, lsid, bid.rid, bid.sid);\ |
2744 | break;\ |
2745 | }\ |
2746 | nextb = br->step;\ |
2747 | } else {\ |
2748 | bid.rid = 0;\ |
2749 | }\ |
2750 | } while (0) |
2751 | |
2752 | #define PUTNEXTB() do {\ |
2753 | if (bid.rid && bid.sid) {\ |
2754 | GRN_B_DEC(bid.tf, sbp);\ |
2755 | if (bid.tf > 0) {\ |
2756 | if (lid.rid > bid.rid || (lid.rid == bid.rid && lid.sid >= bid.sid)) {\ |
2757 | DEFINE_NAME(ii);\ |
2758 | CRIT(GRN_FILE_CORRUPT,\ |
2759 | "[ii][broken] posting in list is larger than posting in buffer: "\ |
2760 | "<%.*s>: (%d:%d) -> (%d:%d)",\ |
2761 | name_size, name, lid.rid, lid.sid, bid.rid, bid.sid);\ |
2762 | break;\ |
2763 | }\ |
2764 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) {\ |
2765 | GRN_B_DEC(bid.weight, sbp);\ |
2766 | }\ |
2767 | PUTNEXT_(bid);\ |
2768 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) {\ |
2769 | while (bid.tf--) { GRN_B_DEC(*posp, sbp); spos += *posp++; }\ |
2770 | }\ |
2771 | }\ |
2772 | }\ |
2773 | GETNEXTB();\ |
2774 | } while (0) |
2775 | |
2776 | #define MERGE_BC(cond) do {\ |
2777 | if (bid.rid) {\ |
2778 | if (cid.rid) {\ |
2779 | if (cid.rid < bid.rid) {\ |
2780 | PUTNEXTC();\ |
2781 | if (ctx->rc != GRN_SUCCESS) { break; }\ |
2782 | } else {\ |
2783 | if (bid.rid < cid.rid) {\ |
2784 | PUTNEXTB();\ |
2785 | if (ctx->rc != GRN_SUCCESS) { break; }\ |
2786 | } else {\ |
2787 | if (bid.sid) {\ |
2788 | if (cid.sid < bid.sid) {\ |
2789 | PUTNEXTC();\ |
2790 | if (ctx->rc != GRN_SUCCESS) { break; }\ |
2791 | } else {\ |
2792 | if (bid.sid == cid.sid) { GETNEXTC(); }\ |
2793 | PUTNEXTB();\ |
2794 | if (ctx->rc != GRN_SUCCESS) { break; }\ |
2795 | }\ |
2796 | } else {\ |
2797 | GETNEXTC();\ |
2798 | }\ |
2799 | }\ |
2800 | }\ |
2801 | } else {\ |
2802 | PUTNEXTB();\ |
2803 | if (ctx->rc != GRN_SUCCESS) { break; }\ |
2804 | }\ |
2805 | } else {\ |
2806 | if (cid.rid) {\ |
2807 | PUTNEXTC();\ |
2808 | if (ctx->rc != GRN_SUCCESS) { break; }\ |
2809 | } else {\ |
2810 | break;\ |
2811 | }\ |
2812 | }\ |
2813 | } while (cond) |
2814 | |
2815 | typedef struct { |
2816 | uint32_t segno; |
2817 | uint32_t size; |
2818 | uint32_t dgap; |
2819 | } chunk_info; |
2820 | |
2821 | static grn_rc |
2822 | chunk_flush(grn_ctx *ctx, grn_ii *ii, chunk_info *cinfo, uint8_t *enc, uint32_t encsize) |
2823 | { |
2824 | uint8_t *dc; |
2825 | uint32_t dcn; |
2826 | grn_io_win dw; |
2827 | if (encsize) { |
2828 | chunk_new(ctx, ii, &dcn, encsize); |
2829 | if (ctx->rc == GRN_SUCCESS) { |
2830 | if ((dc = WIN_MAP(ii->chunk, ctx, &dw, dcn, 0, encsize, grn_io_wronly))) { |
2831 | grn_memcpy(dc, enc, encsize); |
2832 | grn_io_win_unmap(&dw); |
2833 | cinfo->segno = dcn; |
2834 | cinfo->size = encsize; |
2835 | } else { |
2836 | chunk_free(ctx, ii, dcn, 0, encsize); |
2837 | { |
2838 | DEFINE_NAME(ii); |
2839 | MERR("[ii][chunk][flush] failed to allocate a destination chunk: " |
2840 | "<%.*s> :" |
2841 | "segment:<%u>, size:<%u>" , |
2842 | name_size, name, |
2843 | dcn, encsize); |
2844 | } |
2845 | } |
2846 | } |
2847 | } else { |
2848 | cinfo->segno = 0; |
2849 | cinfo->size = 0; |
2850 | } |
2851 | return ctx->rc; |
2852 | } |
2853 | |
2854 | static grn_rc |
2855 | chunk_merge(grn_ctx *ctx, grn_ii *ii, buffer *sb, buffer_term *bt, |
2856 | chunk_info *cinfo, grn_id rid, datavec *dv, |
2857 | uint16_t *nextbp, uint8_t **sbpp, docinfo *bidp, int32_t *balance) |
2858 | { |
2859 | grn_io_win sw; |
2860 | uint64_t spos = 0; |
2861 | uint32_t segno = cinfo->segno, size = cinfo->size, sdf = 0, ndf = 0; |
2862 | uint32_t *ridp = NULL, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; |
2863 | docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = *bidp; |
2864 | uint8_t *scp = WIN_MAP(ii->chunk, ctx, &sw, segno, 0, size, grn_io_rdonly); |
2865 | |
2866 | if (scp) { |
2867 | uint16_t nextb = *nextbp; |
2868 | uint32_t snn = 0, *srp, *ssp = NULL, *stp, *sop = NULL, *snp; |
2869 | uint8_t *sbp = *sbpp; |
2870 | datavec rdv[MAX_N_ELEMENTS + 1]; |
2871 | size_t bufsize = S_SEGMENT * ii->n_elements; |
2872 | datavec_init(ctx, rdv, ii->n_elements, 0, 0); |
2873 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
2874 | rdv[ii->n_elements - 1].flags = ODD; |
2875 | } |
2876 | bufsize += grn_p_decv(ctx, scp, cinfo->size, rdv, ii->n_elements); |
2877 | // (df in chunk list) = a[1] - sdf; |
2878 | { |
2879 | int j = 0; |
2880 | sdf = rdv[j].data_size; |
2881 | srp = rdv[j++].data; |
2882 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } |
2883 | stp = rdv[j++].data; |
2884 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } |
2885 | snn = rdv[j].data_size; |
2886 | snp = rdv[j].data; |
2887 | } |
2888 | datavec_reset(ctx, dv, ii->n_elements, sdf + S_SEGMENT, bufsize); |
2889 | if (ctx->rc == GRN_SUCCESS) { |
2890 | { |
2891 | int j = 0; |
2892 | ridp = dv[j++].data; |
2893 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { sidp = dv[j++].data; } |
2894 | tfp = dv[j++].data; |
2895 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { weightp = dv[j++].data; } |
2896 | posp = dv[j].data; |
2897 | } |
2898 | GETNEXTC(); |
2899 | MERGE_BC(bid.rid <= rid || cid.rid); |
2900 | if (ctx->rc == GRN_SUCCESS) { |
2901 | *sbpp = sbp; |
2902 | *nextbp = nextb; |
2903 | *bidp = bid; |
2904 | GRN_ASSERT(posp < dv[ii->n_elements].data); |
2905 | ndf = ridp - dv[0].data; |
2906 | } |
2907 | } |
2908 | datavec_fin(ctx, rdv); |
2909 | grn_io_win_unmap(&sw); |
2910 | } else { |
2911 | DEFINE_NAME(ii); |
2912 | MERR("[ii][chunk][merge] failed to allocate a source chunk: " |
2913 | "<%.*s> :" |
2914 | "record:<%u>, segment:<%u>, size:<%u>" , |
2915 | name_size, name, |
2916 | rid, |
2917 | segno, |
2918 | size); |
2919 | } |
2920 | if (ctx->rc == GRN_SUCCESS) { |
2921 | int j = 0; |
2922 | uint8_t *enc; |
2923 | uint32_t encsize; |
2924 | uint32_t np = posp - dv[ii->n_elements - 1].data; |
2925 | uint32_t f_s = (ndf < 3) ? 0 : USE_P_ENC; |
2926 | uint32_t f_d = ((ndf < 16) || (ndf <= (lid.rid >> 8))) ? 0 : USE_P_ENC; |
2927 | dv[j].data_size = ndf; dv[j++].flags = f_d; |
2928 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
2929 | dv[j].data_size = ndf; dv[j++].flags = f_s; |
2930 | } |
2931 | dv[j].data_size = ndf; dv[j++].flags = f_s; |
2932 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { |
2933 | dv[j].data_size = ndf; dv[j++].flags = f_s; |
2934 | } |
2935 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
2936 | uint32_t f_p = ((np < 32) || (np <= (spos >> 13))) ? 0 : USE_P_ENC; |
2937 | dv[j].data_size = np; dv[j].flags = f_p|ODD; |
2938 | } |
2939 | if ((enc = GRN_MALLOC((ndf * 4 + np) * 2))) { |
2940 | encsize = grn_p_encv(ctx, dv, ii->n_elements, enc); |
2941 | chunk_flush(ctx, ii, cinfo, enc, encsize); |
2942 | if (ctx->rc == GRN_SUCCESS) { |
2943 | chunk_free(ctx, ii, segno, 0, size); |
2944 | } |
2945 | GRN_FREE(enc); |
2946 | } else { |
2947 | DEFINE_NAME(ii); |
2948 | MERR("[ii][chunk][merge] failed to allocate a encode buffer: " |
2949 | "<%.*s> :" |
2950 | "record:<%u>, segment:<%u>, size:<%u>" , |
2951 | name_size, name, |
2952 | rid, |
2953 | segno, |
2954 | size); |
2955 | } |
2956 | } |
2957 | *balance += (ndf - sdf); |
2958 | return ctx->rc; |
2959 | } |
2960 | |
2961 | static void |
2962 | buffer_merge_dump_datavec(grn_ctx *ctx, |
2963 | grn_ii *ii, |
2964 | datavec *dv, |
2965 | datavec *rdv) |
2966 | { |
2967 | int i, j; |
2968 | grn_obj buffer; |
2969 | |
2970 | GRN_TEXT_INIT(&buffer, 0); |
2971 | for (i = 0; i < ii->n_elements; i++) { |
2972 | GRN_LOG(ctx, GRN_LOG_DEBUG, "rdv[%d] data_size=%d, flags=%d" , |
2973 | i, rdv[i].data_size, rdv[i].flags); |
2974 | GRN_BULK_REWIND(&buffer); |
2975 | for (j = 0; j < rdv[i].data_size;) { |
2976 | grn_text_printf(ctx, &buffer, " %d" , rdv[i].data[j]); |
2977 | j++; |
2978 | if (!(j % 32) || j == rdv[i].data_size) { |
2979 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
2980 | "rdv[%d].data[%d]%.*s" , |
2981 | i, j, |
2982 | (int)GRN_TEXT_LEN(&buffer), |
2983 | GRN_TEXT_VALUE(&buffer)); |
2984 | GRN_BULK_REWIND(&buffer); |
2985 | } |
2986 | } |
2987 | } |
2988 | |
2989 | for (i = 0; i < ii->n_elements; i++) { |
2990 | GRN_LOG(ctx, GRN_LOG_DEBUG, "dv[%d] data_size=%d, flags=%d" , |
2991 | i, dv[i].data_size, dv[i].flags); |
2992 | GRN_BULK_REWIND(&buffer); |
2993 | for (j = 0; j < dv[i].data_size;) { |
2994 | grn_text_printf(ctx, &buffer, " %d" , dv[i].data[j]); |
2995 | j++; |
2996 | if (!(j % 32) || j == dv[i].data_size) { |
2997 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
2998 | "dv[%d].data[%d]%.*s" , |
2999 | i, j, |
3000 | (int)GRN_TEXT_LEN(&buffer), |
3001 | GRN_TEXT_VALUE(&buffer)); |
3002 | GRN_BULK_REWIND(&buffer); |
3003 | } |
3004 | } |
3005 | } |
3006 | |
3007 | GRN_OBJ_FIN(ctx, &buffer); |
3008 | } |
3009 | |
3010 | /* If dc doesn't have enough space, program may be crashed. |
3011 | * TODO: Support auto space extension or max size check. |
3012 | */ |
3013 | static grn_rc |
3014 | buffer_merge(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h, |
3015 | buffer *sb, uint8_t *sc, buffer *db, uint8_t *dc) |
3016 | { |
3017 | buffer_term *bt; |
3018 | uint8_t *sbp = NULL, *dcp = dc; |
3019 | datavec dv[MAX_N_ELEMENTS + 1]; |
3020 | datavec rdv[MAX_N_ELEMENTS + 1]; |
3021 | uint16_t n = db->header.nterms, nterms_void = 0; |
3022 | size_t unitsize = (S_SEGMENT + sb->header.chunk_size / sb->header.nterms) * 2; |
3023 | // size_t unitsize = (S_SEGMENT + sb->header.chunk_size) * 2 + (1<<24); |
3024 | size_t totalsize = unitsize * ii->n_elements; |
3025 | //todo : realloc |
3026 | datavec_init(ctx, dv, ii->n_elements, unitsize, totalsize); |
3027 | if (ctx->rc != GRN_SUCCESS) { |
3028 | DEFINE_NAME(ii); |
3029 | ERR(ctx->rc, |
3030 | "[ii][buffer][merge] failed to initialize data vector: " |
3031 | "<%.*s>: " |
3032 | "unit-size:<%" GRN_FMT_SIZE ">, " |
3033 | "total-size:<%" GRN_FMT_SIZE ">" , |
3034 | name_size, name, |
3035 | unitsize, |
3036 | totalsize); |
3037 | return ctx->rc; |
3038 | } |
3039 | datavec_init(ctx, rdv, ii->n_elements, 0, 0); |
3040 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
3041 | rdv[ii->n_elements - 1].flags = ODD; |
3042 | } |
3043 | for (bt = db->terms; n; n--, bt++) { |
3044 | uint16_t nextb; |
3045 | uint64_t spos = 0; |
3046 | int32_t balance = 0; |
3047 | uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp, nchunks = 0; |
3048 | uint32_t nvchunks = 0; |
3049 | chunk_info *cinfo = NULL; |
3050 | grn_id crid = GRN_ID_NIL; |
3051 | docinfo cid = {0, 0, 0, 0, 0}, lid = {0, 0, 0, 0, 0}, bid = {0, 0}; |
3052 | uint32_t sdf = 0, snn = 0, ndf; |
3053 | uint32_t *srp = NULL, *ssp = NULL, *stp = NULL, *sop = NULL, *snp = NULL; |
3054 | if (!bt->tid) { |
3055 | nterms_void++; |
3056 | continue; |
3057 | } |
3058 | if (!bt->pos_in_buffer) { |
3059 | GRN_ASSERT(!bt->size_in_buffer); |
3060 | if (bt->size_in_chunk) { |
3061 | grn_memcpy(dcp, sc + bt->pos_in_chunk, bt->size_in_chunk); |
3062 | bt->pos_in_chunk = (uint32_t)(dcp - dc); |
3063 | dcp += bt->size_in_chunk; |
3064 | } |
3065 | continue; |
3066 | } |
3067 | nextb = bt->pos_in_buffer; |
3068 | GETNEXTB(); |
3069 | if (sc && bt->size_in_chunk) { |
3070 | uint8_t *scp = sc + bt->pos_in_chunk; |
3071 | uint8_t *sce = scp + bt->size_in_chunk; |
3072 | size_t size = S_SEGMENT * ii->n_elements; |
3073 | if ((bt->tid & CHUNK_SPLIT)) { |
3074 | int i; |
3075 | GRN_B_DEC(nchunks, scp); |
3076 | if (!(cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) { |
3077 | datavec_fin(ctx, dv); |
3078 | datavec_fin(ctx, rdv); |
3079 | { |
3080 | DEFINE_NAME(ii); |
3081 | MERR("[ii][buffer][merge] failed to allocate chunk info: " |
3082 | "<%.*s> :" |
3083 | "segment:<%u>, " |
3084 | "n-chunks:<%u>, " |
3085 | "unit-size:<%" GRN_FMT_SIZE ">, " |
3086 | "total-size:<%" GRN_FMT_SIZE ">" , |
3087 | name_size, name, |
3088 | seg, |
3089 | nchunks, |
3090 | unitsize, |
3091 | totalsize); |
3092 | } |
3093 | return ctx->rc; |
3094 | } |
3095 | for (i = 0; i < nchunks; i++) { |
3096 | GRN_B_DEC(cinfo[i].segno, scp); |
3097 | GRN_B_DEC(cinfo[i].size, scp); |
3098 | GRN_B_DEC(cinfo[i].dgap, scp); |
3099 | crid += cinfo[i].dgap; |
3100 | if (bid.rid <= crid) { |
3101 | chunk_merge(ctx, ii, sb, bt, &cinfo[i], crid, dv, |
3102 | &nextb, &sbp, &bid, &balance); |
3103 | if (ctx->rc != GRN_SUCCESS) { |
3104 | if (cinfo) { GRN_FREE(cinfo); } |
3105 | datavec_fin(ctx, dv); |
3106 | datavec_fin(ctx, rdv); |
3107 | { |
3108 | DEFINE_NAME(ii); |
3109 | ERR(ctx->rc, |
3110 | "[ii][buffer][merge] failed to merge chunk: " |
3111 | "<%.*s>: " |
3112 | "chunk:<%u>, " |
3113 | "n-chunks:<%u>" , |
3114 | name_size, name, |
3115 | i, |
3116 | nchunks); |
3117 | } |
3118 | return ctx->rc; |
3119 | } |
3120 | } |
3121 | if (cinfo[i].size) { |
3122 | nvchunks++; |
3123 | } else { |
3124 | crid -= cinfo[i].dgap; |
3125 | cinfo[i + 1].dgap += cinfo[i].dgap; |
3126 | } |
3127 | } |
3128 | } |
3129 | if (sce > scp) { |
3130 | size += grn_p_decv(ctx, scp, sce - scp, rdv, ii->n_elements); |
3131 | { |
3132 | int j = 0; |
3133 | sdf = rdv[j].data_size; |
3134 | srp = rdv[j++].data; |
3135 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } |
3136 | stp = rdv[j++].data; |
3137 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } |
3138 | snn = rdv[j].data_size; |
3139 | snp = rdv[j].data; |
3140 | } |
3141 | datavec_reset(ctx, dv, ii->n_elements, sdf + S_SEGMENT, size); |
3142 | if (ctx->rc != GRN_SUCCESS) { |
3143 | if (cinfo) { GRN_FREE(cinfo); } |
3144 | datavec_fin(ctx, dv); |
3145 | datavec_fin(ctx, rdv); |
3146 | { |
3147 | DEFINE_NAME(ii); |
3148 | ERR(ctx->rc, |
3149 | "[ii][buffer][merge] failed to reset data vector: " |
3150 | "<%.*s>: " |
3151 | "unit-size:<%" GRN_FMT_SIZE ">, " |
3152 | "total-size:<%" GRN_FMT_SIZE ">" , |
3153 | name_size, name, |
3154 | (size_t)(sdf + S_SEGMENT), |
3155 | size); |
3156 | } |
3157 | return ctx->rc; |
3158 | } |
3159 | } |
3160 | } |
3161 | { |
3162 | int j = 0; |
3163 | ridp = dv[j++].data; |
3164 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { sidp = dv[j++].data; } |
3165 | tfp = dv[j++].data; |
3166 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { weightp = dv[j++].data; } |
3167 | posp = dv[j].data; |
3168 | } |
3169 | GETNEXTC(); |
3170 | MERGE_BC(1); |
3171 | if (ctx->rc != GRN_SUCCESS) { |
3172 | if (cinfo) { GRN_FREE(cinfo); } |
3173 | datavec_fin(ctx, dv); |
3174 | datavec_fin(ctx, rdv); |
3175 | { |
3176 | DEFINE_NAME(ii); |
3177 | ERR(ctx->rc, |
3178 | "[ii][buffer][merge] failed to merge chunk: <%.*s>" , |
3179 | name_size, name); |
3180 | } |
3181 | return ctx->rc; |
3182 | } |
3183 | GRN_ASSERT(posp < dv[ii->n_elements].data); |
3184 | ndf = ridp - dv[0].data; |
3185 | /* |
3186 | { |
3187 | grn_obj buf; |
3188 | uint32_t rid, sid, tf, i, pos, *pp; |
3189 | GRN_TEXT_INIT(&buf, 0); |
3190 | rid = 0; |
3191 | pp = dv[3].data; |
3192 | for (i = 0; i < ndf; i++) { |
3193 | GRN_BULK_REWIND(&buf); |
3194 | rid += dv[0].data[i]; |
3195 | if (dv[0].data[i]) { sid = 0; } |
3196 | sid += dv[1].data[i] + 1; |
3197 | tf = dv[2].data[i] + 1; |
3198 | pos = 0; |
3199 | grn_text_itoa(ctx, &buf, rid); |
3200 | GRN_TEXT_PUTC(ctx, &buf, ':'); |
3201 | grn_text_itoa(ctx, &buf, sid); |
3202 | GRN_TEXT_PUTC(ctx, &buf, ':'); |
3203 | grn_text_itoa(ctx, &buf, tf); |
3204 | GRN_TEXT_PUTC(ctx, &buf, ':'); |
3205 | while (tf--) { |
3206 | pos += *pp++; |
3207 | grn_text_itoa(ctx, &buf, pos); |
3208 | if (tf) { GRN_TEXT_PUTC(ctx, &buf, ','); } |
3209 | } |
3210 | GRN_TEXT_PUTC(ctx, &buf, '\0'); |
3211 | GRN_LOG(ctx, GRN_LOG_DEBUG, "Posting:%s", GRN_TEXT_VALUE(&buf)); |
3212 | } |
3213 | GRN_OBJ_FIN(ctx, &buf); |
3214 | } |
3215 | */ |
3216 | { |
3217 | grn_id tid = bt->tid & GRN_ID_MAX; |
3218 | uint32_t *a = array_at(ctx, ii, tid); |
3219 | if (!a) { |
3220 | GRN_LOG(ctx, GRN_LOG_DEBUG, "array_entry not found tid=%d" , tid); |
3221 | memset(bt, 0, sizeof(buffer_term)); |
3222 | nterms_void++; |
3223 | } else { |
3224 | if (!ndf && !nvchunks) { |
3225 | a[0] = 0; |
3226 | a[1] = 0; |
3227 | lexicon_delete(ctx, ii, tid, h); |
3228 | memset(bt, 0, sizeof(buffer_term)); |
3229 | nterms_void++; |
3230 | } else if ((ii->header->flags & GRN_OBJ_WITH_SECTION) |
3231 | && !nvchunks && ndf == 1 && lid.rid < 0x100000 && |
3232 | lid.sid < 0x800 && lid.tf == 1 && lid.weight == 0) { |
3233 | a[0] = (lid.rid << 12) + (lid.sid << 1) + 1; |
3234 | a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; |
3235 | memset(bt, 0, sizeof(buffer_term)); |
3236 | nterms_void++; |
3237 | } else if (!(ii->header->flags & GRN_OBJ_WITH_SECTION) |
3238 | && !nvchunks && ndf == 1 && lid.tf == 1 && lid.weight == 0) { |
3239 | a[0] = (lid.rid << 1) + 1; |
3240 | a[1] = (ii->header->flags & GRN_OBJ_WITH_POSITION) ? posp[-1] : 0; |
3241 | memset(bt, 0, sizeof(buffer_term)); |
3242 | nterms_void++; |
3243 | } else { |
3244 | int j = 0; |
3245 | uint8_t *dcp0; |
3246 | uint32_t encsize; |
3247 | uint32_t f_s = (ndf < 3) ? 0 : USE_P_ENC; |
3248 | uint32_t f_d = ((ndf < 16) || (ndf <= (lid.rid >> 8))) ? 0 : USE_P_ENC; |
3249 | dv[j].data_size = ndf; dv[j++].flags = f_d; |
3250 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
3251 | dv[j].data_size = ndf; dv[j++].flags = f_s; |
3252 | } |
3253 | dv[j].data_size = ndf; dv[j++].flags = f_s; |
3254 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { |
3255 | dv[j].data_size = ndf; dv[j++].flags = f_s; |
3256 | } |
3257 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
3258 | uint32_t np = posp - dv[ii->n_elements - 1].data; |
3259 | uint32_t f_p = ((np < 32) || (np <= (spos >> 13))) ? 0 : USE_P_ENC; |
3260 | dv[j].data_size = np; dv[j].flags = f_p|ODD; |
3261 | } |
3262 | dcp0 = dcp; |
3263 | a[1] = (bt->size_in_chunk ? a[1] : 0) + (ndf - sdf) + balance; |
3264 | if (nvchunks) { |
3265 | int i; |
3266 | GRN_B_ENC(nvchunks, dcp); |
3267 | for (i = 0; i < nchunks; i++) { |
3268 | if (cinfo[i].size) { |
3269 | GRN_B_ENC(cinfo[i].segno, dcp); |
3270 | GRN_B_ENC(cinfo[i].size, dcp); |
3271 | GRN_B_ENC(cinfo[i].dgap, dcp); |
3272 | } |
3273 | } |
3274 | } |
3275 | encsize = grn_p_encv(ctx, dv, ii->n_elements, dcp); |
3276 | |
3277 | if (grn_logger_pass(ctx, GRN_LOG_DEBUG)) { |
3278 | if (sb->header.chunk_size + S_SEGMENT <= (dcp - dc) + encsize) { |
3279 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
3280 | "cs(%d)+(%d)=(%d)" |
3281 | "<=(%" GRN_FMT_LLD ")+(%d)=" |
3282 | "(%" GRN_FMT_LLD ")" , |
3283 | sb->header.chunk_size, |
3284 | S_SEGMENT, |
3285 | sb->header.chunk_size + S_SEGMENT, |
3286 | (long long int)(dcp - dc), |
3287 | encsize, |
3288 | (long long int)((dcp - dc) + encsize)); |
3289 | buffer_merge_dump_datavec(ctx, ii, dv, rdv); |
3290 | } |
3291 | } |
3292 | |
3293 | if (encsize > CHUNK_SPLIT_THRESHOLD && |
3294 | (cinfo || (cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) && |
3295 | !chunk_flush(ctx, ii, &cinfo[nchunks], dcp, encsize)) { |
3296 | int i; |
3297 | cinfo[nchunks].dgap = lid.rid - crid; |
3298 | nvchunks++; |
3299 | dcp = dcp0; |
3300 | GRN_B_ENC(nvchunks, dcp); |
3301 | for (i = 0; i <= nchunks; i++) { |
3302 | if (cinfo[i].size) { |
3303 | GRN_B_ENC(cinfo[i].segno, dcp); |
3304 | GRN_B_ENC(cinfo[i].size, dcp); |
3305 | GRN_B_ENC(cinfo[i].dgap, dcp); |
3306 | } |
3307 | } |
3308 | GRN_LOG(ctx, GRN_LOG_DEBUG, "split (%d) encsize=%d" , tid, encsize); |
3309 | bt->tid |= CHUNK_SPLIT; |
3310 | } else { |
3311 | dcp += encsize; |
3312 | if (!nvchunks) { |
3313 | bt->tid &= ~CHUNK_SPLIT; |
3314 | } |
3315 | } |
3316 | bt->pos_in_chunk = (uint32_t)(dcp0 - dc); |
3317 | bt->size_in_chunk = (uint32_t)(dcp - dcp0); |
3318 | bt->size_in_buffer = 0; |
3319 | bt->pos_in_buffer = 0; |
3320 | } |
3321 | array_unref(ii, tid); |
3322 | } |
3323 | } |
3324 | if (cinfo) { GRN_FREE(cinfo); } |
3325 | } |
3326 | datavec_fin(ctx, rdv); |
3327 | datavec_fin(ctx, dv); |
3328 | db->header.chunk_size = (uint32_t)(dcp - dc); |
3329 | db->header.buffer_free = |
3330 | S_SEGMENT - sizeof(buffer_header) - db->header.nterms * sizeof(buffer_term); |
3331 | db->header.nterms_void = nterms_void; |
3332 | return ctx->rc; |
3333 | } |
3334 | |
3335 | static void |
3336 | fake_map(grn_ctx *ctx, grn_io *io, grn_io_win *iw, void *addr, uint32_t seg, uint32_t size) |
3337 | { |
3338 | iw->ctx = ctx; |
3339 | iw->diff = 0; |
3340 | iw->io = io; |
3341 | iw->mode = grn_io_wronly; |
3342 | iw->segment = ((seg) >> GRN_II_N_CHUNK_VARIATION); |
3343 | iw->offset = (((seg) & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << GRN_II_W_LEAST_CHUNK); |
3344 | iw->size = size; |
3345 | iw->cached = 0; |
3346 | iw->addr = addr; |
3347 | } |
3348 | |
3349 | static grn_rc |
3350 | buffer_flush(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) |
3351 | { |
3352 | grn_io_win sw, dw; |
3353 | buffer *sb, *db = NULL; |
3354 | uint8_t *dc, *sc = NULL; |
3355 | uint32_t ds, pseg, scn, dcn = 0; |
3356 | if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { |
3357 | DEFINE_NAME(ii); |
3358 | CRIT(GRN_FILE_CORRUPT, |
3359 | "[ii][buffer][flush] invalid segment: " |
3360 | "<%.*s> :" |
3361 | "request:<%u>, max:<%u>" , |
3362 | name_size, name, |
3363 | seg, ii->seg->header->max_segment); |
3364 | return ctx->rc; |
3365 | } |
3366 | if ((ds = segment_get(ctx, ii)) == ii->seg->header->max_segment) { |
3367 | DEFINE_NAME(ii); |
3368 | MERR("[ii][buffer][flush] segment is full: " |
3369 | "<%.*s> :" |
3370 | "request:<%u>, max:<%u>" , |
3371 | name_size, name, |
3372 | seg, ii->seg->header->max_segment); |
3373 | return ctx->rc; |
3374 | } |
3375 | pseg = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); |
3376 | if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { |
3377 | DEFINE_NAME(ii); |
3378 | MERR("[ii][buffer][flush] failed to open buffer: " |
3379 | "<%.*s> :" |
3380 | "segment:<%u>, position:<%u>, max:<%u>" , |
3381 | name_size, name, |
3382 | seg, SEG2POS(seg, 0), ii->seg->header->max_segment); |
3383 | return ctx->rc; |
3384 | } |
3385 | { |
3386 | GRN_IO_SEG_REF(ii->seg, ds, db); |
3387 | if (db) { |
3388 | uint32_t actual_chunk_size = 0; |
3389 | uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; |
3390 | if ((dc = GRN_MALLOC(max_dest_chunk_size * 2))) { |
3391 | if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED || |
3392 | (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, |
3393 | sb->header.chunk_size, grn_io_rdonly))) { |
3394 | uint16_t n = sb->header.nterms; |
3395 | memset(db, 0, S_SEGMENT); |
3396 | grn_memcpy(db->terms, sb->terms, n * sizeof(buffer_term)); |
3397 | db->header.nterms = n; |
3398 | buffer_merge(ctx, ii, seg, h, sb, sc, db, dc); |
3399 | if (ctx->rc == GRN_SUCCESS) { |
3400 | actual_chunk_size = db->header.chunk_size; |
3401 | if (actual_chunk_size > 0) { |
3402 | chunk_new(ctx, ii, &dcn, actual_chunk_size); |
3403 | } |
3404 | if (ctx->rc == GRN_SUCCESS) { |
3405 | grn_rc rc; |
3406 | db->header.chunk = |
3407 | actual_chunk_size ? dcn : GRN_II_PSEG_NOT_ASSIGNED; |
3408 | fake_map(ctx, ii->chunk, &dw, dc, dcn, actual_chunk_size); |
3409 | rc = grn_io_win_unmap(&dw); |
3410 | if (rc == GRN_SUCCESS) { |
3411 | buffer_segment_update(ii, seg, ds); |
3412 | ii->header->total_chunk_size += actual_chunk_size; |
3413 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { |
3414 | grn_io_win_unmap(&sw); |
3415 | chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); |
3416 | ii->header->total_chunk_size -= sb->header.chunk_size; |
3417 | } |
3418 | } else { |
3419 | GRN_FREE(dc); |
3420 | if (actual_chunk_size) { |
3421 | chunk_free(ctx, ii, dcn, 0, actual_chunk_size); |
3422 | } |
3423 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } |
3424 | { |
3425 | DEFINE_NAME(ii); |
3426 | ERR(rc, |
3427 | "[ii][buffer][flush] failed to unmap a destination chunk: " |
3428 | "<%.*s> : " |
3429 | "segment:<%u>, destination-segment:<%u>, actual-size:<%u>" , |
3430 | name_size, name, |
3431 | seg, |
3432 | dcn, |
3433 | actual_chunk_size); |
3434 | } |
3435 | } |
3436 | } else { |
3437 | GRN_FREE(dc); |
3438 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } |
3439 | } |
3440 | } else { |
3441 | GRN_FREE(dc); |
3442 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } |
3443 | } |
3444 | } else { |
3445 | GRN_FREE(dc); |
3446 | { |
3447 | DEFINE_NAME(ii); |
3448 | MERR("[ii][buffer][flush] failed to map a source chunk: " |
3449 | "<%.*s> :" |
3450 | "segment:<%u>, source-segment:<%u>, chunk-size:<%u>" , |
3451 | name_size, name, |
3452 | seg, |
3453 | scn, |
3454 | sb->header.chunk_size); |
3455 | } |
3456 | } |
3457 | } else { |
3458 | DEFINE_NAME(ii); |
3459 | MERR("[ii][buffer][flush] failed to allocate a destination chunk: " |
3460 | "<%.*s> :" |
3461 | "segment:<%u>, destination-segment:<%u>" , |
3462 | name_size, name, |
3463 | seg, |
3464 | ds); |
3465 | } |
3466 | GRN_IO_SEG_UNREF(ii->seg, ds); |
3467 | } else { |
3468 | DEFINE_NAME(ii); |
3469 | MERR("[ii][buffer][flush] failed to allocate a destination segment: " |
3470 | "<%.*s> :" |
3471 | "segment:<%u>, destination-segment:<%u>" , |
3472 | name_size, name, |
3473 | seg, |
3474 | ds); |
3475 | } |
3476 | buffer_close(ctx, ii, pseg); |
3477 | } |
3478 | return ctx->rc; |
3479 | } |
3480 | |
3481 | void |
3482 | grn_ii_buffer_check(grn_ctx *ctx, grn_ii *ii, uint32_t seg) |
3483 | { |
3484 | grn_io_win sw; |
3485 | buffer *sb; |
3486 | uint8_t *sc = NULL; |
3487 | uint32_t pseg, scn, nterms_with_corrupt_chunk = 0, nterm_with_chunk = 0; |
3488 | uint32_t ndeleted_terms_with_value = 0; |
3489 | buffer_term *bt; |
3490 | uint8_t *sbp = NULL; |
3491 | datavec rdv[MAX_N_ELEMENTS + 1]; |
3492 | uint16_t n; |
3493 | int nterms_void = 0; |
3494 | int size_in_buffer = 0; |
3495 | grn_obj buf; |
3496 | size_t lower_bound; |
3497 | int64_t nloops = 0, nviolations = 0; |
3498 | if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { |
3499 | GRN_OUTPUT_BOOL(GRN_FALSE); |
3500 | return; |
3501 | } |
3502 | pseg = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); |
3503 | if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { |
3504 | GRN_OUTPUT_BOOL(GRN_FALSE); |
3505 | return; |
3506 | } |
3507 | lower_bound = |
3508 | (sb->header.buffer_free + sizeof(buffer_term) * sb->header.nterms) |
3509 | / sizeof(buffer_rec); |
3510 | datavec_init(ctx, rdv, ii->n_elements, 0, 0); |
3511 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
3512 | rdv[ii->n_elements - 1].flags = ODD; |
3513 | } |
3514 | GRN_OUTPUT_MAP_OPEN("BUFFER" , -1); |
3515 | GRN_OUTPUT_CSTR("buffer id" ); |
3516 | GRN_OUTPUT_INT64(seg); |
3517 | if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED) { |
3518 | GRN_OUTPUT_CSTR("void chunk size" ); |
3519 | GRN_OUTPUT_INT64(sb->header.chunk_size); |
3520 | } else { |
3521 | if ((sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, sb->header.chunk_size, |
3522 | grn_io_rdonly))) { |
3523 | GRN_OUTPUT_CSTR("chunk size" ); |
3524 | GRN_OUTPUT_INT64(sb->header.chunk_size); |
3525 | } else { |
3526 | GRN_OUTPUT_CSTR("unmappable chunk size" ); |
3527 | GRN_OUTPUT_INT64(sb->header.chunk_size); |
3528 | } |
3529 | } |
3530 | GRN_OUTPUT_CSTR("buffer term" ); |
3531 | GRN_OUTPUT_ARRAY_OPEN("TERMS" , sb->header.nterms); |
3532 | |
3533 | GRN_OBJ_INIT(&buf, GRN_BULK, 0, ii->lexicon->header.domain); |
3534 | for (bt = sb->terms, n = sb->header.nterms; n; n--, bt++) { |
3535 | grn_id tid, tid_; |
3536 | char key[GRN_TABLE_MAX_KEY_SIZE]; |
3537 | int key_size; |
3538 | uint16_t nextb; |
3539 | uint32_t nchunks = 0; |
3540 | chunk_info *cinfo = NULL; |
3541 | grn_id crid = GRN_ID_NIL; |
3542 | docinfo bid = {0, 0}; |
3543 | uint32_t sdf = 0, snn = 0; |
3544 | uint32_t *srp = NULL, *ssp = NULL, *stp = NULL, *sop = NULL, *snp = NULL; |
3545 | if (!bt->tid && !bt->pos_in_buffer && !bt->size_in_buffer) { |
3546 | nterms_void++; |
3547 | continue; |
3548 | } |
3549 | GRN_OUTPUT_ARRAY_OPEN("TERM" , -1); |
3550 | tid = (bt->tid & GRN_ID_MAX); |
3551 | key_size = grn_table_get_key(ctx, ii->lexicon, tid, key, |
3552 | GRN_TABLE_MAX_KEY_SIZE); |
3553 | tid_ = grn_table_get(ctx, ii->lexicon, key, key_size); |
3554 | GRN_TEXT_SET(ctx, &buf, key, key_size); |
3555 | GRN_OUTPUT_OBJ(&buf, NULL); |
3556 | GRN_OUTPUT_INT64(bt->tid); |
3557 | GRN_OUTPUT_INT64(tid_); |
3558 | nextb = bt->pos_in_buffer; |
3559 | size_in_buffer += bt->size_in_buffer; |
3560 | if (tid != tid_ && (bt->size_in_buffer || bt->size_in_chunk)) { |
3561 | ndeleted_terms_with_value++; |
3562 | } |
3563 | GETNEXTB(); |
3564 | GRN_OUTPUT_INT64(bt->size_in_buffer); |
3565 | GRN_OUTPUT_INT64(bt->size_in_chunk); |
3566 | if (sc && bt->size_in_chunk) { |
3567 | uint8_t *scp = sc + bt->pos_in_chunk; |
3568 | uint8_t *sce = scp + bt->size_in_chunk; |
3569 | size_t size = S_SEGMENT * ii->n_elements; |
3570 | if ((bt->tid & CHUNK_SPLIT)) { |
3571 | int i; |
3572 | GRN_B_DEC(nchunks, scp); |
3573 | if (!(cinfo = GRN_MALLOCN(chunk_info, nchunks + 1))) { |
3574 | datavec_fin(ctx, rdv); |
3575 | GRN_OBJ_FIN(ctx, &buf); |
3576 | return; |
3577 | } |
3578 | for (i = 0; i < nchunks; i++) { |
3579 | GRN_B_DEC(cinfo[i].segno, scp); |
3580 | GRN_B_DEC(cinfo[i].size, scp); |
3581 | GRN_B_DEC(cinfo[i].dgap, scp); |
3582 | crid += cinfo[i].dgap; |
3583 | } |
3584 | } |
3585 | if (sce > scp) { |
3586 | size += grn_p_decv(ctx, scp, sce - scp, rdv, ii->n_elements); |
3587 | { |
3588 | int j = 0; |
3589 | sdf = rdv[j].data_size; |
3590 | GRN_OUTPUT_INT64(sdf); |
3591 | srp = rdv[j++].data; |
3592 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { ssp = rdv[j++].data; } |
3593 | if (sdf != rdv[j].data_size) { |
3594 | nterms_with_corrupt_chunk++; |
3595 | } |
3596 | stp = rdv[j++].data; |
3597 | if ((ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { sop = rdv[j++].data; } |
3598 | GRN_OUTPUT_INT64(rdv[j].data_size); |
3599 | snn = rdv[j].data_size; |
3600 | snp = rdv[j].data; |
3601 | } |
3602 | nterm_with_chunk++; |
3603 | } |
3604 | } |
3605 | { |
3606 | uint16_t pos; |
3607 | grn_id rid, sid, rid_ = 0, sid_ = 0; |
3608 | uint8_t *p; |
3609 | buffer_rec *r; |
3610 | for (pos = bt->pos_in_buffer; pos; pos = r->step) { |
3611 | if (pos < lower_bound) { |
3612 | nviolations++; |
3613 | } |
3614 | r = BUFFER_REC_AT(sb, pos); |
3615 | p = GRN_NEXT_ADDR(r); |
3616 | GRN_B_DEC(rid, p); |
3617 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
3618 | GRN_B_DEC(sid, p); |
3619 | } else { |
3620 | sid = 1; |
3621 | } |
3622 | if (rid < rid_ || (rid == rid_ && sid < sid_)) { |
3623 | nloops++; |
3624 | } |
3625 | rid_ = rid; |
3626 | sid_ = sid; |
3627 | } |
3628 | } |
3629 | GRN_OUTPUT_ARRAY_CLOSE(); |
3630 | if (cinfo) { GRN_FREE(cinfo); } |
3631 | } |
3632 | GRN_OBJ_FIN(ctx, &buf); |
3633 | |
3634 | GRN_OUTPUT_ARRAY_CLOSE(); |
3635 | GRN_OUTPUT_CSTR("buffer free" ); |
3636 | GRN_OUTPUT_INT64(sb->header.buffer_free); |
3637 | GRN_OUTPUT_CSTR("size in buffer" ); |
3638 | GRN_OUTPUT_INT64(size_in_buffer); |
3639 | GRN_OUTPUT_CSTR("nterms" ); |
3640 | GRN_OUTPUT_INT64(sb->header.nterms); |
3641 | if (nterms_void != sb->header.nterms_void) { |
3642 | GRN_OUTPUT_CSTR("nterms void gap" ); |
3643 | GRN_OUTPUT_INT64(nterms_void - sb->header.nterms_void); |
3644 | } |
3645 | GRN_OUTPUT_CSTR("nterms with chunk" ); |
3646 | GRN_OUTPUT_INT64(nterm_with_chunk); |
3647 | if (nterms_with_corrupt_chunk) { |
3648 | GRN_OUTPUT_CSTR("nterms with corrupt chunk" ); |
3649 | GRN_OUTPUT_INT64(nterms_with_corrupt_chunk); |
3650 | } |
3651 | if (ndeleted_terms_with_value) { |
3652 | GRN_OUTPUT_CSTR("number of deleted terms with value" ); |
3653 | GRN_OUTPUT_INT64(ndeleted_terms_with_value); |
3654 | } |
3655 | if (nloops) { |
3656 | GRN_OUTPUT_CSTR("number of loops" ); |
3657 | GRN_OUTPUT_INT64(nloops); |
3658 | } |
3659 | if (nviolations) { |
3660 | GRN_OUTPUT_CSTR("number of violations" ); |
3661 | GRN_OUTPUT_INT64(nviolations); |
3662 | } |
3663 | GRN_OUTPUT_MAP_CLOSE(); |
3664 | datavec_fin(ctx, rdv); |
3665 | if (sc) { grn_io_win_unmap(&sw); } |
3666 | buffer_close(ctx, ii, pseg); |
3667 | } |
3668 | |
3669 | typedef struct { |
3670 | buffer_term *bt; |
3671 | const char *key; |
3672 | uint32_t key_size; |
3673 | } term_sort; |
3674 | |
3675 | static int |
3676 | term_compar(const void *t1, const void *t2) |
3677 | { |
3678 | int r; |
3679 | const term_sort *x = (term_sort *)t1, *y = (term_sort *)t2; |
3680 | if (x->key_size > y->key_size) { |
3681 | r = memcmp(x->key, y->key, y->key_size); |
3682 | return r ? r : x->key_size - y->key_size; |
3683 | } else { |
3684 | r = memcmp(x->key, y->key, x->key_size); |
3685 | return r ? r : x->key_size - y->key_size; |
3686 | } |
3687 | } |
3688 | |
3689 | static grn_rc |
3690 | term_split(grn_ctx *ctx, grn_obj *lexicon, buffer *sb, buffer *db0, buffer *db1) |
3691 | { |
3692 | uint16_t i, n, *nt; |
3693 | buffer_term *bt; |
3694 | uint32_t s, th = (sb->header.chunk_size + sb->header.nterms) >> 1; |
3695 | term_sort *ts = GRN_MALLOC(sb->header.nterms * sizeof(term_sort)); |
3696 | if (!ts) { return GRN_NO_MEMORY_AVAILABLE; } |
3697 | for (i = 0, n = sb->header.nterms, bt = sb->terms; n; bt++, n--) { |
3698 | if (bt->tid) { |
3699 | grn_id tid = bt->tid & GRN_ID_MAX; |
3700 | ts[i].key = _grn_table_key(ctx, lexicon, tid, &ts[i].key_size); |
3701 | ts[i].bt = bt; |
3702 | i++; |
3703 | } |
3704 | } |
3705 | qsort(ts, i, sizeof(term_sort), term_compar); |
3706 | memset(db0, 0, S_SEGMENT); |
3707 | bt = db0->terms; |
3708 | nt = &db0->header.nterms; |
3709 | for (s = 0; n + 1 < i && s <= th; n++, bt++) { |
3710 | grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); |
3711 | (*nt)++; |
3712 | s += ts[n].bt->size_in_chunk + 1; |
3713 | } |
3714 | memset(db1, 0, S_SEGMENT); |
3715 | bt = db1->terms; |
3716 | nt = &db1->header.nterms; |
3717 | for (; n < i; n++, bt++) { |
3718 | grn_memcpy(bt, ts[n].bt, sizeof(buffer_term)); |
3719 | (*nt)++; |
3720 | } |
3721 | GRN_FREE(ts); |
3722 | GRN_LOG(ctx, GRN_LOG_DEBUG, "d0=%d d1=%d" , |
3723 | db0->header.nterms, db1->header.nterms); |
3724 | return GRN_SUCCESS; |
3725 | } |
3726 | |
3727 | static void |
3728 | array_update(grn_ctx *ctx, grn_ii *ii, uint32_t dls, buffer *db) |
3729 | { |
3730 | uint16_t n; |
3731 | buffer_term *bt; |
3732 | uint32_t *a, pos = SEG2POS(dls, sizeof(buffer_header)); |
3733 | for (n = db->header.nterms, bt = db->terms; n; n--, bt++) { |
3734 | if (bt->tid) { |
3735 | grn_id tid = bt->tid & GRN_ID_MAX; |
3736 | if ((a = array_at(ctx, ii, tid))) { |
3737 | a[0] = pos; |
3738 | array_unref(ii, tid); |
3739 | } else { |
3740 | GRN_LOG(ctx, GRN_LOG_WARNING, "array_at failed (%d)" , tid); |
3741 | } |
3742 | } |
3743 | pos += sizeof(buffer_term) >> 2; |
3744 | } |
3745 | } |
3746 | |
3747 | static grn_rc |
3748 | buffer_split(grn_ctx *ctx, grn_ii *ii, uint32_t seg, grn_hash *h) |
3749 | { |
3750 | grn_io_win sw, dw0, dw1; |
3751 | buffer *sb, *db0 = NULL, *db1 = NULL; |
3752 | uint8_t *sc = NULL, *dc0, *dc1; |
3753 | uint32_t dps0 = 0, dps1 = 0, dls0 = 0, dls1 = 0, sps, scn, dcn0 = 0, dcn1 = 0; |
3754 | if (ii->header->binfo[seg] == GRN_II_PSEG_NOT_ASSIGNED) { |
3755 | DEFINE_NAME(ii); |
3756 | CRIT(GRN_FILE_CORRUPT, |
3757 | "[ii][buffer][split] invalid segment: " |
3758 | "<%.*s> :" |
3759 | "request:<%u>, max:<%u>" , |
3760 | name_size, name, |
3761 | seg, ii->seg->header->max_segment); |
3762 | return ctx->rc; |
3763 | } |
3764 | buffer_segment_reserve(ctx, ii, &dls0, &dps0, &dls1, &dps1); |
3765 | if (ctx->rc != GRN_SUCCESS) { |
3766 | DEFINE_NAME(ii); |
3767 | ERR(ctx->rc, |
3768 | "[ii][buffer][split] failed to reserve buffer segments: " |
3769 | "<%.*s> :" |
3770 | "request:<%u>, max:<%u>" , |
3771 | name_size, name, |
3772 | seg, ii->seg->header->max_segment); |
3773 | return ctx->rc; |
3774 | } |
3775 | sps = buffer_open(ctx, ii, SEG2POS(seg, 0), NULL, &sb); |
3776 | if (sps == GRN_II_PSEG_NOT_ASSIGNED) { |
3777 | DEFINE_NAME(ii); |
3778 | MERR("[ii][buffer][split] failed to open buffer: " |
3779 | "<%.*s> :" |
3780 | "segment:<%u>, position:<%u>, max-segment:<%u>" , |
3781 | name_size, name, |
3782 | seg, SEG2POS(seg, 0), ii->seg->header->max_segment); |
3783 | } else { |
3784 | GRN_IO_SEG_REF(ii->seg, dps0, db0); |
3785 | if (db0) { |
3786 | GRN_IO_SEG_REF(ii->seg, dps1, db1); |
3787 | if (db1) { |
3788 | uint32_t actual_db0_chunk_size = 0; |
3789 | uint32_t actual_db1_chunk_size = 0; |
3790 | uint32_t max_dest_chunk_size = sb->header.chunk_size + S_SEGMENT; |
3791 | if ((dc0 = GRN_MALLOC(max_dest_chunk_size * 2))) { |
3792 | if ((dc1 = GRN_MALLOC(max_dest_chunk_size * 2))) { |
3793 | if ((scn = sb->header.chunk) == GRN_II_PSEG_NOT_ASSIGNED || |
3794 | (sc = WIN_MAP(ii->chunk, ctx, &sw, scn, 0, |
3795 | sb->header.chunk_size, grn_io_rdonly))) { |
3796 | term_split(ctx, ii->lexicon, sb, db0, db1); |
3797 | buffer_merge(ctx, ii, seg, h, sb, sc, db0, dc0); |
3798 | if (ctx->rc == GRN_SUCCESS) { |
3799 | actual_db0_chunk_size = db0->header.chunk_size; |
3800 | if (actual_db0_chunk_size > 0) { |
3801 | chunk_new(ctx, ii, &dcn0, actual_db0_chunk_size); |
3802 | } |
3803 | if (ctx->rc == GRN_SUCCESS) { |
3804 | grn_rc rc; |
3805 | db0->header.chunk = |
3806 | actual_db0_chunk_size ? dcn0 : GRN_II_PSEG_NOT_ASSIGNED; |
3807 | fake_map(ctx, ii->chunk, &dw0, dc0, dcn0, actual_db0_chunk_size); |
3808 | rc = grn_io_win_unmap(&dw0); |
3809 | if (rc == GRN_SUCCESS) { |
3810 | buffer_merge(ctx, ii, seg, h, sb, sc, db1, dc1); |
3811 | if (ctx->rc == GRN_SUCCESS) { |
3812 | actual_db1_chunk_size = db1->header.chunk_size; |
3813 | if (actual_db1_chunk_size > 0) { |
3814 | chunk_new(ctx, ii, &dcn1, actual_db1_chunk_size); |
3815 | } |
3816 | if (ctx->rc == GRN_SUCCESS) { |
3817 | fake_map(ctx, ii->chunk, &dw1, dc1, dcn1, |
3818 | actual_db1_chunk_size); |
3819 | rc = grn_io_win_unmap(&dw1); |
3820 | if (rc == GRN_SUCCESS) { |
3821 | db1->header.chunk = |
3822 | actual_db1_chunk_size ? dcn1 : GRN_II_PSEG_NOT_ASSIGNED; |
3823 | buffer_segment_update(ii, dls0, dps0); |
3824 | buffer_segment_update(ii, dls1, dps1); |
3825 | array_update(ctx, ii, dls0, db0); |
3826 | array_update(ctx, ii, dls1, db1); |
3827 | buffer_segment_clear(ii, seg); |
3828 | ii->header->total_chunk_size += actual_db0_chunk_size; |
3829 | ii->header->total_chunk_size += actual_db1_chunk_size; |
3830 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { |
3831 | grn_io_win_unmap(&sw); |
3832 | chunk_free(ctx, ii, scn, 0, sb->header.chunk_size); |
3833 | ii->header->total_chunk_size -= sb->header.chunk_size; |
3834 | } |
3835 | } else { |
3836 | if (actual_db1_chunk_size) { |
3837 | chunk_free(ctx, ii, dcn1, 0, actual_db1_chunk_size); |
3838 | } |
3839 | if (actual_db0_chunk_size) { |
3840 | chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); |
3841 | } |
3842 | GRN_FREE(dc1); |
3843 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { |
3844 | grn_io_win_unmap(&sw); |
3845 | } |
3846 | { |
3847 | DEFINE_NAME(ii); |
3848 | ERR(rc, |
3849 | "[ii][buffer[merge] " |
3850 | "failed to unmap a destination chunk2: " |
3851 | "<%.*s> :" |
3852 | "segment:<%u>, " |
3853 | "destination-chunk1:<%u>, " |
3854 | "destination-chunk2:<%u>, " |
3855 | "actual-size1:<%u>, " |
3856 | "actual-size2:<%u>" , |
3857 | name_size, name, |
3858 | seg, |
3859 | dcn0, |
3860 | dcn1, |
3861 | actual_db0_chunk_size, |
3862 | actual_db1_chunk_size); |
3863 | } |
3864 | } |
3865 | } else { |
3866 | if (actual_db0_chunk_size) { |
3867 | chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); |
3868 | } |
3869 | GRN_FREE(dc1); |
3870 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { |
3871 | grn_io_win_unmap(&sw); |
3872 | } |
3873 | } |
3874 | } else { |
3875 | if (actual_db0_chunk_size) { |
3876 | chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); |
3877 | } |
3878 | GRN_FREE(dc1); |
3879 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { |
3880 | grn_io_win_unmap(&sw); |
3881 | } |
3882 | } |
3883 | } else { |
3884 | if (actual_db0_chunk_size) { |
3885 | chunk_free(ctx, ii, dcn0, 0, actual_db0_chunk_size); |
3886 | } |
3887 | GRN_FREE(dc1); |
3888 | GRN_FREE(dc0); |
3889 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { |
3890 | grn_io_win_unmap(&sw); |
3891 | } |
3892 | { |
3893 | DEFINE_NAME(ii); |
3894 | ERR(rc, |
3895 | "[ii][buffer[merge] " |
3896 | "failed to unmap a destination chunk1: " |
3897 | "<%.*s> :" |
3898 | "segment:<%u>, " |
3899 | "destination-chunk1:<%u>, " |
3900 | "actual-size1:<%u>" , |
3901 | name_size, name, |
3902 | seg, |
3903 | dcn0, |
3904 | actual_db0_chunk_size); |
3905 | } |
3906 | } |
3907 | } else { |
3908 | GRN_FREE(dc1); |
3909 | GRN_FREE(dc0); |
3910 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } |
3911 | } |
3912 | } else { |
3913 | GRN_FREE(dc1); |
3914 | GRN_FREE(dc0); |
3915 | if (scn != GRN_II_PSEG_NOT_ASSIGNED) { grn_io_win_unmap(&sw); } |
3916 | } |
3917 | } else { |
3918 | GRN_FREE(dc1); |
3919 | GRN_FREE(dc0); |
3920 | { |
3921 | DEFINE_NAME(ii); |
3922 | MERR("[ii][buffer][split] failed to map a source chunk: " |
3923 | "<%.*s> :" |
3924 | "segment:<%u>, " |
3925 | "source-segment:<%u>, " |
3926 | "chunk-size:<%u>" , |
3927 | name_size, name, |
3928 | seg, |
3929 | scn, |
3930 | sb->header.chunk_size); |
3931 | } |
3932 | } |
3933 | } else { |
3934 | GRN_FREE(dc0); |
3935 | { |
3936 | DEFINE_NAME(ii); |
3937 | MERR("[ii][buffer][split] " |
3938 | "failed to allocate a destination chunk2: " |
3939 | "<%.*s> :" |
3940 | "segment:<%u>, " |
3941 | "destination-segment1:<%u>, " |
3942 | "destination-segment2:<%u>" , |
3943 | name_size, name, |
3944 | seg, |
3945 | dps0, |
3946 | dps1); |
3947 | } |
3948 | } |
3949 | } else { |
3950 | DEFINE_NAME(ii); |
3951 | MERR("[ii][buffer][split] failed to allocate a destination chunk1: " |
3952 | "<%.*s>: " |
3953 | "segment:<%u>, " |
3954 | "destination-segment1:<%u>, " |
3955 | "destination-segment2:<%u>" , |
3956 | name_size, name, |
3957 | seg, |
3958 | dps0, |
3959 | dps1); |
3960 | } |
3961 | GRN_IO_SEG_UNREF(ii->seg, dps1); |
3962 | } else { |
3963 | DEFINE_NAME(ii); |
3964 | MERR("[ii][buffer][split] failed to allocate a destination segment2: " |
3965 | "<%.*s>: " |
3966 | "segment:<%u>, " |
3967 | "destination-segment1:<%u>, " |
3968 | "destination-segment2:<%u>" , |
3969 | name_size, name, |
3970 | seg, |
3971 | dps0, |
3972 | dps1); |
3973 | } |
3974 | GRN_IO_SEG_UNREF(ii->seg, dps0); |
3975 | } else { |
3976 | DEFINE_NAME(ii); |
3977 | MERR("[ii][buffer][split] failed to allocate a destination segment1: " |
3978 | "<%.*s>: " |
3979 | "segment:<%u>, " |
3980 | "destination-segment1:<%u>, " |
3981 | "destination-segment2:<%u>" , |
3982 | name_size, name, |
3983 | seg, |
3984 | dps0, |
3985 | dps1); |
3986 | } |
3987 | buffer_close(ctx, ii, sps); |
3988 | } |
3989 | return ctx->rc; |
3990 | } |
3991 | |
3992 | #define SCALE_FACTOR 2048 |
3993 | #define MAX_NTERMS 8192 |
3994 | #define SPLIT_COND(ii, buffer)\ |
3995 | ((buffer)->header.nterms > 1024 ||\ |
3996 | ((buffer)->header.nterms > 1 &&\ |
3997 | (buffer)->header.chunk_size * 100 > (ii)->header->total_chunk_size)) |
3998 | |
3999 | inline static void |
4000 | buffer_new_find_segment(grn_ctx *ctx, |
4001 | grn_ii *ii, |
4002 | int size, |
4003 | grn_id tid, |
4004 | grn_hash *h, |
4005 | buffer **b, |
4006 | uint32_t *lseg, |
4007 | uint32_t *pseg) |
4008 | { |
4009 | uint32_t *a; |
4010 | |
4011 | a = array_at(ctx, ii, tid); |
4012 | if (!a) { |
4013 | return; |
4014 | } |
4015 | |
4016 | for (;;) { |
4017 | uint32_t pos = a[0]; |
4018 | if (!pos || (pos & 1)) { break; } |
4019 | *pseg = buffer_open(ctx, ii, pos, NULL, b); |
4020 | if (*pseg == GRN_II_PSEG_NOT_ASSIGNED) { break; } |
4021 | if ((*b)->header.buffer_free >= size + sizeof(buffer_term)) { |
4022 | *lseg = LSEG(pos); |
4023 | break; |
4024 | } |
4025 | buffer_close(ctx, ii, *pseg); |
4026 | if (SPLIT_COND(ii, (*b))) { |
4027 | /* ((S_SEGMENT - sizeof(buffer_header) + ii->header->bmax - |
4028 | (*b)->header.nterms * sizeof(buffer_term)) * 4 < |
4029 | (*b)->header.chunk_size) */ |
4030 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
4031 | "nterms=%d chunk=%d total=%" GRN_FMT_INT64U, |
4032 | (*b)->header.nterms, |
4033 | (*b)->header.chunk_size, |
4034 | ii->header->total_chunk_size >> 10); |
4035 | if (buffer_split(ctx, ii, LSEG(pos), h)) { break; } |
4036 | } else { |
4037 | if (S_SEGMENT - sizeof(buffer_header) |
4038 | - (*b)->header.nterms * sizeof(buffer_term) |
4039 | < size + sizeof(buffer_term)) { |
4040 | break; |
4041 | } |
4042 | if (buffer_flush(ctx, ii, LSEG(pos), h)) { break; } |
4043 | } |
4044 | } |
4045 | |
4046 | array_unref(ii, tid); |
4047 | } |
4048 | |
4049 | inline static void |
4050 | buffer_new_lexicon_pat(grn_ctx *ctx, |
4051 | grn_ii *ii, |
4052 | int size, |
4053 | grn_id id, |
4054 | grn_hash *h, |
4055 | buffer **b, |
4056 | uint32_t *lseg, |
4057 | uint32_t *pseg) |
4058 | { |
4059 | grn_pat_cursor *cursor; |
4060 | char key[GRN_TABLE_MAX_KEY_SIZE]; |
4061 | int key_size; |
4062 | |
4063 | key_size = grn_table_get_key(ctx, ii->lexicon, id, key, |
4064 | GRN_TABLE_MAX_KEY_SIZE); |
4065 | if (ii->lexicon->header.flags & GRN_OBJ_KEY_VAR_SIZE) { |
4066 | grn_obj *tokenizer = NULL; |
4067 | |
4068 | grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); |
4069 | if (tokenizer) { |
4070 | /* For natural language */ |
4071 | cursor = grn_pat_cursor_open(ctx, |
4072 | (grn_pat *)(ii->lexicon), |
4073 | key, |
4074 | key_size, |
4075 | NULL, |
4076 | 0, |
4077 | 0, |
4078 | -1, |
4079 | GRN_CURSOR_ASCENDING|GRN_CURSOR_GT); |
4080 | if (cursor) { |
4081 | grn_id tid; |
4082 | while (ctx->rc == GRN_SUCCESS && |
4083 | *lseg == GRN_II_PSEG_NOT_ASSIGNED && |
4084 | (tid = grn_pat_cursor_next(ctx, cursor))) { |
4085 | buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); |
4086 | } |
4087 | grn_pat_cursor_close(ctx, cursor); |
4088 | } |
4089 | } else { |
4090 | /* For text data */ |
4091 | int target_key_size = key_size; |
4092 | int reduced_key_size = 0; |
4093 | |
4094 | while (*lseg == GRN_II_PSEG_NOT_ASSIGNED && target_key_size > 0) { |
4095 | grn_id tid; |
4096 | |
4097 | cursor = grn_pat_cursor_open(ctx, |
4098 | (grn_pat *)(ii->lexicon), |
4099 | key, target_key_size, |
4100 | NULL, 0, 0, -1, |
4101 | GRN_CURSOR_PREFIX); |
4102 | if (!cursor) { |
4103 | break; |
4104 | } |
4105 | |
4106 | if (reduced_key_size == 0) { |
4107 | while (ctx->rc == GRN_SUCCESS && |
4108 | *lseg == GRN_II_PSEG_NOT_ASSIGNED && |
4109 | (tid = grn_pat_cursor_next(ctx, cursor))) { |
4110 | buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); |
4111 | } |
4112 | } else { |
4113 | while (ctx->rc == GRN_SUCCESS && |
4114 | *lseg == GRN_II_PSEG_NOT_ASSIGNED && |
4115 | (tid = grn_pat_cursor_next(ctx, cursor))) { |
4116 | void *current_key; |
4117 | int current_key_size; |
4118 | |
4119 | current_key_size = grn_pat_cursor_get_key(ctx, cursor, ¤t_key); |
4120 | if (memcmp(((char *)current_key) + target_key_size, |
4121 | key + target_key_size, |
4122 | reduced_key_size) == 0) { |
4123 | continue; |
4124 | } |
4125 | buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); |
4126 | } |
4127 | } |
4128 | grn_pat_cursor_close(ctx, cursor); |
4129 | |
4130 | if (reduced_key_size == 0) { |
4131 | reduced_key_size = 1; |
4132 | } else { |
4133 | reduced_key_size *= 2; |
4134 | } |
4135 | target_key_size -= reduced_key_size; |
4136 | } |
4137 | } |
4138 | } else { |
4139 | /* For other data */ |
4140 | cursor = grn_pat_cursor_open(ctx, |
4141 | (grn_pat *)(ii->lexicon), |
4142 | NULL, 0, key, key_size, 0, -1, |
4143 | GRN_CURSOR_PREFIX); |
4144 | if (cursor) { |
4145 | grn_id tid; |
4146 | while (ctx->rc == GRN_SUCCESS && |
4147 | *lseg == GRN_II_PSEG_NOT_ASSIGNED && |
4148 | (tid = grn_pat_cursor_next(ctx, cursor))) { |
4149 | buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); |
4150 | } |
4151 | grn_pat_cursor_close(ctx, cursor); |
4152 | } |
4153 | } |
4154 | } |
4155 | |
4156 | inline static void |
4157 | buffer_new_lexicon_other(grn_ctx *ctx, |
4158 | grn_ii *ii, |
4159 | int size, |
4160 | grn_id id, |
4161 | grn_hash *h, |
4162 | buffer **b, |
4163 | uint32_t *lseg, |
4164 | uint32_t *pseg) |
4165 | { |
4166 | GRN_TABLE_EACH_BEGIN(ctx, ii->lexicon, cursor, tid) { |
4167 | if (ctx->rc != GRN_SUCCESS || *lseg != GRN_II_PSEG_NOT_ASSIGNED) { |
4168 | break; |
4169 | } |
4170 | buffer_new_find_segment(ctx, ii, size, tid, h, b, lseg, pseg); |
4171 | } GRN_TABLE_EACH_END(ctx, cursor); |
4172 | } |
4173 | |
4174 | |
4175 | inline static uint32_t |
4176 | buffer_new(grn_ctx *ctx, grn_ii *ii, int size, uint32_t *pos, |
4177 | buffer_term **bt, buffer_rec **br, buffer **bp, grn_id id, grn_hash *h) |
4178 | { |
4179 | buffer *b = NULL; |
4180 | uint16_t offset; |
4181 | uint32_t lseg = GRN_II_PSEG_NOT_ASSIGNED, pseg = GRN_II_PSEG_NOT_ASSIGNED; |
4182 | if (S_SEGMENT - sizeof(buffer_header) < size + sizeof(buffer_term)) { |
4183 | DEFINE_NAME(ii); |
4184 | MERR("[ii][buffer][new] requested size is too large: " |
4185 | "<%.*s> :" |
4186 | "requested:<%" GRN_FMT_SIZE ">, max:<%" GRN_FMT_SIZE ">" , |
4187 | name_size, name, |
4188 | (size_t)(size + sizeof(buffer_term)), |
4189 | (size_t)(S_SEGMENT - sizeof(buffer_header))); |
4190 | return GRN_II_PSEG_NOT_ASSIGNED; |
4191 | } |
4192 | if (ii->lexicon->header.type == GRN_TABLE_PAT_KEY) { |
4193 | buffer_new_lexicon_pat(ctx, ii, size, id, h, &b, &lseg, &pseg); |
4194 | } else { |
4195 | buffer_new_lexicon_other(ctx, ii, size, id, h, &b, &lseg, &pseg); |
4196 | } |
4197 | if (lseg == GRN_II_PSEG_NOT_ASSIGNED) { |
4198 | if (buffer_segment_new(ctx, ii, &lseg) || |
4199 | (pseg = buffer_open(ctx, ii, SEG2POS(lseg, 0), NULL, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { |
4200 | return GRN_II_PSEG_NOT_ASSIGNED; |
4201 | } |
4202 | memset(b, 0, S_SEGMENT); |
4203 | b->header.buffer_free = S_SEGMENT - sizeof(buffer_header); |
4204 | b->header.chunk = GRN_II_PSEG_NOT_ASSIGNED; |
4205 | } |
4206 | if (b->header.nterms_void) { |
4207 | for (offset = 0; offset < b->header.nterms; offset++) { |
4208 | if (!b->terms[offset].tid) { break; } |
4209 | } |
4210 | if (offset == b->header.nterms) { |
4211 | GRN_LOG(ctx, GRN_LOG_DEBUG, "inconsistent buffer(%d)" , lseg); |
4212 | b->header.nterms_void = 0; |
4213 | b->header.nterms++; |
4214 | b->header.buffer_free -= size + sizeof(buffer_term); |
4215 | } else { |
4216 | b->header.nterms_void--; |
4217 | b->header.buffer_free -= size; |
4218 | } |
4219 | } else { |
4220 | offset = b->header.nterms++; |
4221 | b->header.buffer_free -= size + sizeof(buffer_term); |
4222 | } |
4223 | *pos = SEG2POS(lseg, (sizeof(buffer_header) + sizeof(buffer_term) * offset)); |
4224 | *bt = &b->terms[offset]; |
4225 | *br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); |
4226 | *bp = b; |
4227 | return pseg; |
4228 | } |
4229 | |
4230 | /* ii */ |
4231 | |
4232 | static grn_ii * |
4233 | _grn_ii_create(grn_ctx *ctx, grn_ii *ii, const char *path, grn_obj *lexicon, uint32_t flags) |
4234 | { |
4235 | int i; |
4236 | uint32_t max_n_segments; |
4237 | uint32_t max_n_chunks; |
4238 | grn_io *seg, *chunk; |
4239 | char path2[PATH_MAX]; |
4240 | struct grn_ii_header *; |
4241 | grn_table_flags lflags; |
4242 | grn_encoding encoding; |
4243 | grn_obj *tokenizer; |
4244 | /* |
4245 | for (i = 0; i < 32; i++) { |
4246 | new_histogram[i] = 0; |
4247 | free_histogram[i] = 0; |
4248 | } |
4249 | */ |
4250 | if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, |
4251 | NULL, NULL)) { |
4252 | return NULL; |
4253 | } |
4254 | if (path && strlen(path) + 6 >= PATH_MAX) { return NULL; } |
4255 | |
4256 | if (flags & GRN_OBJ_INDEX_SMALL) { |
4257 | max_n_segments = grn_ii_max_n_segments_small; |
4258 | max_n_chunks = grn_ii_max_n_chunks_small; |
4259 | } else if (flags & GRN_OBJ_INDEX_MEDIUM) { |
4260 | max_n_segments = MAX_PSEG_MEDIUM; |
4261 | max_n_chunks = GRN_II_MAX_CHUNK_MEDIUM; |
4262 | } else { |
4263 | max_n_segments = MAX_PSEG; |
4264 | max_n_chunks = GRN_II_MAX_CHUNK; |
4265 | } |
4266 | |
4267 | seg = grn_io_create(ctx, |
4268 | path, |
4269 | sizeof(struct grn_ii_header), |
4270 | S_SEGMENT, |
4271 | max_n_segments, |
4272 | grn_io_auto, |
4273 | GRN_IO_EXPIRE_SEGMENT); |
4274 | if (!seg) { return NULL; } |
4275 | if (path) { |
4276 | grn_strcpy(path2, PATH_MAX, path); |
4277 | grn_strcat(path2, PATH_MAX, ".c" ); |
4278 | chunk = grn_io_create(ctx, path2, 0, S_CHUNK, max_n_chunks, grn_io_auto, |
4279 | GRN_IO_EXPIRE_SEGMENT); |
4280 | } else { |
4281 | chunk = grn_io_create(ctx, NULL, 0, S_CHUNK, max_n_chunks, grn_io_auto, 0); |
4282 | } |
4283 | if (!chunk) { |
4284 | grn_io_close(ctx, seg); |
4285 | grn_io_remove(ctx, path); |
4286 | return NULL; |
4287 | } |
4288 | header = grn_io_header(seg); |
4289 | grn_io_set_type(seg, GRN_COLUMN_INDEX); |
4290 | for (i = 0; i < GRN_II_MAX_LSEG; i++) { |
4291 | header->ainfo[i] = GRN_II_PSEG_NOT_ASSIGNED; |
4292 | header->binfo[i] = GRN_II_PSEG_NOT_ASSIGNED; |
4293 | } |
4294 | for (i = 0; i <= GRN_II_N_CHUNK_VARIATION; i++) { |
4295 | header->free_chunks[i] = GRN_II_PSEG_NOT_ASSIGNED; |
4296 | header->garbages[i] = GRN_II_PSEG_NOT_ASSIGNED; |
4297 | } |
4298 | header->flags = flags; |
4299 | ii->seg = seg; |
4300 | ii->chunk = chunk; |
4301 | ii->lexicon = lexicon; |
4302 | ii->lflags = lflags; |
4303 | ii->encoding = encoding; |
4304 | ii->header = header; |
4305 | ii->n_elements = 2; |
4306 | if ((flags & GRN_OBJ_WITH_SECTION)) { ii->n_elements++; } |
4307 | if ((flags & GRN_OBJ_WITH_WEIGHT)) { ii->n_elements++; } |
4308 | if ((flags & GRN_OBJ_WITH_POSITION)) { ii->n_elements++; } |
4309 | return ii; |
4310 | } |
4311 | |
4312 | grn_ii * |
4313 | grn_ii_create(grn_ctx *ctx, const char *path, grn_obj *lexicon, uint32_t flags) |
4314 | { |
4315 | grn_ii *ii = NULL; |
4316 | if (!(ii = GRN_MALLOCN(grn_ii, 1))) { |
4317 | return NULL; |
4318 | } |
4319 | GRN_DB_OBJ_SET_TYPE(ii, GRN_COLUMN_INDEX); |
4320 | if (!_grn_ii_create(ctx, ii, path, lexicon, flags)) { |
4321 | GRN_FREE(ii); |
4322 | return NULL; |
4323 | } |
4324 | return ii; |
4325 | } |
4326 | |
4327 | grn_rc |
4328 | grn_ii_remove(grn_ctx *ctx, const char *path) |
4329 | { |
4330 | grn_rc rc; |
4331 | char buffer[PATH_MAX]; |
4332 | if (!path || strlen(path) > PATH_MAX - 4) { return GRN_INVALID_ARGUMENT; } |
4333 | if ((rc = grn_io_remove(ctx, path))) { goto exit; } |
4334 | grn_snprintf(buffer, PATH_MAX, PATH_MAX, |
4335 | "%s.c" , path); |
4336 | rc = grn_io_remove(ctx, buffer); |
4337 | exit : |
4338 | return rc; |
4339 | } |
4340 | |
4341 | grn_rc |
4342 | grn_ii_truncate(grn_ctx *ctx, grn_ii *ii) |
4343 | { |
4344 | grn_rc rc; |
4345 | const char *io_segpath, *io_chunkpath; |
4346 | char *segpath, *chunkpath = NULL; |
4347 | grn_obj *lexicon; |
4348 | uint32_t flags; |
4349 | if ((io_segpath = grn_io_path(ii->seg)) && *io_segpath != '\0') { |
4350 | if (!(segpath = GRN_STRDUP(io_segpath))) { |
4351 | ERR(GRN_NO_MEMORY_AVAILABLE, "cannot duplicate path: <%s>" , io_segpath); |
4352 | return GRN_NO_MEMORY_AVAILABLE; |
4353 | } |
4354 | if ((io_chunkpath = grn_io_path(ii->chunk)) && *io_chunkpath != '\0') { |
4355 | if (!(chunkpath = GRN_STRDUP(io_chunkpath))) { |
4356 | ERR(GRN_NO_MEMORY_AVAILABLE, "cannot duplicate path: <%s>" , io_chunkpath); |
4357 | return GRN_NO_MEMORY_AVAILABLE; |
4358 | } |
4359 | } else { |
4360 | chunkpath = NULL; |
4361 | } |
4362 | } else { |
4363 | segpath = NULL; |
4364 | } |
4365 | lexicon = ii->lexicon; |
4366 | flags = ii->header->flags; |
4367 | if ((rc = grn_io_close(ctx, ii->seg))) { goto exit; } |
4368 | if ((rc = grn_io_close(ctx, ii->chunk))) { goto exit; } |
4369 | ii->seg = NULL; |
4370 | ii->chunk = NULL; |
4371 | if (segpath && (rc = grn_io_remove(ctx, segpath))) { goto exit; } |
4372 | if (chunkpath && (rc = grn_io_remove(ctx, chunkpath))) { goto exit; } |
4373 | if (!_grn_ii_create(ctx, ii, segpath, lexicon, flags)) { |
4374 | rc = GRN_UNKNOWN_ERROR; |
4375 | } |
4376 | exit: |
4377 | if (segpath) { GRN_FREE(segpath); } |
4378 | if (chunkpath) { GRN_FREE(chunkpath); } |
4379 | return rc; |
4380 | } |
4381 | |
4382 | grn_ii * |
4383 | grn_ii_open(grn_ctx *ctx, const char *path, grn_obj *lexicon) |
4384 | { |
4385 | grn_io *seg, *chunk; |
4386 | grn_ii *ii; |
4387 | char path2[PATH_MAX]; |
4388 | struct grn_ii_header *; |
4389 | uint32_t io_type; |
4390 | grn_table_flags lflags; |
4391 | grn_encoding encoding; |
4392 | grn_obj *tokenizer; |
4393 | if (grn_table_get_info(ctx, lexicon, &lflags, &encoding, &tokenizer, |
4394 | NULL, NULL)) { |
4395 | return NULL; |
4396 | } |
4397 | if (strlen(path) + 6 >= PATH_MAX) { return NULL; } |
4398 | grn_strcpy(path2, PATH_MAX, path); |
4399 | grn_strcat(path2, PATH_MAX, ".c" ); |
4400 | seg = grn_io_open(ctx, path, grn_io_auto); |
4401 | if (!seg) { return NULL; } |
4402 | chunk = grn_io_open(ctx, path2, grn_io_auto); |
4403 | if (!chunk) { |
4404 | grn_io_close(ctx, seg); |
4405 | return NULL; |
4406 | } |
4407 | header = grn_io_header(seg); |
4408 | io_type = grn_io_get_type(seg); |
4409 | if (io_type != GRN_COLUMN_INDEX) { |
4410 | ERR(GRN_INVALID_FORMAT, |
4411 | "[column][index] file type must be %#04x: <%#04x>" , |
4412 | GRN_COLUMN_INDEX, io_type); |
4413 | grn_io_close(ctx, seg); |
4414 | grn_io_close(ctx, chunk); |
4415 | return NULL; |
4416 | } |
4417 | if (!(ii = GRN_MALLOCN(grn_ii, 1))) { |
4418 | grn_io_close(ctx, seg); |
4419 | grn_io_close(ctx, chunk); |
4420 | return NULL; |
4421 | } |
4422 | GRN_DB_OBJ_SET_TYPE(ii, GRN_COLUMN_INDEX); |
4423 | ii->seg = seg; |
4424 | ii->chunk = chunk; |
4425 | ii->lexicon = lexicon; |
4426 | ii->lflags = lflags; |
4427 | ii->encoding = encoding; |
4428 | ii->header = header; |
4429 | ii->n_elements = 2; |
4430 | if ((header->flags & GRN_OBJ_WITH_SECTION)) { ii->n_elements++; } |
4431 | if ((header->flags & GRN_OBJ_WITH_WEIGHT)) { ii->n_elements++; } |
4432 | if ((header->flags & GRN_OBJ_WITH_POSITION)) { ii->n_elements++; } |
4433 | return ii; |
4434 | } |
4435 | |
4436 | grn_rc |
4437 | grn_ii_close(grn_ctx *ctx, grn_ii *ii) |
4438 | { |
4439 | grn_rc rc; |
4440 | if (!ii) { return GRN_INVALID_ARGUMENT; } |
4441 | if ((rc = grn_io_close(ctx, ii->seg))) { return rc; } |
4442 | if ((rc = grn_io_close(ctx, ii->chunk))) { return rc; } |
4443 | GRN_FREE(ii); |
4444 | /* |
4445 | { |
4446 | int i; |
4447 | for (i = 0; i < 32; i++) { |
4448 | GRN_LOG(ctx, GRN_LOG_DEBUG, "new[%d]=%d free[%d]=%d", |
4449 | i, new_histogram[i], |
4450 | i, free_histogram[i]); |
4451 | } |
4452 | } |
4453 | */ |
4454 | return rc; |
4455 | } |
4456 | |
4457 | grn_rc |
4458 | grn_ii_info(grn_ctx *ctx, grn_ii *ii, uint64_t *seg_size, uint64_t *chunk_size) |
4459 | { |
4460 | grn_rc rc; |
4461 | |
4462 | if (seg_size) { |
4463 | if ((rc = grn_io_size(ctx, ii->seg, seg_size))) { |
4464 | return rc; |
4465 | } |
4466 | } |
4467 | |
4468 | if (chunk_size) { |
4469 | if ((rc = grn_io_size(ctx, ii->chunk, chunk_size))) { |
4470 | return rc; |
4471 | } |
4472 | } |
4473 | |
4474 | return GRN_SUCCESS; |
4475 | } |
4476 | |
4477 | grn_column_flags |
4478 | grn_ii_get_flags(grn_ctx *ctx, grn_ii *ii) |
4479 | { |
4480 | if (!ii) { |
4481 | return 0; |
4482 | } |
4483 | |
4484 | return ii->header->flags; |
4485 | } |
4486 | |
4487 | uint32_t |
4488 | grn_ii_get_n_elements(grn_ctx *ctx, grn_ii *ii) |
4489 | { |
4490 | if (!ii) { |
4491 | return 0; |
4492 | } |
4493 | |
4494 | return ii->n_elements; |
4495 | } |
4496 | |
4497 | void |
4498 | grn_ii_expire(grn_ctx *ctx, grn_ii *ii) |
4499 | { |
4500 | /* |
4501 | grn_io_expire(ctx, ii->seg, 128, 1000000); |
4502 | */ |
4503 | grn_io_expire(ctx, ii->chunk, 0, 1000000); |
4504 | } |
4505 | |
4506 | grn_rc |
4507 | grn_ii_flush(grn_ctx *ctx, grn_ii *ii) |
4508 | { |
4509 | grn_rc rc; |
4510 | |
4511 | rc = grn_io_flush(ctx, ii->seg); |
4512 | if (rc == GRN_SUCCESS) { |
4513 | rc = grn_io_flush(ctx, ii->chunk); |
4514 | } |
4515 | |
4516 | return rc; |
4517 | } |
4518 | |
4519 | size_t |
4520 | grn_ii_get_disk_usage(grn_ctx *ctx, grn_ii *ii) |
4521 | { |
4522 | size_t usage; |
4523 | |
4524 | usage = grn_io_get_disk_usage(ctx, ii->seg); |
4525 | usage += grn_io_get_disk_usage(ctx, ii->chunk); |
4526 | |
4527 | return usage; |
4528 | } |
4529 | |
4530 | #define BIT11_01(x) ((x >> 1) & 0x7ff) |
4531 | #define BIT31_12(x) (x >> 12) |
4532 | |
4533 | grn_rc |
4534 | grn_ii_update_one(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_ii_updspec *u, grn_hash *h) |
4535 | { |
4536 | buffer *b; |
4537 | uint8_t *bs; |
4538 | buffer_rec *br = NULL; |
4539 | buffer_term *bt; |
4540 | uint32_t pseg = 0, pos = 0, size, *a; |
4541 | if (!tid) { return ctx->rc; } |
4542 | if (!u->tf || !u->sid) { return grn_ii_delete_one(ctx, ii, tid, u, h); } |
4543 | if (u->sid > ii->header->smax) { ii->header->smax = u->sid; } |
4544 | if (!(a = array_get(ctx, ii, tid))) { |
4545 | DEFINE_NAME(ii); |
4546 | MERR("[ii][update][one] failed to allocate an array: " |
4547 | "<%.*s>: " |
4548 | "<%u>:<%u>:<%u>" , |
4549 | name_size, name, |
4550 | u->rid, u->sid, tid); |
4551 | return ctx->rc; |
4552 | } |
4553 | if (!(bs = encode_rec(ctx, ii, u, &size, 0))) { |
4554 | DEFINE_NAME(ii); |
4555 | MERR("[ii][update][one] failed to encode a record: " |
4556 | "<%.*s>: " |
4557 | "<%u>:<%u>:<%u>" , |
4558 | name_size, name, |
4559 | u->rid, u->sid, tid); |
4560 | goto exit; |
4561 | } |
4562 | for (;;) { |
4563 | if (a[0]) { |
4564 | if (!(a[0] & 1)) { |
4565 | pos = a[0]; |
4566 | if ((pseg = buffer_open(ctx, ii, pos, &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { |
4567 | DEFINE_NAME(ii); |
4568 | MERR("[ii][update][one] failed to allocate a buffer: " |
4569 | "<%.*s>: " |
4570 | "<%u>:<%u>:<%u>: " |
4571 | "segment:<%u>" , |
4572 | name_size, name, |
4573 | u->rid, u->sid, tid, |
4574 | pos); |
4575 | goto exit; |
4576 | } |
4577 | if (b->header.buffer_free < size) { |
4578 | int bfb = b->header.buffer_free; |
4579 | GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing a[0]=%d seg=%d(%p) free=%d" , |
4580 | a[0], LSEG(a[0]), b, b->header.buffer_free); |
4581 | buffer_close(ctx, ii, pseg); |
4582 | if (SPLIT_COND(ii, b)) { |
4583 | /*((S_SEGMENT - sizeof(buffer_header) + ii->header->bmax - |
4584 | b->header.nterms * sizeof(buffer_term)) * 4 < |
4585 | b->header.chunk_size)*/ |
4586 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
4587 | "nterms=%d chunk=%d total=%" GRN_FMT_INT64U, |
4588 | b->header.nterms, |
4589 | b->header.chunk_size, |
4590 | ii->header->total_chunk_size >> 10); |
4591 | buffer_split(ctx, ii, LSEG(pos), h); |
4592 | if (ctx->rc != GRN_SUCCESS) { |
4593 | DEFINE_NAME(ii); |
4594 | ERR(ctx->rc, |
4595 | "[ii][update][one] failed to split a buffer: " |
4596 | "<%.*s>: " |
4597 | "<%u>:<%u><%u>: " |
4598 | "segment:<%u>" , |
4599 | name_size, name, |
4600 | u->rid, u->sid, tid, |
4601 | pos); |
4602 | goto exit; |
4603 | } |
4604 | continue; |
4605 | } |
4606 | buffer_flush(ctx, ii, LSEG(pos), h); |
4607 | if (ctx->rc != GRN_SUCCESS) { |
4608 | DEFINE_NAME(ii); |
4609 | ERR(ctx->rc, |
4610 | "[ii][update][one] failed to flush a buffer: " |
4611 | "<%.*s>: " |
4612 | "<%u>:<%u><%u>: " |
4613 | "segment:<%u>" , |
4614 | name_size, name, |
4615 | u->rid, u->sid, tid, |
4616 | pos); |
4617 | goto exit; |
4618 | } |
4619 | if (a[0] != pos) { |
4620 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
4621 | "grn_ii_update_one: a[0] changed %d->%d" , a[0], pos); |
4622 | continue; |
4623 | } |
4624 | if ((pseg = buffer_open(ctx, ii, pos, &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { |
4625 | GRN_LOG(ctx, GRN_LOG_CRIT, "buffer not found a[0]=%d" , a[0]); |
4626 | { |
4627 | DEFINE_NAME(ii); |
4628 | MERR("[ii][update][one] failed to reallocate a buffer: " |
4629 | "<%.*s>: " |
4630 | "<%u>:<%u>:<%u>: " |
4631 | "segment:<%u>, new-segment:<%u>" , |
4632 | name_size, name, |
4633 | u->rid, u->sid, tid, |
4634 | pos, a[0]); |
4635 | } |
4636 | goto exit; |
4637 | } |
4638 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
4639 | "flushed a[0]=%d seg=%d(%p) free=%d->%d nterms=%d v=%d" , |
4640 | a[0], LSEG(a[0]), b, bfb, b->header.buffer_free, |
4641 | b->header.nterms, b->header.nterms_void); |
4642 | if (b->header.buffer_free < size) { |
4643 | DEFINE_NAME(ii); |
4644 | MERR("[ii][update][one] buffer is full: " |
4645 | "<%.*s>: " |
4646 | "<%u>:<%u><%u>: " |
4647 | "segment:<%u>, new-segment:<%u>, free:<%u>, required:<%u>" , |
4648 | name_size, name, |
4649 | u->rid, u->sid, tid, |
4650 | pos, a[0], b->header.buffer_free, size); |
4651 | buffer_close(ctx, ii, pseg); |
4652 | /* todo: direct merge */ |
4653 | goto exit; |
4654 | } |
4655 | } |
4656 | b->header.buffer_free -= size; |
4657 | br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) |
4658 | + b->header.buffer_free); |
4659 | } else { |
4660 | grn_ii_updspec u2; |
4661 | uint32_t size2 = 0, v = a[0]; |
4662 | struct _grn_ii_pos pos2; |
4663 | pos2.pos = a[1]; |
4664 | pos2.next = NULL; |
4665 | u2.pos = &pos2; |
4666 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
4667 | u2.rid = BIT31_12(v); |
4668 | u2.sid = BIT11_01(v); |
4669 | } else { |
4670 | u2.rid = v >> 1; |
4671 | u2.sid = 1; |
4672 | } |
4673 | u2.tf = 1; |
4674 | u2.weight = 0; |
4675 | if (u2.rid != u->rid || u2.sid != u->sid) { |
4676 | uint8_t *bs2 = encode_rec(ctx, ii, &u2, &size2, 0); |
4677 | if (!bs2) { |
4678 | DEFINE_NAME(ii); |
4679 | MERR("[ii][update][one] failed to encode a record2: " |
4680 | "<%.*s>: " |
4681 | "<%u>:<%u>:<%u>" , |
4682 | name_size, name, |
4683 | u2.rid, u2.sid, tid); |
4684 | goto exit; |
4685 | } |
4686 | pseg = buffer_new(ctx, ii, size + size2, &pos, &bt, &br, &b, tid, h); |
4687 | if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { |
4688 | GRN_FREE(bs2); |
4689 | { |
4690 | DEFINE_NAME(ii); |
4691 | MERR("[ii][update][one] failed to create a buffer2: " |
4692 | "<%.*s>: " |
4693 | "<%u>:<%u>:<%u>: " |
4694 | "size:<%u>" , |
4695 | name_size, name, |
4696 | u2.rid, u2.sid, tid, |
4697 | size + size2); |
4698 | } |
4699 | goto exit; |
4700 | } |
4701 | bt->tid = tid; |
4702 | bt->size_in_chunk = 0; |
4703 | bt->pos_in_chunk = 0; |
4704 | bt->size_in_buffer = 0; |
4705 | bt->pos_in_buffer = 0; |
4706 | buffer_put(ctx, ii, b, bt, br, bs2, &u2, size2); |
4707 | if (ctx->rc != GRN_SUCCESS) { |
4708 | GRN_FREE(bs2); |
4709 | buffer_close(ctx, ii, pseg); |
4710 | { |
4711 | DEFINE_NAME(ii); |
4712 | MERR("[ii][update][one] failed to put to buffer: " |
4713 | "<%.*s>: " |
4714 | "<%u>:<%u>:<%u>" , |
4715 | name_size, name, |
4716 | u2.rid, u2.sid, tid); |
4717 | } |
4718 | goto exit; |
4719 | } |
4720 | br = (buffer_rec *)(((byte *)br) + size2); |
4721 | GRN_FREE(bs2); |
4722 | } |
4723 | } |
4724 | } |
4725 | break; |
4726 | } |
4727 | if (!br) { |
4728 | if (u->tf == 1 && u->weight == 0) { |
4729 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
4730 | if (u->rid < 0x100000 && u->sid < 0x800) { |
4731 | a[0] = (u->rid << 12) + (u->sid << 1) + 1; |
4732 | a[1] = u->pos->pos; |
4733 | goto exit; |
4734 | } |
4735 | } else { |
4736 | a[0] = (u->rid << 1) + 1; |
4737 | a[1] = u->pos->pos; |
4738 | goto exit; |
4739 | } |
4740 | } |
4741 | pseg = buffer_new(ctx, ii, size, &pos, &bt, &br, &b, tid, h); |
4742 | if (pseg == GRN_II_PSEG_NOT_ASSIGNED) { |
4743 | DEFINE_NAME(ii); |
4744 | MERR("[ii][update][one] failed to create a buffer: " |
4745 | "<%.*s>: " |
4746 | "<%u>:<%u>:<%u>: " |
4747 | "size:<%u>" , |
4748 | name_size, name, |
4749 | u->rid, u->sid, tid, |
4750 | size); |
4751 | goto exit; |
4752 | } |
4753 | bt->tid = tid; |
4754 | bt->size_in_chunk = 0; |
4755 | bt->pos_in_chunk = 0; |
4756 | bt->size_in_buffer = 0; |
4757 | bt->pos_in_buffer = 0; |
4758 | } |
4759 | buffer_put(ctx, ii, b, bt, br, bs, u, size); |
4760 | buffer_close(ctx, ii, pseg); |
4761 | if (!a[0] || (a[0] & 1)) { a[0] = pos; } |
4762 | exit : |
4763 | array_unref(ii, tid); |
4764 | if (bs) { GRN_FREE(bs); } |
4765 | if (u->tf != u->atf) { |
4766 | grn_obj *source_table; |
4767 | char source_table_name[GRN_TABLE_MAX_KEY_SIZE]; |
4768 | int source_table_name_size; |
4769 | char term[GRN_TABLE_MAX_KEY_SIZE]; |
4770 | int term_size; |
4771 | |
4772 | source_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); |
4773 | if (source_table) { |
4774 | source_table_name_size = grn_obj_name(ctx, |
4775 | source_table, |
4776 | source_table_name, |
4777 | GRN_TABLE_MAX_KEY_SIZE); |
4778 | } else { |
4779 | grn_strcpy(source_table_name, GRN_TABLE_MAX_KEY_SIZE, "(null)" ); |
4780 | source_table_name_size = strlen(source_table_name); |
4781 | } |
4782 | term_size = grn_table_get_key(ctx, ii->lexicon, tid, |
4783 | term, GRN_TABLE_MAX_KEY_SIZE); |
4784 | { |
4785 | DEFINE_NAME(ii); |
4786 | GRN_LOG(ctx, GRN_LOG_WARNING, |
4787 | "[ii][update][one] too many postings: " |
4788 | "<%.*s>: " |
4789 | "record:<%.*s>(%d), " |
4790 | "n-postings:<%d>, " |
4791 | "n-discarded-postings:<%d>, " |
4792 | "term:<%d>(<%.*s>)" , |
4793 | name_size, name, |
4794 | source_table_name_size, source_table_name, |
4795 | u->rid, |
4796 | u->atf, |
4797 | u->atf - u->tf, |
4798 | tid, term_size, term); |
4799 | } |
4800 | } |
4801 | grn_ii_expire(ctx, ii); |
4802 | return ctx->rc; |
4803 | } |
4804 | |
4805 | grn_rc |
4806 | grn_ii_delete_one(grn_ctx *ctx, grn_ii *ii, grn_id tid, grn_ii_updspec *u, grn_hash *h) |
4807 | { |
4808 | buffer *b; |
4809 | uint8_t *bs = NULL; |
4810 | buffer_rec *br; |
4811 | buffer_term *bt; |
4812 | uint32_t pseg, size, *a; |
4813 | if (!tid) { return ctx->rc; } |
4814 | if (!(a = array_at(ctx, ii, tid))) { |
4815 | return ctx->rc; |
4816 | } |
4817 | for (;;) { |
4818 | if (!a[0]) { goto exit; } |
4819 | if (a[0] & 1) { |
4820 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
4821 | uint32_t rid = BIT31_12(a[0]); |
4822 | uint32_t sid = BIT11_01(a[0]); |
4823 | if (u->rid == rid && (!u->sid || u->sid == sid)) { |
4824 | a[0] = 0; |
4825 | lexicon_delete(ctx, ii, tid, h); |
4826 | } |
4827 | } else { |
4828 | uint32_t rid = a[0] >> 1; |
4829 | if (u->rid == rid) { |
4830 | a[0] = 0; |
4831 | lexicon_delete(ctx, ii, tid, h); |
4832 | } |
4833 | } |
4834 | goto exit; |
4835 | } |
4836 | if (!(bs = encode_rec(ctx, ii, u, &size, 1))) { |
4837 | DEFINE_NAME(ii); |
4838 | MERR("[ii][delete][one] failed to encode a record: " |
4839 | "<%.*s>: " |
4840 | "<%u>:<%u>:<%u>" , |
4841 | name_size, name, |
4842 | u->rid, u->sid, tid); |
4843 | goto exit; |
4844 | } |
4845 | if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { |
4846 | DEFINE_NAME(ii); |
4847 | MERR("[ii][delete][one] failed to allocate a buffer: " |
4848 | "<%.*s>: " |
4849 | "<%u>:<%u><%u>: " |
4850 | "position:<%u>" , |
4851 | name_size, name, |
4852 | u->rid, u->sid, tid, |
4853 | a[0]); |
4854 | goto exit; |
4855 | } |
4856 | if (b->header.buffer_free < size) { |
4857 | uint32_t _a = a[0]; |
4858 | GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing! b=%p free=%d, seg(%d)" , |
4859 | b, b->header.buffer_free, LSEG(a[0])); |
4860 | buffer_close(ctx, ii, pseg); |
4861 | buffer_flush(ctx, ii, LSEG(a[0]), h); |
4862 | if (ctx->rc != GRN_SUCCESS) { |
4863 | DEFINE_NAME(ii); |
4864 | ERR(ctx->rc, |
4865 | "[ii][delete][one] failed to flush a buffer: " |
4866 | "<%.*s>: " |
4867 | "<%u>:<%u><%u>: " |
4868 | "position:<%u>" , |
4869 | name_size, name, |
4870 | u->rid, u->sid, tid, |
4871 | a[0]); |
4872 | goto exit; |
4873 | } |
4874 | if (a[0] != _a) { |
4875 | GRN_LOG(ctx, GRN_LOG_DEBUG, "grn_ii_delete_one: a[0] changed %d->%d)" , |
4876 | a[0], _a); |
4877 | continue; |
4878 | } |
4879 | if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { |
4880 | DEFINE_NAME(ii); |
4881 | MERR("[ii][delete][one] failed to reallocate a buffer: " |
4882 | "<%.*s>: " |
4883 | "<%u>:<%u><%u>: " |
4884 | "position:<%u>" , |
4885 | name_size, name, |
4886 | u->rid, u->sid, tid, |
4887 | a[0]); |
4888 | goto exit; |
4889 | } |
4890 | GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed! b=%p free=%d, seg(%d)" , |
4891 | b, b->header.buffer_free, LSEG(a[0])); |
4892 | if (b->header.buffer_free < size) { |
4893 | DEFINE_NAME(ii); |
4894 | MERR("[ii][delete][one] buffer is full: " |
4895 | "<%.*s>: " |
4896 | "<%u>:<%u><%u>: " |
4897 | "segment:<%u>, free:<%u>, required:<%u>" , |
4898 | name_size, name, |
4899 | u->rid, u->sid, tid, |
4900 | a[0], b->header.buffer_free, size); |
4901 | buffer_close(ctx, ii, pseg); |
4902 | goto exit; |
4903 | } |
4904 | } |
4905 | |
4906 | b->header.buffer_free -= size; |
4907 | br = (buffer_rec *)(((byte *)&b->terms[b->header.nterms]) + b->header.buffer_free); |
4908 | buffer_put(ctx, ii, b, bt, br, bs, u, size); |
4909 | buffer_close(ctx, ii, pseg); |
4910 | break; |
4911 | } |
4912 | exit : |
4913 | array_unref(ii, tid); |
4914 | if (bs) { GRN_FREE(bs); } |
4915 | return ctx->rc; |
4916 | } |
4917 | |
4918 | #define CHUNK_USED 1 |
4919 | #define BUFFER_USED 2 |
4920 | #define SOLE_DOC_USED 4 |
4921 | #define SOLE_POS_USED 8 |
4922 | |
4923 | struct _grn_ii_cursor { |
4924 | grn_db_obj obj; |
4925 | grn_ctx *ctx; |
4926 | grn_ii *ii; |
4927 | grn_id id; |
4928 | grn_posting *post; |
4929 | |
4930 | grn_id min; /* Minimum record ID */ |
4931 | grn_id max; |
4932 | grn_posting pc; |
4933 | grn_posting pb; |
4934 | |
4935 | uint32_t cdf; /* Document frequency */ |
4936 | uint32_t *cdp; |
4937 | uint32_t *crp; /* Record ID */ |
4938 | uint32_t *csp; /* Section ID */ |
4939 | uint32_t *ctp; /* Term frequency */ |
4940 | uint32_t *cwp; /* Weight */ |
4941 | uint32_t *cpp; /* Position */ |
4942 | |
4943 | uint8_t *bp; |
4944 | |
4945 | int nelements; |
4946 | uint32_t nchunks; |
4947 | uint32_t curr_chunk; |
4948 | chunk_info *cinfo; |
4949 | grn_io_win iw; |
4950 | uint8_t *cp; |
4951 | uint8_t *cpe; |
4952 | datavec rdv[MAX_N_ELEMENTS + 1]; |
4953 | |
4954 | struct grn_ii_buffer *buf; |
4955 | uint16_t stat; |
4956 | uint16_t nextb; |
4957 | uint32_t buffer_pseg; |
4958 | int flags; |
4959 | uint32_t *ppseg; |
4960 | |
4961 | int weight; |
4962 | |
4963 | uint32_t prev_chunk_rid; |
4964 | }; |
4965 | |
4966 | static grn_bool |
4967 | buffer_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c) |
4968 | { |
4969 | if (*c->ppseg != c->buffer_pseg) { |
4970 | uint32_t i; |
4971 | for (i = ii->header->bgqtail; i != ii->header->bgqhead; |
4972 | i = (i + 1) & (GRN_II_BGQSIZE - 1)) { |
4973 | if (ii->header->bgqbody[i] == c->buffer_pseg) { return GRN_FALSE; } |
4974 | } |
4975 | return GRN_TRUE; |
4976 | } |
4977 | return GRN_FALSE; |
4978 | } |
4979 | |
4980 | static int |
4981 | chunk_is_reused(grn_ctx *ctx, grn_ii *ii, grn_ii_cursor *c, uint32_t offset, uint32_t size) |
4982 | { |
4983 | if (*c->ppseg != c->buffer_pseg) { |
4984 | uint32_t i, m, gseg; |
4985 | if (size > S_CHUNK) { return 1; } |
4986 | if (size > (1 << GRN_II_W_LEAST_CHUNK)) { |
4987 | int es = size - 1; |
4988 | GRN_BIT_SCAN_REV(es, m); |
4989 | m++; |
4990 | } else { |
4991 | m = GRN_II_W_LEAST_CHUNK; |
4992 | } |
4993 | gseg = ii->header->garbages[m - GRN_II_W_LEAST_CHUNK]; |
4994 | while (gseg != GRN_II_PSEG_NOT_ASSIGNED) { |
4995 | grn_io_win iw; |
4996 | grn_ii_ginfo *ginfo = WIN_MAP(ii->chunk, ctx, &iw, gseg, 0, S_GARBAGE, |
4997 | grn_io_rdwr); |
4998 | if (!ginfo) { break; } |
4999 | for (i = 0; i < ginfo->nrecs; i++) { |
5000 | if (ginfo->recs[i] == offset) { |
5001 | grn_io_win_unmap(&iw); |
5002 | return 0; |
5003 | } |
5004 | } |
5005 | gseg = ginfo->next; |
5006 | grn_io_win_unmap(&iw); |
5007 | } |
5008 | return 1; |
5009 | } |
5010 | return 0; |
5011 | } |
5012 | |
5013 | #define GRN_II_CURSOR_CMP(c1,c2) \ |
5014 | (((c1)->post->rid > (c2)->post->rid) || \ |
5015 | (((c1)->post->rid == (c2)->post->rid) && \ |
5016 | (((c1)->post->sid > (c2)->post->sid) || \ |
5017 | (((c1)->post->sid == (c2)->post->sid) && \ |
5018 | ((c1)->post->pos > (c2)->post->pos))))) |
5019 | |
5020 | grn_ii_cursor * |
5021 | grn_ii_cursor_open(grn_ctx *ctx, grn_ii *ii, grn_id tid, |
5022 | grn_id min, grn_id max, int nelements, int flags) |
5023 | { |
5024 | grn_ii_cursor *c = NULL; |
5025 | uint32_t pos, *a; |
5026 | if (!(a = array_at(ctx, ii, tid))) { return NULL; } |
5027 | for (;;) { |
5028 | c = NULL; |
5029 | if (!(pos = a[0])) { goto exit; } |
5030 | if (!(c = GRN_MALLOC(sizeof(grn_ii_cursor)))) { goto exit; } |
5031 | memset(c, 0, sizeof(grn_ii_cursor)); |
5032 | c->ctx = ctx; |
5033 | c->ii = ii; |
5034 | c->id = tid; |
5035 | c->min = min; |
5036 | c->max = max; |
5037 | c->nelements = nelements; |
5038 | c->flags = flags; |
5039 | c->weight = 0; |
5040 | if (pos & 1) { |
5041 | c->stat = 0; |
5042 | if ((ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
5043 | c->pb.rid = BIT31_12(pos); |
5044 | c->pb.sid = BIT11_01(pos); |
5045 | } else { |
5046 | c->pb.rid = pos >> 1; |
5047 | c->pb.sid = 1; |
5048 | } |
5049 | c->pb.tf = 1; |
5050 | c->pb.weight = 0; |
5051 | c->pb.pos = a[1]; |
5052 | } else { |
5053 | uint32_t chunk; |
5054 | buffer_term *bt; |
5055 | c->buffer_pseg = buffer_open(ctx, ii, pos, &bt, &c->buf); |
5056 | if (c->buffer_pseg == GRN_II_PSEG_NOT_ASSIGNED) { |
5057 | GRN_FREE(c); |
5058 | c = NULL; |
5059 | goto exit; |
5060 | } |
5061 | c->ppseg = &ii->header->binfo[LSEG(pos)]; |
5062 | if (bt->size_in_chunk && (chunk = c->buf->header.chunk) != GRN_II_PSEG_NOT_ASSIGNED) { |
5063 | if (!(c->cp = WIN_MAP(ii->chunk, ctx, &c->iw, chunk, bt->pos_in_chunk, |
5064 | bt->size_in_chunk, grn_io_rdonly))) { |
5065 | buffer_close(ctx, ii, c->buffer_pseg); |
5066 | GRN_FREE(c); |
5067 | c = NULL; |
5068 | goto exit; |
5069 | } |
5070 | if (buffer_is_reused(ctx, ii, c)) { |
5071 | grn_ii_cursor_close(ctx, c); |
5072 | continue; |
5073 | } |
5074 | c->cpe = c->cp + bt->size_in_chunk; |
5075 | if ((bt->tid & CHUNK_SPLIT)) { |
5076 | int i; |
5077 | grn_id crid; |
5078 | GRN_B_DEC(c->nchunks, c->cp); |
5079 | if (chunk_is_reused(ctx, ii, c, chunk, c->buf->header.chunk_size)) { |
5080 | grn_ii_cursor_close(ctx, c); |
5081 | continue; |
5082 | } |
5083 | if (!(c->cinfo = GRN_MALLOCN(chunk_info, c->nchunks))) { |
5084 | buffer_close(ctx, ii, c->buffer_pseg); |
5085 | grn_io_win_unmap(&c->iw); |
5086 | GRN_FREE(c); |
5087 | c = NULL; |
5088 | goto exit; |
5089 | } |
5090 | for (i = 0, crid = GRN_ID_NIL; i < c->nchunks; i++) { |
5091 | GRN_B_DEC(c->cinfo[i].segno, c->cp); |
5092 | GRN_B_DEC(c->cinfo[i].size, c->cp); |
5093 | GRN_B_DEC(c->cinfo[i].dgap, c->cp); |
5094 | crid += c->cinfo[i].dgap; |
5095 | if (crid < min) { |
5096 | c->pc.rid = crid; |
5097 | c->curr_chunk = i + 1; |
5098 | } |
5099 | } |
5100 | if (chunk_is_reused(ctx, ii, c, chunk, c->buf->header.chunk_size)) { |
5101 | grn_ii_cursor_close(ctx, c); |
5102 | continue; |
5103 | } |
5104 | } |
5105 | if ((ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
5106 | c->rdv[ii->n_elements - 1].flags = ODD; |
5107 | } |
5108 | } |
5109 | c->nextb = bt->pos_in_buffer; |
5110 | c->stat = CHUNK_USED|BUFFER_USED; |
5111 | } |
5112 | if (pos == a[0]) { break; } |
5113 | grn_ii_cursor_close(ctx, c); |
5114 | } |
5115 | exit : |
5116 | array_unref(ii, tid); |
5117 | return c; |
5118 | } |
5119 | |
5120 | static inline void |
5121 | grn_ii_cursor_set_min(grn_ctx *ctx, grn_ii_cursor *c, grn_id min) |
5122 | { |
5123 | if (c->min >= min) { |
5124 | return; |
5125 | } |
5126 | |
5127 | if (grn_ii_cursor_set_min_enable) { |
5128 | grn_id old_min = c->min; |
5129 | c->min = min; |
5130 | if (c->buf && |
5131 | c->pc.rid != GRN_ID_NIL && |
5132 | c->pc.rid < c->min && |
5133 | c->prev_chunk_rid < c->min && |
5134 | c->curr_chunk < c->nchunks) { |
5135 | uint32_t i; |
5136 | uint32_t skip_chunk = 0; |
5137 | grn_id rid = c->prev_chunk_rid; |
5138 | |
5139 | if (c->curr_chunk > 0) { |
5140 | i = c->curr_chunk - 1; |
5141 | } else { |
5142 | i = 0; |
5143 | } |
5144 | for (; i < c->nchunks; i++) { |
5145 | rid += c->cinfo[i].dgap; |
5146 | if (rid < c->min) { |
5147 | skip_chunk = i + 1; |
5148 | } else { |
5149 | rid -= c->cinfo[i].dgap; |
5150 | break; |
5151 | } |
5152 | } |
5153 | if (skip_chunk > c->curr_chunk) { |
5154 | uint32_t old_chunk = c->curr_chunk; |
5155 | grn_bool old_chunk_used = (c->stat & CHUNK_USED); |
5156 | c->pc.rid = rid; |
5157 | c->pc.rest = 0; |
5158 | c->prev_chunk_rid = rid - c->cinfo[skip_chunk - 1].dgap; |
5159 | c->curr_chunk = skip_chunk; |
5160 | c->crp = c->cdp + c->cdf; |
5161 | c->stat |= CHUNK_USED; |
5162 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
5163 | "[ii][cursor][min] skip: %p: min(%u->%u): chunk(%u->%u): " |
5164 | "chunk-used(%s->%s)" , |
5165 | c, |
5166 | old_min, min, |
5167 | old_chunk, c->curr_chunk, |
5168 | old_chunk_used ? "true" : "false" , |
5169 | (c->stat & CHUNK_USED) ? "true" : "false" ); |
5170 | } |
5171 | } |
5172 | } |
5173 | } |
5174 | |
5175 | typedef struct { |
5176 | grn_bool include_garbage; |
5177 | } grn_ii_cursor_next_options; |
5178 | |
5179 | static inline grn_posting * |
5180 | grn_ii_cursor_next_internal(grn_ctx *ctx, grn_ii_cursor *c, |
5181 | grn_ii_cursor_next_options *options) |
5182 | { |
5183 | const grn_bool include_garbage = options->include_garbage; |
5184 | if (c->buf) { |
5185 | for (;;) { |
5186 | if (c->stat & CHUNK_USED) { |
5187 | for (;;) { |
5188 | if (c->crp < c->cdp + c->cdf) { |
5189 | uint32_t dgap = *c->crp++; |
5190 | c->pc.rid += dgap; |
5191 | if (dgap) { c->pc.sid = 0; } |
5192 | if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
5193 | c->pc.sid += 1 + *c->csp++; |
5194 | } else { |
5195 | c->pc.sid = 1; |
5196 | } |
5197 | c->cpp += c->pc.rest; |
5198 | c->pc.rest = c->pc.tf = 1 + *c->ctp++; |
5199 | if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { |
5200 | c->pc.weight = *c->cwp++; |
5201 | } else { |
5202 | c->pc.weight = 0; |
5203 | } |
5204 | c->pc.pos = 0; |
5205 | /* |
5206 | { |
5207 | static int count = 0; |
5208 | int tf = c->pc.tf, pos = 0, *pp = (int *)c->cpp; |
5209 | grn_obj buf; |
5210 | GRN_TEXT_INIT(&buf, 0); |
5211 | grn_text_itoa(ctx, &buf, c->pc.rid); |
5212 | GRN_TEXT_PUTC(ctx, &buf, ':'); |
5213 | grn_text_itoa(ctx, &buf, c->pc.sid); |
5214 | GRN_TEXT_PUTC(ctx, &buf, ':'); |
5215 | grn_text_itoa(ctx, &buf, c->pc.tf); |
5216 | GRN_TEXT_PUTC(ctx, &buf, '('); |
5217 | while (tf--) { |
5218 | pos += *pp++; |
5219 | count++; |
5220 | grn_text_itoa(ctx, &buf, pos); |
5221 | if (tf) { GRN_TEXT_PUTC(ctx, &buf, ':'); } |
5222 | } |
5223 | GRN_TEXT_PUTC(ctx, &buf, ')'); |
5224 | GRN_TEXT_PUTC(ctx, &buf, '\0'); |
5225 | GRN_LOG(ctx, GRN_LOG_DEBUG, "posting(%d):%s", count, GRN_TEXT_VALUE(&buf)); |
5226 | GRN_OBJ_FIN(ctx, &buf); |
5227 | } |
5228 | */ |
5229 | } else { |
5230 | if (c->curr_chunk <= c->nchunks) { |
5231 | if (c->curr_chunk == c->nchunks) { |
5232 | if (c->cp < c->cpe) { |
5233 | int decoded_size; |
5234 | decoded_size = |
5235 | grn_p_decv(ctx, c->cp, c->cpe - c->cp, |
5236 | c->rdv, c->ii->n_elements); |
5237 | if (decoded_size == 0) { |
5238 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5239 | "[ii][cursor][next][chunk][last] " |
5240 | "chunk(%d) is changed by another thread " |
5241 | "while decoding: %p" , |
5242 | c->cinfo[c->curr_chunk].segno, |
5243 | c); |
5244 | c->pc.rid = GRN_ID_NIL; |
5245 | break; |
5246 | } |
5247 | if (buffer_is_reused(ctx, c->ii, c)) { |
5248 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5249 | "[ii][cursor][next][chunk][last] " |
5250 | "buffer is reused by another thread: %p" , |
5251 | c); |
5252 | c->pc.rid = GRN_ID_NIL; |
5253 | break; |
5254 | } |
5255 | if (chunk_is_reused(ctx, c->ii, c, |
5256 | c->buf->header.chunk, |
5257 | c->buf->header.chunk_size)) { |
5258 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5259 | "[ii][cursor][next][chunk][last] " |
5260 | "chunk(%d) is reused by another thread: %p" , |
5261 | c->buf->header.chunk, |
5262 | c); |
5263 | c->pc.rid = GRN_ID_NIL; |
5264 | break; |
5265 | } |
5266 | } else { |
5267 | c->pc.rid = GRN_ID_NIL; |
5268 | break; |
5269 | } |
5270 | } else { |
5271 | uint8_t *cp; |
5272 | grn_io_win iw; |
5273 | uint32_t size = c->cinfo[c->curr_chunk].size; |
5274 | if (size && (cp = WIN_MAP(c->ii->chunk, ctx, &iw, |
5275 | c->cinfo[c->curr_chunk].segno, 0, |
5276 | size, grn_io_rdonly))) { |
5277 | int decoded_size; |
5278 | decoded_size = |
5279 | grn_p_decv(ctx, cp, size, c->rdv, c->ii->n_elements); |
5280 | grn_io_win_unmap(&iw); |
5281 | if (decoded_size == 0) { |
5282 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5283 | "[ii][cursor][next][chunk] " |
5284 | "chunk(%d) is changed by another thread " |
5285 | "while decoding: %p" , |
5286 | c->cinfo[c->curr_chunk].segno, |
5287 | c); |
5288 | c->pc.rid = GRN_ID_NIL; |
5289 | break; |
5290 | } |
5291 | if (chunk_is_reused(ctx, c->ii, c, |
5292 | c->cinfo[c->curr_chunk].segno, size)) { |
5293 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5294 | "[ii][cursor][next][chunk] " |
5295 | "chunk(%d) is reused by another thread: %p" , |
5296 | c->cinfo[c->curr_chunk].segno, |
5297 | c); |
5298 | c->pc.rid = GRN_ID_NIL; |
5299 | break; |
5300 | } |
5301 | } else { |
5302 | c->pc.rid = GRN_ID_NIL; |
5303 | break; |
5304 | } |
5305 | } |
5306 | { |
5307 | int j = 0; |
5308 | c->cdf = c->rdv[j].data_size; |
5309 | c->crp = c->cdp = c->rdv[j++].data; |
5310 | if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
5311 | c->csp = c->rdv[j++].data; |
5312 | } |
5313 | c->ctp = c->rdv[j++].data; |
5314 | if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { |
5315 | c->cwp = c->rdv[j++].data; |
5316 | } |
5317 | if ((c->ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
5318 | c->cpp = c->rdv[j].data; |
5319 | } |
5320 | } |
5321 | c->prev_chunk_rid = c->pc.rid; |
5322 | c->pc.rid = GRN_ID_NIL; |
5323 | c->pc.sid = 0; |
5324 | c->pc.rest = 0; |
5325 | c->curr_chunk++; |
5326 | continue; |
5327 | } else { |
5328 | c->pc.rid = GRN_ID_NIL; |
5329 | } |
5330 | } |
5331 | break; |
5332 | } |
5333 | } |
5334 | if (c->stat & BUFFER_USED) { |
5335 | for (;;) { |
5336 | if (c->nextb) { |
5337 | uint32_t lrid = c->pb.rid, lsid = c->pb.sid; /* for check */ |
5338 | buffer_rec *br = BUFFER_REC_AT(c->buf, c->nextb); |
5339 | if (buffer_is_reused(ctx, c->ii, c)) { |
5340 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5341 | "[ii][cursor][next][buffer] " |
5342 | "buffer(%d,%d) is reused by another thread: %p" , |
5343 | c->buffer_pseg, *c->ppseg, |
5344 | c); |
5345 | c->pb.rid = GRN_ID_NIL; |
5346 | break; |
5347 | } |
5348 | c->bp = GRN_NEXT_ADDR(br); |
5349 | GRN_B_DEC(c->pb.rid, c->bp); |
5350 | if ((c->ii->header->flags & GRN_OBJ_WITH_SECTION)) { |
5351 | GRN_B_DEC(c->pb.sid, c->bp); |
5352 | } else { |
5353 | c->pb.sid = 1; |
5354 | } |
5355 | if (lrid > c->pb.rid || (lrid == c->pb.rid && lsid >= c->pb.sid)) { |
5356 | DEFINE_NAME(c->ii); |
5357 | ERR(GRN_FILE_CORRUPT, |
5358 | "[ii][broken][cursor][next][buffer] " |
5359 | "posting in list in buffer isn't sorted: " |
5360 | "<%.*s>: (%d:%d) -> (%d:%d) (%d->%d)" , |
5361 | name_size, name, |
5362 | lrid, lsid, |
5363 | c->pb.rid, c->pb.sid, |
5364 | c->buffer_pseg, *c->ppseg); |
5365 | c->pb.rid = GRN_ID_NIL; |
5366 | break; |
5367 | } |
5368 | if (c->pb.rid < c->min) { |
5369 | c->pb.rid = 0; |
5370 | if (br->jump > 0 && !BUFFER_REC_DELETED(br)) { |
5371 | buffer_rec *jump_br = BUFFER_REC_AT(c->buf, br->jump); |
5372 | if (BUFFER_REC_DELETED(jump_br)) { |
5373 | c->nextb = br->step; |
5374 | } else { |
5375 | uint8_t *jump_bp; |
5376 | uint32_t jump_rid; |
5377 | jump_bp = GRN_NEXT_ADDR(jump_br); |
5378 | GRN_B_DEC(jump_rid, jump_bp); |
5379 | if (jump_rid < c->min) { |
5380 | c->nextb = br->jump; |
5381 | } else { |
5382 | c->nextb = br->step; |
5383 | } |
5384 | } |
5385 | } else { |
5386 | c->nextb = br->step; |
5387 | } |
5388 | continue; |
5389 | } |
5390 | c->nextb = br->step; |
5391 | GRN_B_DEC(c->pb.tf, c->bp); |
5392 | if ((c->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { |
5393 | GRN_B_DEC(c->pb.weight, c->bp); |
5394 | } else { |
5395 | c->pb.weight = 0; |
5396 | } |
5397 | c->pb.rest = c->pb.tf; |
5398 | c->pb.pos = 0; |
5399 | } else { |
5400 | c->pb.rid = 0; |
5401 | } |
5402 | break; |
5403 | } |
5404 | } |
5405 | if (c->pb.rid) { |
5406 | if (c->pc.rid) { |
5407 | if (c->pc.rid < c->pb.rid) { |
5408 | c->stat = CHUNK_USED; |
5409 | if (include_garbage || (c->pc.tf && c->pc.sid)) { |
5410 | c->post = &c->pc; |
5411 | break; |
5412 | } |
5413 | } else { |
5414 | if (c->pb.rid < c->pc.rid) { |
5415 | c->stat = BUFFER_USED; |
5416 | if (include_garbage || (c->pb.tf && c->pb.sid)) { |
5417 | c->post = &c->pb; |
5418 | break; |
5419 | } |
5420 | } else { |
5421 | if (c->pb.sid) { |
5422 | if (c->pc.sid < c->pb.sid) { |
5423 | c->stat = CHUNK_USED; |
5424 | if (include_garbage || (c->pc.tf && c->pc.sid)) { |
5425 | c->post = &c->pc; |
5426 | break; |
5427 | } |
5428 | } else { |
5429 | c->stat = BUFFER_USED; |
5430 | if (c->pb.sid == c->pc.sid) { c->stat |= CHUNK_USED; } |
5431 | if (include_garbage || (c->pb.tf)) { |
5432 | c->post = &c->pb; |
5433 | break; |
5434 | } |
5435 | } |
5436 | } else { |
5437 | c->stat = CHUNK_USED; |
5438 | } |
5439 | } |
5440 | } |
5441 | } else { |
5442 | c->stat = BUFFER_USED; |
5443 | if (include_garbage || (c->pb.tf && c->pb.sid)) { |
5444 | c->post = &c->pb; |
5445 | break; |
5446 | } |
5447 | } |
5448 | } else { |
5449 | if (c->pc.rid) { |
5450 | c->stat = CHUNK_USED; |
5451 | if (include_garbage || (c->pc.tf && c->pc.sid)) { |
5452 | c->post = &c->pc; |
5453 | break; |
5454 | } |
5455 | } else { |
5456 | c->post = NULL; |
5457 | return NULL; |
5458 | } |
5459 | } |
5460 | } |
5461 | } else { |
5462 | if (c->stat & SOLE_DOC_USED) { |
5463 | c->post = NULL; |
5464 | return NULL; |
5465 | } else { |
5466 | c->post = &c->pb; |
5467 | c->stat |= SOLE_DOC_USED; |
5468 | if (c->post->rid < c->min) { |
5469 | c->post = NULL; |
5470 | return NULL; |
5471 | } |
5472 | } |
5473 | } |
5474 | return c->post; |
5475 | } |
5476 | |
5477 | grn_posting * |
5478 | grn_ii_cursor_next(grn_ctx *ctx, grn_ii_cursor *c) |
5479 | { |
5480 | grn_ii_cursor_next_options options = { |
5481 | .include_garbage = GRN_FALSE |
5482 | }; |
5483 | return grn_ii_cursor_next_internal(ctx, c, &options); |
5484 | } |
5485 | |
5486 | grn_posting * |
5487 | grn_ii_cursor_next_pos(grn_ctx *ctx, grn_ii_cursor *c) |
5488 | { |
5489 | uint32_t gap; |
5490 | if ((c->ii->header->flags & GRN_OBJ_WITH_POSITION)) { |
5491 | if (c->nelements == c->ii->n_elements) { |
5492 | if (c->buf) { |
5493 | if (c->post == &c->pc) { |
5494 | if (c->pc.rest) { |
5495 | c->pc.rest--; |
5496 | c->pc.pos += *c->cpp++; |
5497 | } else { |
5498 | return NULL; |
5499 | } |
5500 | } else if (c->post == &c->pb) { |
5501 | if (buffer_is_reused(ctx, c->ii, c)) { |
5502 | GRN_LOG(ctx, GRN_LOG_WARNING, |
5503 | "[ii][cursor][next][pos][buffer] " |
5504 | "buffer(%d,%d) is reused by another thread: %p" , |
5505 | c->buffer_pseg, *c->ppseg, |
5506 | c); |
5507 | return NULL; |
5508 | } |
5509 | if (c->pb.rest) { |
5510 | c->pb.rest--; |
5511 | GRN_B_DEC(gap, c->bp); |
5512 | c->pb.pos += gap; |
5513 | } else { |
5514 | return NULL; |
5515 | } |
5516 | } else { |
5517 | return NULL; |
5518 | } |
5519 | } else { |
5520 | if (c->stat & SOLE_POS_USED) { |
5521 | return NULL; |
5522 | } else { |
5523 | c->stat |= SOLE_POS_USED; |
5524 | } |
5525 | } |
5526 | } |
5527 | } else { |
5528 | if (c->stat & SOLE_POS_USED) { |
5529 | return NULL; |
5530 | } else { |
5531 | c->stat |= SOLE_POS_USED; |
5532 | } |
5533 | } |
5534 | return c->post; |
5535 | } |
5536 | |
5537 | grn_rc |
5538 | grn_ii_cursor_close(grn_ctx *ctx, grn_ii_cursor *c) |
5539 | { |
5540 | if (!c) { return GRN_INVALID_ARGUMENT; } |
5541 | datavec_fin(ctx, c->rdv); |
5542 | if (c->cinfo) { GRN_FREE(c->cinfo); } |
5543 | if (c->buf) { buffer_close(ctx, c->ii, c->buffer_pseg); } |
5544 | if (c->cp) { grn_io_win_unmap(&c->iw); } |
5545 | GRN_FREE(c); |
5546 | return GRN_SUCCESS; |
5547 | } |
5548 | |
5549 | uint32_t |
5550 | grn_ii_get_chunksize(grn_ctx *ctx, grn_ii *ii, grn_id tid) |
5551 | { |
5552 | uint32_t res, pos, *a; |
5553 | a = array_at(ctx, ii, tid); |
5554 | if (!a) { return 0; } |
5555 | if ((pos = a[0])) { |
5556 | if (pos & 1) { |
5557 | res = 0; |
5558 | } else { |
5559 | buffer *buf; |
5560 | uint32_t pseg; |
5561 | buffer_term *bt; |
5562 | if ((pseg = buffer_open(ctx, ii, pos, &bt, &buf)) == GRN_II_PSEG_NOT_ASSIGNED) { |
5563 | res = 0; |
5564 | } else { |
5565 | res = bt->size_in_chunk; |
5566 | buffer_close(ctx, ii, pseg); |
5567 | } |
5568 | } |
5569 | } else { |
5570 | res = 0; |
5571 | } |
5572 | array_unref(ii, tid); |
5573 | return res; |
5574 | } |
5575 | |
5576 | uint32_t |
5577 | grn_ii_estimate_size(grn_ctx *ctx, grn_ii *ii, grn_id tid) |
5578 | { |
5579 | uint32_t res, pos, *a; |
5580 | a = array_at(ctx, ii, tid); |
5581 | if (!a) { return 0; } |
5582 | if ((pos = a[0])) { |
5583 | if (pos & 1) { |
5584 | res = 1; |
5585 | } else { |
5586 | buffer *buf; |
5587 | uint32_t pseg; |
5588 | buffer_term *bt; |
5589 | if ((pseg = buffer_open(ctx, ii, pos, &bt, &buf)) == GRN_II_PSEG_NOT_ASSIGNED) { |
5590 | res = 0; |
5591 | } else { |
5592 | res = a[1] + bt->size_in_buffer + 2; |
5593 | buffer_close(ctx, ii, pseg); |
5594 | } |
5595 | } |
5596 | } else { |
5597 | res = 0; |
5598 | } |
5599 | array_unref(ii, tid); |
5600 | return res; |
5601 | } |
5602 | |
5603 | int |
5604 | grn_ii_entry_info(grn_ctx *ctx, grn_ii *ii, grn_id tid, unsigned int *a, |
5605 | unsigned int *chunk, unsigned int *chunk_size, |
5606 | unsigned int *buffer_free, |
5607 | unsigned int *nterms, unsigned int *nterms_void, |
5608 | unsigned int *bt_tid, |
5609 | unsigned int *size_in_chunk, unsigned int *pos_in_chunk, |
5610 | unsigned int *size_in_buffer, unsigned int *pos_in_buffer) |
5611 | { |
5612 | buffer *b; |
5613 | buffer_term *bt; |
5614 | uint32_t pseg, *ap; |
5615 | ERRCLR(NULL); |
5616 | ap = array_at(ctx, ii, tid); |
5617 | if (!ap) { return 0; } |
5618 | a[0] = *ap; |
5619 | array_unref(ii, tid); |
5620 | if (!a[0]) { return 1; } |
5621 | if (a[0] & 1) { return 2; } |
5622 | if ((pseg = buffer_open(ctx, ii, a[0], &bt, &b)) == GRN_II_PSEG_NOT_ASSIGNED) { return 3; } |
5623 | *chunk = b->header.chunk; |
5624 | *chunk_size = b->header.chunk_size; |
5625 | *buffer_free = b->header.buffer_free; |
5626 | *nterms = b->header.nterms; |
5627 | *bt_tid = bt->tid; |
5628 | *size_in_chunk = bt->size_in_chunk; |
5629 | *pos_in_chunk = bt->pos_in_chunk; |
5630 | *size_in_buffer = bt->size_in_buffer; |
5631 | *pos_in_buffer = bt->pos_in_buffer; |
5632 | buffer_close(ctx, ii, pseg); |
5633 | return 4; |
5634 | } |
5635 | |
5636 | const char * |
5637 | grn_ii_path(grn_ii *ii) |
5638 | { |
5639 | return grn_io_path(ii->seg); |
5640 | } |
5641 | |
5642 | uint32_t |
5643 | grn_ii_max_section(grn_ii *ii) |
5644 | { |
5645 | return ii->header->smax; |
5646 | } |
5647 | |
5648 | grn_obj * |
5649 | grn_ii_lexicon(grn_ii *ii) |
5650 | { |
5651 | return ii->lexicon; |
5652 | } |
5653 | |
5654 | /* private classes */ |
5655 | |
5656 | /* b-heap */ |
5657 | |
5658 | typedef struct { |
5659 | int n_entries; |
5660 | int n_bins; |
5661 | grn_ii_cursor **bins; |
5662 | } cursor_heap; |
5663 | |
5664 | static inline cursor_heap * |
5665 | cursor_heap_open(grn_ctx *ctx, int max) |
5666 | { |
5667 | cursor_heap *h = GRN_MALLOC(sizeof(cursor_heap)); |
5668 | if (!h) { return NULL; } |
5669 | h->bins = GRN_MALLOC(sizeof(grn_ii_cursor *) * max); |
5670 | if (!h->bins) { |
5671 | GRN_FREE(h); |
5672 | return NULL; |
5673 | } |
5674 | h->n_entries = 0; |
5675 | h->n_bins = max; |
5676 | return h; |
5677 | } |
5678 | |
5679 | static inline grn_rc |
5680 | cursor_heap_push(grn_ctx *ctx, cursor_heap *h, grn_ii *ii, grn_id tid, uint32_t offset2, |
5681 | int weight, grn_id min) |
5682 | { |
5683 | int n, n2; |
5684 | grn_ii_cursor *c, *c2; |
5685 | if (h->n_entries >= h->n_bins) { |
5686 | int max = h->n_bins * 2; |
5687 | grn_ii_cursor **bins = GRN_REALLOC(h->bins, sizeof(grn_ii_cursor *) * max); |
5688 | GRN_LOG(ctx, GRN_LOG_DEBUG, "expanded cursor_heap to %d,%p" , max, bins); |
5689 | if (!bins) { return GRN_NO_MEMORY_AVAILABLE; } |
5690 | h->n_bins = max; |
5691 | h->bins = bins; |
5692 | } |
5693 | { |
5694 | if (!(c = grn_ii_cursor_open(ctx, ii, tid, min, GRN_ID_MAX, |
5695 | ii->n_elements, 0))) { |
5696 | GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed" ); |
5697 | return ctx->rc; |
5698 | } |
5699 | if (!grn_ii_cursor_next(ctx, c)) { |
5700 | grn_ii_cursor_close(ctx, c); |
5701 | return GRN_END_OF_DATA; |
5702 | } |
5703 | if (!grn_ii_cursor_next_pos(ctx, c)) { |
5704 | if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { |
5705 | char token[GRN_TABLE_MAX_KEY_SIZE]; |
5706 | int token_size; |
5707 | token_size = grn_table_get_key(ctx, |
5708 | c->ii->lexicon, |
5709 | c->id, |
5710 | &token, |
5711 | GRN_TABLE_MAX_KEY_SIZE); |
5712 | GRN_LOG(ctx, GRN_LOG_ERROR, |
5713 | "[ii][cursor][heap][push] invalid cursor: " |
5714 | "%p: token:<%.*s>(%u)" , |
5715 | c, token_size, token, c->id); |
5716 | } |
5717 | grn_ii_cursor_close(ctx, c); |
5718 | return GRN_END_OF_DATA; |
5719 | } |
5720 | if (weight) { |
5721 | c->weight = weight; |
5722 | } |
5723 | n = h->n_entries++; |
5724 | while (n) { |
5725 | n2 = (n - 1) >> 1; |
5726 | c2 = h->bins[n2]; |
5727 | if (GRN_II_CURSOR_CMP(c, c2)) { break; } |
5728 | h->bins[n] = c2; |
5729 | n = n2; |
5730 | } |
5731 | h->bins[n] = c; |
5732 | } |
5733 | return GRN_SUCCESS; |
5734 | } |
5735 | |
5736 | static inline grn_rc |
5737 | cursor_heap_push2(cursor_heap *h) |
5738 | { |
5739 | grn_rc rc = GRN_SUCCESS; |
5740 | return rc; |
5741 | } |
5742 | |
5743 | static inline grn_ii_cursor * |
5744 | cursor_heap_min(cursor_heap *h) |
5745 | { |
5746 | return h->n_entries ? h->bins[0] : NULL; |
5747 | } |
5748 | |
5749 | static inline void |
5750 | cursor_heap_recalc_min(cursor_heap *h) |
5751 | { |
5752 | int n = 0, n1, n2, m; |
5753 | if ((m = h->n_entries) > 1) { |
5754 | grn_ii_cursor *c = h->bins[0], *c1, *c2; |
5755 | for (;;) { |
5756 | n1 = n * 2 + 1; |
5757 | n2 = n1 + 1; |
5758 | c1 = n1 < m ? h->bins[n1] : NULL; |
5759 | c2 = n2 < m ? h->bins[n2] : NULL; |
5760 | if (c1 && GRN_II_CURSOR_CMP(c, c1)) { |
5761 | if (c2 && GRN_II_CURSOR_CMP(c, c2) && GRN_II_CURSOR_CMP(c1, c2)) { |
5762 | h->bins[n] = c2; |
5763 | n = n2; |
5764 | } else { |
5765 | h->bins[n] = c1; |
5766 | n = n1; |
5767 | } |
5768 | } else { |
5769 | if (c2 && GRN_II_CURSOR_CMP(c, c2)) { |
5770 | h->bins[n] = c2; |
5771 | n = n2; |
5772 | } else { |
5773 | h->bins[n] = c; |
5774 | break; |
5775 | } |
5776 | } |
5777 | } |
5778 | } |
5779 | } |
5780 | |
5781 | static inline void |
5782 | cursor_heap_pop(grn_ctx *ctx, cursor_heap *h, grn_id min) |
5783 | { |
5784 | if (h->n_entries) { |
5785 | grn_ii_cursor *c = h->bins[0]; |
5786 | grn_ii_cursor_set_min(ctx, c, min); |
5787 | if (!grn_ii_cursor_next(ctx, c)) { |
5788 | grn_ii_cursor_close(ctx, c); |
5789 | h->bins[0] = h->bins[--h->n_entries]; |
5790 | } else if (!grn_ii_cursor_next_pos(ctx, c)) { |
5791 | if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { |
5792 | char token[GRN_TABLE_MAX_KEY_SIZE]; |
5793 | int token_size; |
5794 | token_size = grn_table_get_key(ctx, |
5795 | c->ii->lexicon, |
5796 | c->id, |
5797 | &token, |
5798 | GRN_TABLE_MAX_KEY_SIZE); |
5799 | GRN_LOG(ctx, GRN_LOG_ERROR, |
5800 | "[ii][cursor][heap][pop] invalid cursor: " |
5801 | "%p: token:<%.*s>(%u)" , |
5802 | c, token_size, token, c->id); |
5803 | } |
5804 | grn_ii_cursor_close(ctx, c); |
5805 | h->bins[0] = h->bins[--h->n_entries]; |
5806 | } |
5807 | if (h->n_entries > 1) { cursor_heap_recalc_min(h); } |
5808 | } |
5809 | } |
5810 | |
5811 | static inline void |
5812 | cursor_heap_pop_pos(grn_ctx *ctx, cursor_heap *h) |
5813 | { |
5814 | if (h->n_entries) { |
5815 | grn_ii_cursor *c = h->bins[0]; |
5816 | if (!grn_ii_cursor_next_pos(ctx, c)) { |
5817 | if (!grn_ii_cursor_next(ctx, c)) { |
5818 | grn_ii_cursor_close(ctx, c); |
5819 | h->bins[0] = h->bins[--h->n_entries]; |
5820 | } else if (!grn_ii_cursor_next_pos(ctx, c)) { |
5821 | if (grn_logger_pass(ctx, GRN_LOG_ERROR)) { |
5822 | char token[GRN_TABLE_MAX_KEY_SIZE]; |
5823 | int token_size; |
5824 | token_size = grn_table_get_key(ctx, |
5825 | c->ii->lexicon, |
5826 | c->id, |
5827 | &token, |
5828 | GRN_TABLE_MAX_KEY_SIZE); |
5829 | GRN_LOG(ctx, GRN_LOG_ERROR, |
5830 | "[ii][cursor][heap][pop][position] invalid cursor: " |
5831 | "%p: token:<%.*s>(%u)" , |
5832 | c, token_size, token, c->id); |
5833 | } |
5834 | grn_ii_cursor_close(ctx, c); |
5835 | h->bins[0] = h->bins[--h->n_entries]; |
5836 | } |
5837 | } |
5838 | if (h->n_entries > 1) { cursor_heap_recalc_min(h); } |
5839 | } |
5840 | } |
5841 | |
5842 | static inline void |
5843 | cursor_heap_close(grn_ctx *ctx, cursor_heap *h) |
5844 | { |
5845 | int i; |
5846 | if (!h) { return; } |
5847 | for (i = h->n_entries; i--;) { grn_ii_cursor_close(ctx, h->bins[i]); } |
5848 | GRN_FREE(h->bins); |
5849 | GRN_FREE(h); |
5850 | } |
5851 | |
5852 | /* update */ |
5853 | #ifdef USE_VGRAM |
5854 | |
5855 | inline static grn_rc |
5856 | index_add(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, |
5857 | const char *value, size_t value_len) |
5858 | { |
5859 | grn_hash *h; |
5860 | unsigned int token_flags = 0; |
5861 | grn_token_cursor *token_cursor; |
5862 | grn_ii_updspec **u; |
5863 | grn_id tid, *tp; |
5864 | grn_rc r, rc = GRN_SUCCESS; |
5865 | grn_vgram_buf *sbuf = NULL; |
5866 | if (!rid) { return GRN_INVALID_ARGUMENT; } |
5867 | if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, value, value_len, |
5868 | GRN_TOKEN_ADD, token_flags))) { |
5869 | return GRN_NO_MEMORY_AVAILABLE; |
5870 | } |
5871 | if (vgram) { sbuf = grn_vgram_buf_open(value_len); } |
5872 | h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), |
5873 | GRN_HASH_TINY); |
5874 | if (!h) { |
5875 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on index_add failed !" ); |
5876 | grn_token_cursor_close(ctx, token_cursor); |
5877 | if (sbuf) { grn_vgram_buf_close(sbuf); } |
5878 | return GRN_NO_MEMORY_AVAILABLE; |
5879 | } |
5880 | while (!token_cursor->status) { |
5881 | (tid = grn_token_cursor_next(ctx, token_cursor)); |
5882 | if (tid) { |
5883 | if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { |
5884 | break; |
5885 | } |
5886 | if (!*u) { |
5887 | if (!(*u = grn_ii_updspec_open(ctx, rid, 1))) { |
5888 | GRN_LOG(ctx, GRN_LOG_ERROR, |
5889 | "grn_ii_updspec_open on index_add failed!" ); |
5890 | goto exit; |
5891 | } |
5892 | } |
5893 | if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, 0)) { |
5894 | GRN_LOG(ctx, GRN_LOG_ERROR, |
5895 | "grn_ii_updspec_add on index_add failed!" ); |
5896 | goto exit; |
5897 | } |
5898 | if (sbuf) { grn_vgram_buf_add(sbuf, tid); } |
5899 | } |
5900 | } |
5901 | grn_token_cursor_close(ctx, token_cursor); |
5902 | // todo : support vgram |
5903 | // if (sbuf) { grn_vgram_update(vgram, rid, sbuf, (grn_set *)h); } |
5904 | GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { |
5905 | if ((r = grn_ii_update_one(ctx, ii, *tp, *u, h))) { rc = r; } |
5906 | grn_ii_updspec_close(ctx, *u); |
5907 | }); |
5908 | grn_hash_close(ctx, h); |
5909 | if (sbuf) { grn_vgram_buf_close(sbuf); } |
5910 | return rc; |
5911 | exit: |
5912 | grn_hash_close(ctx, h); |
5913 | grn_token_cursor_close(ctx, token_cursor); |
5914 | if (sbuf) { grn_vgram_buf_close(sbuf); } |
5915 | return GRN_NO_MEMORY_AVAILABLE; |
5916 | } |
5917 | |
5918 | inline static grn_rc |
5919 | index_del(grn_ctx *ctx, grn_id rid, grn_obj *lexicon, grn_ii *ii, grn_vgram *vgram, |
5920 | const char *value, size_t value_len) |
5921 | { |
5922 | grn_rc rc = GRN_SUCCESS; |
5923 | grn_hash *h; |
5924 | unsigned int token_flags = 0; |
5925 | grn_token_cursor *token_cursor; |
5926 | grn_ii_updspec **u; |
5927 | grn_id tid, *tp; |
5928 | if (!rid) { return GRN_INVALID_ARGUMENT; } |
5929 | if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, value, value_len, |
5930 | GRN_TOKEN_DEL, token_flags))) { |
5931 | return GRN_NO_MEMORY_AVAILABLE; |
5932 | } |
5933 | h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), |
5934 | GRN_HASH_TINY); |
5935 | if (!h) { |
5936 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on index_del failed !" ); |
5937 | grn_token_cursor_close(ctx, token_cursor); |
5938 | return GRN_NO_MEMORY_AVAILABLE; |
5939 | } |
5940 | while (!token_cursor->status) { |
5941 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
5942 | if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, NULL)) { |
5943 | break; |
5944 | } |
5945 | if (!*u) { |
5946 | if (!(*u = grn_ii_updspec_open(ctx, rid, 0))) { |
5947 | GRN_LOG(ctx, GRN_LOG_ALERT, |
5948 | "grn_ii_updspec_open on index_del failed !" ); |
5949 | grn_hash_close(ctx, h); |
5950 | grn_token_cursor_close(ctx, token_cursor); |
5951 | return GRN_NO_MEMORY_AVAILABLE; |
5952 | } |
5953 | } |
5954 | } |
5955 | } |
5956 | grn_token_cursor_close(ctx, token_cursor); |
5957 | GRN_HASH_EACH(ctx, h, id, &tp, NULL, &u, { |
5958 | if (*tp) { |
5959 | grn_rc r; |
5960 | r = grn_ii_delete_one(ctx, ii, *tp, *u, NULL); |
5961 | if (r) { |
5962 | rc = r; |
5963 | } |
5964 | } |
5965 | grn_ii_updspec_close(ctx, *u); |
5966 | }); |
5967 | grn_hash_close(ctx, h); |
5968 | return rc; |
5969 | } |
5970 | |
5971 | grn_rc |
5972 | grn_ii_upd(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, |
5973 | const char *oldvalue, unsigned int oldvalue_len, |
5974 | const char *newvalue, unsigned int newvalue_len) |
5975 | { |
5976 | grn_rc rc; |
5977 | grn_obj *lexicon = ii->lexicon; |
5978 | if (!rid) { return GRN_INVALID_ARGUMENT; } |
5979 | if (oldvalue && *oldvalue) { |
5980 | if ((rc = index_del(ctx, rid, lexicon, ii, vgram, oldvalue, oldvalue_len))) { |
5981 | GRN_LOG(ctx, GRN_LOG_ERROR, "index_del on grn_ii_upd failed !" ); |
5982 | goto exit; |
5983 | } |
5984 | } |
5985 | if (newvalue && *newvalue) { |
5986 | rc = index_add(ctx, rid, lexicon, ii, vgram, newvalue, newvalue_len); |
5987 | } |
5988 | exit : |
5989 | return rc; |
5990 | } |
5991 | |
5992 | grn_rc |
5993 | grn_ii_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, grn_vgram *vgram, unsigned int section, |
5994 | grn_values *oldvalues, grn_values *newvalues) |
5995 | { |
5996 | int j; |
5997 | grn_value *v; |
5998 | unsigned int token_flags = 0; |
5999 | grn_token_cursor *token_cursor; |
6000 | grn_rc rc = GRN_SUCCESS; |
6001 | grn_hash *old, *new; |
6002 | grn_id tid, *tp; |
6003 | grn_ii_updspec **u, **un; |
6004 | grn_obj *lexicon = ii->lexicon; |
6005 | if (!lexicon || !ii || !rid) { |
6006 | GRN_LOG(ctx, GRN_LOG_WARNING, "grn_ii_update: invalid argument" ); |
6007 | return GRN_INVALID_ARGUMENT; |
6008 | } |
6009 | if (newvalues) { |
6010 | new = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), |
6011 | GRN_HASH_TINY); |
6012 | if (!new) { |
6013 | GRN_LOG(ctx, GRN_LOG_ALERT, "grn_hash_create on grn_ii_update failed !" ); |
6014 | rc = GRN_NO_MEMORY_AVAILABLE; |
6015 | goto exit; |
6016 | } |
6017 | for (j = newvalues->n_values, v = newvalues->values; j; j--, v++) { |
6018 | if ((token_cursor = grn_token_cursor_open(ctx, lexicon, v->str, |
6019 | v->str_len, GRN_TOKEN_ADD, |
6020 | token_flags))) { |
6021 | while (!token_cursor->status) { |
6022 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
6023 | if (!grn_hash_add(ctx, new, &tid, sizeof(grn_id), (void **) &u, |
6024 | NULL)) { |
6025 | break; |
6026 | } |
6027 | if (!*u) { |
6028 | if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { |
6029 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6030 | "grn_ii_updspec_open on grn_ii_update failed!" ); |
6031 | grn_token_cursor_close(ctx, token_cursor); |
6032 | grn_hash_close(ctx, new); |
6033 | rc = GRN_NO_MEMORY_AVAILABLE; |
6034 | goto exit; |
6035 | } |
6036 | } |
6037 | if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { |
6038 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6039 | "grn_ii_updspec_add on grn_ii_update failed!" ); |
6040 | grn_token_cursor_close(ctx, token_cursor); |
6041 | grn_hash_close(ctx, new); |
6042 | rc = GRN_NO_MEMORY_AVAILABLE; |
6043 | goto exit; |
6044 | } |
6045 | } |
6046 | } |
6047 | grn_token_cursor_close(ctx, token_cursor); |
6048 | } |
6049 | } |
6050 | if (!GRN_HASH_SIZE(new)) { |
6051 | grn_hash_close(ctx, new); |
6052 | new = NULL; |
6053 | } |
6054 | } else { |
6055 | new = NULL; |
6056 | } |
6057 | if (oldvalues) { |
6058 | old = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(grn_ii_updspec *), |
6059 | GRN_HASH_TINY); |
6060 | if (!old) { |
6061 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6062 | "grn_hash_create(ctx, NULL, old) on grn_ii_update failed!" ); |
6063 | if (new) { grn_hash_close(ctx, new); } |
6064 | rc = GRN_NO_MEMORY_AVAILABLE; |
6065 | goto exit; |
6066 | } |
6067 | for (j = oldvalues->n_values, v = oldvalues->values; j; j--, v++) { |
6068 | if ((token_cursor = grn_token_cursor_open(ctx, lexicon, v->str, |
6069 | v->str_len, GRN_TOKEN_DEL, |
6070 | token_flags))) { |
6071 | while (!token_cursor->status) { |
6072 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
6073 | if (!grn_hash_add(ctx, old, &tid, sizeof(grn_id), (void **) &u, |
6074 | NULL)) { |
6075 | break; |
6076 | } |
6077 | if (!*u) { |
6078 | if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { |
6079 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6080 | "grn_ii_updspec_open on grn_ii_update failed!" ); |
6081 | grn_token_cursor_close(ctx, token_cursor); |
6082 | if (new) { grn_hash_close(ctx, new); }; |
6083 | grn_hash_close(ctx, old); |
6084 | rc = GRN_NO_MEMORY_AVAILABLE; |
6085 | goto exit; |
6086 | } |
6087 | } |
6088 | if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { |
6089 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6090 | "grn_ii_updspec_add on grn_ii_update failed!" ); |
6091 | grn_token_cursor_close(ctx, token_cursor); |
6092 | if (new) { grn_hash_close(ctx, new); }; |
6093 | grn_hash_close(ctx, old); |
6094 | rc = GRN_NO_MEMORY_AVAILABLE; |
6095 | goto exit; |
6096 | } |
6097 | } |
6098 | } |
6099 | grn_token_cursor_close(ctx, token_cursor); |
6100 | } |
6101 | } |
6102 | } else { |
6103 | old = NULL; |
6104 | } |
6105 | if (old) { |
6106 | grn_id eid; |
6107 | GRN_HASH_EACH(ctx, old, id, &tp, NULL, &u, { |
6108 | if (new && (eid = grn_hash_get(ctx, new, tp, sizeof(grn_id), |
6109 | (void **) &un))) { |
6110 | if (!grn_ii_updspec_cmp(*u, *un)) { |
6111 | grn_ii_updspec_close(ctx, *un); |
6112 | grn_hash_delete_by_id(ctx, new, eid, NULL); |
6113 | } |
6114 | } else { |
6115 | grn_rc r; |
6116 | r = grn_ii_delete_one(ctx, ii, *tp, *u, new); |
6117 | if (r) { |
6118 | rc = r; |
6119 | } |
6120 | } |
6121 | grn_ii_updspec_close(ctx, *u); |
6122 | }); |
6123 | grn_hash_close(ctx, old); |
6124 | } |
6125 | if (new) { |
6126 | GRN_HASH_EACH(ctx, new, id, &tp, NULL, &u, { |
6127 | grn_rc r; |
6128 | if ((r = grn_ii_update_one(ctx, ii, *tp, *u, new))) { rc = r; } |
6129 | grn_ii_updspec_close(ctx, *u); |
6130 | }); |
6131 | grn_hash_close(ctx, new); |
6132 | } else { |
6133 | if (!section) { |
6134 | /* todo: delete key when all sections deleted */ |
6135 | } |
6136 | } |
6137 | exit : |
6138 | return rc; |
6139 | } |
6140 | #endif /* USE_VGRAM */ |
6141 | |
6142 | static grn_rc |
6143 | grn_vector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, |
6144 | grn_obj *in, grn_obj *out, grn_tokenize_mode mode, |
6145 | grn_obj *posting) |
6146 | { |
6147 | int j; |
6148 | grn_id tid; |
6149 | grn_section *v; |
6150 | grn_token_cursor *token_cursor; |
6151 | grn_ii_updspec **u; |
6152 | grn_hash *h = (grn_hash *)out; |
6153 | grn_obj *lexicon = ii->lexicon; |
6154 | if (in->u.v.body) { |
6155 | const char *head = GRN_BULK_HEAD(in->u.v.body); |
6156 | for (j = in->u.v.n_sections, v = in->u.v.sections; j; j--, v++) { |
6157 | unsigned int token_flags = 0; |
6158 | if (v->length && |
6159 | (token_cursor = grn_token_cursor_open(ctx, lexicon, head + v->offset, |
6160 | v->length, mode, |
6161 | token_flags))) { |
6162 | while (!token_cursor->status) { |
6163 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
6164 | if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } |
6165 | if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **) &u, |
6166 | NULL)) { |
6167 | break; |
6168 | } |
6169 | if (!*u) { |
6170 | if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { |
6171 | DEFINE_NAME(ii); |
6172 | MERR("[ii][update][spec] failed to create an update spec: " |
6173 | "<%.*s>: " |
6174 | "record:<%u>:<%u>, token:<%u>:<%d>:<%u>" , |
6175 | name_size, name, |
6176 | rid, section, |
6177 | tid, token_cursor->pos, v->weight); |
6178 | grn_token_cursor_close(ctx, token_cursor); |
6179 | return ctx->rc; |
6180 | } |
6181 | } |
6182 | if (grn_ii_updspec_add(ctx, *u, token_cursor->pos, v->weight)) { |
6183 | DEFINE_NAME(ii); |
6184 | MERR("[ii][update][spec] failed to add to update spec: " |
6185 | "<%.*s>: " |
6186 | "record:<%u>:<%u>, token:<%u>:<%d>:<%u>" , |
6187 | name_size, name, |
6188 | rid, section, |
6189 | tid, token_cursor->pos, v->weight); |
6190 | grn_token_cursor_close(ctx, token_cursor); |
6191 | return ctx->rc; |
6192 | } |
6193 | } |
6194 | } |
6195 | grn_token_cursor_close(ctx, token_cursor); |
6196 | } |
6197 | } |
6198 | } |
6199 | return ctx->rc; |
6200 | } |
6201 | |
6202 | static grn_rc |
6203 | grn_uvector2updspecs_data(grn_ctx *ctx, grn_ii *ii, grn_id rid, |
6204 | unsigned int section, grn_obj *in, grn_obj *out, |
6205 | grn_tokenize_mode mode, grn_obj *posting) |
6206 | { |
6207 | int i, n; |
6208 | grn_hash *h = (grn_hash *)out; |
6209 | grn_obj *lexicon = ii->lexicon; |
6210 | unsigned int element_size; |
6211 | |
6212 | n = grn_uvector_size(ctx, in); |
6213 | element_size = grn_uvector_element_size(ctx, in); |
6214 | for (i = 0; i < n; i++) { |
6215 | grn_obj *tokenizer; |
6216 | grn_token_cursor *token_cursor; |
6217 | unsigned int token_flags = 0; |
6218 | const char *element; |
6219 | |
6220 | tokenizer = grn_obj_get_info(ctx, lexicon, GRN_INFO_DEFAULT_TOKENIZER, |
6221 | NULL); |
6222 | |
6223 | element = GRN_BULK_HEAD(in) + (element_size * i); |
6224 | token_cursor = grn_token_cursor_open(ctx, lexicon, |
6225 | element, element_size, |
6226 | mode, token_flags); |
6227 | if (!token_cursor) { |
6228 | continue; |
6229 | } |
6230 | |
6231 | while (!token_cursor->status) { |
6232 | grn_id tid; |
6233 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
6234 | grn_ii_updspec **u; |
6235 | int pos; |
6236 | |
6237 | if (posting) { GRN_RECORD_PUT(ctx, posting, tid); } |
6238 | if (!grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&u, NULL)) { |
6239 | break; |
6240 | } |
6241 | if (!*u) { |
6242 | if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { |
6243 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6244 | "grn_ii_updspec_open on grn_uvector2updspecs_data failed!" ); |
6245 | grn_token_cursor_close(ctx, token_cursor); |
6246 | return GRN_NO_MEMORY_AVAILABLE; |
6247 | } |
6248 | } |
6249 | if (tokenizer) { |
6250 | pos = token_cursor->pos; |
6251 | } else { |
6252 | pos = i; |
6253 | } |
6254 | if (grn_ii_updspec_add(ctx, *u, pos, 0)) { |
6255 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6256 | "grn_ii_updspec_add on grn_uvector2updspecs failed!" ); |
6257 | grn_token_cursor_close(ctx, token_cursor); |
6258 | return GRN_NO_MEMORY_AVAILABLE; |
6259 | } |
6260 | } |
6261 | } |
6262 | |
6263 | grn_token_cursor_close(ctx, token_cursor); |
6264 | } |
6265 | |
6266 | return GRN_SUCCESS; |
6267 | } |
6268 | |
6269 | static grn_rc |
6270 | grn_uvector2updspecs_id(grn_ctx *ctx, grn_ii *ii, grn_id rid, |
6271 | unsigned int section, grn_obj *in, grn_obj *out) |
6272 | { |
6273 | int i, n; |
6274 | grn_ii_updspec **u; |
6275 | grn_hash *h = (grn_hash *)out; |
6276 | |
6277 | n = grn_vector_size(ctx, in); |
6278 | for (i = 0; i < n; i++) { |
6279 | grn_id id; |
6280 | unsigned int weight; |
6281 | |
6282 | id = grn_uvector_get_element(ctx, in, i, &weight); |
6283 | if (!grn_hash_add(ctx, h, &id, sizeof(grn_id), (void **)&u, NULL)) { |
6284 | break; |
6285 | } |
6286 | if (!*u) { |
6287 | if (!(*u = grn_ii_updspec_open(ctx, rid, section))) { |
6288 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6289 | "grn_ii_updspec_open on grn_ii_update failed!" ); |
6290 | return GRN_NO_MEMORY_AVAILABLE; |
6291 | } |
6292 | } |
6293 | if (grn_ii_updspec_add(ctx, *u, i, weight)) { |
6294 | GRN_LOG(ctx, GRN_LOG_ALERT, |
6295 | "grn_ii_updspec_add on grn_ii_update failed!" ); |
6296 | return GRN_NO_MEMORY_AVAILABLE; |
6297 | } |
6298 | } |
6299 | return GRN_SUCCESS; |
6300 | } |
6301 | |
6302 | static grn_rc |
6303 | grn_uvector2updspecs(grn_ctx *ctx, grn_ii *ii, grn_id rid, |
6304 | unsigned int section, grn_obj *in, grn_obj *out, |
6305 | grn_tokenize_mode mode, grn_obj *posting) |
6306 | { |
6307 | if (in->header.domain < GRN_N_RESERVED_TYPES) { |
6308 | return grn_uvector2updspecs_data(ctx, ii, rid, section, in, out, |
6309 | mode, posting); |
6310 | } else { |
6311 | return grn_uvector2updspecs_id(ctx, ii, rid, section, in, out); |
6312 | } |
6313 | } |
6314 | |
6315 | grn_rc |
6316 | grn_ii_column_update(grn_ctx *ctx, grn_ii *ii, grn_id rid, unsigned int section, |
6317 | grn_obj *oldvalue, grn_obj *newvalue, grn_obj *posting) |
6318 | { |
6319 | grn_id *tp; |
6320 | grn_bool do_grn_ii_updspec_cmp = GRN_TRUE; |
6321 | grn_ii_updspec **u, **un; |
6322 | grn_obj *old_, *old = oldvalue, *new_, *new = newvalue, oldv, newv; |
6323 | grn_obj buf, *post = NULL; |
6324 | |
6325 | if (!ii) { |
6326 | ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] ii is NULL" ); |
6327 | return ctx->rc; |
6328 | } |
6329 | if (!ii->lexicon) { |
6330 | ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] lexicon is NULL" ); |
6331 | return ctx->rc; |
6332 | } |
6333 | if (rid == GRN_ID_NIL) { |
6334 | ERR(GRN_INVALID_ARGUMENT, "[ii][column][update] record ID is nil" ); |
6335 | return ctx->rc; |
6336 | } |
6337 | if (old || new) { |
6338 | unsigned char type = GRN_VOID; |
6339 | if (old) { |
6340 | type = (ii->obj.header.domain == old->header.domain) |
6341 | ? GRN_UVECTOR |
6342 | : old->header.type; |
6343 | } |
6344 | if (new) { |
6345 | type = (ii->obj.header.domain == new->header.domain) |
6346 | ? GRN_UVECTOR |
6347 | : new->header.type; |
6348 | } |
6349 | if (type == GRN_VECTOR) { |
6350 | grn_obj *tokenizer; |
6351 | grn_table_get_info(ctx, ii->lexicon, NULL, NULL, &tokenizer, NULL, NULL); |
6352 | if (tokenizer) { |
6353 | grn_obj old_elem, new_elem; |
6354 | unsigned int i, max_n; |
6355 | unsigned int old_n = 0, new_n = 0; |
6356 | if (old) { |
6357 | old_n = grn_vector_size(ctx, old); |
6358 | } |
6359 | if (new) { |
6360 | new_n = grn_vector_size(ctx, new); |
6361 | } |
6362 | max_n = (old_n > new_n) ? old_n : new_n; |
6363 | GRN_OBJ_INIT(&old_elem, GRN_BULK, GRN_OBJ_DO_SHALLOW_COPY, old->header.domain); |
6364 | GRN_OBJ_INIT(&new_elem, GRN_BULK, GRN_OBJ_DO_SHALLOW_COPY, new->header.domain); |
6365 | for (i = 0; i < max_n; i++) { |
6366 | grn_rc rc; |
6367 | grn_obj *old_p = NULL, *new_p = NULL; |
6368 | if (i < old_n) { |
6369 | const char *str; |
6370 | unsigned int size = grn_vector_get_element(ctx, old, i, &str, NULL, NULL); |
6371 | GRN_TEXT_SET_REF(&old_elem, str, size); |
6372 | old_p = &old_elem; |
6373 | } |
6374 | if (i < new_n) { |
6375 | const char *str; |
6376 | unsigned int size = grn_vector_get_element(ctx, new, i, &str, NULL, NULL); |
6377 | GRN_TEXT_SET_REF(&new_elem, str, size); |
6378 | new_p = &new_elem; |
6379 | } |
6380 | rc = grn_ii_column_update(ctx, ii, rid, section + i, old_p, new_p, posting); |
6381 | if (rc != GRN_SUCCESS) { |
6382 | break; |
6383 | } |
6384 | } |
6385 | GRN_OBJ_FIN(ctx, &old_elem); |
6386 | GRN_OBJ_FIN(ctx, &new_elem); |
6387 | return ctx->rc; |
6388 | } |
6389 | } |
6390 | } |
6391 | if (posting) { |
6392 | GRN_RECORD_INIT(&buf, GRN_OBJ_VECTOR, grn_obj_id(ctx, ii->lexicon)); |
6393 | post = &buf; |
6394 | } |
6395 | if (grn_io_lock(ctx, ii->seg, grn_lock_timeout)) { return ctx->rc; } |
6396 | if (new) { |
6397 | unsigned char type = (ii->obj.header.domain == new->header.domain) |
6398 | ? GRN_UVECTOR |
6399 | : new->header.type; |
6400 | switch (type) { |
6401 | case GRN_BULK : |
6402 | { |
6403 | if (grn_bulk_is_zero(ctx, new)) { |
6404 | do_grn_ii_updspec_cmp = GRN_FALSE; |
6405 | } |
6406 | new_ = new; |
6407 | GRN_OBJ_INIT(&newv, GRN_VECTOR, GRN_OBJ_DO_SHALLOW_COPY, GRN_DB_TEXT); |
6408 | newv.u.v.body = new; |
6409 | new = &newv; |
6410 | grn_vector_delimit(ctx, new, 0, GRN_ID_NIL); |
6411 | if (new_ != newvalue) { grn_obj_close(ctx, new_); } |
6412 | } |
6413 | /* fallthru */ |
6414 | case GRN_VECTOR : |
6415 | new_ = new; |
6416 | new = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), |
6417 | sizeof(grn_ii_updspec *), |
6418 | GRN_HASH_TINY); |
6419 | if (!new) { |
6420 | DEFINE_NAME(ii); |
6421 | MERR("[ii][column][update][new][vector] failed to create a hash table: " |
6422 | "<%.*s>: " , |
6423 | name_size, name); |
6424 | } else { |
6425 | grn_vector2updspecs(ctx, ii, rid, section, new_, new, |
6426 | GRN_TOKEN_ADD, post); |
6427 | } |
6428 | if (new_ != newvalue) { grn_obj_close(ctx, new_); } |
6429 | if (ctx->rc != GRN_SUCCESS) { goto exit; } |
6430 | break; |
6431 | case GRN_UVECTOR : |
6432 | new_ = new; |
6433 | new = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), |
6434 | sizeof(grn_ii_updspec *), |
6435 | GRN_HASH_TINY); |
6436 | if (!new) { |
6437 | DEFINE_NAME(ii); |
6438 | MERR("[ii][column][update][new][uvector] failed to create a hash table: " |
6439 | "<%.*s>: " , |
6440 | name_size, name); |
6441 | } else { |
6442 | if (new_->header.type == GRN_UVECTOR) { |
6443 | grn_uvector2updspecs(ctx, ii, rid, section, new_, new, |
6444 | GRN_TOKEN_ADD, post); |
6445 | } else { |
6446 | grn_obj uvector; |
6447 | unsigned int weight = 0; |
6448 | GRN_VALUE_FIX_SIZE_INIT(&uvector, GRN_OBJ_VECTOR, |
6449 | new_->header.domain); |
6450 | if (new_->header.impl_flags & GRN_OBJ_WITH_WEIGHT) { |
6451 | uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; |
6452 | } |
6453 | grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(new_), |
6454 | weight); |
6455 | grn_uvector2updspecs(ctx, ii, rid, section, &uvector, new, |
6456 | GRN_TOKEN_ADD, post); |
6457 | GRN_OBJ_FIN(ctx, &uvector); |
6458 | } |
6459 | } |
6460 | if (new_ != newvalue) { grn_obj_close(ctx, new_); } |
6461 | if (ctx->rc != GRN_SUCCESS) { goto exit; } |
6462 | break; |
6463 | case GRN_TABLE_HASH_KEY : |
6464 | break; |
6465 | default : |
6466 | { |
6467 | DEFINE_NAME(ii); |
6468 | ERR(GRN_INVALID_ARGUMENT, |
6469 | "[ii][column][update][new] invalid object: " |
6470 | "<%.*s>: " |
6471 | "<%s>(%#x)" , |
6472 | name_size, name, |
6473 | grn_obj_type_to_string(type), |
6474 | type); |
6475 | } |
6476 | goto exit; |
6477 | } |
6478 | } |
6479 | if (posting) { |
6480 | grn_ii_updspec *u_; |
6481 | uint32_t offset = 0; |
6482 | grn_id tid_ = 0, gap, tid, *tpe; |
6483 | grn_table_sort_optarg arg = {GRN_TABLE_SORT_ASC| |
6484 | GRN_TABLE_SORT_AS_NUMBER| |
6485 | GRN_TABLE_SORT_AS_UNSIGNED, NULL, NULL,0 }; |
6486 | grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), 0); |
6487 | grn_hash_sort(ctx, (grn_hash *)new, -1, sorted, &arg); |
6488 | GRN_TEXT_PUT(ctx, posting, ((grn_hash *)new)->n_entries, sizeof(uint32_t)); |
6489 | GRN_ARRAY_EACH(ctx, sorted, 0, 0, id, &tp, { |
6490 | grn_hash_get_key(ctx, (grn_hash *)new, *tp, &tid, sizeof(grn_id)); |
6491 | gap = tid - tid_; |
6492 | GRN_TEXT_PUT(ctx, posting, &gap, sizeof(grn_id)); |
6493 | tid_ = tid; |
6494 | }); |
6495 | GRN_ARRAY_EACH(ctx, sorted, 0, 0, id, &tp, { |
6496 | grn_hash_get_value(ctx, (grn_hash *)new, *tp, &u_); |
6497 | u_->offset = offset++; |
6498 | GRN_TEXT_PUT(ctx, posting, &u_->tf, sizeof(int32_t)); |
6499 | }); |
6500 | tpe = (grn_id *)GRN_BULK_CURR(post); |
6501 | for (tp = (grn_id *)GRN_BULK_HEAD(post); tp < tpe; tp++) { |
6502 | grn_hash_get(ctx, (grn_hash *)new, (void *)tp, sizeof(grn_id), |
6503 | (void **)&u); |
6504 | GRN_TEXT_PUT(ctx, posting, &(*u)->offset, sizeof(int32_t)); |
6505 | } |
6506 | GRN_OBJ_FIN(ctx, post); |
6507 | grn_array_close(ctx, sorted); |
6508 | } |
6509 | |
6510 | if (old) { |
6511 | unsigned char type = (ii->obj.header.domain == old->header.domain) |
6512 | ? GRN_UVECTOR |
6513 | : old->header.type; |
6514 | switch (type) { |
6515 | case GRN_BULK : |
6516 | { |
6517 | // const char *str = GRN_BULK_HEAD(old); |
6518 | // unsigned int str_len = GRN_BULK_VSIZE(old); |
6519 | old_ = old; |
6520 | GRN_OBJ_INIT(&oldv, GRN_VECTOR, GRN_OBJ_DO_SHALLOW_COPY, GRN_DB_TEXT); |
6521 | oldv.u.v.body = old; |
6522 | old = &oldv; |
6523 | grn_vector_delimit(ctx, old, 0, GRN_ID_NIL); |
6524 | if (old_ != oldvalue) { grn_obj_close(ctx, old_); } |
6525 | } |
6526 | /* fallthru */ |
6527 | case GRN_VECTOR : |
6528 | old_ = old; |
6529 | old = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), |
6530 | sizeof(grn_ii_updspec *), |
6531 | GRN_HASH_TINY); |
6532 | if (!old) { |
6533 | DEFINE_NAME(ii); |
6534 | MERR("[ii][column][update][old][vector] failed to create a hash table: " |
6535 | "<%.*s>: " , |
6536 | name_size, name); |
6537 | } else { |
6538 | grn_vector2updspecs(ctx, ii, rid, section, old_, old, |
6539 | GRN_TOKEN_DEL, NULL); |
6540 | } |
6541 | if (old_ != oldvalue) { grn_obj_close(ctx, old_); } |
6542 | if (ctx->rc != GRN_SUCCESS) { goto exit; } |
6543 | break; |
6544 | case GRN_UVECTOR : |
6545 | old_ = old; |
6546 | old = (grn_obj *)grn_hash_create(ctx, NULL, sizeof(grn_id), |
6547 | sizeof(grn_ii_updspec *), |
6548 | GRN_HASH_TINY); |
6549 | if (!old) { |
6550 | DEFINE_NAME(ii); |
6551 | MERR("[ii][column][update][old][uvector] failed to create a hash table: " |
6552 | "<%.*s>: " , |
6553 | name_size, name); |
6554 | } else { |
6555 | if (old_->header.type == GRN_UVECTOR) { |
6556 | grn_uvector2updspecs(ctx, ii, rid, section, old_, old, |
6557 | GRN_TOKEN_DEL, NULL); |
6558 | } else { |
6559 | grn_obj uvector; |
6560 | unsigned int weight = 0; |
6561 | GRN_VALUE_FIX_SIZE_INIT(&uvector, GRN_OBJ_VECTOR, |
6562 | old_->header.domain); |
6563 | if (old_->header.impl_flags & GRN_OBJ_WITH_WEIGHT) { |
6564 | uvector.header.impl_flags |= GRN_OBJ_WITH_WEIGHT; |
6565 | } |
6566 | grn_uvector_add_element(ctx, &uvector, GRN_RECORD_VALUE(old_), |
6567 | weight); |
6568 | grn_uvector2updspecs(ctx, ii, rid, section, &uvector, old, |
6569 | GRN_TOKEN_DEL, NULL); |
6570 | GRN_OBJ_FIN(ctx, &uvector); |
6571 | } |
6572 | } |
6573 | if (old_ != oldvalue) { grn_obj_close(ctx, old_); } |
6574 | if (ctx->rc != GRN_SUCCESS) { goto exit; } |
6575 | break; |
6576 | case GRN_TABLE_HASH_KEY : |
6577 | break; |
6578 | default : |
6579 | { |
6580 | DEFINE_NAME(ii); |
6581 | ERR(GRN_INVALID_ARGUMENT, |
6582 | "[ii][column][update][old] invalid object: " |
6583 | "<%.*s>: " |
6584 | "<%s>(%#x)" , |
6585 | name_size, name, |
6586 | grn_obj_type_to_string(type), |
6587 | type); |
6588 | } |
6589 | goto exit; |
6590 | } |
6591 | } |
6592 | |
6593 | if (old) { |
6594 | grn_id eid; |
6595 | grn_hash *o = (grn_hash *)old; |
6596 | grn_hash *n = (grn_hash *)new; |
6597 | GRN_HASH_EACH(ctx, o, id, &tp, NULL, &u, { |
6598 | if (n && (eid = grn_hash_get(ctx, n, tp, sizeof(grn_id), |
6599 | (void **) &un))) { |
6600 | if (do_grn_ii_updspec_cmp && !grn_ii_updspec_cmp(*u, *un)) { |
6601 | grn_ii_updspec_close(ctx, *un); |
6602 | grn_hash_delete_by_id(ctx, n, eid, NULL); |
6603 | } |
6604 | } else { |
6605 | grn_ii_delete_one(ctx, ii, *tp, *u, n); |
6606 | } |
6607 | grn_ii_updspec_close(ctx, *u); |
6608 | if (ctx->rc != GRN_SUCCESS) { |
6609 | break; |
6610 | } |
6611 | }); |
6612 | } |
6613 | if (new) { |
6614 | grn_hash *n = (grn_hash *)new; |
6615 | GRN_HASH_EACH(ctx, n, id, &tp, NULL, &u, { |
6616 | grn_ii_update_one(ctx, ii, *tp, *u, n); |
6617 | grn_ii_updspec_close(ctx, *u); |
6618 | if (ctx->rc != GRN_SUCCESS) { |
6619 | break; |
6620 | } |
6621 | }); |
6622 | } else { |
6623 | if (!section) { |
6624 | /* todo: delete key when all sections deleted */ |
6625 | } |
6626 | } |
6627 | exit : |
6628 | grn_io_unlock(ii->seg); |
6629 | if (old && old != oldvalue) { grn_obj_close(ctx, old); } |
6630 | if (new && new != newvalue) { grn_obj_close(ctx, new); } |
6631 | return ctx->rc; |
6632 | } |
6633 | |
6634 | /* token_info */ |
6635 | |
6636 | typedef struct { |
6637 | cursor_heap *cursors; |
6638 | int offset; |
6639 | int pos; |
6640 | int size; |
6641 | int ntoken; |
6642 | grn_posting *p; |
6643 | } token_info; |
6644 | |
6645 | #define EX_NONE 0 |
6646 | #define EX_PREFIX 1 |
6647 | #define EX_SUFFIX 2 |
6648 | #define EX_BOTH 3 |
6649 | #define EX_FUZZY 4 |
6650 | |
6651 | inline static void |
6652 | token_info_expand_both(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, |
6653 | const char *key, unsigned int key_size, token_info *ti) |
6654 | { |
6655 | int s = 0; |
6656 | grn_hash *h, *g; |
6657 | uint32_t *offset2; |
6658 | grn_hash_cursor *c; |
6659 | grn_id *tp, *tq; |
6660 | if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { |
6661 | grn_table_search(ctx, lexicon, key, key_size, |
6662 | GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); |
6663 | if (GRN_HASH_SIZE(h)) { |
6664 | if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h) + 256))) { |
6665 | if ((c = grn_hash_cursor_open(ctx, h, NULL, 0, NULL, 0, 0, -1, 0))) { |
6666 | uint32_t key2_size; |
6667 | const char *key2; |
6668 | while (grn_hash_cursor_next(ctx, c)) { |
6669 | grn_hash_cursor_get_key(ctx, c, (void **) &tp); |
6670 | key2 = _grn_table_key(ctx, lexicon, *tp, &key2_size); |
6671 | if (!key2) { break; } |
6672 | if ((lexicon->header.type != GRN_TABLE_PAT_KEY) || |
6673 | !(lexicon->header.flags & GRN_OBJ_KEY_WITH_SIS) || |
6674 | key2_size <= 2) { // todo: refine |
6675 | if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { |
6676 | cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, 0, GRN_ID_NIL); |
6677 | ti->ntoken++; |
6678 | ti->size += s; |
6679 | } |
6680 | } else { |
6681 | if ((g = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, |
6682 | GRN_HASH_TINY))) { |
6683 | grn_pat_suffix_search(ctx, (grn_pat *)lexicon, key2, key2_size, |
6684 | g); |
6685 | GRN_HASH_EACH(ctx, g, id, &tq, NULL, &offset2, { |
6686 | if ((s = grn_ii_estimate_size(ctx, ii, *tq))) { |
6687 | cursor_heap_push(ctx, ti->cursors, ii, *tq, |
6688 | /* *offset2 */ 0, 0, GRN_ID_NIL); |
6689 | ti->ntoken++; |
6690 | ti->size += s; |
6691 | } |
6692 | }); |
6693 | grn_hash_close(ctx, g); |
6694 | } |
6695 | } |
6696 | } |
6697 | grn_hash_cursor_close(ctx, c); |
6698 | } |
6699 | } |
6700 | } |
6701 | grn_hash_close(ctx, h); |
6702 | } |
6703 | } |
6704 | |
6705 | inline static grn_rc |
6706 | token_info_close(grn_ctx *ctx, token_info *ti) |
6707 | { |
6708 | cursor_heap_close(ctx, ti->cursors); |
6709 | GRN_FREE(ti); |
6710 | return GRN_SUCCESS; |
6711 | } |
6712 | |
6713 | inline static token_info * |
6714 | token_info_open(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, |
6715 | const char *key, unsigned int key_size, uint32_t offset, |
6716 | int mode, grn_fuzzy_search_optarg *args, grn_id min) |
6717 | { |
6718 | int s = 0; |
6719 | grn_hash *h; |
6720 | token_info *ti; |
6721 | grn_id tid; |
6722 | grn_id *tp; |
6723 | if (!key) { return NULL; } |
6724 | if (!(ti = GRN_MALLOC(sizeof(token_info)))) { return NULL; } |
6725 | ti->cursors = NULL; |
6726 | ti->size = 0; |
6727 | ti->ntoken = 0; |
6728 | ti->offset = offset; |
6729 | switch (mode) { |
6730 | case EX_BOTH : |
6731 | token_info_expand_both(ctx, lexicon, ii, key, key_size, ti); |
6732 | break; |
6733 | case EX_NONE : |
6734 | if ((tid = grn_table_get(ctx, lexicon, key, key_size)) && |
6735 | (s = grn_ii_estimate_size(ctx, ii, tid)) && |
6736 | (ti->cursors = cursor_heap_open(ctx, 1))) { |
6737 | cursor_heap_push(ctx, ti->cursors, ii, tid, 0, 0, min); |
6738 | ti->ntoken++; |
6739 | ti->size = s; |
6740 | } |
6741 | break; |
6742 | case EX_PREFIX : |
6743 | if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { |
6744 | grn_table_search(ctx, lexicon, key, key_size, |
6745 | GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); |
6746 | if (GRN_HASH_SIZE(h)) { |
6747 | if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { |
6748 | GRN_HASH_EACH(ctx, h, id, &tp, NULL, NULL, { |
6749 | if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { |
6750 | cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, 0, min); |
6751 | ti->ntoken++; |
6752 | ti->size += s; |
6753 | } |
6754 | }); |
6755 | } |
6756 | } |
6757 | grn_hash_close(ctx, h); |
6758 | } |
6759 | break; |
6760 | case EX_SUFFIX : |
6761 | if ((h = grn_hash_create(ctx, NULL, sizeof(grn_id), 0, 0))) { |
6762 | grn_table_search(ctx, lexicon, key, key_size, |
6763 | GRN_OP_SUFFIX, (grn_obj *)h, GRN_OP_OR); |
6764 | if (GRN_HASH_SIZE(h)) { |
6765 | if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { |
6766 | uint32_t *offset2; |
6767 | GRN_HASH_EACH(ctx, h, id, &tp, NULL, &offset2, { |
6768 | if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { |
6769 | cursor_heap_push(ctx, ti->cursors, ii, *tp, /* *offset2 */ 0, 0, min); |
6770 | ti->ntoken++; |
6771 | ti->size += s; |
6772 | } |
6773 | }); |
6774 | } |
6775 | } |
6776 | grn_hash_close(ctx, h); |
6777 | } |
6778 | break; |
6779 | case EX_FUZZY : |
6780 | if ((h = (grn_hash *)grn_table_create(ctx, NULL, 0, NULL, |
6781 | GRN_OBJ_TABLE_HASH_KEY|GRN_OBJ_WITH_SUBREC, |
6782 | grn_ctx_at(ctx, GRN_DB_UINT32), NULL))) { |
6783 | grn_table_fuzzy_search(ctx, lexicon, key, key_size, |
6784 | args, (grn_obj *)h, GRN_OP_OR); |
6785 | if (GRN_HASH_SIZE(h)) { |
6786 | if ((ti->cursors = cursor_heap_open(ctx, GRN_HASH_SIZE(h)))) { |
6787 | grn_rset_recinfo *ri; |
6788 | GRN_HASH_EACH(ctx, h, id, &tp, NULL, (void **)&ri, { |
6789 | if ((s = grn_ii_estimate_size(ctx, ii, *tp))) { |
6790 | cursor_heap_push(ctx, ti->cursors, ii, *tp, 0, ri->score - 1, min); |
6791 | ti->ntoken++; |
6792 | ti->size += s; |
6793 | } |
6794 | }); |
6795 | } |
6796 | } |
6797 | grn_obj_close(ctx, (grn_obj *)h); |
6798 | } |
6799 | break; |
6800 | } |
6801 | if (cursor_heap_push2(ti->cursors)) { |
6802 | token_info_close(ctx, ti); |
6803 | return NULL; |
6804 | } |
6805 | { |
6806 | grn_ii_cursor *ic; |
6807 | if (ti->cursors && (ic = cursor_heap_min(ti->cursors))) { |
6808 | grn_posting *p = ic->post; |
6809 | ti->pos = p->pos - ti->offset; |
6810 | ti->p = p; |
6811 | } else { |
6812 | token_info_close(ctx, ti); |
6813 | ti = NULL; |
6814 | } |
6815 | } |
6816 | return ti; |
6817 | } |
6818 | |
6819 | static inline grn_rc |
6820 | token_info_skip(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid) |
6821 | { |
6822 | grn_ii_cursor *c; |
6823 | grn_posting *p; |
6824 | for (;;) { |
6825 | if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } |
6826 | p = c->post; |
6827 | if (p->rid > rid || (p->rid == rid && p->sid >= sid)) { break; } |
6828 | cursor_heap_pop(ctx, ti->cursors, rid); |
6829 | } |
6830 | ti->pos = p->pos - ti->offset; |
6831 | ti->p = p; |
6832 | return GRN_SUCCESS; |
6833 | } |
6834 | |
6835 | static inline grn_rc |
6836 | token_info_skip_pos(grn_ctx *ctx, token_info *ti, uint32_t rid, uint32_t sid, uint32_t pos) |
6837 | { |
6838 | grn_ii_cursor *c; |
6839 | grn_posting *p; |
6840 | pos += ti->offset; |
6841 | for (;;) { |
6842 | if (!(c = cursor_heap_min(ti->cursors))) { return GRN_END_OF_DATA; } |
6843 | p = c->post; |
6844 | if (p->rid != rid || p->sid != sid || p->pos >= pos) { break; } |
6845 | cursor_heap_pop_pos(ctx, ti->cursors); |
6846 | } |
6847 | ti->pos = p->pos - ti->offset; |
6848 | ti->p = p; |
6849 | return GRN_SUCCESS; |
6850 | } |
6851 | |
6852 | inline static int |
6853 | token_compare(const void *a, const void *b) |
6854 | { |
6855 | const token_info *t1 = *((token_info **)a), *t2 = *((token_info **)b); |
6856 | return t1->size - t2->size; |
6857 | } |
6858 | |
6859 | #define TOKEN_CANDIDATE_NODE_SIZE 32 |
6860 | #define TOKEN_CANDIDATE_ADJACENT_MAX_SIZE 16 |
6861 | #define TOKEN_CANDIDATE_QUEUE_SIZE 64 |
6862 | #define TOKEN_CANDIDATE_SIZE 16 |
6863 | |
6864 | typedef struct { |
6865 | grn_id tid; |
6866 | const unsigned char *token; |
6867 | uint32_t token_size; |
6868 | int32_t pos; |
6869 | grn_token_cursor_status status; |
6870 | int ef; |
6871 | uint32_t estimated_size; |
6872 | uint8_t adjacent[TOKEN_CANDIDATE_ADJACENT_MAX_SIZE]; /* Index of adjacent node from top */ |
6873 | uint8_t n_adjacent; |
6874 | } token_candidate_node; |
6875 | |
6876 | typedef struct { |
6877 | uint32_t *candidates; /* Standing bits indicate index of token_candidate_node */ |
6878 | int top; |
6879 | int rear; |
6880 | int size; |
6881 | } token_candidate_queue; |
6882 | |
6883 | inline static void |
6884 | token_candidate_adjacent_set(grn_ctx *ctx, grn_token_cursor *token_cursor, |
6885 | token_candidate_node *top, token_candidate_node *curr) |
6886 | { |
6887 | grn_bool exists_adjacent = GRN_FALSE; |
6888 | token_candidate_node *adj; |
6889 | for (adj = top; adj < curr; adj++) { |
6890 | if (token_cursor->curr <= adj->token + adj->token_size) { |
6891 | if (adj->n_adjacent < TOKEN_CANDIDATE_ADJACENT_MAX_SIZE) { |
6892 | adj->adjacent[adj->n_adjacent] = curr - top; |
6893 | adj->n_adjacent++; |
6894 | exists_adjacent = GRN_TRUE; |
6895 | } |
6896 | } |
6897 | } |
6898 | if (!exists_adjacent) { |
6899 | adj = curr - 1; |
6900 | if (adj->n_adjacent < TOKEN_CANDIDATE_ADJACENT_MAX_SIZE) { |
6901 | adj->adjacent[adj->n_adjacent] = curr - top; |
6902 | adj->n_adjacent++; |
6903 | } |
6904 | } |
6905 | } |
6906 | |
6907 | inline static grn_rc |
6908 | token_candidate_init(grn_ctx *ctx, grn_ii *ii, grn_token_cursor *token_cursor, |
6909 | grn_id tid, int ef, token_candidate_node **nodes, int *n_nodes, |
6910 | uint32_t *max_estimated_size) |
6911 | { |
6912 | grn_rc rc; |
6913 | token_candidate_node *top, *curr; |
6914 | int size = TOKEN_CANDIDATE_NODE_SIZE; |
6915 | |
6916 | *nodes = GRN_MALLOC(TOKEN_CANDIDATE_NODE_SIZE * sizeof(token_candidate_node)); |
6917 | if (!*nodes) { |
6918 | return GRN_NO_MEMORY_AVAILABLE; |
6919 | } |
6920 | top = *nodes; |
6921 | curr = top; |
6922 | |
6923 | #define TOKEN_CANDIDATE_NODE_SET() { \ |
6924 | curr->tid = tid; \ |
6925 | curr->token = token_cursor->curr; \ |
6926 | curr->token_size = token_cursor->curr_size; \ |
6927 | curr->pos = token_cursor->pos; \ |
6928 | curr->status = token_cursor->status; \ |
6929 | curr->ef = ef; \ |
6930 | curr->estimated_size = grn_ii_estimate_size(ctx, ii, tid); \ |
6931 | curr->n_adjacent = 0; \ |
6932 | } |
6933 | TOKEN_CANDIDATE_NODE_SET(); |
6934 | GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][overlap_token_skip] tid=%u pos=%d estimated_size=%u" , |
6935 | curr->tid, curr->pos, curr->estimated_size); |
6936 | *max_estimated_size = curr->estimated_size; |
6937 | curr++; |
6938 | |
6939 | while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { |
6940 | if (curr - top >= size) { |
6941 | if (!(*nodes = GRN_REALLOC(*nodes, |
6942 | (curr - top + TOKEN_CANDIDATE_NODE_SIZE) * sizeof(token_candidate_node)))) { |
6943 | return GRN_NO_MEMORY_AVAILABLE; |
6944 | } |
6945 | top = *nodes; |
6946 | curr = top + size; |
6947 | size += TOKEN_CANDIDATE_NODE_SIZE; |
6948 | } |
6949 | tid = grn_token_cursor_next(ctx, token_cursor); |
6950 | if (token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { |
6951 | if (token_cursor->force_prefix) { ef |= EX_PREFIX; } |
6952 | TOKEN_CANDIDATE_NODE_SET(); |
6953 | token_candidate_adjacent_set(ctx, token_cursor, top, curr); |
6954 | if (curr->estimated_size > *max_estimated_size) { |
6955 | *max_estimated_size = curr->estimated_size; |
6956 | } |
6957 | curr++; |
6958 | } |
6959 | } |
6960 | *n_nodes = curr - top; |
6961 | rc = GRN_SUCCESS; |
6962 | return rc; |
6963 | #undef TOKEN_CANDIDATE_NODE_SET |
6964 | } |
6965 | |
6966 | inline static grn_rc |
6967 | token_candidate_queue_init(grn_ctx *ctx, token_candidate_queue *q) |
6968 | { |
6969 | q->top = 0; |
6970 | q->rear = 0; |
6971 | q->size = TOKEN_CANDIDATE_QUEUE_SIZE; |
6972 | |
6973 | q->candidates = GRN_MALLOC(TOKEN_CANDIDATE_QUEUE_SIZE * sizeof(uint32_t)); |
6974 | if (!q->candidates) { |
6975 | q->size = 0; |
6976 | return GRN_NO_MEMORY_AVAILABLE; |
6977 | } |
6978 | return GRN_SUCCESS; |
6979 | } |
6980 | |
6981 | inline static grn_rc |
6982 | token_candidate_enqueue(grn_ctx *ctx, token_candidate_queue *q, uint32_t candidate) |
6983 | { |
6984 | if (q->rear >= q->size) { |
6985 | if (!(q->candidates = |
6986 | GRN_REALLOC(q->candidates, |
6987 | (q->rear + TOKEN_CANDIDATE_QUEUE_SIZE) * sizeof(uint32_t)))) { |
6988 | q->size = 0; |
6989 | return GRN_NO_MEMORY_AVAILABLE; |
6990 | } |
6991 | q->size += TOKEN_CANDIDATE_QUEUE_SIZE; |
6992 | } |
6993 | *(q->candidates + q->rear) = candidate; |
6994 | q->rear++; |
6995 | return GRN_SUCCESS; |
6996 | } |
6997 | |
6998 | inline static grn_rc |
6999 | token_candidate_dequeue(grn_ctx *ctx, token_candidate_queue *q, uint32_t *candidate) |
7000 | { |
7001 | if (q->top == q->rear) { |
7002 | return GRN_END_OF_DATA; |
7003 | } |
7004 | *candidate = *(q->candidates + q->top); |
7005 | q->top++; |
7006 | return GRN_SUCCESS; |
7007 | } |
7008 | |
7009 | inline static void |
7010 | token_candidate_queue_fin(grn_ctx *ctx, token_candidate_queue *q) |
7011 | { |
7012 | GRN_FREE(q->candidates); |
7013 | } |
7014 | |
7015 | inline static token_candidate_node* |
7016 | token_candidate_last_node(grn_ctx *ctx, token_candidate_node *nodes, uint32_t candidate, int offset) |
7017 | { |
7018 | int i; |
7019 | GRN_BIT_SCAN_REV(candidate, i); |
7020 | return nodes + i + offset; |
7021 | } |
7022 | |
7023 | inline static uint64_t |
7024 | token_candidate_score(grn_ctx *ctx, token_candidate_node *nodes, uint32_t candidate, |
7025 | int offset, uint32_t max_estimated_size) |
7026 | { |
7027 | int i, last; |
7028 | uint64_t score = 0; |
7029 | GRN_BIT_SCAN_REV(candidate, last); |
7030 | for (i = 0; i <= last; i++) { |
7031 | if (candidate & (1 << i)) { |
7032 | token_candidate_node *node = nodes + i + offset; |
7033 | if (node->estimated_size > 0) { |
7034 | score += max_estimated_size / node->estimated_size; |
7035 | } |
7036 | } |
7037 | } |
7038 | return score; |
7039 | } |
7040 | |
7041 | inline static grn_rc |
7042 | token_candidate_select(grn_ctx *ctx, token_candidate_node *nodes, |
7043 | int offset, int limit, int end, |
7044 | uint32_t *selected_candidate, uint32_t max_estimated_size) |
7045 | { |
7046 | grn_rc rc; |
7047 | token_candidate_queue q; |
7048 | uint32_t candidate; |
7049 | uint64_t max_score = 0; |
7050 | int i, min_n_nodes = 0; |
7051 | |
7052 | if (offset + limit > end) { |
7053 | limit = end - offset; |
7054 | } |
7055 | rc = token_candidate_queue_init(ctx, &q); |
7056 | if (rc != GRN_SUCCESS) { |
7057 | return rc; |
7058 | } |
7059 | rc = token_candidate_enqueue(ctx, &q, 1); |
7060 | if (rc != GRN_SUCCESS) { |
7061 | goto exit; |
7062 | } |
7063 | while (token_candidate_dequeue(ctx, &q, &candidate) != GRN_END_OF_DATA) { |
7064 | token_candidate_node *candidate_last_node = |
7065 | token_candidate_last_node(ctx, nodes, candidate, offset); |
7066 | for (i = 0; i < candidate_last_node->n_adjacent; i++) { |
7067 | int adjacent, n_nodes = 0; |
7068 | uint32_t new_candidate; |
7069 | adjacent = candidate_last_node->adjacent[i] - offset; |
7070 | if (adjacent > limit) { |
7071 | break; |
7072 | } |
7073 | new_candidate = candidate | (1 << adjacent); |
7074 | GET_NUM_BITS(new_candidate, n_nodes); |
7075 | if (min_n_nodes > 0 && n_nodes > min_n_nodes + 1) { |
7076 | goto exit; |
7077 | } |
7078 | rc = token_candidate_enqueue(ctx, &q, new_candidate); |
7079 | if (rc != GRN_SUCCESS) { |
7080 | goto exit; |
7081 | } |
7082 | if (adjacent == limit) { |
7083 | if (min_n_nodes == 0) { |
7084 | min_n_nodes = n_nodes; |
7085 | } |
7086 | if (n_nodes >= min_n_nodes && n_nodes <= min_n_nodes + 1) { |
7087 | uint64_t score; |
7088 | score = token_candidate_score(ctx, nodes, new_candidate, offset, max_estimated_size); |
7089 | if (score > max_score) { |
7090 | max_score = score; |
7091 | *selected_candidate = new_candidate; |
7092 | } |
7093 | } |
7094 | } |
7095 | } |
7096 | } |
7097 | rc = GRN_SUCCESS; |
7098 | exit : |
7099 | token_candidate_queue_fin(ctx, &q); |
7100 | return rc; |
7101 | } |
7102 | |
7103 | inline static grn_rc |
7104 | token_candidate_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, |
7105 | token_info **tis, uint32_t *n, |
7106 | token_candidate_node *nodes, uint32_t selected_candidate, |
7107 | int offset, grn_id min) |
7108 | { |
7109 | grn_rc rc = GRN_END_OF_DATA; |
7110 | token_info *ti; |
7111 | const char *key; |
7112 | uint32_t size; |
7113 | int i, last = 0; |
7114 | GRN_BIT_SCAN_REV(selected_candidate, last); |
7115 | for (i = 1; i <= last; i++) { |
7116 | if (selected_candidate & (1 << i)) { |
7117 | token_candidate_node *node = nodes + i + offset; |
7118 | switch (node->status) { |
7119 | case GRN_TOKEN_CURSOR_DOING : |
7120 | key = _grn_table_key(ctx, lexicon, node->tid, &size); |
7121 | ti = token_info_open(ctx, lexicon, ii, key, size, node->pos, |
7122 | EX_NONE, NULL, min); |
7123 | break; |
7124 | case GRN_TOKEN_CURSOR_DONE : |
7125 | if (node->tid) { |
7126 | key = _grn_table_key(ctx, lexicon, node->tid, &size); |
7127 | ti = token_info_open(ctx, lexicon, ii, key, size, node->pos, |
7128 | node->ef & EX_PREFIX, NULL, min); |
7129 | break; |
7130 | } /* else fallthru */ |
7131 | default : |
7132 | ti = token_info_open(ctx, lexicon, ii, (char *)node->token, |
7133 | node->token_size, node->pos, |
7134 | node->ef & EX_PREFIX, NULL, min); |
7135 | break; |
7136 | } |
7137 | if (!ti) { |
7138 | goto exit; |
7139 | } |
7140 | tis[(*n)++] = ti; |
7141 | GRN_LOG(ctx, GRN_LOG_DEBUG, "[ii][overlap_token_skip] tid=%u pos=%d estimated_size=%u" , |
7142 | node->tid, node->pos, node->estimated_size); |
7143 | } |
7144 | } |
7145 | rc = GRN_SUCCESS; |
7146 | exit : |
7147 | return rc; |
7148 | } |
7149 | |
7150 | inline static grn_rc |
7151 | token_info_build_skipping_overlap(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, |
7152 | token_info **tis, uint32_t *n, |
7153 | grn_token_cursor *token_cursor, |
7154 | grn_id tid, int ef, grn_id min) |
7155 | { |
7156 | grn_rc rc; |
7157 | token_candidate_node *nodes = NULL; |
7158 | int n_nodes = 0, offset = 0, limit = TOKEN_CANDIDATE_SIZE - 1; |
7159 | uint32_t max_estimated_size; |
7160 | |
7161 | rc = token_candidate_init(ctx, ii, token_cursor, tid, ef, &nodes, &n_nodes, &max_estimated_size); |
7162 | if (rc != GRN_SUCCESS) { |
7163 | return rc; |
7164 | } |
7165 | while (offset < n_nodes - 1) { |
7166 | uint32_t selected_candidate = 0; |
7167 | rc = token_candidate_select(ctx, nodes, offset, limit, n_nodes - 1, |
7168 | &selected_candidate, max_estimated_size); |
7169 | if (rc != GRN_SUCCESS) { |
7170 | goto exit; |
7171 | } |
7172 | rc = token_candidate_build(ctx, lexicon, ii, tis, n, nodes, selected_candidate, offset, min); |
7173 | if (rc != GRN_SUCCESS) { |
7174 | goto exit; |
7175 | } |
7176 | offset += limit; |
7177 | } |
7178 | rc = GRN_SUCCESS; |
7179 | exit : |
7180 | if (nodes) { |
7181 | GRN_FREE(nodes); |
7182 | } |
7183 | return rc; |
7184 | } |
7185 | |
7186 | inline static grn_rc |
7187 | token_info_build(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, const char *string, unsigned int string_len, |
7188 | token_info **tis, uint32_t *n, grn_bool *only_skip_token, grn_id min, |
7189 | grn_operator mode) |
7190 | { |
7191 | token_info *ti; |
7192 | const char *key; |
7193 | uint32_t size; |
7194 | grn_rc rc = GRN_END_OF_DATA; |
7195 | unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; |
7196 | grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, |
7197 | string, string_len, |
7198 | GRN_TOKEN_GET, |
7199 | token_flags); |
7200 | *only_skip_token = GRN_FALSE; |
7201 | if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; } |
7202 | if (mode == GRN_OP_UNSPLIT) { |
7203 | if ((ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, |
7204 | token_cursor->orig_blen, 0, EX_BOTH, NULL, min))) { |
7205 | tis[(*n)++] = ti; |
7206 | rc = GRN_SUCCESS; |
7207 | } |
7208 | } else { |
7209 | grn_id tid; |
7210 | int ef; |
7211 | switch (mode) { |
7212 | case GRN_OP_PREFIX : |
7213 | ef = EX_PREFIX; |
7214 | break; |
7215 | case GRN_OP_SUFFIX : |
7216 | ef = EX_SUFFIX; |
7217 | break; |
7218 | case GRN_OP_PARTIAL : |
7219 | ef = EX_BOTH; |
7220 | break; |
7221 | default : |
7222 | ef = EX_NONE; |
7223 | break; |
7224 | } |
7225 | tid = grn_token_cursor_next(ctx, token_cursor); |
7226 | if (token_cursor->force_prefix) { ef |= EX_PREFIX; } |
7227 | switch (token_cursor->status) { |
7228 | case GRN_TOKEN_CURSOR_DOING : |
7229 | key = _grn_table_key(ctx, lexicon, tid, &size); |
7230 | ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, |
7231 | ef & EX_SUFFIX, NULL, min); |
7232 | break; |
7233 | case GRN_TOKEN_CURSOR_DONE : |
7234 | ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, |
7235 | token_cursor->curr_size, 0, ef, NULL, min); |
7236 | /* |
7237 | key = _grn_table_key(ctx, lexicon, tid, &size); |
7238 | ti = token_info_open(ctx, lexicon, ii, token_cursor->curr, token_cursor->curr_size, token_cursor->pos, ef, NULL, GRN_ID_NIL); |
7239 | ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, |
7240 | token_cursor->orig_blen, token_cursor->pos, ef, NULL, GRN_ID_NIL); |
7241 | */ |
7242 | break; |
7243 | case GRN_TOKEN_CURSOR_NOT_FOUND : |
7244 | ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->orig, |
7245 | token_cursor->orig_blen, 0, ef, NULL, min); |
7246 | break; |
7247 | case GRN_TOKEN_CURSOR_DONE_SKIP : |
7248 | *only_skip_token = GRN_TRUE; |
7249 | goto exit; |
7250 | default : |
7251 | goto exit; |
7252 | } |
7253 | if (!ti) { goto exit ; } |
7254 | tis[(*n)++] = ti; |
7255 | |
7256 | if (grn_ii_overlap_token_skip_enable) { |
7257 | rc = token_info_build_skipping_overlap(ctx, lexicon, ii, tis, n, token_cursor, tid, ef, min); |
7258 | goto exit; |
7259 | } |
7260 | |
7261 | while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { |
7262 | tid = grn_token_cursor_next(ctx, token_cursor); |
7263 | if (token_cursor->force_prefix) { ef |= EX_PREFIX; } |
7264 | switch (token_cursor->status) { |
7265 | case GRN_TOKEN_CURSOR_DONE_SKIP : |
7266 | continue; |
7267 | case GRN_TOKEN_CURSOR_DOING : |
7268 | key = _grn_table_key(ctx, lexicon, tid, &size); |
7269 | ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, |
7270 | EX_NONE, NULL, min); |
7271 | break; |
7272 | case GRN_TOKEN_CURSOR_DONE : |
7273 | if (tid) { |
7274 | key = _grn_table_key(ctx, lexicon, tid, &size); |
7275 | ti = token_info_open(ctx, lexicon, ii, key, size, token_cursor->pos, |
7276 | ef & EX_PREFIX, NULL, min); |
7277 | break; |
7278 | } /* else fallthru */ |
7279 | default : |
7280 | ti = token_info_open(ctx, lexicon, ii, (char *)token_cursor->curr, |
7281 | token_cursor->curr_size, token_cursor->pos, |
7282 | ef & EX_PREFIX, NULL, min); |
7283 | break; |
7284 | } |
7285 | if (!ti) { |
7286 | goto exit; |
7287 | } |
7288 | tis[(*n)++] = ti; |
7289 | } |
7290 | rc = GRN_SUCCESS; |
7291 | } |
7292 | exit : |
7293 | grn_token_cursor_close(ctx, token_cursor); |
7294 | return rc; |
7295 | } |
7296 | |
7297 | inline static grn_rc |
7298 | token_info_build_fuzzy(grn_ctx *ctx, grn_obj *lexicon, grn_ii *ii, |
7299 | const char *string, unsigned int string_len, |
7300 | token_info **tis, uint32_t *n, grn_bool *only_skip_token, |
7301 | grn_id min, grn_operator mode, grn_fuzzy_search_optarg *args) |
7302 | { |
7303 | token_info *ti; |
7304 | grn_rc rc = GRN_END_OF_DATA; |
7305 | unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; |
7306 | grn_token_cursor *token_cursor = grn_token_cursor_open(ctx, lexicon, |
7307 | string, string_len, |
7308 | GRN_TOKENIZE_ONLY, |
7309 | token_flags); |
7310 | *only_skip_token = GRN_FALSE; |
7311 | if (!token_cursor) { return GRN_NO_MEMORY_AVAILABLE; } |
7312 | grn_token_cursor_next(ctx, token_cursor); |
7313 | switch (token_cursor->status) { |
7314 | case GRN_TOKEN_CURSOR_DONE_SKIP : |
7315 | *only_skip_token = GRN_TRUE; |
7316 | goto exit; |
7317 | case GRN_TOKEN_CURSOR_DOING : |
7318 | case GRN_TOKEN_CURSOR_DONE : |
7319 | ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, |
7320 | token_cursor->curr_size, token_cursor->pos, EX_FUZZY, |
7321 | args, min); |
7322 | break; |
7323 | default : |
7324 | ti = NULL; |
7325 | break; |
7326 | } |
7327 | if (!ti) { |
7328 | goto exit ; |
7329 | } |
7330 | tis[(*n)++] = ti; |
7331 | while (token_cursor->status == GRN_TOKEN_CURSOR_DOING) { |
7332 | grn_token_cursor_next(ctx, token_cursor); |
7333 | switch (token_cursor->status) { |
7334 | case GRN_TOKEN_CURSOR_DONE_SKIP : |
7335 | continue; |
7336 | case GRN_TOKEN_CURSOR_DOING : |
7337 | case GRN_TOKEN_CURSOR_DONE : |
7338 | ti = token_info_open(ctx, lexicon, ii, (const char *)token_cursor->curr, |
7339 | token_cursor->curr_size, token_cursor->pos, EX_FUZZY, |
7340 | args, min); |
7341 | break; |
7342 | default : |
7343 | break; |
7344 | } |
7345 | if (!ti) { |
7346 | goto exit; |
7347 | } |
7348 | tis[(*n)++] = ti; |
7349 | } |
7350 | rc = GRN_SUCCESS; |
7351 | exit : |
7352 | grn_token_cursor_close(ctx, token_cursor); |
7353 | return rc; |
7354 | } |
7355 | |
7356 | static void |
7357 | token_info_clear_offset(token_info **tis, uint32_t n) |
7358 | { |
7359 | token_info **tie; |
7360 | for (tie = tis + n; tis < tie; tis++) { (*tis)->offset = 0; } |
7361 | } |
7362 | |
7363 | /* select */ |
7364 | |
7365 | inline static void |
7366 | res_add(grn_ctx *ctx, grn_hash *s, grn_rset_posinfo *pi, double score, |
7367 | grn_operator op) |
7368 | { |
7369 | grn_rset_recinfo *ri; |
7370 | switch (op) { |
7371 | case GRN_OP_OR : |
7372 | if (grn_hash_add(ctx, s, pi, s->key_size, (void **)&ri, NULL)) { |
7373 | if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { |
7374 | grn_table_add_subrec((grn_obj *)s, ri, score, pi, 1); |
7375 | } |
7376 | } |
7377 | break; |
7378 | case GRN_OP_AND : |
7379 | if (grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri)) { |
7380 | if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { |
7381 | ri->n_subrecs |= GRN_RSET_UTIL_BIT; |
7382 | grn_table_add_subrec((grn_obj *)s, ri, score, pi, 1); |
7383 | } |
7384 | } |
7385 | break; |
7386 | case GRN_OP_AND_NOT : |
7387 | { |
7388 | grn_id id; |
7389 | if ((id = grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri))) { |
7390 | grn_hash_delete_by_id(ctx, s, id, NULL); |
7391 | } |
7392 | } |
7393 | break; |
7394 | case GRN_OP_ADJUST : |
7395 | if (grn_hash_get(ctx, s, pi, s->key_size, (void **)&ri)) { |
7396 | if (s->obj.header.flags & GRN_OBJ_WITH_SUBREC) { |
7397 | ri->score += score; |
7398 | } |
7399 | } |
7400 | break; |
7401 | default : |
7402 | break; |
7403 | } |
7404 | } |
7405 | |
7406 | grn_rc |
7407 | grn_ii_posting_add(grn_ctx *ctx, grn_posting *pos, grn_hash *s, grn_operator op) |
7408 | { |
7409 | res_add(ctx, s, (grn_rset_posinfo *)(pos), (1 + pos->weight), op); |
7410 | return ctx->rc; |
7411 | } |
7412 | |
7413 | #ifdef USE_BHEAP |
7414 | |
7415 | /* todo */ |
7416 | |
7417 | #else /* USE_BHEAP */ |
7418 | |
7419 | struct _btr_node { |
7420 | struct _btr_node *car; |
7421 | struct _btr_node *cdr; |
7422 | token_info *ti; |
7423 | }; |
7424 | |
7425 | typedef struct _btr_node btr_node; |
7426 | |
7427 | typedef struct { |
7428 | int n; |
7429 | token_info *min; |
7430 | token_info *max; |
7431 | btr_node *root; |
7432 | btr_node *nodes; |
7433 | } btr; |
7434 | |
7435 | inline static void |
7436 | bt_zap(btr *bt) |
7437 | { |
7438 | bt->n = 0; |
7439 | bt->min = NULL; |
7440 | bt->max = NULL; |
7441 | bt->root = NULL; |
7442 | } |
7443 | |
7444 | inline static btr * |
7445 | bt_open(grn_ctx *ctx, int size) |
7446 | { |
7447 | btr *bt = GRN_MALLOC(sizeof(btr)); |
7448 | if (bt) { |
7449 | bt_zap(bt); |
7450 | if (!(bt->nodes = GRN_MALLOC(sizeof(btr_node) * size))) { |
7451 | GRN_FREE(bt); |
7452 | bt = NULL; |
7453 | } |
7454 | } |
7455 | return bt; |
7456 | } |
7457 | |
7458 | inline static void |
7459 | bt_close(grn_ctx *ctx, btr *bt) |
7460 | { |
7461 | if (!bt) { return; } |
7462 | GRN_FREE(bt->nodes); |
7463 | GRN_FREE(bt); |
7464 | } |
7465 | |
7466 | inline static void |
7467 | bt_push(btr *bt, token_info *ti) |
7468 | { |
7469 | int pos = ti->pos, minp = 1, maxp = 1; |
7470 | btr_node *node, *new, **last; |
7471 | new = bt->nodes + bt->n++; |
7472 | new->ti = ti; |
7473 | new->car = NULL; |
7474 | new->cdr = NULL; |
7475 | for (last = &bt->root; (node = *last);) { |
7476 | if (pos < node->ti->pos) { |
7477 | last = &node->car; |
7478 | maxp = 0; |
7479 | } else { |
7480 | last = &node->cdr; |
7481 | minp = 0; |
7482 | } |
7483 | } |
7484 | *last = new; |
7485 | if (minp) { bt->min = ti; } |
7486 | if (maxp) { bt->max = ti; } |
7487 | } |
7488 | |
7489 | inline static void |
7490 | bt_pop(btr *bt) |
7491 | { |
7492 | btr_node *node, *min, *newmin, **last; |
7493 | for (last = &bt->root; (min = *last) && min->car; last = &min->car) ; |
7494 | if (min) { |
7495 | int pos = min->ti->pos, minp = 1, maxp = 1; |
7496 | *last = min->cdr; |
7497 | min->cdr = NULL; |
7498 | for (last = &bt->root; (node = *last);) { |
7499 | if (pos < node->ti->pos) { |
7500 | last = &node->car; |
7501 | maxp = 0; |
7502 | } else { |
7503 | last = &node->cdr; |
7504 | minp = 0; |
7505 | } |
7506 | } |
7507 | *last = min; |
7508 | if (maxp) { bt->max = min->ti; } |
7509 | if (!minp) { |
7510 | for (newmin = bt->root; newmin->car; newmin = newmin->car) ; |
7511 | bt->min = newmin->ti; |
7512 | } |
7513 | } |
7514 | } |
7515 | |
7516 | #endif /* USE_BHEAP */ |
7517 | |
7518 | typedef enum { |
7519 | grn_wv_none = 0, |
7520 | grn_wv_static, |
7521 | grn_wv_dynamic, |
7522 | grn_wv_constant |
7523 | } grn_wv_mode; |
7524 | |
7525 | inline static double |
7526 | get_weight(grn_ctx *ctx, grn_hash *s, grn_id rid, int sid, |
7527 | grn_wv_mode wvm, grn_select_optarg *optarg) |
7528 | { |
7529 | switch (wvm) { |
7530 | case grn_wv_none : |
7531 | return 1; |
7532 | case grn_wv_static : |
7533 | return sid <= optarg->vector_size ? optarg->weight_vector[sid - 1] : 0; |
7534 | case grn_wv_dynamic : |
7535 | /* todo : support hash with keys |
7536 | if (s->keys) { |
7537 | uint32_t key_size; |
7538 | const char *key = _grn_table_key(ctx, s->keys, rid, &key_size); |
7539 | // todo : change grn_select_optarg |
7540 | return key ? optarg->func(s, key, key_size, sid, optarg->func_arg) : 0; |
7541 | } |
7542 | */ |
7543 | /* todo : cast */ |
7544 | return optarg->func(ctx, (void *)s, (void *)(intptr_t)rid, sid, |
7545 | optarg->func_arg); |
7546 | case grn_wv_constant : |
7547 | return optarg->vector_size; |
7548 | default : |
7549 | return 1; |
7550 | } |
7551 | } |
7552 | |
7553 | grn_rc |
7554 | grn_ii_similar_search(grn_ctx *ctx, grn_ii *ii, |
7555 | const char *string, unsigned int string_len, |
7556 | grn_hash *s, grn_operator op, grn_select_optarg *optarg) |
7557 | { |
7558 | int *w1, limit; |
7559 | grn_id tid, *tp, max_size; |
7560 | grn_rc rc = GRN_SUCCESS; |
7561 | grn_hash *h; |
7562 | grn_token_cursor *token_cursor; |
7563 | unsigned int token_flags = GRN_TOKEN_CURSOR_ENABLE_TOKENIZED_DELIMITER; |
7564 | grn_obj *lexicon = ii->lexicon; |
7565 | if (!lexicon || !ii || !string || !string_len || !s || !optarg) { |
7566 | return GRN_INVALID_ARGUMENT; |
7567 | } |
7568 | if (!(h = grn_hash_create(ctx, NULL, sizeof(grn_id), sizeof(int), 0))) { |
7569 | return GRN_NO_MEMORY_AVAILABLE; |
7570 | } |
7571 | if (!(token_cursor = grn_token_cursor_open(ctx, lexicon, string, string_len, |
7572 | GRN_TOKEN_GET, token_flags))) { |
7573 | grn_hash_close(ctx, h); |
7574 | return GRN_NO_MEMORY_AVAILABLE; |
7575 | } |
7576 | if (!(max_size = optarg->max_size)) { max_size = 1048576; } |
7577 | while (token_cursor->status != GRN_TOKEN_CURSOR_DONE && |
7578 | token_cursor->status != GRN_TOKEN_CURSOR_DONE_SKIP) { |
7579 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
7580 | if (grn_hash_add(ctx, h, &tid, sizeof(grn_id), (void **)&w1, NULL)) { |
7581 | (*w1)++; |
7582 | } |
7583 | } |
7584 | if (tid && token_cursor->curr_size) { |
7585 | if (optarg->mode == GRN_OP_UNSPLIT) { |
7586 | grn_table_search(ctx, lexicon, token_cursor->curr, |
7587 | token_cursor->curr_size, |
7588 | GRN_OP_PREFIX, (grn_obj *)h, GRN_OP_OR); |
7589 | } |
7590 | if (optarg->mode == GRN_OP_PARTIAL) { |
7591 | grn_table_search(ctx, lexicon, token_cursor->curr, |
7592 | token_cursor->curr_size, |
7593 | GRN_OP_SUFFIX, (grn_obj *)h, GRN_OP_OR); |
7594 | } |
7595 | } |
7596 | } |
7597 | grn_token_cursor_close(ctx, token_cursor); |
7598 | { |
7599 | grn_hash_cursor *c = grn_hash_cursor_open(ctx, h, NULL, 0, NULL, 0, |
7600 | 0, -1, 0); |
7601 | if (!c) { |
7602 | GRN_LOG(ctx, GRN_LOG_ALERT, |
7603 | "grn_hash_cursor_open on grn_ii_similar_search failed !" ); |
7604 | grn_hash_close(ctx, h); |
7605 | return GRN_NO_MEMORY_AVAILABLE; |
7606 | } |
7607 | while (grn_hash_cursor_next(ctx, c)) { |
7608 | uint32_t es; |
7609 | grn_hash_cursor_get_key_value(ctx, c, (void **) &tp, NULL, (void **) &w1); |
7610 | if ((es = grn_ii_estimate_size(ctx, ii, *tp))) { |
7611 | *w1 += max_size / es; |
7612 | } else { |
7613 | grn_hash_cursor_delete(ctx, c, NULL); |
7614 | } |
7615 | } |
7616 | grn_hash_cursor_close(ctx, c); |
7617 | } |
7618 | limit = optarg->similarity_threshold |
7619 | ? (optarg->similarity_threshold > GRN_HASH_SIZE(h) |
7620 | ? GRN_HASH_SIZE(h) |
7621 | : optarg->similarity_threshold) |
7622 | : (GRN_HASH_SIZE(h) >> 3) + 1; |
7623 | if (GRN_HASH_SIZE(h)) { |
7624 | grn_id j, id; |
7625 | int w2, rep; |
7626 | grn_ii_cursor *c; |
7627 | grn_posting *pos; |
7628 | grn_wv_mode wvm = grn_wv_none; |
7629 | grn_table_sort_optarg arg = { |
7630 | GRN_TABLE_SORT_DESC|GRN_TABLE_SORT_BY_VALUE|GRN_TABLE_SORT_AS_NUMBER, |
7631 | NULL, |
7632 | NULL, |
7633 | 0 |
7634 | }; |
7635 | grn_array *sorted = grn_array_create(ctx, NULL, sizeof(grn_id), 0); |
7636 | if (!sorted) { |
7637 | GRN_LOG(ctx, GRN_LOG_ALERT, |
7638 | "grn_hash_sort on grn_ii_similar_search failed !" ); |
7639 | grn_hash_close(ctx, h); |
7640 | return GRN_NO_MEMORY_AVAILABLE; |
7641 | } |
7642 | grn_hash_sort(ctx, h, limit, sorted, &arg); |
7643 | /* todo support subrec |
7644 | rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); |
7645 | */ |
7646 | rep = 0; |
7647 | if (optarg->func) { |
7648 | wvm = grn_wv_dynamic; |
7649 | } else if (optarg->vector_size) { |
7650 | wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; |
7651 | } |
7652 | for (j = 1; j <= limit; j++) { |
7653 | grn_array_get_value(ctx, sorted, j, &id); |
7654 | _grn_hash_get_key_value(ctx, h, id, (void **) &tp, (void **) &w1); |
7655 | if (!*tp || !(c = grn_ii_cursor_open(ctx, ii, *tp, GRN_ID_NIL, GRN_ID_MAX, |
7656 | rep |
7657 | ? ii->n_elements |
7658 | : ii->n_elements - 1, 0))) { |
7659 | GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed (%d)" , *tp); |
7660 | continue; |
7661 | } |
7662 | if (rep) { |
7663 | while (grn_ii_cursor_next(ctx, c)) { |
7664 | pos = c->post; |
7665 | if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { |
7666 | while (grn_ii_cursor_next_pos(ctx, c)) { |
7667 | res_add(ctx, s, (grn_rset_posinfo *) pos, |
7668 | *w1 * w2 * (1 + pos->weight), op); |
7669 | } |
7670 | } |
7671 | } |
7672 | } else { |
7673 | while (grn_ii_cursor_next(ctx, c)) { |
7674 | pos = c->post; |
7675 | if ((w2 = get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg)) > 0) { |
7676 | res_add(ctx, s, (grn_rset_posinfo *) pos, |
7677 | *w1 * w2 * (pos->tf + pos->weight), op); |
7678 | } |
7679 | } |
7680 | } |
7681 | grn_ii_cursor_close(ctx, c); |
7682 | } |
7683 | grn_array_close(ctx, sorted); |
7684 | } |
7685 | grn_hash_close(ctx, h); |
7686 | grn_ii_resolve_sel_and(ctx, s, op); |
7687 | // grn_hash_cursor_clear(r); |
7688 | return rc; |
7689 | } |
7690 | |
7691 | #define 0 |
7692 | #define 1 |
7693 | |
7694 | grn_rc |
7695 | (grn_ctx *ctx, grn_ii *ii, const char *string, |
7696 | unsigned int string_len, grn_hash *s, |
7697 | grn_operator op, grn_select_optarg *optarg) |
7698 | { |
7699 | grn_rset_posinfo pi; |
7700 | grn_id tid; |
7701 | const char *p, *pe; |
7702 | grn_obj *nstr; |
7703 | const char *normalized; |
7704 | unsigned int normalized_length_in_bytes; |
7705 | grn_ii_cursor *c; |
7706 | grn_posting *pos; |
7707 | int skip, rep, policy; |
7708 | grn_rc rc = GRN_SUCCESS; |
7709 | grn_wv_mode wvm = grn_wv_none; |
7710 | if (!ii || !string || !string_len || !s || !optarg) { |
7711 | return GRN_INVALID_ARGUMENT; |
7712 | } |
7713 | if (!(nstr = grn_string_open(ctx, string, string_len, NULL, 0))) { |
7714 | return GRN_INVALID_ARGUMENT; |
7715 | } |
7716 | policy = optarg->max_interval; |
7717 | if (optarg->func) { |
7718 | wvm = grn_wv_dynamic; |
7719 | } else if (optarg->vector_size) { |
7720 | wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; |
7721 | } |
7722 | /* todo support subrec |
7723 | if (policy == TERM_EXTRACT_EACH_POST) { |
7724 | if ((rc = grn_records_reopen(s, grn_rec_section, grn_rec_none, 0))) { goto exit; } |
7725 | } |
7726 | rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); |
7727 | */ |
7728 | rep = 0; |
7729 | grn_string_get_normalized(ctx, nstr, &normalized, &normalized_length_in_bytes, |
7730 | NULL); |
7731 | for (p = normalized, pe = p + normalized_length_in_bytes; p < pe; p += skip) { |
7732 | if ((tid = grn_table_lcp_search(ctx, ii->lexicon, p, pe - p))) { |
7733 | if (policy == TERM_EXTRACT_EACH_POST) { |
7734 | if (!(skip = grn_table_get_key(ctx, ii->lexicon, tid, NULL, 0))) { break; } |
7735 | } else { |
7736 | if (!(skip = (int)grn_charlen(ctx, p, pe))) { break; } |
7737 | } |
7738 | if (!(c = grn_ii_cursor_open(ctx, ii, tid, GRN_ID_NIL, GRN_ID_MAX, |
7739 | rep |
7740 | ? ii->n_elements |
7741 | : ii->n_elements - 1, 0))) { |
7742 | GRN_LOG(ctx, GRN_LOG_ERROR, "cursor open failed (%d)" , tid); |
7743 | continue; |
7744 | } |
7745 | if (rep) { |
7746 | while (grn_ii_cursor_next(ctx, c)) { |
7747 | pos = c->post; |
7748 | while (grn_ii_cursor_next_pos(ctx, c)) { |
7749 | res_add(ctx, s, (grn_rset_posinfo *) pos, |
7750 | get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg), op); |
7751 | } |
7752 | } |
7753 | } else { |
7754 | while (grn_ii_cursor_next(ctx, c)) { |
7755 | if (policy == TERM_EXTRACT_EACH_POST) { |
7756 | pi.rid = c->post->rid; |
7757 | pi.sid = p - normalized; |
7758 | res_add(ctx, s, &pi, pi.sid + 1, op); |
7759 | } else { |
7760 | pos = c->post; |
7761 | res_add(ctx, s, (grn_rset_posinfo *) pos, |
7762 | get_weight(ctx, s, pos->rid, pos->sid, wvm, optarg), op); |
7763 | } |
7764 | } |
7765 | } |
7766 | grn_ii_cursor_close(ctx, c); |
7767 | } else { |
7768 | if (!(skip = (int)grn_charlen(ctx, p, pe))) { |
7769 | break; |
7770 | } |
7771 | } |
7772 | } |
7773 | grn_obj_close(ctx, nstr); |
7774 | return rc; |
7775 | } |
7776 | |
7777 | typedef struct { |
7778 | grn_id rid; |
7779 | uint32_t sid; |
7780 | uint32_t start_pos; |
7781 | uint32_t end_pos; |
7782 | uint32_t tf; |
7783 | uint32_t weight; |
7784 | } grn_ii_select_cursor_posting; |
7785 | |
7786 | typedef struct { |
7787 | btr *bt; |
7788 | grn_ii *ii; |
7789 | token_info **tis; |
7790 | uint32_t n_tis; |
7791 | int max_interval; |
7792 | grn_operator mode; |
7793 | grn_ii_select_cursor_posting posting; |
7794 | const char *string; |
7795 | unsigned int string_len; |
7796 | grn_bool done; |
7797 | grn_ii_select_cursor_posting unshifted_posting; |
7798 | grn_bool have_unshifted_posting; |
7799 | } grn_ii_select_cursor; |
7800 | |
7801 | static grn_rc |
7802 | grn_ii_select_cursor_close(grn_ctx *ctx, |
7803 | grn_ii_select_cursor *cursor) |
7804 | { |
7805 | token_info **tip; |
7806 | |
7807 | if (!cursor) { |
7808 | return GRN_SUCCESS; |
7809 | } |
7810 | |
7811 | for (tip = cursor->tis; tip < cursor->tis + cursor->n_tis; tip++) { |
7812 | if (*tip) { |
7813 | token_info_close(ctx, *tip); |
7814 | } |
7815 | } |
7816 | if (cursor->tis) { |
7817 | GRN_FREE(cursor->tis); |
7818 | } |
7819 | bt_close(ctx, cursor->bt); |
7820 | GRN_FREE(cursor); |
7821 | |
7822 | return GRN_SUCCESS; |
7823 | } |
7824 | |
7825 | static grn_ii_select_cursor * |
7826 | grn_ii_select_cursor_open(grn_ctx *ctx, |
7827 | grn_ii *ii, |
7828 | const char *string, |
7829 | unsigned int string_len, |
7830 | grn_select_optarg *optarg) |
7831 | { |
7832 | grn_operator mode = GRN_OP_EXACT; |
7833 | grn_ii_select_cursor *cursor; |
7834 | |
7835 | if (string_len == 0) { |
7836 | ERR(GRN_INVALID_ARGUMENT, |
7837 | "[ii][select][cursor][open] empty string" ); |
7838 | return NULL; |
7839 | } |
7840 | |
7841 | if (optarg) { |
7842 | mode = optarg->mode; |
7843 | } |
7844 | switch (mode) { |
7845 | case GRN_OP_EXACT : |
7846 | case GRN_OP_FUZZY : |
7847 | case GRN_OP_NEAR : |
7848 | case GRN_OP_NEAR2 : |
7849 | break; |
7850 | default : |
7851 | ERR(GRN_INVALID_ARGUMENT, |
7852 | "[ii][select][cursor][open] " |
7853 | "EXACT, FUZZY, NEAR and NEAR2 are only supported mode: %s" , |
7854 | grn_operator_to_string(mode)); |
7855 | break; |
7856 | } |
7857 | |
7858 | cursor = GRN_CALLOC(sizeof(grn_ii_select_cursor)); |
7859 | if (!cursor) { |
7860 | ERR(ctx->rc, |
7861 | "[ii][select][cursor][open] failed to allocate cursor: %s" , |
7862 | ctx->errbuf); |
7863 | return NULL; |
7864 | } |
7865 | |
7866 | cursor->ii = ii; |
7867 | cursor->mode = mode; |
7868 | |
7869 | if (!(cursor->tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { |
7870 | ERR(ctx->rc, |
7871 | "[ii][select][cursor][open] failed to allocate token info container: %s" , |
7872 | ctx->errbuf); |
7873 | GRN_FREE(cursor); |
7874 | return NULL; |
7875 | } |
7876 | cursor->n_tis = 0; |
7877 | if (cursor->mode == GRN_OP_FUZZY) { |
7878 | grn_bool only_skip_token = GRN_FALSE; |
7879 | grn_id previous_min = GRN_ID_NIL; |
7880 | if (token_info_build_fuzzy(ctx, ii->lexicon, ii, string, string_len, |
7881 | cursor->tis, &(cursor->n_tis), |
7882 | &only_skip_token, previous_min, |
7883 | cursor->mode, &(optarg->fuzzy)) != GRN_SUCCESS) { |
7884 | grn_ii_select_cursor_close(ctx, cursor); |
7885 | return NULL; |
7886 | } |
7887 | } else { |
7888 | grn_bool only_skip_token = GRN_FALSE; |
7889 | grn_id previous_min = GRN_ID_NIL; |
7890 | if (token_info_build(ctx, ii->lexicon, ii, string, string_len, |
7891 | cursor->tis, &(cursor->n_tis), |
7892 | &only_skip_token, previous_min, |
7893 | cursor->mode) != GRN_SUCCESS) { |
7894 | grn_ii_select_cursor_close(ctx, cursor); |
7895 | return NULL; |
7896 | } |
7897 | } |
7898 | if (cursor->n_tis == 0) { |
7899 | grn_ii_select_cursor_close(ctx, cursor); |
7900 | return NULL; |
7901 | } |
7902 | |
7903 | switch (cursor->mode) { |
7904 | case GRN_OP_NEAR2 : |
7905 | token_info_clear_offset(cursor->tis, cursor->n_tis); |
7906 | cursor->mode = GRN_OP_NEAR; |
7907 | /* fallthru */ |
7908 | case GRN_OP_NEAR : |
7909 | if (!(cursor->bt = bt_open(ctx, cursor->n_tis))) { |
7910 | ERR(ctx->rc, |
7911 | "[ii][select][cursor][open] failed to allocate btree: %s" , |
7912 | ctx->errbuf); |
7913 | grn_ii_select_cursor_close(ctx, cursor); |
7914 | return NULL; |
7915 | } |
7916 | cursor->max_interval = optarg->max_interval; |
7917 | break; |
7918 | default : |
7919 | break; |
7920 | } |
7921 | qsort(cursor->tis, cursor->n_tis, sizeof(token_info *), token_compare); |
7922 | GRN_LOG(ctx, GRN_LOG_INFO, |
7923 | "[ii][select][cursor][open] n=%d <%.*s>" , |
7924 | cursor->n_tis, |
7925 | string_len, string); |
7926 | |
7927 | cursor->string = string; |
7928 | cursor->string_len = string_len; |
7929 | |
7930 | cursor->done = GRN_FALSE; |
7931 | |
7932 | cursor->have_unshifted_posting = GRN_FALSE; |
7933 | |
7934 | return cursor; |
7935 | } |
7936 | |
7937 | static grn_ii_select_cursor_posting * |
7938 | grn_ii_select_cursor_next(grn_ctx *ctx, |
7939 | grn_ii_select_cursor *cursor) |
7940 | { |
7941 | btr *bt = cursor->bt; |
7942 | token_info **tis = cursor->tis; |
7943 | token_info **tie = tis + cursor->n_tis; |
7944 | uint32_t n_tis = cursor->n_tis; |
7945 | int max_interval = cursor->max_interval; |
7946 | grn_operator mode = cursor->mode; |
7947 | |
7948 | if (cursor->have_unshifted_posting) { |
7949 | cursor->have_unshifted_posting = GRN_FALSE; |
7950 | return &(cursor->unshifted_posting); |
7951 | } |
7952 | |
7953 | if (cursor->done) { |
7954 | return NULL; |
7955 | } |
7956 | |
7957 | for (;;) { |
7958 | grn_id rid; |
7959 | grn_id sid; |
7960 | grn_id next_rid; |
7961 | grn_id next_sid; |
7962 | token_info **tip; |
7963 | |
7964 | rid = (*tis)->p->rid; |
7965 | sid = (*tis)->p->sid; |
7966 | for (tip = tis + 1, next_rid = rid, next_sid = sid + 1; |
7967 | tip < tie; |
7968 | tip++) { |
7969 | token_info *ti = *tip; |
7970 | if (token_info_skip(ctx, ti, rid, sid)) { return NULL; } |
7971 | if (ti->p->rid != rid || ti->p->sid != sid) { |
7972 | next_rid = ti->p->rid; |
7973 | next_sid = ti->p->sid; |
7974 | break; |
7975 | } |
7976 | } |
7977 | |
7978 | if (tip == tie) { |
7979 | int start_pos = 0; |
7980 | int pos = 0; |
7981 | int end_pos = 0; |
7982 | int score = 0; |
7983 | int tf = 0; |
7984 | int tscore = 0; |
7985 | |
7986 | #define SKIP_OR_BREAK(pos) {\ |
7987 | if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ |
7988 | if (ti->p->rid != rid || ti->p->sid != sid) { \ |
7989 | next_rid = ti->p->rid; \ |
7990 | next_sid = ti->p->sid; \ |
7991 | break; \ |
7992 | } \ |
7993 | } |
7994 | |
7995 | #define RETURN_POSTING() do { \ |
7996 | cursor->posting.rid = rid; \ |
7997 | cursor->posting.sid = sid; \ |
7998 | cursor->posting.start_pos = start_pos; \ |
7999 | cursor->posting.end_pos = end_pos; \ |
8000 | cursor->posting.tf = tf; \ |
8001 | cursor->posting.weight = tscore; \ |
8002 | if (token_info_skip_pos(ctx, *tis, rid, sid, pos) != GRN_SUCCESS) { \ |
8003 | if (token_info_skip(ctx, *tis, next_rid, next_sid) != GRN_SUCCESS) { \ |
8004 | cursor->done = GRN_TRUE; \ |
8005 | } \ |
8006 | } \ |
8007 | return &(cursor->posting); \ |
8008 | } while (GRN_FALSE) |
8009 | |
8010 | if (n_tis == 1) { |
8011 | start_pos = pos = end_pos = (*tis)->p->pos; |
8012 | pos++; |
8013 | tf = (*tis)->p->tf; |
8014 | tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; |
8015 | RETURN_POSTING(); |
8016 | } else if (mode == GRN_OP_NEAR) { |
8017 | bt_zap(bt); |
8018 | for (tip = tis; tip < tie; tip++) { |
8019 | token_info *ti = *tip; |
8020 | SKIP_OR_BREAK(pos); |
8021 | bt_push(bt, ti); |
8022 | } |
8023 | if (tip == tie) { |
8024 | for (;;) { |
8025 | token_info *ti; |
8026 | int min; |
8027 | int max; |
8028 | |
8029 | ti = bt->min; |
8030 | min = ti->pos; |
8031 | max = bt->max->pos; |
8032 | if (min > max) { |
8033 | char ii_name[GRN_TABLE_MAX_KEY_SIZE]; |
8034 | int ii_name_size; |
8035 | ii_name_size = grn_obj_name(ctx, |
8036 | (grn_obj *)(cursor->ii), |
8037 | ii_name, |
8038 | GRN_TABLE_MAX_KEY_SIZE); |
8039 | ERR(GRN_FILE_CORRUPT, |
8040 | "[ii][select][cursor][near] " |
8041 | "max position must be larger than min position: " |
8042 | "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>" , |
8043 | min, max, |
8044 | ii_name_size, ii_name, |
8045 | cursor->string_len, |
8046 | cursor->string); |
8047 | return NULL; |
8048 | } |
8049 | if ((max_interval < 0) || (max - min <= max_interval)) { |
8050 | /* TODO: Set start_pos, pos, end_pos, tf and tscore */ |
8051 | RETURN_POSTING(); |
8052 | if (ti->pos == max + 1) { |
8053 | break; |
8054 | } |
8055 | SKIP_OR_BREAK(max + 1); |
8056 | } else { |
8057 | if (ti->pos == max - max_interval) { |
8058 | break; |
8059 | } |
8060 | SKIP_OR_BREAK(max - max_interval); |
8061 | } |
8062 | bt_pop(bt); |
8063 | } |
8064 | } |
8065 | } else { |
8066 | int count = 0; |
8067 | for (tip = tis; ; tip++) { |
8068 | token_info *ti; |
8069 | |
8070 | if (tip == tie) { tip = tis; } |
8071 | ti = *tip; |
8072 | SKIP_OR_BREAK(pos); |
8073 | if (ti->pos == pos) { |
8074 | score += ti->p->weight + ti->cursors->bins[0]->weight; |
8075 | count++; |
8076 | if (ti->p->pos > end_pos) { |
8077 | end_pos = ti->p->pos; |
8078 | } |
8079 | } else { |
8080 | score = ti->p->weight + ti->cursors->bins[0]->weight; |
8081 | count = 1; |
8082 | start_pos = pos = ti->pos; |
8083 | end_pos = ti->p->pos; |
8084 | } |
8085 | if (count == n_tis) { |
8086 | pos++; |
8087 | if (ti->p->pos > end_pos) { |
8088 | end_pos = ti->p->pos; |
8089 | } |
8090 | tf = 1; |
8091 | tscore += score; |
8092 | RETURN_POSTING(); |
8093 | } |
8094 | } |
8095 | } |
8096 | #undef SKIP_OR_BREAK |
8097 | } |
8098 | if (token_info_skip(ctx, *tis, next_rid, next_sid)) { |
8099 | return NULL; |
8100 | } |
8101 | } |
8102 | } |
8103 | |
8104 | static void |
8105 | grn_ii_select_cursor_unshift(grn_ctx *ctx, |
8106 | grn_ii_select_cursor *cursor, |
8107 | grn_ii_select_cursor_posting *posting) |
8108 | { |
8109 | cursor->unshifted_posting = *posting; |
8110 | cursor->have_unshifted_posting = GRN_TRUE; |
8111 | } |
8112 | |
8113 | static grn_rc |
8114 | grn_ii_parse_regexp_query(grn_ctx *ctx, |
8115 | const char *log_tag, |
8116 | const char *string, unsigned int string_len, |
8117 | grn_obj *parsed_strings) |
8118 | { |
8119 | grn_bool escaping = GRN_FALSE; |
8120 | int nth_char = 0; |
8121 | const char *current = string; |
8122 | const char *string_end = string + string_len; |
8123 | grn_obj buffer; |
8124 | |
8125 | GRN_TEXT_INIT(&buffer, 0); |
8126 | while (current < string_end) { |
8127 | const char *target; |
8128 | int char_len; |
8129 | |
8130 | char_len = grn_charlen(ctx, current, string_end); |
8131 | if (char_len == 0) { |
8132 | GRN_OBJ_FIN(ctx, &buffer); |
8133 | ERR(GRN_INVALID_ARGUMENT, |
8134 | "%s invalid encoding character: <%.*s|%#x|>" , |
8135 | log_tag, |
8136 | (int)(current - string), string, |
8137 | *current); |
8138 | return ctx->rc; |
8139 | } |
8140 | target = current; |
8141 | current += char_len; |
8142 | |
8143 | if (escaping) { |
8144 | escaping = GRN_FALSE; |
8145 | if (char_len == 1) { |
8146 | switch (*target) { |
8147 | case 'A' : |
8148 | if (nth_char == 0) { |
8149 | target = GRN_TOKENIZER_BEGIN_MARK_UTF8; |
8150 | char_len = GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN; |
8151 | } |
8152 | break; |
8153 | case 'z' : |
8154 | if (current == string_end) { |
8155 | target = GRN_TOKENIZER_END_MARK_UTF8; |
8156 | char_len = GRN_TOKENIZER_END_MARK_UTF8_LEN; |
8157 | } |
8158 | break; |
8159 | default : |
8160 | break; |
8161 | } |
8162 | } |
8163 | } else { |
8164 | if (char_len == 1) { |
8165 | if (*target == '\\') { |
8166 | escaping = GRN_TRUE; |
8167 | continue; |
8168 | } else if (*target == '.' && |
8169 | grn_charlen(ctx, current, string_end) == 1 && |
8170 | *current == '*') { |
8171 | if (GRN_TEXT_LEN(&buffer) > 0) { |
8172 | grn_vector_add_element(ctx, |
8173 | parsed_strings, |
8174 | GRN_TEXT_VALUE(&buffer), |
8175 | GRN_TEXT_LEN(&buffer), |
8176 | 0, |
8177 | GRN_DB_TEXT); |
8178 | GRN_BULK_REWIND(&buffer); |
8179 | } |
8180 | current++; |
8181 | nth_char++; |
8182 | continue; |
8183 | } |
8184 | } |
8185 | } |
8186 | |
8187 | GRN_TEXT_PUT(ctx, &buffer, target, char_len); |
8188 | nth_char++; |
8189 | } |
8190 | if (GRN_TEXT_LEN(&buffer) > 0) { |
8191 | grn_vector_add_element(ctx, |
8192 | parsed_strings, |
8193 | GRN_TEXT_VALUE(&buffer), |
8194 | GRN_TEXT_LEN(&buffer), |
8195 | 0, |
8196 | GRN_DB_TEXT); |
8197 | } |
8198 | GRN_OBJ_FIN(ctx, &buffer); |
8199 | |
8200 | return GRN_SUCCESS; |
8201 | } |
8202 | |
8203 | static grn_rc |
8204 | grn_ii_select_regexp(grn_ctx *ctx, grn_ii *ii, |
8205 | const char *string, unsigned int string_len, |
8206 | grn_hash *s, grn_operator op, grn_select_optarg *optarg) |
8207 | { |
8208 | grn_rc rc; |
8209 | grn_obj parsed_strings; |
8210 | unsigned int n_parsed_strings; |
8211 | |
8212 | GRN_TEXT_INIT(&parsed_strings, GRN_OBJ_VECTOR); |
8213 | rc = grn_ii_parse_regexp_query(ctx, "[ii][select][regexp]" , |
8214 | string, string_len, &parsed_strings); |
8215 | if (rc != GRN_SUCCESS) { |
8216 | GRN_OBJ_FIN(ctx, &parsed_strings); |
8217 | return rc; |
8218 | } |
8219 | |
8220 | if (optarg) { |
8221 | optarg->mode = GRN_OP_EXACT; |
8222 | } |
8223 | |
8224 | n_parsed_strings = grn_vector_size(ctx, &parsed_strings); |
8225 | if (n_parsed_strings == 1) { |
8226 | const char *parsed_string; |
8227 | unsigned int parsed_string_len; |
8228 | parsed_string_len = grn_vector_get_element(ctx, |
8229 | &parsed_strings, |
8230 | 0, |
8231 | &parsed_string, |
8232 | NULL, |
8233 | NULL); |
8234 | rc = grn_ii_select(ctx, ii, |
8235 | parsed_string, |
8236 | parsed_string_len, |
8237 | s, op, optarg); |
8238 | } else { |
8239 | int i; |
8240 | grn_ii_select_cursor **cursors; |
8241 | grn_bool have_error = GRN_FALSE; |
8242 | |
8243 | cursors = GRN_CALLOC(sizeof(grn_ii_select_cursor *) * n_parsed_strings); |
8244 | for (i = 0; i < n_parsed_strings; i++) { |
8245 | const char *parsed_string; |
8246 | unsigned int parsed_string_len; |
8247 | parsed_string_len = grn_vector_get_element(ctx, |
8248 | &parsed_strings, |
8249 | i, |
8250 | &parsed_string, |
8251 | NULL, |
8252 | NULL); |
8253 | cursors[i] = grn_ii_select_cursor_open(ctx, |
8254 | ii, |
8255 | parsed_string, |
8256 | parsed_string_len, |
8257 | optarg); |
8258 | if (!cursors[i]) { |
8259 | have_error = GRN_TRUE; |
8260 | break; |
8261 | } |
8262 | } |
8263 | |
8264 | while (!have_error) { |
8265 | grn_ii_select_cursor_posting *posting; |
8266 | uint32_t pos; |
8267 | |
8268 | posting = grn_ii_select_cursor_next(ctx, cursors[0]); |
8269 | if (!posting) { |
8270 | break; |
8271 | } |
8272 | |
8273 | pos = posting->end_pos; |
8274 | for (i = 1; i < n_parsed_strings; i++) { |
8275 | grn_ii_select_cursor_posting *posting_i; |
8276 | |
8277 | for (;;) { |
8278 | posting_i = grn_ii_select_cursor_next(ctx, cursors[i]); |
8279 | if (!posting_i) { |
8280 | break; |
8281 | } |
8282 | |
8283 | if (posting_i->rid == posting->rid && |
8284 | posting_i->sid == posting->sid && |
8285 | posting_i->start_pos > pos) { |
8286 | grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); |
8287 | break; |
8288 | } |
8289 | if (posting_i->rid > posting->rid) { |
8290 | grn_ii_select_cursor_unshift(ctx, cursors[i], posting_i); |
8291 | break; |
8292 | } |
8293 | } |
8294 | |
8295 | if (!posting_i) { |
8296 | break; |
8297 | } |
8298 | |
8299 | if (posting_i->rid != posting->rid || posting_i->sid != posting->sid) { |
8300 | break; |
8301 | } |
8302 | |
8303 | pos = posting_i->end_pos; |
8304 | } |
8305 | |
8306 | if (i == n_parsed_strings) { |
8307 | grn_rset_posinfo pi = {posting->rid, posting->sid, pos}; |
8308 | double record_score = 1.0; |
8309 | res_add(ctx, s, &pi, record_score, op); |
8310 | } |
8311 | } |
8312 | |
8313 | for (i = 0; i < n_parsed_strings; i++) { |
8314 | if (cursors[i]) { |
8315 | grn_ii_select_cursor_close(ctx, cursors[i]); |
8316 | } |
8317 | } |
8318 | GRN_FREE(cursors); |
8319 | } |
8320 | GRN_OBJ_FIN(ctx, &parsed_strings); |
8321 | |
8322 | if (optarg) { |
8323 | optarg->mode = GRN_OP_REGEXP; |
8324 | } |
8325 | |
8326 | return rc; |
8327 | } |
8328 | |
8329 | #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH |
8330 | static grn_bool |
8331 | grn_ii_select_sequential_search_should_use(grn_ctx *ctx, |
8332 | grn_ii *ii, |
8333 | const char *raw_query, |
8334 | unsigned int raw_query_len, |
8335 | grn_hash *result, |
8336 | grn_operator op, |
8337 | grn_wv_mode wvm, |
8338 | grn_select_optarg *optarg, |
8339 | token_info **token_infos, |
8340 | uint32_t n_token_infos, |
8341 | double too_many_index_match_ratio) |
8342 | { |
8343 | int n_sources; |
8344 | |
8345 | if (too_many_index_match_ratio < 0.0) { |
8346 | return GRN_FALSE; |
8347 | } |
8348 | |
8349 | if (op != GRN_OP_AND) { |
8350 | return GRN_FALSE; |
8351 | } |
8352 | |
8353 | if (optarg->mode != GRN_OP_EXACT) { |
8354 | return GRN_FALSE; |
8355 | } |
8356 | |
8357 | n_sources = ii->obj.source_size / sizeof(grn_id); |
8358 | if (n_sources == 0) { |
8359 | return GRN_FALSE; |
8360 | } |
8361 | |
8362 | { |
8363 | uint32_t i; |
8364 | int n_existing_records; |
8365 | |
8366 | n_existing_records = GRN_HASH_SIZE(result); |
8367 | for (i = 0; i < n_token_infos; i++) { |
8368 | token_info *info = token_infos[i]; |
8369 | if (n_existing_records <= (info->size * too_many_index_match_ratio)) { |
8370 | return GRN_TRUE; |
8371 | } |
8372 | } |
8373 | return GRN_FALSE; |
8374 | } |
8375 | } |
8376 | |
8377 | static void |
8378 | grn_ii_select_sequential_search_body(grn_ctx *ctx, |
8379 | grn_ii *ii, |
8380 | grn_obj *normalizer, |
8381 | grn_encoding encoding, |
8382 | OnigRegex regex, |
8383 | grn_hash *result, |
8384 | grn_operator op, |
8385 | grn_wv_mode wvm, |
8386 | grn_select_optarg *optarg) |
8387 | { |
8388 | int i, n_sources; |
8389 | grn_id *source_ids = ii->obj.source; |
8390 | grn_obj buffer; |
8391 | |
8392 | GRN_TEXT_INIT(&buffer, 0); |
8393 | n_sources = ii->obj.source_size / sizeof(grn_id); |
8394 | for (i = 0; i < n_sources; i++) { |
8395 | grn_id source_id = source_ids[i]; |
8396 | grn_obj *source; |
8397 | grn_obj *accessor; |
8398 | |
8399 | source = grn_ctx_at(ctx, source_id); |
8400 | switch (source->header.type) { |
8401 | case GRN_TABLE_HASH_KEY : |
8402 | case GRN_TABLE_PAT_KEY : |
8403 | case GRN_TABLE_DAT_KEY : |
8404 | accessor = grn_obj_column(ctx, |
8405 | (grn_obj *)result, |
8406 | GRN_COLUMN_NAME_KEY, |
8407 | GRN_COLUMN_NAME_KEY_LEN); |
8408 | break; |
8409 | default : |
8410 | { |
8411 | char column_name[GRN_TABLE_MAX_KEY_SIZE]; |
8412 | int column_name_size; |
8413 | column_name_size = grn_column_name(ctx, source, |
8414 | column_name, |
8415 | GRN_TABLE_MAX_KEY_SIZE); |
8416 | accessor = grn_obj_column(ctx, (grn_obj *)result, column_name, |
8417 | column_name_size); |
8418 | } |
8419 | break; |
8420 | } |
8421 | |
8422 | { |
8423 | grn_hash_cursor *cursor; |
8424 | grn_id id; |
8425 | cursor = grn_hash_cursor_open(ctx, result, NULL, 0, NULL, 0, 0, -1, 0); |
8426 | while ((id = grn_hash_cursor_next(ctx, cursor)) != GRN_ID_NIL) { |
8427 | OnigPosition position; |
8428 | grn_obj *value; |
8429 | const char *normalized_value; |
8430 | unsigned int normalized_value_length; |
8431 | |
8432 | GRN_BULK_REWIND(&buffer); |
8433 | grn_obj_get_value(ctx, accessor, id, &buffer); |
8434 | value = grn_string_open_(ctx, |
8435 | GRN_TEXT_VALUE(&buffer), |
8436 | GRN_TEXT_LEN(&buffer), |
8437 | normalizer, 0, encoding); |
8438 | grn_string_get_normalized(ctx, value, |
8439 | &normalized_value, &normalized_value_length, |
8440 | NULL); |
8441 | position = onig_search(regex, |
8442 | normalized_value, |
8443 | normalized_value + normalized_value_length, |
8444 | normalized_value, |
8445 | normalized_value + normalized_value_length, |
8446 | NULL, |
8447 | 0); |
8448 | if (position != ONIG_MISMATCH) { |
8449 | grn_id *record_id; |
8450 | grn_rset_posinfo info; |
8451 | double score; |
8452 | |
8453 | grn_hash_cursor_get_key(ctx, cursor, (void **)&record_id); |
8454 | |
8455 | info.rid = *record_id; |
8456 | info.sid = i + 1; |
8457 | info.pos = 0; |
8458 | score = get_weight(ctx, result, info.rid, info.sid, wvm, optarg); |
8459 | res_add(ctx, result, &info, score, op); |
8460 | } |
8461 | grn_obj_unlink(ctx, value); |
8462 | } |
8463 | grn_hash_cursor_close(ctx, cursor); |
8464 | } |
8465 | grn_obj_unlink(ctx, accessor); |
8466 | } |
8467 | grn_obj_unlink(ctx, &buffer); |
8468 | } |
8469 | |
8470 | static grn_bool |
8471 | grn_ii_select_sequential_search(grn_ctx *ctx, |
8472 | grn_ii *ii, |
8473 | const char *raw_query, |
8474 | unsigned int raw_query_len, |
8475 | grn_hash *result, |
8476 | grn_operator op, |
8477 | grn_wv_mode wvm, |
8478 | grn_select_optarg *optarg, |
8479 | token_info **token_infos, |
8480 | uint32_t n_token_infos) |
8481 | { |
8482 | grn_bool processed = GRN_TRUE; |
8483 | |
8484 | { |
8485 | if (!grn_ii_select_sequential_search_should_use(ctx, |
8486 | ii, |
8487 | raw_query, |
8488 | raw_query_len, |
8489 | result, |
8490 | op, |
8491 | wvm, |
8492 | optarg, |
8493 | token_infos, |
8494 | n_token_infos, |
8495 | grn_ii_select_too_many_index_match_ratio)) { |
8496 | return GRN_FALSE; |
8497 | } |
8498 | } |
8499 | |
8500 | { |
8501 | grn_encoding encoding; |
8502 | grn_obj *normalizer; |
8503 | int nflags = 0; |
8504 | grn_obj *query; |
8505 | const char *normalized_query; |
8506 | unsigned int normalized_query_length; |
8507 | |
8508 | grn_table_get_info(ctx, ii->lexicon, |
8509 | NULL, &encoding, NULL, &normalizer, NULL); |
8510 | query = grn_string_open_(ctx, raw_query, raw_query_len, |
8511 | normalizer, nflags, encoding); |
8512 | grn_string_get_normalized(ctx, query, |
8513 | &normalized_query, &normalized_query_length, |
8514 | NULL); |
8515 | { |
8516 | OnigRegex regex; |
8517 | int onig_result; |
8518 | OnigErrorInfo error_info; |
8519 | onig_result = onig_new(®ex, |
8520 | normalized_query, |
8521 | normalized_query + normalized_query_length, |
8522 | ONIG_OPTION_NONE, |
8523 | ONIG_ENCODING_UTF8, |
8524 | ONIG_SYNTAX_ASIS, |
8525 | &error_info); |
8526 | if (onig_result == ONIG_NORMAL) { |
8527 | grn_ii_select_sequential_search_body(ctx, ii, normalizer, encoding, |
8528 | regex, result, op, wvm, optarg); |
8529 | onig_free(regex); |
8530 | } else { |
8531 | char message[ONIG_MAX_ERROR_MESSAGE_LEN]; |
8532 | onig_error_code_to_str(message, onig_result, error_info); |
8533 | GRN_LOG(ctx, GRN_LOG_WARNING, |
8534 | "[ii][select][sequential] " |
8535 | "failed to create regular expression object: %s" , |
8536 | message); |
8537 | processed = GRN_FALSE; |
8538 | } |
8539 | } |
8540 | grn_obj_unlink(ctx, query); |
8541 | } |
8542 | |
8543 | return processed; |
8544 | } |
8545 | #endif |
8546 | |
8547 | grn_rc |
8548 | grn_ii_select(grn_ctx *ctx, grn_ii *ii, |
8549 | const char *string, unsigned int string_len, |
8550 | grn_hash *s, grn_operator op, grn_select_optarg *optarg) |
8551 | { |
8552 | btr *bt = NULL; |
8553 | grn_rc rc = GRN_SUCCESS; |
8554 | int rep, orp, weight, max_interval = 0; |
8555 | token_info *ti, **tis = NULL, **tip, **tie; |
8556 | uint32_t n = 0, rid, sid, nrid, nsid; |
8557 | grn_bool only_skip_token = GRN_FALSE; |
8558 | grn_operator mode = GRN_OP_EXACT; |
8559 | grn_wv_mode wvm = grn_wv_none; |
8560 | grn_obj *lexicon = ii->lexicon; |
8561 | grn_scorer_score_func *score_func = NULL; |
8562 | grn_scorer_matched_record record; |
8563 | grn_id previous_min = GRN_ID_NIL; |
8564 | grn_id current_min = GRN_ID_NIL; |
8565 | grn_bool set_min_enable_for_and_query = GRN_FALSE; |
8566 | |
8567 | if (!lexicon || !ii || !s) { return GRN_INVALID_ARGUMENT; } |
8568 | if (optarg) { |
8569 | mode = optarg->mode; |
8570 | if (optarg->func) { |
8571 | wvm = grn_wv_dynamic; |
8572 | } else if (optarg->vector_size) { |
8573 | wvm = optarg->weight_vector ? grn_wv_static : grn_wv_constant; |
8574 | } |
8575 | if (optarg->match_info) { |
8576 | if (optarg->match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { |
8577 | previous_min = optarg->match_info->min; |
8578 | set_min_enable_for_and_query = GRN_TRUE; |
8579 | } |
8580 | } |
8581 | } |
8582 | if (mode == GRN_OP_SIMILAR) { |
8583 | return grn_ii_similar_search(ctx, ii, string, string_len, s, op, optarg); |
8584 | } |
8585 | if (mode == GRN_OP_TERM_EXTRACT) { |
8586 | return grn_ii_term_extract(ctx, ii, string, string_len, s, op, optarg); |
8587 | } |
8588 | if (mode == GRN_OP_REGEXP) { |
8589 | return grn_ii_select_regexp(ctx, ii, string, string_len, s, op, optarg); |
8590 | } |
8591 | /* todo : support subrec |
8592 | rep = (s->record_unit == grn_rec_position || s->subrec_unit == grn_rec_position); |
8593 | orp = (s->record_unit == grn_rec_position || op == GRN_OP_OR); |
8594 | */ |
8595 | rep = 0; |
8596 | orp = op == GRN_OP_OR; |
8597 | if (!string_len) { goto exit; } |
8598 | if (!(tis = GRN_MALLOC(sizeof(token_info *) * string_len * 2))) { |
8599 | return GRN_NO_MEMORY_AVAILABLE; |
8600 | } |
8601 | if (mode == GRN_OP_FUZZY) { |
8602 | if (token_info_build_fuzzy(ctx, lexicon, ii, string, string_len, |
8603 | tis, &n, &only_skip_token, previous_min, |
8604 | mode, &(optarg->fuzzy)) || |
8605 | !n) { |
8606 | goto exit; |
8607 | } |
8608 | } else { |
8609 | if (token_info_build(ctx, lexicon, ii, string, string_len, |
8610 | tis, &n, &only_skip_token, previous_min, mode) || |
8611 | !n) { |
8612 | goto exit; |
8613 | } |
8614 | } |
8615 | switch (mode) { |
8616 | case GRN_OP_NEAR2 : |
8617 | token_info_clear_offset(tis, n); |
8618 | mode = GRN_OP_NEAR; |
8619 | /* fallthru */ |
8620 | case GRN_OP_NEAR : |
8621 | if (!(bt = bt_open(ctx, n))) { rc = GRN_NO_MEMORY_AVAILABLE; goto exit; } |
8622 | max_interval = optarg->max_interval; |
8623 | break; |
8624 | default : |
8625 | break; |
8626 | } |
8627 | qsort(tis, n, sizeof(token_info *), token_compare); |
8628 | tie = tis + n; |
8629 | /* |
8630 | for (tip = tis; tip < tie; tip++) { |
8631 | ti = *tip; |
8632 | grn_log("o=%d n=%d s=%d r=%d", ti->offset, ti->ntoken, ti->size, ti->rid); |
8633 | } |
8634 | */ |
8635 | GRN_LOG(ctx, GRN_LOG_INFO, "n=%d (%.*s)" , n, string_len, string); |
8636 | /* todo : array as result |
8637 | if (n == 1 && (*tis)->cursors->n_entries == 1 && op == GRN_OP_OR |
8638 | && !GRN_HASH_SIZE(s) && !s->garbages |
8639 | && s->record_unit == grn_rec_document && !s->max_n_subrecs |
8640 | && grn_ii_max_section(ii) == 1) { |
8641 | grn_ii_cursor *c = (*tis)->cursors->bins[0]; |
8642 | if ((rc = grn_hash_array_init(s, (*tis)->size + 32768))) { goto exit; } |
8643 | do { |
8644 | grn_rset_recinfo *ri; |
8645 | grn_posting *p = c->post; |
8646 | if ((weight = get_weight(ctx, s, p->rid, p->sid, wvm, optarg))) { |
8647 | GRN_HASH_INT_ADD(s, p, ri); |
8648 | ri->score = (p->tf + p->score) * weight; |
8649 | ri->n_subrecs = 1; |
8650 | } |
8651 | } while (grn_ii_cursor_next(ctx, c)); |
8652 | goto exit; |
8653 | } |
8654 | */ |
8655 | #ifdef GRN_II_SELECT_ENABLE_SEQUENTIAL_SEARCH |
8656 | if (grn_ii_select_sequential_search(ctx, ii, string, string_len, |
8657 | s, op, wvm, optarg, tis, n)) { |
8658 | goto exit; |
8659 | } |
8660 | #endif |
8661 | |
8662 | if (optarg && optarg->scorer) { |
8663 | grn_proc *scorer = (grn_proc *)(optarg->scorer); |
8664 | score_func = scorer->callbacks.scorer.score; |
8665 | record.table = grn_ctx_at(ctx, s->obj.header.domain); |
8666 | record.lexicon = lexicon; |
8667 | record.id = GRN_ID_NIL; |
8668 | GRN_RECORD_INIT(&(record.terms), GRN_OBJ_VECTOR, lexicon->header.domain); |
8669 | GRN_UINT32_INIT(&(record.term_weights), GRN_OBJ_VECTOR); |
8670 | record.total_term_weights = 0; |
8671 | record.n_documents = grn_table_size(ctx, record.table); |
8672 | record.n_occurrences = 0; |
8673 | record.n_candidates = 0; |
8674 | record.n_tokens = 0; |
8675 | record.weight = 0; |
8676 | record.args_expr = optarg->scorer_args_expr; |
8677 | record.args_expr_offset = optarg->scorer_args_expr_offset; |
8678 | } |
8679 | |
8680 | for (;;) { |
8681 | rid = (*tis)->p->rid; |
8682 | sid = (*tis)->p->sid; |
8683 | for (tip = tis + 1, nrid = rid, nsid = sid + 1; tip < tie; tip++) { |
8684 | ti = *tip; |
8685 | if (token_info_skip(ctx, ti, rid, sid)) { goto exit; } |
8686 | if (ti->p->rid != rid || ti->p->sid != sid) { |
8687 | nrid = ti->p->rid; |
8688 | nsid = ti->p->sid; |
8689 | break; |
8690 | } |
8691 | } |
8692 | weight = get_weight(ctx, s, rid, sid, wvm, optarg); |
8693 | if (tip == tie && weight != 0) { |
8694 | grn_rset_posinfo pi = {rid, sid, 0}; |
8695 | if (orp || grn_hash_get(ctx, s, &pi, s->key_size, NULL)) { |
8696 | int count = 0, noccur = 0, pos = 0, score = 0, tscore = 0, min, max; |
8697 | |
8698 | if (score_func) { |
8699 | GRN_BULK_REWIND(&(record.terms)); |
8700 | GRN_BULK_REWIND(&(record.term_weights)); |
8701 | record.n_candidates = 0; |
8702 | record.n_tokens = 0; |
8703 | } |
8704 | |
8705 | #define SKIP_OR_BREAK(pos) {\ |
8706 | if (token_info_skip_pos(ctx, ti, rid, sid, pos)) { break; } \ |
8707 | if (ti->p->rid != rid || ti->p->sid != sid) { \ |
8708 | nrid = ti->p->rid; \ |
8709 | nsid = ti->p->sid; \ |
8710 | break; \ |
8711 | } \ |
8712 | } |
8713 | if (n == 1 && !rep) { |
8714 | noccur = (*tis)->p->tf; |
8715 | tscore = (*tis)->p->weight + (*tis)->cursors->bins[0]->weight; |
8716 | if (score_func) { |
8717 | GRN_RECORD_PUT(ctx, &(record.terms), (*tis)->cursors->bins[0]->id); |
8718 | GRN_UINT32_PUT(ctx, &(record.term_weights), tscore); |
8719 | record.n_occurrences = noccur; |
8720 | record.n_candidates = (*tis)->size; |
8721 | record.n_tokens = (*tis)->ntoken; |
8722 | } |
8723 | } else if (mode == GRN_OP_NEAR) { |
8724 | bt_zap(bt); |
8725 | for (tip = tis; tip < tie; tip++) { |
8726 | ti = *tip; |
8727 | SKIP_OR_BREAK(pos); |
8728 | bt_push(bt, ti); |
8729 | } |
8730 | if (tip == tie) { |
8731 | for (;;) { |
8732 | ti = bt->min; min = ti->pos; max = bt->max->pos; |
8733 | if (min > max) { |
8734 | char ii_name[GRN_TABLE_MAX_KEY_SIZE]; |
8735 | int ii_name_size; |
8736 | ii_name_size = grn_obj_name(ctx, (grn_obj *)ii, ii_name, |
8737 | GRN_TABLE_MAX_KEY_SIZE); |
8738 | ERR(GRN_FILE_CORRUPT, |
8739 | "[ii][select][near] " |
8740 | "max position must be larger than min position: " |
8741 | "min:<%d> max:<%d> ii:<%.*s> string:<%.*s>" , |
8742 | min, max, |
8743 | ii_name_size, ii_name, |
8744 | string_len, string); |
8745 | rc = ctx->rc; |
8746 | goto exit; |
8747 | } |
8748 | if ((max_interval < 0) || (max - min <= max_interval)) { |
8749 | if (rep) { pi.pos = min; res_add(ctx, s, &pi, weight, op); } |
8750 | noccur++; |
8751 | if (ti->pos == max + 1) { |
8752 | break; |
8753 | } |
8754 | SKIP_OR_BREAK(max + 1); |
8755 | } else { |
8756 | if (ti->pos == max - max_interval) { |
8757 | break; |
8758 | } |
8759 | SKIP_OR_BREAK(max - max_interval); |
8760 | } |
8761 | bt_pop(bt); |
8762 | } |
8763 | } |
8764 | } else { |
8765 | for (tip = tis; ; tip++) { |
8766 | if (tip == tie) { tip = tis; } |
8767 | ti = *tip; |
8768 | SKIP_OR_BREAK(pos); |
8769 | if (ti->pos == pos) { |
8770 | score += ti->p->weight + ti->cursors->bins[0]->weight; count++; |
8771 | } else { |
8772 | score = ti->p->weight + ti->cursors->bins[0]->weight; count = 1; |
8773 | pos = ti->pos; |
8774 | if (noccur == 0 && score_func) { |
8775 | GRN_BULK_REWIND(&(record.terms)); |
8776 | GRN_BULK_REWIND(&(record.term_weights)); |
8777 | record.n_candidates = 0; |
8778 | record.n_tokens = 0; |
8779 | } |
8780 | } |
8781 | if (noccur == 0 && score_func) { |
8782 | GRN_RECORD_PUT(ctx, &(record.terms), ti->cursors->bins[0]->id); |
8783 | GRN_UINT32_PUT(ctx, &(record.term_weights), |
8784 | ti->p->weight + ti->cursors->bins[0]->weight); |
8785 | record.n_candidates += ti->size; |
8786 | record.n_tokens += ti->ntoken; |
8787 | } |
8788 | if (count == n) { |
8789 | if (rep) { |
8790 | pi.pos = pos; res_add(ctx, s, &pi, (score + 1) * weight, op); |
8791 | } |
8792 | tscore += score; |
8793 | score = 0; count = 0; pos++; |
8794 | noccur++; |
8795 | } |
8796 | } |
8797 | } |
8798 | if (noccur && !rep) { |
8799 | double record_score; |
8800 | if (score_func) { |
8801 | record.id = rid; |
8802 | record.weight = weight; |
8803 | record.n_occurrences = noccur; |
8804 | record.total_term_weights = tscore; |
8805 | record_score = score_func(ctx, &record) * weight; |
8806 | } else { |
8807 | record_score = (noccur + tscore) * weight; |
8808 | } |
8809 | if (set_min_enable_for_and_query) { |
8810 | if (current_min == GRN_ID_NIL) { |
8811 | current_min = rid; |
8812 | } |
8813 | } |
8814 | res_add(ctx, s, &pi, record_score, op); |
8815 | } |
8816 | #undef SKIP_OR_BREAK |
8817 | } |
8818 | } |
8819 | if (token_info_skip(ctx, *tis, nrid, nsid)) { goto exit; } |
8820 | } |
8821 | exit : |
8822 | if (score_func) { |
8823 | GRN_OBJ_FIN(ctx, &(record.terms)); |
8824 | GRN_OBJ_FIN(ctx, &(record.term_weights)); |
8825 | } |
8826 | |
8827 | if (set_min_enable_for_and_query) { |
8828 | if (current_min > previous_min) { |
8829 | optarg->match_info->min = current_min; |
8830 | } |
8831 | } |
8832 | |
8833 | for (tip = tis; tip < tis + n; tip++) { |
8834 | if (*tip) { token_info_close(ctx, *tip); } |
8835 | } |
8836 | if (tis) { GRN_FREE(tis); } |
8837 | if (!only_skip_token) { |
8838 | grn_ii_resolve_sel_and(ctx, s, op); |
8839 | } |
8840 | // grn_hash_cursor_clear(r); |
8841 | bt_close(ctx, bt); |
8842 | #ifdef DEBUG |
8843 | { |
8844 | uint32_t segno = GRN_II_MAX_LSEG, nnref = 0; |
8845 | grn_io_mapinfo *info = ii->seg->maps; |
8846 | for (; segno; segno--, info++) { if (info->nref) { nnref++; } } |
8847 | GRN_LOG(ctx, GRN_LOG_INFO, "nnref=%d" , nnref); |
8848 | } |
8849 | #endif /* DEBUG */ |
8850 | return rc; |
8851 | } |
8852 | |
8853 | static uint32_t |
8854 | grn_ii_estimate_size_for_query_regexp(grn_ctx *ctx, grn_ii *ii, |
8855 | const char *query, unsigned int query_len, |
8856 | grn_search_optarg *optarg) |
8857 | { |
8858 | grn_rc rc; |
8859 | grn_obj parsed_query; |
8860 | uint32_t size; |
8861 | |
8862 | GRN_TEXT_INIT(&parsed_query, 0); |
8863 | rc = grn_ii_parse_regexp_query(ctx, "[ii][estimate-size][query][regexp]" , |
8864 | query, query_len, &parsed_query); |
8865 | if (rc != GRN_SUCCESS) { |
8866 | GRN_OBJ_FIN(ctx, &parsed_query); |
8867 | return 0; |
8868 | } |
8869 | |
8870 | if (optarg) { |
8871 | optarg->mode = GRN_OP_EXACT; |
8872 | } |
8873 | |
8874 | size = grn_ii_estimate_size_for_query(ctx, ii, |
8875 | GRN_TEXT_VALUE(&parsed_query), |
8876 | GRN_TEXT_LEN(&parsed_query), |
8877 | optarg); |
8878 | GRN_OBJ_FIN(ctx, &parsed_query); |
8879 | |
8880 | if (optarg) { |
8881 | optarg->mode = GRN_OP_REGEXP; |
8882 | } |
8883 | |
8884 | return size; |
8885 | } |
8886 | |
8887 | uint32_t |
8888 | grn_ii_estimate_size_for_query(grn_ctx *ctx, grn_ii *ii, |
8889 | const char *query, unsigned int query_len, |
8890 | grn_search_optarg *optarg) |
8891 | { |
8892 | grn_rc rc; |
8893 | grn_obj *lexicon = ii->lexicon; |
8894 | token_info **tis = NULL; |
8895 | uint32_t i; |
8896 | uint32_t n_tis = 0; |
8897 | grn_bool only_skip_token = GRN_FALSE; |
8898 | grn_operator mode = GRN_OP_EXACT; |
8899 | double estimated_size = 0; |
8900 | double normalized_ratio = 1.0; |
8901 | grn_id min = GRN_ID_NIL; |
8902 | |
8903 | if (query_len == 0) { |
8904 | return 0; |
8905 | } |
8906 | |
8907 | if (optarg) { |
8908 | switch (optarg->mode) { |
8909 | case GRN_OP_NEAR : |
8910 | case GRN_OP_NEAR2 : |
8911 | mode = optarg->mode; |
8912 | break; |
8913 | case GRN_OP_SIMILAR : |
8914 | mode = optarg->mode; |
8915 | break; |
8916 | case GRN_OP_REGEXP : |
8917 | mode = optarg->mode; |
8918 | break; |
8919 | case GRN_OP_FUZZY : |
8920 | mode = optarg->mode; |
8921 | default : |
8922 | break; |
8923 | } |
8924 | if (optarg->match_info.flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { |
8925 | min = optarg->match_info.min; |
8926 | } |
8927 | } |
8928 | |
8929 | if (mode == GRN_OP_REGEXP) { |
8930 | return grn_ii_estimate_size_for_query_regexp(ctx, ii, query, query_len, |
8931 | optarg); |
8932 | } |
8933 | |
8934 | tis = GRN_MALLOC(sizeof(token_info *) * query_len * 2); |
8935 | if (!tis) { |
8936 | return 0; |
8937 | } |
8938 | |
8939 | switch (mode) { |
8940 | case GRN_OP_FUZZY : |
8941 | rc = token_info_build_fuzzy(ctx, lexicon, ii, query, query_len, |
8942 | tis, &n_tis, &only_skip_token, min, |
8943 | mode, &(optarg->fuzzy)); |
8944 | break; |
8945 | default : |
8946 | rc = token_info_build(ctx, lexicon, ii, query, query_len, |
8947 | tis, &n_tis, &only_skip_token, min, mode); |
8948 | break; |
8949 | } |
8950 | |
8951 | if (rc != GRN_SUCCESS) { |
8952 | goto exit; |
8953 | } |
8954 | |
8955 | for (i = 0; i < n_tis; i++) { |
8956 | token_info *ti = tis[i]; |
8957 | double term_estimated_size; |
8958 | term_estimated_size = ((double)ti->size / ti->ntoken); |
8959 | if (i == 0) { |
8960 | estimated_size = term_estimated_size; |
8961 | } else { |
8962 | if (term_estimated_size < estimated_size) { |
8963 | estimated_size = term_estimated_size; |
8964 | } |
8965 | normalized_ratio *= grn_ii_estimate_size_for_query_reduce_ratio; |
8966 | } |
8967 | } |
8968 | |
8969 | estimated_size *= normalized_ratio; |
8970 | if (estimated_size > 0.0 && estimated_size < 1.0) { |
8971 | estimated_size = 1.0; |
8972 | } |
8973 | |
8974 | exit : |
8975 | for (i = 0; i < n_tis; i++) { |
8976 | token_info *ti = tis[i]; |
8977 | if (ti) { |
8978 | token_info_close(ctx, ti); |
8979 | } |
8980 | } |
8981 | if (tis) { |
8982 | GRN_FREE(tis); |
8983 | } |
8984 | |
8985 | return estimated_size; |
8986 | } |
8987 | |
8988 | uint32_t |
8989 | grn_ii_estimate_size_for_lexicon_cursor(grn_ctx *ctx, grn_ii *ii, |
8990 | grn_table_cursor *lexicon_cursor) |
8991 | { |
8992 | grn_id term_id; |
8993 | uint32_t estimated_size = 0; |
8994 | |
8995 | while ((term_id = grn_table_cursor_next(ctx, lexicon_cursor)) != GRN_ID_NIL) { |
8996 | uint32_t term_estimated_size; |
8997 | term_estimated_size = grn_ii_estimate_size(ctx, ii, term_id); |
8998 | estimated_size += term_estimated_size; |
8999 | } |
9000 | |
9001 | return estimated_size; |
9002 | } |
9003 | |
9004 | grn_rc |
9005 | grn_ii_sel(grn_ctx *ctx, grn_ii *ii, const char *string, unsigned int string_len, |
9006 | grn_hash *s, grn_operator op, grn_search_optarg *optarg) |
9007 | { |
9008 | ERRCLR(ctx); |
9009 | GRN_LOG(ctx, GRN_LOG_INFO, "grn_ii_sel > (%.*s)" , string_len, string); |
9010 | { |
9011 | grn_select_optarg arg; |
9012 | if (!s) { return GRN_INVALID_ARGUMENT; } |
9013 | memset(&arg, 0, sizeof(grn_select_optarg)); |
9014 | arg.mode = GRN_OP_EXACT; |
9015 | if (optarg) { |
9016 | switch (optarg->mode) { |
9017 | case GRN_OP_NEAR : |
9018 | case GRN_OP_NEAR2 : |
9019 | arg.mode = optarg->mode; |
9020 | arg.max_interval = optarg->max_interval; |
9021 | break; |
9022 | case GRN_OP_SIMILAR : |
9023 | arg.mode = optarg->mode; |
9024 | arg.similarity_threshold = optarg->similarity_threshold; |
9025 | break; |
9026 | case GRN_OP_REGEXP : |
9027 | arg.mode = optarg->mode; |
9028 | break; |
9029 | case GRN_OP_FUZZY : |
9030 | arg.mode = optarg->mode; |
9031 | arg.fuzzy = optarg->fuzzy; |
9032 | break; |
9033 | default : |
9034 | break; |
9035 | } |
9036 | if (optarg->vector_size != 0) { |
9037 | arg.weight_vector = optarg->weight_vector; |
9038 | arg.vector_size = optarg->vector_size; |
9039 | } |
9040 | arg.scorer = optarg->scorer; |
9041 | arg.scorer_args_expr = optarg->scorer_args_expr; |
9042 | arg.scorer_args_expr_offset = optarg->scorer_args_expr_offset; |
9043 | arg.match_info = &(optarg->match_info); |
9044 | } |
9045 | /* todo : support subrec |
9046 | grn_rset_init(ctx, s, grn_rec_document, 0, grn_rec_none, 0, 0); |
9047 | */ |
9048 | if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { |
9049 | GRN_LOG(ctx, GRN_LOG_ERROR, "grn_ii_select on grn_ii_sel(1) failed !" ); |
9050 | return ctx->rc; |
9051 | } |
9052 | GRN_LOG(ctx, GRN_LOG_INFO, "exact: %d" , GRN_HASH_SIZE(s)); |
9053 | if (op == GRN_OP_OR) { |
9054 | grn_id min = GRN_ID_NIL; |
9055 | if ((int64_t)GRN_HASH_SIZE(s) <= ctx->impl->match_escalation_threshold) { |
9056 | arg.mode = GRN_OP_UNSPLIT; |
9057 | if (arg.match_info) { |
9058 | if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { |
9059 | min = arg.match_info->min; |
9060 | arg.match_info->min = GRN_ID_NIL; |
9061 | } |
9062 | } |
9063 | if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { |
9064 | GRN_LOG(ctx, GRN_LOG_ERROR, |
9065 | "grn_ii_select on grn_ii_sel(2) failed !" ); |
9066 | return ctx->rc; |
9067 | } |
9068 | GRN_LOG(ctx, GRN_LOG_INFO, "unsplit: %d" , GRN_HASH_SIZE(s)); |
9069 | if (arg.match_info) { |
9070 | if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { |
9071 | if (min > GRN_ID_NIL && min < arg.match_info->min) { |
9072 | arg.match_info->min = min; |
9073 | } |
9074 | } |
9075 | } |
9076 | } |
9077 | if ((int64_t)GRN_HASH_SIZE(s) <= ctx->impl->match_escalation_threshold) { |
9078 | arg.mode = GRN_OP_PARTIAL; |
9079 | if (arg.match_info) { |
9080 | if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { |
9081 | min = arg.match_info->min; |
9082 | arg.match_info->min = GRN_ID_NIL; |
9083 | } |
9084 | } |
9085 | if (grn_ii_select(ctx, ii, string, string_len, s, op, &arg)) { |
9086 | GRN_LOG(ctx, GRN_LOG_ERROR, |
9087 | "grn_ii_select on grn_ii_sel(3) failed !" ); |
9088 | return ctx->rc; |
9089 | } |
9090 | GRN_LOG(ctx, GRN_LOG_INFO, "partial: %d" , GRN_HASH_SIZE(s)); |
9091 | if (arg.match_info) { |
9092 | if (arg.match_info->flags & GRN_MATCH_INFO_GET_MIN_RECORD_ID) { |
9093 | if (min > GRN_ID_NIL && min < arg.match_info->min) { |
9094 | arg.match_info->min = min; |
9095 | } |
9096 | } |
9097 | } |
9098 | } |
9099 | } |
9100 | GRN_LOG(ctx, GRN_LOG_INFO, "hits=%d" , GRN_HASH_SIZE(s)); |
9101 | return GRN_SUCCESS; |
9102 | } |
9103 | } |
9104 | |
9105 | grn_rc |
9106 | grn_ii_at(grn_ctx *ctx, grn_ii *ii, grn_id id, grn_hash *s, grn_operator op) |
9107 | { |
9108 | int rep = 0; |
9109 | grn_ii_cursor *c; |
9110 | grn_posting *pos; |
9111 | if ((c = grn_ii_cursor_open(ctx, ii, id, GRN_ID_NIL, GRN_ID_MAX, |
9112 | rep ? ii->n_elements : ii->n_elements - 1, 0))) { |
9113 | while ((pos = grn_ii_cursor_next(ctx, c))) { |
9114 | res_add(ctx, s, (grn_rset_posinfo *) pos, (1 + pos->weight), op); |
9115 | } |
9116 | grn_ii_cursor_close(ctx, c); |
9117 | } |
9118 | return ctx->rc; |
9119 | } |
9120 | |
9121 | void |
9122 | grn_ii_resolve_sel_and(grn_ctx *ctx, grn_hash *s, grn_operator op) |
9123 | { |
9124 | if (op == GRN_OP_AND |
9125 | && !(ctx->flags & GRN_CTX_TEMPORARY_DISABLE_II_RESOLVE_SEL_AND)) { |
9126 | grn_id eid; |
9127 | grn_rset_recinfo *ri; |
9128 | grn_hash_cursor *c = grn_hash_cursor_open(ctx, s, NULL, 0, NULL, 0, |
9129 | 0, -1, 0); |
9130 | if (c) { |
9131 | while ((eid = grn_hash_cursor_next(ctx, c))) { |
9132 | grn_hash_cursor_get_value(ctx, c, (void **) &ri); |
9133 | if ((ri->n_subrecs & GRN_RSET_UTIL_BIT)) { |
9134 | ri->n_subrecs &= ~GRN_RSET_UTIL_BIT; |
9135 | } else { |
9136 | grn_hash_delete_by_id(ctx, s, eid, NULL); |
9137 | } |
9138 | } |
9139 | grn_hash_cursor_close(ctx, c); |
9140 | } |
9141 | } |
9142 | } |
9143 | |
9144 | void |
9145 | grn_ii_cursor_inspect(grn_ctx *ctx, grn_ii_cursor *c, grn_obj *buf) |
9146 | { |
9147 | grn_obj key_buf; |
9148 | char key[GRN_TABLE_MAX_KEY_SIZE]; |
9149 | int key_size; |
9150 | int i = 0; |
9151 | grn_ii_cursor_next_options options = { |
9152 | .include_garbage = GRN_TRUE |
9153 | }; |
9154 | |
9155 | GRN_TEXT_PUTS(ctx, buf, " #<" ); |
9156 | key_size = grn_table_get_key(ctx, c->ii->lexicon, c->id, |
9157 | key, GRN_TABLE_MAX_KEY_SIZE); |
9158 | GRN_OBJ_INIT(&key_buf, GRN_BULK, 0, c->ii->lexicon->header.domain); |
9159 | GRN_TEXT_SET(ctx, &key_buf, key, key_size); |
9160 | grn_inspect(ctx, buf, &key_buf); |
9161 | GRN_OBJ_FIN(ctx, &key_buf); |
9162 | |
9163 | GRN_TEXT_PUTS(ctx, buf, "\n elements:[\n " ); |
9164 | while (grn_ii_cursor_next_internal(ctx, c, &options)) { |
9165 | grn_posting *pos = c->post; |
9166 | if (i > 0) { |
9167 | GRN_TEXT_PUTS(ctx, buf, ",\n " ); |
9168 | } |
9169 | i++; |
9170 | GRN_TEXT_PUTS(ctx, buf, "{status:" ); |
9171 | if (pos->tf && pos->sid) { |
9172 | GRN_TEXT_PUTS(ctx, buf, "available" ); |
9173 | } else { |
9174 | GRN_TEXT_PUTS(ctx, buf, "garbage" ); |
9175 | } |
9176 | GRN_TEXT_PUTS(ctx, buf, ", rid:" ); |
9177 | grn_text_lltoa(ctx, buf, pos->rid); |
9178 | GRN_TEXT_PUTS(ctx, buf, ", sid:" ); |
9179 | grn_text_lltoa(ctx, buf, pos->sid); |
9180 | GRN_TEXT_PUTS(ctx, buf, ", pos:" ); |
9181 | grn_text_lltoa(ctx, buf, pos->pos); |
9182 | GRN_TEXT_PUTS(ctx, buf, ", tf:" ); |
9183 | grn_text_lltoa(ctx, buf, pos->tf); |
9184 | GRN_TEXT_PUTS(ctx, buf, ", weight:" ); |
9185 | grn_text_lltoa(ctx, buf, pos->weight); |
9186 | GRN_TEXT_PUTS(ctx, buf, ", rest:" ); |
9187 | grn_text_lltoa(ctx, buf, pos->rest); |
9188 | GRN_TEXT_PUTS(ctx, buf, "}" ); |
9189 | } |
9190 | GRN_TEXT_PUTS(ctx, buf, "\n ]\n >" ); |
9191 | } |
9192 | |
9193 | void |
9194 | grn_ii_inspect_values(grn_ctx *ctx, grn_ii *ii, grn_obj *buf) |
9195 | { |
9196 | grn_table_cursor *tc; |
9197 | GRN_TEXT_PUTS(ctx, buf, "[" ); |
9198 | if ((tc = grn_table_cursor_open(ctx, ii->lexicon, NULL, 0, NULL, 0, 0, -1, |
9199 | GRN_CURSOR_ASCENDING))) { |
9200 | int i = 0; |
9201 | grn_id tid; |
9202 | grn_ii_cursor *c; |
9203 | while ((tid = grn_table_cursor_next(ctx, tc))) { |
9204 | if (i > 0) { |
9205 | GRN_TEXT_PUTS(ctx, buf, "," ); |
9206 | } |
9207 | i++; |
9208 | GRN_TEXT_PUTS(ctx, buf, "\n" ); |
9209 | if ((c = grn_ii_cursor_open(ctx, ii, tid, GRN_ID_NIL, GRN_ID_MAX, |
9210 | ii->n_elements, |
9211 | GRN_OBJ_WITH_POSITION|GRN_OBJ_WITH_SECTION))) { |
9212 | grn_ii_cursor_inspect(ctx, c, buf); |
9213 | grn_ii_cursor_close(ctx, c); |
9214 | } |
9215 | } |
9216 | grn_table_cursor_close(ctx, tc); |
9217 | } |
9218 | GRN_TEXT_PUTS(ctx, buf, "]" ); |
9219 | } |
9220 | |
9221 | /********************** buffered index builder ***********************/ |
9222 | |
9223 | const grn_id II_BUFFER_TYPE_MASK = 0xc0000000; |
9224 | #define II_BUFFER_TYPE_RID 0x80000000 |
9225 | #define II_BUFFER_TYPE_WEIGHT 0x40000000 |
9226 | #define II_BUFFER_TYPE(id) (((id) & II_BUFFER_TYPE_MASK)) |
9227 | #define II_BUFFER_PACK(value, type) ((value) | (type)) |
9228 | #define II_BUFFER_UNPACK(id, type) ((id) & ~(type)) |
9229 | #define II_BUFFER_ORDER GRN_CURSOR_BY_KEY |
9230 | const uint16_t II_BUFFER_NTERMS_PER_BUFFER = 16380; |
9231 | const uint32_t II_BUFFER_PACKED_BUF_SIZE = 0x4000000; |
9232 | const char *TMPFILE_PATH = "grn_ii_buffer_tmp" ; |
9233 | const uint32_t II_BUFFER_NCOUNTERS_MARGIN = 0x100000; |
9234 | const size_t II_BUFFER_BLOCK_SIZE = 0x1000000; |
9235 | const uint32_t II_BUFFER_BLOCK_READ_UNIT_SIZE = 0x200000; |
9236 | |
9237 | typedef struct { |
9238 | unsigned int sid; /* Section ID */ |
9239 | unsigned int weight; /* Weight */ |
9240 | const char *p; /* Value address */ |
9241 | uint32_t len; /* Value length */ |
9242 | char *buf; /* Buffer address */ |
9243 | uint32_t cap; /* Buffer size */ |
9244 | } ii_buffer_value; |
9245 | |
9246 | /* ii_buffer_counter is associated with a combination of a block an a term. */ |
9247 | typedef struct { |
9248 | uint32_t nrecs; /* Number of records or sections */ |
9249 | uint32_t nposts; /* Number of occurrences */ |
9250 | |
9251 | /* Information of the last value */ |
9252 | grn_id last_rid; /* Record ID */ |
9253 | uint32_t last_sid; /* Section ID */ |
9254 | uint32_t last_tf; /* Term frequency */ |
9255 | uint32_t last_weight; /* Total weight */ |
9256 | uint32_t last_pos; /* Token position */ |
9257 | |
9258 | /* Meaning of offset_* is different before/after encoding. */ |
9259 | /* Before encoding: size in encoded sequence */ |
9260 | /* After encoding: Offset in encoded sequence */ |
9261 | uint32_t offset_rid; /* Record ID */ |
9262 | uint32_t offset_sid; /* Section ID */ |
9263 | uint32_t offset_tf; /* Term frequency */ |
9264 | uint32_t offset_weight; /* Weight */ |
9265 | uint32_t offset_pos; /* Token position */ |
9266 | } ii_buffer_counter; |
9267 | |
9268 | typedef struct { |
9269 | off64_t head; |
9270 | off64_t tail; |
9271 | uint32_t nextsize; |
9272 | uint8_t *buffer; |
9273 | uint32_t buffersize; |
9274 | uint8_t *bufcur; |
9275 | uint32_t rest; |
9276 | grn_id tid; |
9277 | uint32_t nrecs; |
9278 | uint32_t nposts; |
9279 | grn_id *recs; |
9280 | uint32_t *tfs; |
9281 | uint32_t *posts; |
9282 | } ii_buffer_block; |
9283 | |
9284 | struct _grn_ii_buffer { |
9285 | grn_obj *lexicon; /* Global lexicon */ |
9286 | grn_obj *tmp_lexicon; /* Temporary lexicon for each block */ |
9287 | ii_buffer_block *blocks; /* Blocks */ |
9288 | uint32_t nblocks; /* Number of blocks */ |
9289 | int tmpfd; /* Descriptor of temporary file */ |
9290 | char tmpfpath[PATH_MAX]; /* Path of temporary file */ |
9291 | uint64_t update_buffer_size; |
9292 | |
9293 | // stuff for parsing |
9294 | off64_t filepos; /* Write position of temporary file */ |
9295 | grn_id *block_buf; /* Buffer for the current block */ |
9296 | size_t block_buf_size; /* Size of block_buf */ |
9297 | size_t block_pos; /* Write position of block_buf */ |
9298 | ii_buffer_counter *counters; /* Status of terms */ |
9299 | uint32_t ncounters; /* Number of counters */ |
9300 | size_t total_size; |
9301 | size_t curr_size; |
9302 | ii_buffer_value *values; /* Values in block */ |
9303 | unsigned int nvalues; /* Number of values in block */ |
9304 | unsigned int max_nvalues; /* Size of values */ |
9305 | grn_id last_rid; |
9306 | |
9307 | // stuff for merging |
9308 | grn_ii *ii; |
9309 | uint32_t lseg; |
9310 | uint32_t dseg; |
9311 | buffer *term_buffer; |
9312 | datavec data_vectors[MAX_N_ELEMENTS + 1]; |
9313 | uint8_t *packed_buf; |
9314 | size_t packed_buf_size; |
9315 | size_t packed_len; |
9316 | size_t total_chunk_size; |
9317 | }; |
9318 | |
9319 | /* block_new returns a new ii_buffer_block to store block information. */ |
9320 | static ii_buffer_block * |
9321 | block_new(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
9322 | { |
9323 | ii_buffer_block *block; |
9324 | if (!(ii_buffer->nblocks & 0x3ff)) { |
9325 | ii_buffer_block *blocks; |
9326 | if (!(blocks = GRN_REALLOC(ii_buffer->blocks, |
9327 | (ii_buffer->nblocks + 0x400) * |
9328 | sizeof(ii_buffer_block)))) { |
9329 | return NULL; |
9330 | } |
9331 | ii_buffer->blocks = blocks; |
9332 | } |
9333 | block = &ii_buffer->blocks[ii_buffer->nblocks]; |
9334 | block->head = ii_buffer->filepos; |
9335 | block->rest = 0; |
9336 | block->buffer = NULL; |
9337 | block->buffersize = 0; |
9338 | return block; |
9339 | } |
9340 | |
9341 | /* allocate_outbuf allocates memory to flush a block. */ |
9342 | static uint8_t * |
9343 | allocate_outbuf(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
9344 | { |
9345 | size_t bufsize = 0, bufsize_ = 0; |
9346 | uint32_t flags = ii_buffer->ii->header->flags; |
9347 | ii_buffer_counter *counter = ii_buffer->counters; |
9348 | grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); |
9349 | for (tid = 1; tid <= tid_max; counter++, tid++) { |
9350 | counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); |
9351 | counter->last_rid = 0; |
9352 | counter->last_tf = 0; |
9353 | bufsize += 5; |
9354 | bufsize += GRN_B_ENC_SIZE(counter->nrecs); |
9355 | bufsize += GRN_B_ENC_SIZE(counter->nposts); |
9356 | bufsize += counter->offset_rid; |
9357 | if ((flags & GRN_OBJ_WITH_SECTION)) { |
9358 | bufsize += counter->offset_sid; |
9359 | } |
9360 | bufsize += counter->offset_tf; |
9361 | if ((flags & GRN_OBJ_WITH_WEIGHT)) { |
9362 | bufsize += counter->offset_weight; |
9363 | } |
9364 | if ((flags & GRN_OBJ_WITH_POSITION)) { |
9365 | bufsize += counter->offset_pos; |
9366 | } |
9367 | if (bufsize_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < bufsize) { |
9368 | bufsize += sizeof(uint32_t); |
9369 | bufsize_ = bufsize; |
9370 | } |
9371 | } |
9372 | GRN_LOG(ctx, GRN_LOG_INFO, "flushing:%d bufsize:%" GRN_FMT_SIZE, |
9373 | ii_buffer->nblocks, bufsize); |
9374 | return (uint8_t *)GRN_MALLOC(bufsize); |
9375 | } |
9376 | |
9377 | /* |
9378 | * The temporary file format is roughly as follows: |
9379 | * |
9380 | * File = Block... |
9381 | * Block = Unit... |
9382 | * Unit = TermChunk (key order) |
9383 | * NextUnitSize (The first unit size is kept on memory) |
9384 | * Chunk = Term... |
9385 | * Term = ID (gtid) |
9386 | * NumRecordsOrSections (nrecs), NumOccurrences (nposts) |
9387 | * RecordID... (rid, diff) |
9388 | * [SectionID... (sid, diff)] |
9389 | * TermFrequency... (tf, diff) |
9390 | * [Weight... (weight, diff)] |
9391 | * [Position... (pos, diff)] |
9392 | */ |
9393 | |
9394 | /* |
9395 | * encode_terms encodes terms in ii_buffer->tmp_lexicon and returns the |
9396 | * expected temporary file size. |
9397 | */ |
9398 | static size_t |
9399 | encode_terms(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
9400 | uint8_t *outbuf, ii_buffer_block *block) |
9401 | { |
9402 | grn_id tid; |
9403 | uint8_t *outbufp = outbuf; |
9404 | uint8_t *outbufp_ = outbuf; |
9405 | grn_table_cursor *tc; |
9406 | /* The first size is written into block->nextsize. */ |
9407 | uint8_t *pnext = (uint8_t *)&block->nextsize; |
9408 | uint32_t flags = ii_buffer->ii->header->flags; |
9409 | tc = grn_table_cursor_open(ctx, ii_buffer->tmp_lexicon, |
9410 | NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); |
9411 | while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { |
9412 | char key[GRN_TABLE_MAX_KEY_SIZE]; |
9413 | int key_size = grn_table_get_key(ctx, ii_buffer->tmp_lexicon, tid, |
9414 | key, GRN_TABLE_MAX_KEY_SIZE); |
9415 | /* gtid is a global term ID, not in a temporary lexicon. */ |
9416 | grn_id gtid = grn_table_add(ctx, ii_buffer->lexicon, key, key_size, NULL); |
9417 | ii_buffer_counter *counter = &ii_buffer->counters[tid - 1]; |
9418 | if (counter->nrecs) { |
9419 | uint32_t offset_rid = counter->offset_rid; |
9420 | uint32_t offset_sid = counter->offset_sid; |
9421 | uint32_t offset_tf = counter->offset_tf; |
9422 | uint32_t offset_weight = counter->offset_weight; |
9423 | uint32_t offset_pos = counter->offset_pos; |
9424 | GRN_B_ENC(gtid, outbufp); |
9425 | GRN_B_ENC(counter->nrecs, outbufp); |
9426 | GRN_B_ENC(counter->nposts, outbufp); |
9427 | ii_buffer->total_size += counter->nrecs + counter->nposts; |
9428 | counter->offset_rid = outbufp - outbuf; |
9429 | outbufp += offset_rid; |
9430 | if ((flags & GRN_OBJ_WITH_SECTION)) { |
9431 | counter->offset_sid = outbufp - outbuf; |
9432 | outbufp += offset_sid; |
9433 | } |
9434 | counter->offset_tf = outbufp - outbuf; |
9435 | outbufp += offset_tf; |
9436 | if ((flags & GRN_OBJ_WITH_WEIGHT)) { |
9437 | counter->offset_weight = outbufp - outbuf; |
9438 | outbufp += offset_weight; |
9439 | } |
9440 | if ((flags & GRN_OBJ_WITH_POSITION)) { |
9441 | counter->offset_pos = outbufp - outbuf; |
9442 | outbufp += offset_pos; |
9443 | } |
9444 | } |
9445 | if (outbufp_ + II_BUFFER_BLOCK_READ_UNIT_SIZE < outbufp) { |
9446 | uint32_t size = outbufp - outbufp_ + sizeof(uint32_t); |
9447 | grn_memcpy(pnext, &size, sizeof(uint32_t)); |
9448 | pnext = outbufp; |
9449 | outbufp += sizeof(uint32_t); |
9450 | outbufp_ = outbufp; |
9451 | } |
9452 | } |
9453 | grn_table_cursor_close(ctx, tc); |
9454 | if (outbufp_ < outbufp) { |
9455 | uint32_t size = outbufp - outbufp_; |
9456 | grn_memcpy(pnext, &size, sizeof(uint32_t)); |
9457 | } |
9458 | return outbufp - outbuf; |
9459 | } |
9460 | |
9461 | /* encode_postings encodes data in ii_buffer->block_buf. */ |
9462 | static void |
9463 | encode_postings(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) |
9464 | { |
9465 | grn_id rid = 0; |
9466 | unsigned int sid = 1; |
9467 | unsigned int weight = 0; |
9468 | uint32_t pos = 0; |
9469 | uint32_t rest; |
9470 | grn_id *bp = ii_buffer->block_buf; |
9471 | uint32_t flags = ii_buffer->ii->header->flags; |
9472 | for (rest = ii_buffer->block_pos; rest; bp++, rest--) { |
9473 | grn_id id = *bp; |
9474 | switch (II_BUFFER_TYPE(id)) { |
9475 | case II_BUFFER_TYPE_RID : |
9476 | rid = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_RID); |
9477 | if ((flags & GRN_OBJ_WITH_SECTION) && rest) { |
9478 | sid = *++bp; |
9479 | rest--; |
9480 | } |
9481 | weight = 0; |
9482 | pos = 0; |
9483 | break; |
9484 | case II_BUFFER_TYPE_WEIGHT : |
9485 | weight = II_BUFFER_UNPACK(id, II_BUFFER_TYPE_WEIGHT); |
9486 | break; |
9487 | default : |
9488 | { |
9489 | ii_buffer_counter *counter = &ii_buffer->counters[id - 1]; |
9490 | if (counter->last_rid == rid && counter->last_sid == sid) { |
9491 | counter->last_tf++; |
9492 | counter->last_weight += weight; |
9493 | } else { |
9494 | if (counter->last_tf) { |
9495 | uint8_t *p = outbuf + counter->offset_tf; |
9496 | GRN_B_ENC(counter->last_tf - 1, p); |
9497 | counter->offset_tf = p - outbuf; |
9498 | if (flags & GRN_OBJ_WITH_WEIGHT) { |
9499 | p = outbuf + counter->offset_weight; |
9500 | GRN_B_ENC(counter->last_weight, p); |
9501 | counter->offset_weight = p - outbuf; |
9502 | } |
9503 | } |
9504 | { |
9505 | uint8_t *p = outbuf + counter->offset_rid; |
9506 | GRN_B_ENC(rid - counter->last_rid, p); |
9507 | counter->offset_rid = p - outbuf; |
9508 | } |
9509 | if (flags & GRN_OBJ_WITH_SECTION) { |
9510 | uint8_t *p = outbuf + counter->offset_sid; |
9511 | if (counter->last_rid != rid) { |
9512 | GRN_B_ENC(sid - 1, p); |
9513 | } else { |
9514 | GRN_B_ENC(sid - counter->last_sid - 1, p); |
9515 | } |
9516 | counter->offset_sid = p - outbuf; |
9517 | } |
9518 | counter->last_rid = rid; |
9519 | counter->last_sid = sid; |
9520 | counter->last_tf = 1; |
9521 | counter->last_weight = weight; |
9522 | counter->last_pos = 0; |
9523 | } |
9524 | if ((flags & GRN_OBJ_WITH_POSITION) && rest) { |
9525 | uint8_t *p = outbuf + counter->offset_pos; |
9526 | pos = *++bp; |
9527 | rest--; |
9528 | GRN_B_ENC(pos - counter->last_pos, p); |
9529 | counter->offset_pos = p - outbuf; |
9530 | counter->last_pos = pos; |
9531 | } |
9532 | } |
9533 | break; |
9534 | } |
9535 | } |
9536 | } |
9537 | |
9538 | /* encode_last_tf encodes last_tf and last_weight in counters. */ |
9539 | static void |
9540 | encode_last_tf(grn_ctx *ctx, grn_ii_buffer *ii_buffer, uint8_t *outbuf) |
9541 | { |
9542 | ii_buffer_counter *counter = ii_buffer->counters; |
9543 | grn_id tid, tid_max = grn_table_size(ctx, ii_buffer->tmp_lexicon); |
9544 | for (tid = 1; tid <= tid_max; counter++, tid++) { |
9545 | uint8_t *p = outbuf + counter->offset_tf; |
9546 | GRN_B_ENC(counter->last_tf - 1, p); |
9547 | } |
9548 | if ((ii_buffer->ii->header->flags & GRN_OBJ_WITH_WEIGHT)) { |
9549 | for (tid = 1; tid <= tid_max; counter++, tid++) { |
9550 | uint8_t *p = outbuf + counter->offset_weight; |
9551 | GRN_B_ENC(counter->last_weight, p); |
9552 | } |
9553 | } |
9554 | } |
9555 | |
9556 | /* |
9557 | * grn_ii_buffer_flush flushes the current block (ii_buffer->block_buf, |
9558 | * counters and tmp_lexicon) to a temporary file (ii_buffer->tmpfd). |
9559 | * Also, block information is stored into ii_buffer->blocks. |
9560 | */ |
9561 | static void |
9562 | grn_ii_buffer_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
9563 | { |
9564 | size_t encsize; |
9565 | uint8_t *outbuf; |
9566 | ii_buffer_block *block; |
9567 | GRN_LOG(ctx, GRN_LOG_DEBUG, "flushing:%d npostings:%" GRN_FMT_SIZE, |
9568 | ii_buffer->nblocks, ii_buffer->block_pos); |
9569 | if (!(block = block_new(ctx, ii_buffer))) { return; } |
9570 | if (!(outbuf = allocate_outbuf(ctx, ii_buffer))) { return; } |
9571 | encsize = encode_terms(ctx, ii_buffer, outbuf, block); |
9572 | encode_postings(ctx, ii_buffer, outbuf); |
9573 | encode_last_tf(ctx, ii_buffer, outbuf); |
9574 | { |
9575 | ssize_t r = grn_write(ii_buffer->tmpfd, outbuf, encsize); |
9576 | if (r != encsize) { |
9577 | ERR(GRN_INPUT_OUTPUT_ERROR, |
9578 | "write returned %" GRN_FMT_LLD " != %" GRN_FMT_LLU, |
9579 | (long long int)r, (unsigned long long int)encsize); |
9580 | GRN_FREE(outbuf); |
9581 | return; |
9582 | } |
9583 | ii_buffer->filepos += r; |
9584 | block->tail = ii_buffer->filepos; |
9585 | } |
9586 | GRN_FREE(outbuf); |
9587 | memset(ii_buffer->counters, 0, |
9588 | grn_table_size(ctx, ii_buffer->tmp_lexicon) * |
9589 | sizeof(ii_buffer_counter)); |
9590 | grn_obj_close(ctx, ii_buffer->tmp_lexicon); |
9591 | GRN_LOG(ctx, GRN_LOG_DEBUG, "flushed: %d encsize:%" GRN_FMT_SIZE, |
9592 | ii_buffer->nblocks, encsize); |
9593 | ii_buffer->tmp_lexicon = NULL; |
9594 | ii_buffer->nblocks++; |
9595 | ii_buffer->block_pos = 0; |
9596 | } |
9597 | |
9598 | const uint32_t PAT_CACHE_SIZE = 1<<20; |
9599 | |
9600 | /* |
9601 | * get_tmp_lexicon returns a temporary lexicon. |
9602 | * |
9603 | * Note that a lexicon is created for each block and ii_buffer->tmp_lexicon is |
9604 | * closed in grn_ii_buffer_flush. |
9605 | */ |
9606 | static grn_obj * |
9607 | get_tmp_lexicon(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
9608 | { |
9609 | grn_obj *tmp_lexicon = ii_buffer->tmp_lexicon; |
9610 | if (!tmp_lexicon) { |
9611 | grn_obj *domain = grn_ctx_at(ctx, ii_buffer->lexicon->header.domain); |
9612 | grn_obj *range = grn_ctx_at(ctx, DB_OBJ(ii_buffer->lexicon)->range); |
9613 | grn_obj *tokenizer; |
9614 | grn_obj *normalizer; |
9615 | grn_obj *token_filters; |
9616 | grn_table_flags flags; |
9617 | grn_table_get_info(ctx, ii_buffer->lexicon, &flags, NULL, |
9618 | &tokenizer, &normalizer, &token_filters); |
9619 | flags &= ~GRN_OBJ_PERSISTENT; |
9620 | tmp_lexicon = grn_table_create(ctx, NULL, 0, NULL, flags, domain, range); |
9621 | if (tmp_lexicon) { |
9622 | ii_buffer->tmp_lexicon = tmp_lexicon; |
9623 | grn_obj_set_info(ctx, tmp_lexicon, |
9624 | GRN_INFO_DEFAULT_TOKENIZER, tokenizer); |
9625 | grn_obj_set_info(ctx, tmp_lexicon, |
9626 | GRN_INFO_NORMALIZER, normalizer); |
9627 | grn_obj_set_info(ctx, tmp_lexicon, |
9628 | GRN_INFO_TOKEN_FILTERS, token_filters); |
9629 | if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { |
9630 | grn_pat_cache_enable(ctx, (grn_pat *)tmp_lexicon, PAT_CACHE_SIZE); |
9631 | } |
9632 | } |
9633 | } |
9634 | return tmp_lexicon; |
9635 | } |
9636 | |
9637 | /* get_buffer_counter returns a counter associated with tid. */ |
9638 | static ii_buffer_counter * |
9639 | get_buffer_counter(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
9640 | grn_obj *tmp_lexicon, grn_id tid) |
9641 | { |
9642 | if (tid > ii_buffer->ncounters) { |
9643 | ii_buffer_counter *counters; |
9644 | uint32_t ncounters = |
9645 | grn_table_size(ctx, tmp_lexicon) + II_BUFFER_NCOUNTERS_MARGIN; |
9646 | counters = GRN_REALLOC(ii_buffer->counters, |
9647 | ncounters * sizeof(ii_buffer_counter)); |
9648 | if (!counters) { return NULL; } |
9649 | memset(&counters[ii_buffer->ncounters], 0, |
9650 | (ncounters - ii_buffer->ncounters) * sizeof(ii_buffer_counter)); |
9651 | ii_buffer->ncounters = ncounters; |
9652 | ii_buffer->counters = counters; |
9653 | } |
9654 | return &ii_buffer->counters[tid - 1]; |
9655 | } |
9656 | |
9657 | /* |
9658 | * grn_ii_buffer_tokenize_value tokenizes a value. |
9659 | * |
9660 | * The result is written into the current block (ii_buffer->tmp_lexicon, |
9661 | * ii_buffer->block_buf, ii_buffer->counters, etc.). |
9662 | */ |
9663 | static void |
9664 | grn_ii_buffer_tokenize_value(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
9665 | grn_id rid, const ii_buffer_value *value) |
9666 | { |
9667 | grn_obj *tmp_lexicon; |
9668 | if ((tmp_lexicon = get_tmp_lexicon(ctx, ii_buffer))) { |
9669 | unsigned int token_flags = 0; |
9670 | grn_token_cursor *token_cursor; |
9671 | grn_id *buffer = ii_buffer->block_buf; |
9672 | uint32_t block_pos = ii_buffer->block_pos; |
9673 | uint32_t ii_flags = ii_buffer->ii->header->flags; |
9674 | buffer[block_pos++] = II_BUFFER_PACK(rid, II_BUFFER_TYPE_RID); |
9675 | if (ii_flags & GRN_OBJ_WITH_SECTION) { |
9676 | buffer[block_pos++] = value->sid; |
9677 | } |
9678 | if (value->weight) { |
9679 | buffer[block_pos++] = II_BUFFER_PACK(value->weight, |
9680 | II_BUFFER_TYPE_WEIGHT); |
9681 | } |
9682 | if ((token_cursor = grn_token_cursor_open(ctx, tmp_lexicon, |
9683 | value->p, value->len, |
9684 | GRN_TOKEN_ADD, token_flags))) { |
9685 | while (!token_cursor->status) { |
9686 | grn_id tid; |
9687 | if ((tid = grn_token_cursor_next(ctx, token_cursor))) { |
9688 | ii_buffer_counter *counter; |
9689 | counter = get_buffer_counter(ctx, ii_buffer, tmp_lexicon, tid); |
9690 | if (!counter) { return; } |
9691 | buffer[block_pos++] = tid; |
9692 | if (ii_flags & GRN_OBJ_WITH_POSITION) { |
9693 | buffer[block_pos++] = token_cursor->pos; |
9694 | } |
9695 | if (counter->last_rid != rid) { |
9696 | counter->offset_rid += GRN_B_ENC_SIZE(rid - counter->last_rid); |
9697 | counter->last_rid = rid; |
9698 | counter->offset_sid += GRN_B_ENC_SIZE(value->sid - 1); |
9699 | counter->last_sid = value->sid; |
9700 | if (counter->last_tf) { |
9701 | counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); |
9702 | counter->last_tf = 0; |
9703 | counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight); |
9704 | counter->last_weight = 0; |
9705 | } |
9706 | counter->last_pos = 0; |
9707 | counter->nrecs++; |
9708 | } else if (counter->last_sid != value->sid) { |
9709 | counter->offset_rid += GRN_B_ENC_SIZE(0); |
9710 | counter->offset_sid += |
9711 | GRN_B_ENC_SIZE(value->sid - counter->last_sid - 1); |
9712 | counter->last_sid = value->sid; |
9713 | if (counter->last_tf) { |
9714 | counter->offset_tf += GRN_B_ENC_SIZE(counter->last_tf - 1); |
9715 | counter->last_tf = 0; |
9716 | counter->offset_weight += GRN_B_ENC_SIZE(counter->last_weight); |
9717 | counter->last_weight = 0; |
9718 | } |
9719 | counter->last_pos = 0; |
9720 | counter->nrecs++; |
9721 | } |
9722 | counter->offset_pos += |
9723 | GRN_B_ENC_SIZE(token_cursor->pos - counter->last_pos); |
9724 | counter->last_pos = token_cursor->pos; |
9725 | counter->last_tf++; |
9726 | counter->last_weight += value->weight; |
9727 | counter->nposts++; |
9728 | } |
9729 | } |
9730 | grn_token_cursor_close(ctx, token_cursor); |
9731 | } |
9732 | ii_buffer->block_pos = block_pos; |
9733 | } |
9734 | } |
9735 | |
9736 | /* |
9737 | * grn_ii_buffer_tokenize tokenizes ii_buffer->values. |
9738 | * |
9739 | * grn_ii_buffer_tokenize estimates the size of tokenized values. |
9740 | * If the remaining space of the current block is not enough to store the new |
9741 | * tokenized values, the current block is flushed. |
9742 | * Then, grn_ii_buffer_tokenize tokenizes values. |
9743 | */ |
9744 | static void |
9745 | grn_ii_buffer_tokenize(grn_ctx *ctx, grn_ii_buffer *ii_buffer, grn_id rid) |
9746 | { |
9747 | unsigned int i; |
9748 | uint32_t est_len = 0; |
9749 | for (i = 0; i < ii_buffer->nvalues; i++) { |
9750 | est_len += ii_buffer->values[i].len * 2 + 2; |
9751 | } |
9752 | if (ii_buffer->block_buf_size < ii_buffer->block_pos + est_len) { |
9753 | grn_ii_buffer_flush(ctx, ii_buffer); |
9754 | } |
9755 | if (ii_buffer->block_buf_size < est_len) { |
9756 | grn_id *block_buf = (grn_id *)GRN_REALLOC(ii_buffer->block_buf, |
9757 | est_len * sizeof(grn_id)); |
9758 | if (block_buf) { |
9759 | ii_buffer->block_buf = block_buf; |
9760 | ii_buffer->block_buf_size = est_len; |
9761 | } |
9762 | } |
9763 | |
9764 | for (i = 0; i < ii_buffer->nvalues; i++) { |
9765 | const ii_buffer_value *value = &ii_buffer->values[i]; |
9766 | if (value->len) { |
9767 | uint32_t est_len = value->len * 2 + 2; |
9768 | if (ii_buffer->block_buf_size >= ii_buffer->block_pos + est_len) { |
9769 | grn_ii_buffer_tokenize_value(ctx, ii_buffer, rid, value); |
9770 | } |
9771 | } |
9772 | } |
9773 | ii_buffer->nvalues = 0; |
9774 | } |
9775 | |
9776 | /* grn_ii_buffer_fetch fetches the next term. */ |
9777 | static void |
9778 | grn_ii_buffer_fetch(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
9779 | ii_buffer_block *block) |
9780 | { |
9781 | if (!block->rest) { |
9782 | /* Read the next unit. */ |
9783 | if (block->head < block->tail) { |
9784 | size_t bytesize = block->nextsize; |
9785 | if (block->buffersize < block->nextsize) { |
9786 | void *r = GRN_REALLOC(block->buffer, bytesize); |
9787 | if (r) { |
9788 | block->buffer = (uint8_t *)r; |
9789 | block->buffersize = block->nextsize; |
9790 | } else { |
9791 | GRN_LOG(ctx, GRN_LOG_WARNING, "realloc: %" GRN_FMT_LLU, |
9792 | (unsigned long long int)bytesize); |
9793 | return; |
9794 | } |
9795 | } |
9796 | { |
9797 | off64_t seeked_position; |
9798 | seeked_position = grn_lseek(ii_buffer->tmpfd, block->head, SEEK_SET); |
9799 | if (seeked_position != block->head) { |
9800 | ERRNO_ERR("failed to " |
9801 | "grn_lseek(%" GRN_FMT_OFF64_T ") -> %" GRN_FMT_OFF64_T, |
9802 | block->head, |
9803 | seeked_position); |
9804 | return; |
9805 | } |
9806 | } |
9807 | { |
9808 | size_t read_bytesize; |
9809 | read_bytesize = grn_read(ii_buffer->tmpfd, block->buffer, bytesize); |
9810 | if (read_bytesize != bytesize) { |
9811 | SERR("failed to grn_read(%" GRN_FMT_SIZE ") -> %" GRN_FMT_SIZE, |
9812 | bytesize, read_bytesize); |
9813 | return; |
9814 | } |
9815 | } |
9816 | block->head += bytesize; |
9817 | block->bufcur = block->buffer; |
9818 | if (block->head >= block->tail) { |
9819 | if (block->head > block->tail) { |
9820 | GRN_LOG(ctx, GRN_LOG_WARNING, |
9821 | "fetch error: %" GRN_FMT_INT64D " > %" GRN_FMT_INT64D, |
9822 | block->head, block->tail); |
9823 | } |
9824 | block->rest = block->nextsize; |
9825 | block->nextsize = 0; |
9826 | } else { |
9827 | block->rest = block->nextsize - sizeof(uint32_t); |
9828 | grn_memcpy(&block->nextsize, |
9829 | &block->buffer[block->rest], sizeof(uint32_t)); |
9830 | } |
9831 | } |
9832 | } |
9833 | if (block->rest) { |
9834 | uint8_t *p = block->bufcur; |
9835 | GRN_B_DEC(block->tid, p); |
9836 | GRN_B_DEC(block->nrecs, p); |
9837 | GRN_B_DEC(block->nposts, p); |
9838 | block->rest -= (p - block->bufcur); |
9839 | block->bufcur = p; |
9840 | } else { |
9841 | block->tid = 0; |
9842 | } |
9843 | } |
9844 | |
9845 | /* grn_ii_buffer_chunk_flush flushes the current buffer for packed postings. */ |
9846 | static void |
9847 | grn_ii_buffer_chunk_flush(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
9848 | { |
9849 | grn_io_win io_win; |
9850 | uint32_t chunk_number; |
9851 | chunk_new(ctx, ii_buffer->ii, &chunk_number, ii_buffer->packed_len); |
9852 | GRN_LOG(ctx, GRN_LOG_INFO, "chunk:%d, packed_len:%" GRN_FMT_SIZE, |
9853 | chunk_number, ii_buffer->packed_len); |
9854 | fake_map(ctx, ii_buffer->ii->chunk, &io_win, ii_buffer->packed_buf, |
9855 | chunk_number, ii_buffer->packed_len); |
9856 | grn_io_win_unmap(&io_win); |
9857 | ii_buffer->term_buffer->header.chunk = chunk_number; |
9858 | ii_buffer->term_buffer->header.chunk_size = ii_buffer->packed_len; |
9859 | ii_buffer->term_buffer->header.buffer_free = |
9860 | S_SEGMENT - sizeof(buffer_header) - |
9861 | ii_buffer->term_buffer->header.nterms * sizeof(buffer_term); |
9862 | ii_buffer->term_buffer->header.nterms_void = 0; |
9863 | buffer_segment_update(ii_buffer->ii, ii_buffer->lseg, ii_buffer->dseg); |
9864 | ii_buffer->ii->header->total_chunk_size += ii_buffer->packed_len; |
9865 | ii_buffer->total_chunk_size += ii_buffer->packed_len; |
9866 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
9867 | "nterms=%d chunk=%d total=%" GRN_FMT_INT64U "KB" , |
9868 | ii_buffer->term_buffer->header.nterms, |
9869 | ii_buffer->term_buffer->header.chunk_size, |
9870 | ii_buffer->ii->header->total_chunk_size >> 10); |
9871 | ii_buffer->term_buffer = NULL; |
9872 | ii_buffer->packed_buf = NULL; |
9873 | ii_buffer->packed_len = 0; |
9874 | ii_buffer->packed_buf_size = 0; |
9875 | ii_buffer->curr_size = 0; |
9876 | } |
9877 | |
9878 | /* |
9879 | * merge_hit_blocks merges hit blocks into ii_buffer->data_vectors. |
9880 | * merge_hit_blocks returns the estimated maximum size in bytes. |
9881 | */ |
9882 | static size_t |
9883 | merge_hit_blocks(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
9884 | ii_buffer_block *hits[], int nhits) |
9885 | { |
9886 | uint64_t nrecs = 0; |
9887 | uint64_t nposts = 0; |
9888 | size_t max_size; |
9889 | uint64_t flags = ii_buffer->ii->header->flags; |
9890 | int i; |
9891 | for (i = 0; i < nhits; i++) { |
9892 | ii_buffer_block *block = hits[i]; |
9893 | nrecs += block->nrecs; |
9894 | nposts += block->nposts; |
9895 | } |
9896 | ii_buffer->curr_size += nrecs + nposts; |
9897 | max_size = nrecs * (ii_buffer->ii->n_elements); |
9898 | if (flags & GRN_OBJ_WITH_POSITION) { max_size += nposts - nrecs; } |
9899 | datavec_reset(ctx, ii_buffer->data_vectors, |
9900 | ii_buffer->ii->n_elements, nrecs, max_size); |
9901 | { |
9902 | int i; |
9903 | uint32_t lr = 0; /* Last rid */ |
9904 | uint64_t spos = 0; |
9905 | uint32_t *ridp, *sidp = NULL, *tfp, *weightp = NULL, *posp = NULL; |
9906 | { |
9907 | /* Get write positions in datavec. */ |
9908 | int j = 0; |
9909 | ridp = ii_buffer->data_vectors[j++].data; |
9910 | if (flags & GRN_OBJ_WITH_SECTION) { |
9911 | sidp = ii_buffer->data_vectors[j++].data; |
9912 | } |
9913 | tfp = ii_buffer->data_vectors[j++].data; |
9914 | if (flags & GRN_OBJ_WITH_WEIGHT) { |
9915 | weightp = ii_buffer->data_vectors[j++].data; |
9916 | } |
9917 | if (flags & GRN_OBJ_WITH_POSITION) { |
9918 | posp = ii_buffer->data_vectors[j++].data; |
9919 | } |
9920 | } |
9921 | for (i = 0; i < nhits; i++) { |
9922 | /* Read postings from hit blocks and join the postings into datavec. */ |
9923 | ii_buffer_block *block = hits[i]; |
9924 | uint8_t *p = block->bufcur; |
9925 | uint32_t n = block->nrecs; |
9926 | if (n) { |
9927 | GRN_B_DEC(*ridp, p); |
9928 | *ridp -= lr; |
9929 | lr += *ridp++; |
9930 | while (--n) { |
9931 | GRN_B_DEC(*ridp, p); |
9932 | lr += *ridp++; |
9933 | } |
9934 | } |
9935 | if ((flags & GRN_OBJ_WITH_SECTION)) { |
9936 | for (n = block->nrecs; n; n--) { |
9937 | GRN_B_DEC(*sidp++, p); |
9938 | } |
9939 | } |
9940 | for (n = block->nrecs; n; n--) { |
9941 | GRN_B_DEC(*tfp++, p); |
9942 | } |
9943 | if ((flags & GRN_OBJ_WITH_WEIGHT)) { |
9944 | for (n = block->nrecs; n; n--) { |
9945 | GRN_B_DEC(*weightp++, p); |
9946 | } |
9947 | } |
9948 | if ((flags & GRN_OBJ_WITH_POSITION)) { |
9949 | for (n = block->nposts; n; n--) { |
9950 | GRN_B_DEC(*posp, p); |
9951 | spos += *posp++; |
9952 | } |
9953 | } |
9954 | block->rest -= (p - block->bufcur); |
9955 | block->bufcur = p; |
9956 | grn_ii_buffer_fetch(ctx, ii_buffer, block); |
9957 | } |
9958 | { |
9959 | /* Set size and flags of datavec. */ |
9960 | int j = 0; |
9961 | uint32_t f_s = (nrecs < 3) ? 0 : USE_P_ENC; |
9962 | uint32_t f_d = ((nrecs < 16) || (nrecs <= (lr >> 8))) ? 0 : USE_P_ENC; |
9963 | ii_buffer->data_vectors[j].data_size = nrecs; |
9964 | ii_buffer->data_vectors[j++].flags = f_d; |
9965 | if ((flags & GRN_OBJ_WITH_SECTION)) { |
9966 | ii_buffer->data_vectors[j].data_size = nrecs; |
9967 | ii_buffer->data_vectors[j++].flags = f_s; |
9968 | } |
9969 | ii_buffer->data_vectors[j].data_size = nrecs; |
9970 | ii_buffer->data_vectors[j++].flags = f_s; |
9971 | if ((flags & GRN_OBJ_WITH_WEIGHT)) { |
9972 | ii_buffer->data_vectors[j].data_size = nrecs; |
9973 | ii_buffer->data_vectors[j++].flags = f_s; |
9974 | } |
9975 | if ((flags & GRN_OBJ_WITH_POSITION)) { |
9976 | uint32_t f_p = (((nposts < 32) || |
9977 | (nposts <= (spos >> 13))) ? 0 : USE_P_ENC); |
9978 | ii_buffer->data_vectors[j].data_size = nposts; |
9979 | ii_buffer->data_vectors[j++].flags = f_p|ODD; |
9980 | } |
9981 | } |
9982 | } |
9983 | return (max_size + ii_buffer->ii->n_elements) * 4; |
9984 | } |
9985 | |
9986 | static buffer * |
9987 | get_term_buffer(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
9988 | { |
9989 | if (!ii_buffer->term_buffer) { |
9990 | uint32_t lseg; |
9991 | void *term_buffer; |
9992 | for (lseg = 0; lseg < GRN_II_MAX_LSEG; lseg++) { |
9993 | if (ii_buffer->ii->header->binfo[lseg] == GRN_II_PSEG_NOT_ASSIGNED) { break; } |
9994 | } |
9995 | if (lseg == GRN_II_MAX_LSEG) { |
9996 | DEFINE_NAME(ii_buffer->ii); |
9997 | MERR("[ii][buffer][term-buffer] couldn't find a free buffer: " |
9998 | "<%.*s>" , |
9999 | name_size, name); |
10000 | return NULL; |
10001 | } |
10002 | ii_buffer->lseg = lseg; |
10003 | ii_buffer->dseg = segment_get(ctx, ii_buffer->ii); |
10004 | GRN_IO_SEG_REF(ii_buffer->ii->seg, ii_buffer->dseg, term_buffer); |
10005 | ii_buffer->term_buffer = (buffer *)term_buffer; |
10006 | } |
10007 | return ii_buffer->term_buffer; |
10008 | } |
10009 | |
10010 | /* |
10011 | * try_in_place_packing tries to pack a posting in an array element. |
10012 | * |
10013 | * The requirements are as follows: |
10014 | * - nposts == 1 |
10015 | * - nhits == 1 && nrecs == 1 && tf == 0 |
10016 | * - weight == 0 |
10017 | * - !(flags & GRN_OBJ_WITH_SECTION) || (rid < 0x100000 && sid < 0x800) |
10018 | */ |
10019 | static grn_bool |
10020 | try_in_place_packing(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
10021 | grn_id tid, ii_buffer_block *hits[], int nhits) |
10022 | { |
10023 | if (nhits == 1 && hits[0]->nrecs == 1 && hits[0]->nposts == 1) { |
10024 | grn_id rid; |
10025 | uint32_t sid = 1, tf, pos = 0, weight = 0; |
10026 | ii_buffer_block *block = hits[0]; |
10027 | uint8_t *p = block->bufcur; |
10028 | uint32_t flags = ii_buffer->ii->header->flags; |
10029 | GRN_B_DEC(rid, p); |
10030 | if (flags & GRN_OBJ_WITH_SECTION) { |
10031 | GRN_B_DEC(sid, p); |
10032 | sid++; |
10033 | } |
10034 | GRN_B_DEC(tf, p); |
10035 | if (tf != 0) { GRN_LOG(ctx, GRN_LOG_WARNING, "tf=%d" , tf); } |
10036 | if (flags & GRN_OBJ_WITH_WEIGHT) { GRN_B_DEC(weight, p); } |
10037 | if (flags & GRN_OBJ_WITH_POSITION) { GRN_B_DEC(pos, p); } |
10038 | if (!weight) { |
10039 | if (flags & GRN_OBJ_WITH_SECTION) { |
10040 | if (rid < 0x100000 && sid < 0x800) { |
10041 | uint32_t *a = array_get(ctx, ii_buffer->ii, tid); |
10042 | a[0] = (rid << 12) + (sid << 1) + 1; |
10043 | a[1] = pos; |
10044 | array_unref(ii_buffer->ii, tid); |
10045 | } else { |
10046 | return GRN_FALSE; |
10047 | } |
10048 | } else { |
10049 | uint32_t *a = array_get(ctx, ii_buffer->ii, tid); |
10050 | a[0] = (rid << 1) + 1; |
10051 | a[1] = pos; |
10052 | array_unref(ii_buffer->ii, tid); |
10053 | } |
10054 | block->rest -= (p - block->bufcur); |
10055 | block->bufcur = p; |
10056 | grn_ii_buffer_fetch(ctx, ii_buffer, block); |
10057 | return GRN_TRUE; |
10058 | } |
10059 | } |
10060 | return GRN_FALSE; |
10061 | } |
10062 | |
10063 | /* grn_ii_buffer_merge merges hit blocks and pack it. */ |
10064 | static void |
10065 | grn_ii_buffer_merge(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
10066 | grn_id tid, ii_buffer_block *hits[], int nhits) |
10067 | { |
10068 | if (!try_in_place_packing(ctx, ii_buffer, tid, hits, nhits)) { |
10069 | /* Merge hit blocks and reserve a buffer for packed data. */ |
10070 | size_t max_size = merge_hit_blocks(ctx, ii_buffer, hits, nhits); |
10071 | if (ii_buffer->packed_buf && |
10072 | ii_buffer->packed_buf_size < ii_buffer->packed_len + max_size) { |
10073 | grn_ii_buffer_chunk_flush(ctx, ii_buffer); |
10074 | } |
10075 | if (!ii_buffer->packed_buf) { |
10076 | size_t buf_size = (max_size > II_BUFFER_PACKED_BUF_SIZE) |
10077 | ? max_size : II_BUFFER_PACKED_BUF_SIZE; |
10078 | if ((ii_buffer->packed_buf = GRN_MALLOC(buf_size))) { |
10079 | ii_buffer->packed_buf_size = buf_size; |
10080 | } |
10081 | } |
10082 | { |
10083 | /* Pack postings into the current buffer. */ |
10084 | uint16_t nterm; |
10085 | size_t packed_len; |
10086 | buffer_term *bt; |
10087 | uint32_t *a; |
10088 | buffer *term_buffer; |
10089 | |
10090 | a = array_get(ctx, ii_buffer->ii, tid); |
10091 | if (!a) { |
10092 | DEFINE_NAME(ii_buffer->ii); |
10093 | MERR("[ii][buffer][merge] failed to allocate an array: " |
10094 | "<%.*s>: " |
10095 | "<%u>" , |
10096 | name_size, name, |
10097 | tid); |
10098 | return; |
10099 | } |
10100 | term_buffer = get_term_buffer(ctx, ii_buffer); |
10101 | if (!term_buffer) { |
10102 | DEFINE_NAME(ii_buffer->ii); |
10103 | MERR("[ii][buffer][merge] failed to allocate a term buffer: " |
10104 | "<%.*s>: " |
10105 | "<%u>" , |
10106 | name_size, name, |
10107 | tid); |
10108 | return; |
10109 | } |
10110 | nterm = term_buffer->header.nterms++; |
10111 | bt = &term_buffer->terms[nterm]; |
10112 | a[0] = SEG2POS(ii_buffer->lseg, |
10113 | (sizeof(buffer_header) + sizeof(buffer_term) * nterm)); |
10114 | packed_len = grn_p_encv(ctx, ii_buffer->data_vectors, |
10115 | ii_buffer->ii->n_elements, |
10116 | ii_buffer->packed_buf + |
10117 | ii_buffer->packed_len); |
10118 | a[1] = ii_buffer->data_vectors[0].data_size; |
10119 | bt->tid = tid; |
10120 | bt->size_in_buffer = 0; |
10121 | bt->pos_in_buffer = 0; |
10122 | bt->size_in_chunk = packed_len; |
10123 | bt->pos_in_chunk = ii_buffer->packed_len; |
10124 | ii_buffer->packed_len += packed_len; |
10125 | if (((ii_buffer->curr_size * ii_buffer->update_buffer_size) + |
10126 | (ii_buffer->total_size * term_buffer->header.nterms * 16)) >= |
10127 | (ii_buffer->total_size * II_BUFFER_NTERMS_PER_BUFFER * 16)) { |
10128 | grn_ii_buffer_chunk_flush(ctx, ii_buffer); |
10129 | } |
10130 | array_unref(ii_buffer->ii, tid); |
10131 | } |
10132 | } |
10133 | } |
10134 | |
10135 | grn_ii_buffer * |
10136 | grn_ii_buffer_open(grn_ctx *ctx, grn_ii *ii, |
10137 | long long unsigned int update_buffer_size) |
10138 | { |
10139 | if (ii && ii->lexicon) { |
10140 | grn_ii_buffer *ii_buffer = GRN_MALLOCN(grn_ii_buffer, 1); |
10141 | if (ii_buffer) { |
10142 | ii_buffer->ii = ii; |
10143 | ii_buffer->lexicon = ii->lexicon; |
10144 | ii_buffer->tmp_lexicon = NULL; |
10145 | ii_buffer->nblocks = 0; |
10146 | ii_buffer->blocks = NULL; |
10147 | ii_buffer->ncounters = II_BUFFER_NCOUNTERS_MARGIN; |
10148 | ii_buffer->block_pos = 0; |
10149 | ii_buffer->filepos = 0; |
10150 | ii_buffer->curr_size = 0; |
10151 | ii_buffer->total_size = 0; |
10152 | ii_buffer->update_buffer_size = update_buffer_size; |
10153 | ii_buffer->counters = GRN_CALLOC(ii_buffer->ncounters * |
10154 | sizeof(ii_buffer_counter)); |
10155 | ii_buffer->term_buffer = NULL; |
10156 | ii_buffer->packed_buf = NULL; |
10157 | ii_buffer->packed_len = 0; |
10158 | ii_buffer->packed_buf_size = 0; |
10159 | ii_buffer->total_chunk_size = 0; |
10160 | ii_buffer->values = NULL; |
10161 | ii_buffer->nvalues = 0; |
10162 | ii_buffer->max_nvalues = 0; |
10163 | ii_buffer->last_rid = 0; |
10164 | if (ii_buffer->counters) { |
10165 | ii_buffer->block_buf = GRN_MALLOCN(grn_id, II_BUFFER_BLOCK_SIZE); |
10166 | if (ii_buffer->block_buf) { |
10167 | grn_snprintf(ii_buffer->tmpfpath, PATH_MAX, PATH_MAX, |
10168 | "%sXXXXXX" , grn_io_path(ii->seg)); |
10169 | ii_buffer->block_buf_size = II_BUFFER_BLOCK_SIZE; |
10170 | ii_buffer->tmpfd = grn_mkstemp(ii_buffer->tmpfpath); |
10171 | if (ii_buffer->tmpfd != -1) { |
10172 | grn_table_flags flags; |
10173 | grn_table_get_info(ctx, ii->lexicon, &flags, NULL, NULL, NULL, |
10174 | NULL); |
10175 | if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { |
10176 | grn_pat_cache_enable(ctx, (grn_pat *)ii->lexicon, |
10177 | PAT_CACHE_SIZE); |
10178 | } |
10179 | return ii_buffer; |
10180 | } else { |
10181 | SERR("failed grn_mkstemp(%s)" , |
10182 | ii_buffer->tmpfpath); |
10183 | } |
10184 | GRN_FREE(ii_buffer->block_buf); |
10185 | } |
10186 | GRN_FREE(ii_buffer->counters); |
10187 | } |
10188 | GRN_FREE(ii_buffer); |
10189 | } |
10190 | } else { |
10191 | ERR(GRN_INVALID_ARGUMENT, "ii or ii->lexicon is NULL" ); |
10192 | } |
10193 | return NULL; |
10194 | } |
10195 | |
10196 | static void |
10197 | ii_buffer_value_init(grn_ctx *ctx, ii_buffer_value *value) |
10198 | { |
10199 | value->sid = 0; |
10200 | value->weight = 0; |
10201 | value->p = NULL; |
10202 | value->len = 0; |
10203 | value->buf = NULL; |
10204 | value->cap = 0; |
10205 | } |
10206 | |
10207 | static void |
10208 | ii_buffer_value_fin(grn_ctx *ctx, ii_buffer_value *value) |
10209 | { |
10210 | if (value->buf) { |
10211 | GRN_FREE(value->buf); |
10212 | } |
10213 | } |
10214 | |
10215 | /* |
10216 | * ii_buffer_values_append appends a value to ii_buffer. |
10217 | * This function deep-copies the value if need_copy == GRN_TRUE. |
10218 | */ |
10219 | static void |
10220 | ii_buffer_values_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
10221 | unsigned int sid, unsigned weight, |
10222 | const char *p, uint32_t len, grn_bool need_copy) |
10223 | { |
10224 | if (ii_buffer->nvalues == ii_buffer->max_nvalues) { |
10225 | unsigned int i; |
10226 | unsigned int new_max_nvalues = ii_buffer->max_nvalues * 2; |
10227 | unsigned int new_size; |
10228 | ii_buffer_value *new_values; |
10229 | if (new_max_nvalues == 0) { |
10230 | new_max_nvalues = 1; |
10231 | } |
10232 | new_size = new_max_nvalues * sizeof(ii_buffer_value); |
10233 | new_values = (ii_buffer_value *)GRN_REALLOC(ii_buffer->values, new_size); |
10234 | if (!new_values) { |
10235 | return; |
10236 | } |
10237 | for (i = ii_buffer->max_nvalues; i < new_max_nvalues; i++) { |
10238 | ii_buffer_value_init(ctx, &new_values[i]); |
10239 | } |
10240 | ii_buffer->values = new_values; |
10241 | ii_buffer->max_nvalues = new_max_nvalues; |
10242 | } |
10243 | |
10244 | { |
10245 | ii_buffer_value *value = &ii_buffer->values[ii_buffer->nvalues]; |
10246 | if (need_copy) { |
10247 | if (len > value->cap) { |
10248 | char *new_buf = (char *)GRN_REALLOC(value->buf, len); |
10249 | if (!new_buf) { |
10250 | return; |
10251 | } |
10252 | value->buf = new_buf; |
10253 | value->cap = len; |
10254 | } |
10255 | grn_memcpy(value->buf, p, len); |
10256 | p = value->buf; |
10257 | } |
10258 | value->sid = sid; |
10259 | value->weight = weight; |
10260 | value->p = p; |
10261 | value->len = len; |
10262 | ii_buffer->nvalues++; |
10263 | } |
10264 | } |
10265 | |
10266 | grn_rc |
10267 | grn_ii_buffer_append(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
10268 | grn_id rid, unsigned int sid, grn_obj *value) |
10269 | { |
10270 | if (rid != ii_buffer->last_rid) { |
10271 | if (ii_buffer->last_rid) { |
10272 | grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid); |
10273 | } |
10274 | ii_buffer->last_rid = rid; |
10275 | } |
10276 | ii_buffer_values_append(ctx, ii_buffer, sid, 0, |
10277 | GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value), |
10278 | GRN_TRUE); |
10279 | return ctx->rc; |
10280 | } |
10281 | |
10282 | /* |
10283 | * grn_ii_buffer_commit completes tokenization and builds an inverted index |
10284 | * from data in a temporary file. |
10285 | */ |
10286 | grn_rc |
10287 | grn_ii_buffer_commit(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
10288 | { |
10289 | /* Tokenize the remaining values and free resources. */ |
10290 | if (ii_buffer->last_rid && ii_buffer->nvalues) { |
10291 | grn_ii_buffer_tokenize(ctx, ii_buffer, ii_buffer->last_rid); |
10292 | } |
10293 | if (ii_buffer->block_pos) { |
10294 | grn_ii_buffer_flush(ctx, ii_buffer); |
10295 | } |
10296 | if (ii_buffer->tmpfd != -1) { |
10297 | grn_close(ii_buffer->tmpfd); |
10298 | } |
10299 | if (ii_buffer->block_buf) { |
10300 | GRN_FREE(ii_buffer->block_buf); |
10301 | ii_buffer->block_buf = NULL; |
10302 | } |
10303 | if (ii_buffer->counters) { |
10304 | GRN_FREE(ii_buffer->counters); |
10305 | ii_buffer->counters = NULL; |
10306 | } |
10307 | |
10308 | if (ii_buffer->update_buffer_size && |
10309 | ii_buffer->update_buffer_size < 20) { |
10310 | if (ii_buffer->update_buffer_size < 10) { |
10311 | ii_buffer->update_buffer_size = |
10312 | ii_buffer->total_size >> (10 - ii_buffer->update_buffer_size); |
10313 | } else { |
10314 | ii_buffer->update_buffer_size = |
10315 | ii_buffer->total_size << (ii_buffer->update_buffer_size - 10); |
10316 | } |
10317 | } |
10318 | |
10319 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
10320 | "nblocks=%d, update_buffer_size=%" GRN_FMT_INT64U, |
10321 | ii_buffer->nblocks, ii_buffer->update_buffer_size); |
10322 | |
10323 | datavec_init(ctx, ii_buffer->data_vectors, ii_buffer->ii->n_elements, 0, 0); |
10324 | grn_open(ii_buffer->tmpfd, |
10325 | ii_buffer->tmpfpath, |
10326 | O_RDONLY | GRN_OPEN_FLAG_BINARY); |
10327 | if (ii_buffer->tmpfd == -1) { |
10328 | ERRNO_ERR("failed to open path: <%s>" , ii_buffer->tmpfpath); |
10329 | return ctx->rc; |
10330 | } |
10331 | { |
10332 | /* Fetch the first term of each block. */ |
10333 | uint32_t i; |
10334 | for (i = 0; i < ii_buffer->nblocks; i++) { |
10335 | grn_ii_buffer_fetch(ctx, ii_buffer, &ii_buffer->blocks[i]); |
10336 | } |
10337 | } |
10338 | { |
10339 | ii_buffer_block **hits; |
10340 | if ((hits = GRN_MALLOCN(ii_buffer_block *, ii_buffer->nblocks))) { |
10341 | grn_id tid; |
10342 | grn_table_cursor *tc; |
10343 | tc = grn_table_cursor_open(ctx, ii_buffer->lexicon, |
10344 | NULL, 0, NULL, 0, 0, -1, II_BUFFER_ORDER); |
10345 | if (tc) { |
10346 | while ((tid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { |
10347 | /* |
10348 | * Find blocks which contain the current term. |
10349 | * Then, merge the postings. |
10350 | */ |
10351 | int nrests = 0; |
10352 | int nhits = 0; |
10353 | uint32_t i; |
10354 | for (i = 0; i < ii_buffer->nblocks; i++) { |
10355 | if (ii_buffer->blocks[i].tid == tid) { |
10356 | hits[nhits++] = &ii_buffer->blocks[i]; |
10357 | } |
10358 | if (ii_buffer->blocks[i].tid) { nrests++; } |
10359 | } |
10360 | if (nhits) { grn_ii_buffer_merge(ctx, ii_buffer, tid, hits, nhits); } |
10361 | if (!nrests) { break; } |
10362 | } |
10363 | if (ii_buffer->packed_len) { |
10364 | grn_ii_buffer_chunk_flush(ctx, ii_buffer); |
10365 | } |
10366 | grn_table_cursor_close(ctx, tc); |
10367 | } |
10368 | GRN_FREE(hits); |
10369 | } |
10370 | } |
10371 | datavec_fin(ctx, ii_buffer->data_vectors); |
10372 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
10373 | "tmpfile_size:%" GRN_FMT_INT64D " > total_chunk_size:%" GRN_FMT_SIZE, |
10374 | ii_buffer->filepos, ii_buffer->total_chunk_size); |
10375 | grn_close(ii_buffer->tmpfd); |
10376 | if (grn_unlink(ii_buffer->tmpfpath) == 0) { |
10377 | GRN_LOG(ctx, GRN_LOG_INFO, |
10378 | "[ii][buffer][commit] removed temporary path: <%s>" , |
10379 | ii_buffer->tmpfpath); |
10380 | } else { |
10381 | ERRNO_ERR("[ii][buffer][commit] failed to remove temporary path: <%s>" , |
10382 | ii_buffer->tmpfpath); |
10383 | } |
10384 | ii_buffer->tmpfd = -1; |
10385 | return ctx->rc; |
10386 | } |
10387 | |
10388 | grn_rc |
10389 | grn_ii_buffer_close(grn_ctx *ctx, grn_ii_buffer *ii_buffer) |
10390 | { |
10391 | uint32_t i; |
10392 | grn_table_flags flags; |
10393 | grn_table_get_info(ctx, ii_buffer->ii->lexicon, &flags, NULL, NULL, NULL, |
10394 | NULL); |
10395 | if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { |
10396 | grn_pat_cache_disable(ctx, (grn_pat *)ii_buffer->ii->lexicon); |
10397 | } |
10398 | if (ii_buffer->tmp_lexicon) { |
10399 | grn_obj_close(ctx, ii_buffer->tmp_lexicon); |
10400 | } |
10401 | if (ii_buffer->tmpfd != -1) { |
10402 | grn_close(ii_buffer->tmpfd); |
10403 | if (grn_unlink(ii_buffer->tmpfpath) == 0) { |
10404 | GRN_LOG(ctx, GRN_LOG_INFO, |
10405 | "[ii][buffer][close] removed temporary path: <%s>" , |
10406 | ii_buffer->tmpfpath); |
10407 | } else { |
10408 | ERRNO_ERR("[ii][buffer][close] failed to remove temporary path: <%s>" , |
10409 | ii_buffer->tmpfpath); |
10410 | } |
10411 | } |
10412 | if (ii_buffer->block_buf) { |
10413 | GRN_FREE(ii_buffer->block_buf); |
10414 | } |
10415 | if (ii_buffer->counters) { |
10416 | GRN_FREE(ii_buffer->counters); |
10417 | } |
10418 | if (ii_buffer->blocks) { |
10419 | for (i = 0; i < ii_buffer->nblocks; i++) { |
10420 | if (ii_buffer->blocks[i].buffer) { |
10421 | GRN_FREE(ii_buffer->blocks[i].buffer); |
10422 | } |
10423 | } |
10424 | GRN_FREE(ii_buffer->blocks); |
10425 | } |
10426 | if (ii_buffer->values) { |
10427 | for (i = 0; i < ii_buffer->max_nvalues; i++) { |
10428 | ii_buffer_value_fin(ctx, &ii_buffer->values[i]); |
10429 | } |
10430 | GRN_FREE(ii_buffer->values); |
10431 | } |
10432 | GRN_FREE(ii_buffer); |
10433 | return ctx->rc; |
10434 | } |
10435 | |
10436 | /* |
10437 | * grn_ii_buffer_parse tokenizes values to be indexed. |
10438 | * |
10439 | * For each record of the target table, grn_ii_buffer_parse makes a list of |
10440 | * target values and calls grn_ii_buffer_tokenize. To make a list of target |
10441 | * values, ii_buffer_values_append is called for each value. Note that |
10442 | * ii_buffer_values_append is called for each element for a vector. |
10443 | */ |
10444 | static void |
10445 | grn_ii_buffer_parse(grn_ctx *ctx, grn_ii_buffer *ii_buffer, |
10446 | grn_obj *target, int ncols, grn_obj **cols) |
10447 | { |
10448 | grn_table_cursor *tc; |
10449 | grn_obj *vobjs; |
10450 | if ((vobjs = GRN_MALLOCN(grn_obj, ncols))) { |
10451 | int i; |
10452 | for (i = 0; i < ncols; i++) { |
10453 | GRN_TEXT_INIT(&vobjs[i], 0); |
10454 | } |
10455 | if ((tc = grn_table_cursor_open(ctx, target, |
10456 | NULL, 0, NULL, 0, 0, -1, |
10457 | GRN_CURSOR_BY_ID))) { |
10458 | grn_id rid; |
10459 | while ((rid = grn_table_cursor_next(ctx, tc)) != GRN_ID_NIL) { |
10460 | unsigned int j; |
10461 | int sid; |
10462 | grn_obj **col; |
10463 | for (sid = 1, col = cols; sid <= ncols; sid++, col++) { |
10464 | grn_obj *rv = &vobjs[sid - 1]; |
10465 | grn_obj_reinit_for(ctx, rv, *col); |
10466 | if (GRN_OBJ_TABLEP(*col)) { |
10467 | grn_table_get_key2(ctx, *col, rid, rv); |
10468 | } else { |
10469 | grn_obj_get_value(ctx, *col, rid, rv); |
10470 | } |
10471 | switch (rv->header.type) { |
10472 | case GRN_BULK : |
10473 | ii_buffer_values_append(ctx, ii_buffer, sid, 0, |
10474 | GRN_TEXT_VALUE(rv), GRN_TEXT_LEN(rv), |
10475 | GRN_FALSE); |
10476 | break; |
10477 | case GRN_UVECTOR : |
10478 | { |
10479 | unsigned int size; |
10480 | unsigned int elem_size; |
10481 | size = grn_uvector_size(ctx, rv); |
10482 | elem_size = grn_uvector_element_size(ctx, rv); |
10483 | for (j = 0; j < size; j++) { |
10484 | ii_buffer_values_append(ctx, ii_buffer, sid, 0, |
10485 | GRN_BULK_HEAD(rv) + (elem_size * j), |
10486 | elem_size, GRN_FALSE); |
10487 | } |
10488 | } |
10489 | break; |
10490 | case GRN_VECTOR : |
10491 | if (rv->u.v.body) { |
10492 | int j; |
10493 | int n_sections = rv->u.v.n_sections; |
10494 | grn_section *sections = rv->u.v.sections; |
10495 | const char *head = GRN_BULK_HEAD(rv->u.v.body); |
10496 | for (j = 0; j < n_sections; j++) { |
10497 | grn_section *section = sections + j; |
10498 | if (section->length == 0) { |
10499 | continue; |
10500 | } |
10501 | ii_buffer_values_append(ctx, ii_buffer, sid, section->weight, |
10502 | head + section->offset, |
10503 | section->length, GRN_FALSE); |
10504 | } |
10505 | } |
10506 | break; |
10507 | default : |
10508 | ERR(GRN_INVALID_ARGUMENT, |
10509 | "[index] invalid object assigned as value" ); |
10510 | break; |
10511 | } |
10512 | } |
10513 | grn_ii_buffer_tokenize(ctx, ii_buffer, rid); |
10514 | } |
10515 | grn_table_cursor_close(ctx, tc); |
10516 | } |
10517 | for (i = 0; i < ncols; i++) { |
10518 | GRN_OBJ_FIN(ctx, &vobjs[i]); |
10519 | } |
10520 | GRN_FREE(vobjs); |
10521 | } |
10522 | } |
10523 | |
10524 | grn_rc |
10525 | grn_ii_build(grn_ctx *ctx, grn_ii *ii, uint64_t sparsity) |
10526 | { |
10527 | grn_ii_buffer *ii_buffer; |
10528 | |
10529 | { |
10530 | /* Do nothing if there are no targets. */ |
10531 | grn_obj *data_table = grn_ctx_at(ctx, DB_OBJ(ii)->range); |
10532 | if (!data_table) { |
10533 | return ctx->rc; |
10534 | } |
10535 | if (grn_table_size(ctx, data_table) == 0) { |
10536 | return ctx->rc; |
10537 | } |
10538 | } |
10539 | |
10540 | ii_buffer = grn_ii_buffer_open(ctx, ii, sparsity); |
10541 | if (ii_buffer) { |
10542 | grn_id *source = (grn_id *)ii->obj.source; |
10543 | if (ii->obj.source_size && ii->obj.source) { |
10544 | int ncols = ii->obj.source_size / sizeof(grn_id); |
10545 | grn_obj **cols = GRN_MALLOCN(grn_obj *, ncols); |
10546 | if (cols) { |
10547 | int i; |
10548 | for (i = 0; i < ncols; i++) { |
10549 | if (!(cols[i] = grn_ctx_at(ctx, source[i]))) { break; } |
10550 | } |
10551 | if (i == ncols) { /* All the source columns are available. */ |
10552 | grn_obj *target = cols[0]; |
10553 | if (!GRN_OBJ_TABLEP(target)) { |
10554 | target = grn_ctx_at(ctx, target->header.domain); |
10555 | } |
10556 | if (target) { |
10557 | grn_ii_buffer_parse(ctx, ii_buffer, target, ncols, cols); |
10558 | grn_ii_buffer_commit(ctx, ii_buffer); |
10559 | } else { |
10560 | ERR(GRN_INVALID_ARGUMENT, "failed to resolve the target" ); |
10561 | } |
10562 | } else { |
10563 | ERR(GRN_INVALID_ARGUMENT, "failed to resolve a column (%d)" , i); |
10564 | } |
10565 | GRN_FREE(cols); |
10566 | } |
10567 | } else { |
10568 | ERR(GRN_INVALID_ARGUMENT, "ii->obj.source is void" ); |
10569 | } |
10570 | grn_ii_buffer_close(ctx, ii_buffer); |
10571 | } |
10572 | return ctx->rc; |
10573 | } |
10574 | |
10575 | /* |
10576 | * ========================================================================== |
10577 | * The following part provides constants, structures and functions for static |
10578 | * indexing. |
10579 | * ========================================================================== |
10580 | */ |
10581 | |
10582 | #define GRN_II_BUILDER_BUFFER_CHUNK_SIZE (S_CHUNK >> 2) |
10583 | |
10584 | #define GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE (1 << 24) |
10585 | |
10586 | #define GRN_II_BUILDER_MIN_BLOCK_THRESHOLD 1 |
10587 | #define GRN_II_BUILDER_MAX_BLOCK_THRESHOLD (1 << 28) |
10588 | |
10589 | #define GRN_II_BUILDER_MIN_FILE_BUF_SIZE (1 << 12) |
10590 | #define GRN_II_BUILDER_MAX_FILE_BUF_SIZE (1 << 30) |
10591 | |
10592 | #define GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE (1 << 12) |
10593 | #define GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE (1 << 30) |
10594 | |
10595 | #define GRN_II_BUILDER_MIN_CHUNK_THRESHOLD 1 |
10596 | #define GRN_II_BUILDER_MAX_CHUNK_THRESHOLD (1 << 28) |
10597 | |
10598 | #define GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS 1 |
10599 | #define GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS \ |
10600 | ((S_SEGMENT - sizeof(buffer_header)) / sizeof(buffer_term)) |
10601 | |
10602 | struct grn_ii_builder_options { |
10603 | uint32_t lexicon_cache_size; /* Cache size of temporary lexicon */ |
10604 | /* A block is flushed if builder->n reaches this value. */ |
10605 | uint32_t block_threshold; |
10606 | uint32_t file_buf_size; /* Buffer size for buffered output */ |
10607 | uint32_t block_buf_size; /* Buffer size for buffered input */ |
10608 | /* A chunk is flushed if chunk->n reaches this value. */ |
10609 | uint32_t chunk_threshold; |
10610 | uint32_t buffer_max_n_terms; /* Maximum number of terms in each buffer */ |
10611 | }; |
10612 | |
10613 | static const grn_ii_builder_options grn_ii_builder_default_options = { |
10614 | 0x80000, /* lexicon_cache_size */ |
10615 | 0x4000000, /* block_threshold */ |
10616 | 0x10000, /* file_buf_size */ |
10617 | 0x10000, /* block_buf_size */ |
10618 | 0x1000, /* chunk_threshold */ |
10619 | 0x3000, /* buffer_max_n_terms */ |
10620 | }; |
10621 | |
10622 | /* grn_ii_builder_options_init fills options with the default options. */ |
10623 | void |
10624 | grn_ii_builder_options_init(grn_ii_builder_options *options) |
10625 | { |
10626 | *options = grn_ii_builder_default_options; |
10627 | } |
10628 | |
10629 | /* grn_ii_builder_options_fix fixes out-of-range options. */ |
10630 | static void |
10631 | grn_ii_builder_options_fix(grn_ii_builder_options *options) |
10632 | { |
10633 | if (options->lexicon_cache_size > GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE) { |
10634 | options->lexicon_cache_size = GRN_II_BUILDER_MAX_LEXICON_CACHE_SIZE; |
10635 | } |
10636 | |
10637 | if (options->block_threshold < GRN_II_BUILDER_MIN_BLOCK_THRESHOLD) { |
10638 | options->block_threshold = GRN_II_BUILDER_MIN_BLOCK_THRESHOLD; |
10639 | } |
10640 | if (options->block_threshold > GRN_II_BUILDER_MAX_BLOCK_THRESHOLD) { |
10641 | options->block_threshold = GRN_II_BUILDER_MAX_BLOCK_THRESHOLD; |
10642 | } |
10643 | |
10644 | if (options->file_buf_size < GRN_II_BUILDER_MIN_FILE_BUF_SIZE) { |
10645 | options->file_buf_size = GRN_II_BUILDER_MIN_FILE_BUF_SIZE; |
10646 | } |
10647 | if (options->file_buf_size > GRN_II_BUILDER_MAX_FILE_BUF_SIZE) { |
10648 | options->file_buf_size = GRN_II_BUILDER_MAX_FILE_BUF_SIZE; |
10649 | } |
10650 | |
10651 | if (options->block_buf_size < GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE) { |
10652 | options->block_buf_size = GRN_II_BUILDER_MIN_BLOCK_BUF_SIZE; |
10653 | } |
10654 | if (options->block_buf_size > GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE) { |
10655 | options->block_buf_size = GRN_II_BUILDER_MAX_BLOCK_BUF_SIZE; |
10656 | } |
10657 | |
10658 | if (options->chunk_threshold < GRN_II_BUILDER_MIN_CHUNK_THRESHOLD) { |
10659 | options->chunk_threshold = GRN_II_BUILDER_MIN_CHUNK_THRESHOLD; |
10660 | } |
10661 | if (options->chunk_threshold > GRN_II_BUILDER_MAX_CHUNK_THRESHOLD) { |
10662 | options->chunk_threshold = GRN_II_BUILDER_MAX_CHUNK_THRESHOLD; |
10663 | } |
10664 | |
10665 | if (options->buffer_max_n_terms < GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS) { |
10666 | options->buffer_max_n_terms = GRN_II_BUILDER_MIN_BUFFER_MAX_N_TERMS; |
10667 | } |
10668 | if (options->buffer_max_n_terms > GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS) { |
10669 | options->buffer_max_n_terms = GRN_II_BUILDER_MAX_BUFFER_MAX_N_TERMS; |
10670 | } |
10671 | } |
10672 | |
10673 | #define GRN_II_BUILDER_TERM_INPLACE_SIZE\ |
10674 | (sizeof(grn_ii_builder_term) - (uintptr_t)&((grn_ii_builder_term *)0)->dummy) |
10675 | |
10676 | typedef struct { |
10677 | grn_id rid; /* Last record ID */ |
10678 | uint32_t sid; /* Last section ID */ |
10679 | /* Last position (GRN_OBJ_WITH_POSITION) or frequency. */ |
10680 | uint32_t pos_or_freq; |
10681 | uint32_t offset; /* Buffer write offset */ |
10682 | uint32_t size; /* Buffer size */ |
10683 | uint32_t dummy; /* Padding */ |
10684 | uint8_t *buf; /* Buffer (to be freed) */ |
10685 | } grn_ii_builder_term; |
10686 | |
10687 | /* grn_ii_builder_term_is_inplace returns whether a term buffer is inplace. */ |
10688 | inline static grn_bool |
10689 | grn_ii_builder_term_is_inplace(grn_ii_builder_term *term) |
10690 | { |
10691 | return term->size == GRN_II_BUILDER_TERM_INPLACE_SIZE; |
10692 | } |
10693 | |
10694 | /* grn_ii_builder_term_get_buf returns a term buffer. */ |
10695 | inline static uint8_t * |
10696 | grn_ii_builder_term_get_buf(grn_ii_builder_term *term) |
10697 | { |
10698 | if (grn_ii_builder_term_is_inplace(term)) { |
10699 | return (uint8_t *)&term->dummy; |
10700 | } else { |
10701 | return term->buf; |
10702 | } |
10703 | } |
10704 | |
10705 | /* |
10706 | * grn_ii_builder_term_init initializes a term. Note that an initialized term |
10707 | * must be finalized by grn_ii_builder_term_fin. |
10708 | */ |
10709 | static void |
10710 | grn_ii_builder_term_init(grn_ctx *ctx, grn_ii_builder_term *term) |
10711 | { |
10712 | term->rid = GRN_ID_NIL; |
10713 | term->sid = 0; |
10714 | term->pos_or_freq = 0; |
10715 | term->offset = 0; |
10716 | term->size = GRN_II_BUILDER_TERM_INPLACE_SIZE; |
10717 | } |
10718 | |
10719 | /* grn_ii_builder_term_fin finalizes a term. */ |
10720 | static void |
10721 | grn_ii_builder_term_fin(grn_ctx *ctx, grn_ii_builder_term *term) |
10722 | { |
10723 | if (!grn_ii_builder_term_is_inplace(term)) { |
10724 | GRN_FREE(term->buf); |
10725 | } |
10726 | } |
10727 | |
10728 | /* grn_ii_builder_term_reinit reinitializes a term. */ |
10729 | static void |
10730 | grn_ii_builder_term_reinit(grn_ctx *ctx, grn_ii_builder_term *term) |
10731 | { |
10732 | grn_ii_builder_term_fin(ctx, term); |
10733 | grn_ii_builder_term_init(ctx, term); |
10734 | } |
10735 | |
10736 | /* grn_ii_builder_term_extend extends a term buffer. */ |
10737 | static grn_rc |
10738 | grn_ii_builder_term_extend(grn_ctx *ctx, grn_ii_builder_term *term) |
10739 | { |
10740 | uint8_t *buf; |
10741 | uint32_t size = term->size * 2; |
10742 | if (grn_ii_builder_term_is_inplace(term)) { |
10743 | buf = (uint8_t *)GRN_MALLOC(size); |
10744 | if (!buf) { |
10745 | ERR(GRN_NO_MEMORY_AVAILABLE, |
10746 | "failed to allocate memory for term buffer: size = %u" , size); |
10747 | return ctx->rc; |
10748 | } |
10749 | grn_memcpy(buf, &term->dummy, term->offset); |
10750 | } else { |
10751 | buf = (uint8_t *)GRN_REALLOC(term->buf, size); |
10752 | if (!buf) { |
10753 | ERR(GRN_NO_MEMORY_AVAILABLE, |
10754 | "failed to reallocate memory for term buffer: size = %u" , size); |
10755 | return ctx->rc; |
10756 | } |
10757 | } |
10758 | term->buf = buf; |
10759 | term->size = size; |
10760 | return GRN_SUCCESS; |
10761 | } |
10762 | |
10763 | /* grn_ii_builder_term_append appends an integer to a term buffer. */ |
10764 | inline static grn_rc |
10765 | grn_ii_builder_term_append(grn_ctx *ctx, grn_ii_builder_term *term, |
10766 | uint64_t value) |
10767 | { |
10768 | uint8_t *p; |
10769 | if (value < (uint64_t)1 << 5) { |
10770 | if (term->offset + 1 > term->size) { |
10771 | grn_rc rc = grn_ii_builder_term_extend(ctx, term); |
10772 | if (rc != GRN_SUCCESS) { |
10773 | return rc; |
10774 | } |
10775 | } |
10776 | p = grn_ii_builder_term_get_buf(term) + term->offset; |
10777 | p[0] = (uint8_t)value; |
10778 | term->offset++; |
10779 | return GRN_SUCCESS; |
10780 | } else if (value < (uint64_t)1 << 13) { |
10781 | if (term->offset + 2 > term->size) { |
10782 | grn_rc rc = grn_ii_builder_term_extend(ctx, term); |
10783 | if (rc != GRN_SUCCESS) { |
10784 | return rc; |
10785 | } |
10786 | } |
10787 | p = grn_ii_builder_term_get_buf(term) + term->offset; |
10788 | p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); |
10789 | p[1] = (uint8_t)(value >> 5); |
10790 | term->offset += 2; |
10791 | return GRN_SUCCESS; |
10792 | } else { |
10793 | uint8_t i, n; |
10794 | if (value < (uint64_t)1 << 21) { |
10795 | n = 3; |
10796 | } else if (value < (uint64_t)1 << 29) { |
10797 | n = 4; |
10798 | } else if (value < (uint64_t)1 << 37) { |
10799 | n = 5; |
10800 | } else if (value < (uint64_t)1 << 45) { |
10801 | n = 6; |
10802 | } else if (value < (uint64_t)1 << 53) { |
10803 | n = 7; |
10804 | } else { |
10805 | n = 8; |
10806 | } |
10807 | if (term->offset + n > term->size) { |
10808 | grn_rc rc = grn_ii_builder_term_extend(ctx, term); |
10809 | if (rc != GRN_SUCCESS) { |
10810 | return rc; |
10811 | } |
10812 | } |
10813 | p = grn_ii_builder_term_get_buf(term) + term->offset; |
10814 | p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); |
10815 | value >>= 5; |
10816 | for (i = 1; i < n; i++) { |
10817 | p[i] = (uint8_t)value; |
10818 | value >>= 8; |
10819 | } |
10820 | term->offset += n; |
10821 | return GRN_SUCCESS; |
10822 | } |
10823 | } |
10824 | |
10825 | typedef struct { |
10826 | uint64_t offset; /* File offset */ |
10827 | uint32_t rest; /* Remaining size */ |
10828 | uint8_t *buf; /* Buffer (to be freed) */ |
10829 | uint8_t *cur; /* Current pointer */ |
10830 | uint8_t *end; /* End pointer */ |
10831 | uint32_t tid; /* Term ID */ |
10832 | } grn_ii_builder_block; |
10833 | |
10834 | /* |
10835 | * grn_ii_builder_block_init initializes a block. Note that an initialized |
10836 | * block must be finalized by grn_ii_builder_block_fin. |
10837 | */ |
10838 | static void |
10839 | grn_ii_builder_block_init(grn_ctx *ctx, grn_ii_builder_block *block) |
10840 | { |
10841 | block->offset = 0; |
10842 | block->rest = 0; |
10843 | block->buf = NULL; |
10844 | block->cur = NULL; |
10845 | block->end = NULL; |
10846 | block->tid = GRN_ID_NIL; |
10847 | } |
10848 | |
10849 | /* grn_ii_builder_block_fin finalizes a block. */ |
10850 | static void |
10851 | grn_ii_builder_block_fin(grn_ctx *ctx, grn_ii_builder_block *block) |
10852 | { |
10853 | if (block->buf) { |
10854 | GRN_FREE(block->buf); |
10855 | } |
10856 | } |
10857 | |
10858 | /* |
10859 | * grn_ii_builder_block_next reads the next integer. Note that this function |
10860 | * returns GRN_END_OF_DATA if it reaches the end of a block. |
10861 | */ |
10862 | inline static grn_rc |
10863 | grn_ii_builder_block_next(grn_ctx *ctx, grn_ii_builder_block *block, |
10864 | uint64_t *value) |
10865 | { |
10866 | uint8_t n; |
10867 | if (block->cur == block->end) { |
10868 | return GRN_END_OF_DATA; |
10869 | } |
10870 | n = (*block->cur >> 5) + 1; |
10871 | if (n > block->end - block->cur) { |
10872 | return GRN_END_OF_DATA; |
10873 | } |
10874 | *value = 0; |
10875 | switch (n) { |
10876 | case 8 : |
10877 | *value |= (uint64_t)block->cur[7] << 53; |
10878 | case 7 : |
10879 | *value |= (uint64_t)block->cur[6] << 45; |
10880 | case 6 : |
10881 | *value |= (uint64_t)block->cur[5] << 37; |
10882 | case 5 : |
10883 | *value |= (uint64_t)block->cur[4] << 29; |
10884 | case 4 : |
10885 | *value |= (uint64_t)block->cur[3] << 21; |
10886 | case 3 : |
10887 | *value |= (uint64_t)block->cur[2] << 13; |
10888 | case 2 : |
10889 | *value |= (uint64_t)block->cur[1] << 5; |
10890 | case 1 : |
10891 | *value |= block->cur[0] & 0x1f; |
10892 | break; |
10893 | } |
10894 | block->cur += n; |
10895 | return GRN_SUCCESS; |
10896 | } |
10897 | |
10898 | typedef struct { |
10899 | grn_ii *ii; /* Inverted index */ |
10900 | uint32_t buf_id; /* Buffer ID */ |
10901 | uint32_t buf_seg_id; /* Buffer segment ID */ |
10902 | buffer *buf; /* Buffer (to be unreferenced) */ |
10903 | uint32_t chunk_id; /* Chunk ID */ |
10904 | uint32_t chunk_seg_id; /* Chunk segment ID */ |
10905 | uint8_t *chunk; /* Chunk (to be unreferenced) */ |
10906 | uint32_t chunk_offset; /* Chunk write position */ |
10907 | uint32_t chunk_size; /* Chunk size */ |
10908 | } grn_ii_builder_buffer; |
10909 | |
10910 | /* |
10911 | * grn_ii_builder_buffer_init initializes a buffer. Note that a buffer must be |
10912 | * finalized by grn_ii_builder_buffer_fin. |
10913 | */ |
10914 | static void |
10915 | grn_ii_builder_buffer_init(grn_ctx *ctx, grn_ii_builder_buffer *buf, |
10916 | grn_ii *ii) |
10917 | { |
10918 | buf->ii = ii; |
10919 | buf->buf_id = 0; |
10920 | buf->buf_seg_id = 0; |
10921 | buf->buf = NULL; |
10922 | buf->chunk_id = 0; |
10923 | buf->chunk_seg_id = 0; |
10924 | buf->chunk = NULL; |
10925 | buf->chunk_offset = 0; |
10926 | buf->chunk_size = 0; |
10927 | } |
10928 | |
10929 | /* grn_ii_builder_buffer_fin finalizes a buffer. */ |
10930 | static void |
10931 | grn_ii_builder_buffer_fin(grn_ctx *ctx, grn_ii_builder_buffer *buf) |
10932 | { |
10933 | if (buf->buf) { |
10934 | GRN_IO_SEG_UNREF(buf->ii->seg, buf->buf_seg_id); |
10935 | } |
10936 | if (buf->chunk) { |
10937 | GRN_IO_SEG_UNREF(buf->ii->chunk, buf->chunk_seg_id); |
10938 | } |
10939 | } |
10940 | |
10941 | /* grn_ii_builder_buffer_is_assigned returns whether a buffer is assigned. */ |
10942 | static grn_bool |
10943 | grn_ii_builder_buffer_is_assigned(grn_ctx *ctx, grn_ii_builder_buffer *buf) |
10944 | { |
10945 | return buf->buf != NULL; |
10946 | } |
10947 | |
10948 | /* grn_ii_builder_buffer_assign assigns a buffer. */ |
10949 | static grn_rc |
10950 | grn_ii_builder_buffer_assign(grn_ctx *ctx, grn_ii_builder_buffer *buf, |
10951 | size_t min_chunk_size) |
10952 | { |
10953 | void *seg; |
10954 | size_t chunk_size; |
10955 | grn_rc rc; |
10956 | |
10957 | /* Create a buffer. */ |
10958 | buf->buf_id = GRN_II_PSEG_NOT_ASSIGNED; |
10959 | rc = buffer_segment_new(ctx, buf->ii, &buf->buf_id); |
10960 | if (rc != GRN_SUCCESS) { |
10961 | if (ctx->rc != GRN_SUCCESS) { |
10962 | ERR(rc, "failed to allocate segment for buffer" ); |
10963 | } |
10964 | return rc; |
10965 | } |
10966 | buf->buf_seg_id = buf->ii->header->binfo[buf->buf_id]; |
10967 | GRN_IO_SEG_REF(buf->ii->seg, buf->buf_seg_id, seg); |
10968 | if (!seg) { |
10969 | if (ctx->rc == GRN_SUCCESS) { |
10970 | ERR(GRN_UNKNOWN_ERROR, |
10971 | "failed access buffer segment: buf_id = %u, seg_id = %u" , |
10972 | buf->buf_id, buf->buf_seg_id); |
10973 | } |
10974 | return ctx->rc; |
10975 | } |
10976 | buf->buf = (buffer *)seg; |
10977 | |
10978 | /* Create a chunk. */ |
10979 | chunk_size = GRN_II_BUILDER_BUFFER_CHUNK_SIZE; |
10980 | while (chunk_size < min_chunk_size) { |
10981 | chunk_size *= 2; |
10982 | } |
10983 | rc = chunk_new(ctx, buf->ii, &buf->chunk_id, chunk_size); |
10984 | if (rc != GRN_SUCCESS) { |
10985 | return rc; |
10986 | } |
10987 | buf->chunk_seg_id = buf->chunk_id >> GRN_II_N_CHUNK_VARIATION; |
10988 | GRN_IO_SEG_REF(buf->ii->chunk, buf->chunk_seg_id, seg); |
10989 | if (!seg) { |
10990 | if (ctx->rc == GRN_SUCCESS) { |
10991 | ERR(GRN_UNKNOWN_ERROR, |
10992 | "failed access chunk segment: chunk_id = %u, seg_id = %u" , |
10993 | buf->chunk_id, buf->chunk_seg_id); |
10994 | } |
10995 | return ctx->rc; |
10996 | } |
10997 | buf->chunk = (uint8_t *)seg; |
10998 | buf->chunk += (buf->chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << |
10999 | GRN_II_W_LEAST_CHUNK; |
11000 | buf->chunk_offset = 0; |
11001 | buf->chunk_size = chunk_size; |
11002 | |
11003 | buf->buf->header.chunk = buf->chunk_id; |
11004 | buf->buf->header.chunk_size = chunk_size; |
11005 | buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header); |
11006 | buf->buf->header.nterms = 0; |
11007 | buf->buf->header.nterms_void = 0; |
11008 | buf->ii->header->total_chunk_size += chunk_size; |
11009 | return GRN_SUCCESS; |
11010 | } |
11011 | |
11012 | /* grn_ii_builder_buffer_flush flushes a buffer. */ |
11013 | static grn_rc |
11014 | grn_ii_builder_buffer_flush(grn_ctx *ctx, grn_ii_builder_buffer *buf) |
11015 | { |
11016 | grn_ii *ii; |
11017 | |
11018 | buf->buf->header.buffer_free = S_SEGMENT - sizeof(buffer_header) - |
11019 | buf->buf->header.nterms * sizeof(buffer_term); |
11020 | GRN_LOG(ctx, GRN_LOG_DEBUG, |
11021 | "n_terms = %u, chunk_offset = %u, chunk_size = %u, total = %" |
11022 | GRN_FMT_INT64U "KB" , |
11023 | buf->buf->header.nterms, |
11024 | buf->chunk_offset, |
11025 | buf->buf->header.chunk_size, |
11026 | buf->ii->header->total_chunk_size >> 10); |
11027 | |
11028 | ii = buf->ii; |
11029 | grn_ii_builder_buffer_fin(ctx, buf); |
11030 | grn_ii_builder_buffer_init(ctx, buf, ii); |
11031 | return GRN_SUCCESS; |
11032 | } |
11033 | |
11034 | typedef struct { |
11035 | grn_id tid; /* Term ID */ |
11036 | uint32_t n; /* Number of integers in buffers */ |
11037 | grn_id rid; /* Record ID */ |
11038 | uint32_t rid_gap; /* Record ID gap */ |
11039 | uint64_t pos_sum; /* Sum of position gaps */ |
11040 | |
11041 | uint32_t offset; /* Write offset */ |
11042 | uint32_t size; /* Buffer size */ |
11043 | grn_id *rid_buf; /* Buffer for record IDs (to be freed) */ |
11044 | uint32_t *sid_buf; /* Buffer for section IDs (to be freed) */ |
11045 | uint32_t *freq_buf; /* Buffer for frequencies (to be freed) */ |
11046 | uint32_t *weight_buf; /* Buffer for weights (to be freed) */ |
11047 | |
11048 | uint32_t pos_offset; /* Write offset of pos_buf */ |
11049 | uint32_t pos_size; /* Buffer size of pos_buf */ |
11050 | uint32_t *pos_buf; /* Buffer for positions (to be freed) */ |
11051 | |
11052 | size_t enc_offset; /* Write offset of enc_buf */ |
11053 | size_t enc_size; /* Buffer size of enc_buf */ |
11054 | uint8_t *enc_buf; /* Buffer for encoded data (to be freed) */ |
11055 | } grn_ii_builder_chunk; |
11056 | |
11057 | /* |
11058 | * grn_ii_builder_chunk_init initializes a chunk. Note that an initialized |
11059 | * chunk must be finalized by grn_ii_builder_chunk_fin. |
11060 | */ |
11061 | static void |
11062 | grn_ii_builder_chunk_init(grn_ctx *ctx, grn_ii_builder_chunk *chunk) |
11063 | { |
11064 | chunk->tid = GRN_ID_NIL; |
11065 | chunk->n = 0; |
11066 | chunk->rid = GRN_ID_NIL; |
11067 | chunk->rid_gap = 0; |
11068 | chunk->pos_sum = 0; |
11069 | |
11070 | chunk->offset = 0; |
11071 | chunk->size = 0; |
11072 | chunk->rid_buf = NULL; |
11073 | chunk->sid_buf = NULL; |
11074 | chunk->freq_buf = NULL; |
11075 | chunk->weight_buf = NULL; |
11076 | |
11077 | chunk->pos_offset = 0; |
11078 | chunk->pos_size = 0; |
11079 | chunk->pos_buf = NULL; |
11080 | |
11081 | chunk->enc_offset = 0; |
11082 | chunk->enc_size = 0; |
11083 | chunk->enc_buf = NULL; |
11084 | } |
11085 | |
11086 | /* grn_ii_builder_chunk_fin finalizes a chunk. */ |
11087 | static void |
11088 | grn_ii_builder_chunk_fin(grn_ctx *ctx, grn_ii_builder_chunk *chunk) |
11089 | { |
11090 | if (chunk->enc_buf) { |
11091 | GRN_FREE(chunk->enc_buf); |
11092 | } |
11093 | if (chunk->pos_buf) { |
11094 | GRN_FREE(chunk->pos_buf); |
11095 | } |
11096 | if (chunk->weight_buf) { |
11097 | GRN_FREE(chunk->weight_buf); |
11098 | } |
11099 | if (chunk->freq_buf) { |
11100 | GRN_FREE(chunk->freq_buf); |
11101 | } |
11102 | if (chunk->sid_buf) { |
11103 | GRN_FREE(chunk->sid_buf); |
11104 | } |
11105 | if (chunk->rid_buf) { |
11106 | GRN_FREE(chunk->rid_buf); |
11107 | } |
11108 | } |
11109 | |
11110 | /* |
11111 | * grn_ii_builder_chunk_clear clears stats except rid and buffers except |
11112 | * enc_buf. |
11113 | */ |
11114 | static void |
11115 | grn_ii_builder_chunk_clear(grn_ctx *ctx, grn_ii_builder_chunk *chunk) |
11116 | { |
11117 | chunk->n = 0; |
11118 | chunk->rid_gap = 0; |
11119 | chunk->pos_sum = 0; |
11120 | chunk->offset = 0; |
11121 | chunk->pos_offset = 0; |
11122 | } |
11123 | |
11124 | /* |
11125 | * grn_ii_builder_chunk_extend_bufs extends buffers except pos_buf and enc_buf. |
11126 | */ |
11127 | static grn_rc |
11128 | grn_ii_builder_chunk_extend_bufs(grn_ctx *ctx, grn_ii_builder_chunk *chunk, |
11129 | uint32_t ii_flags) |
11130 | { |
11131 | uint32_t *buf, size = chunk->size ? chunk->size * 2 : 1; |
11132 | size_t n_bytes = size * sizeof(uint32_t); |
11133 | |
11134 | buf = (uint32_t *)GRN_REALLOC(chunk->rid_buf, n_bytes); |
11135 | if (!buf) { |
11136 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11137 | "failed to allocate memory for record IDs: n_bytes = %" GRN_FMT_SIZE, |
11138 | n_bytes); |
11139 | return ctx->rc; |
11140 | } |
11141 | chunk->rid_buf = buf; |
11142 | |
11143 | if (ii_flags & GRN_OBJ_WITH_SECTION) { |
11144 | buf = (uint32_t *)GRN_REALLOC(chunk->sid_buf, n_bytes); |
11145 | if (!buf) { |
11146 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11147 | "failed to allocate memory for section IDs:" |
11148 | " n_bytes = %" GRN_FMT_SIZE, |
11149 | n_bytes); |
11150 | return ctx->rc; |
11151 | } |
11152 | chunk->sid_buf = buf; |
11153 | } |
11154 | |
11155 | buf = (uint32_t *)GRN_REALLOC(chunk->freq_buf, n_bytes); |
11156 | if (!buf) { |
11157 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11158 | "failed to allocate memory for frequencies: n_bytes = %" GRN_FMT_SIZE, |
11159 | n_bytes); |
11160 | return ctx->rc; |
11161 | } |
11162 | chunk->freq_buf = buf; |
11163 | |
11164 | if (ii_flags & GRN_OBJ_WITH_WEIGHT) { |
11165 | buf = (uint32_t *)GRN_REALLOC(chunk->weight_buf, n_bytes); |
11166 | if (!buf) { |
11167 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11168 | "failed to allocate memory for weights: n_bytes = %" GRN_FMT_SIZE, |
11169 | n_bytes); |
11170 | return ctx->rc; |
11171 | } |
11172 | chunk->weight_buf = buf; |
11173 | } |
11174 | |
11175 | chunk->size = size; |
11176 | return GRN_SUCCESS; |
11177 | } |
11178 | |
11179 | /* grn_ii_builder_chunk_extend_pos_buf extends pos_buf. */ |
11180 | static grn_rc |
11181 | grn_ii_builder_chunk_extend_pos_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk) |
11182 | { |
11183 | uint32_t *buf, size = chunk->pos_size ? chunk->pos_size * 2 : 1; |
11184 | size_t n_bytes = size * sizeof(uint32_t); |
11185 | buf = (uint32_t *)GRN_REALLOC(chunk->pos_buf, n_bytes); |
11186 | if (!buf) { |
11187 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11188 | "failed to allocate memory for positions: n_bytes = %" GRN_FMT_SIZE, |
11189 | n_bytes); |
11190 | return ctx->rc; |
11191 | } |
11192 | chunk->pos_buf = buf; |
11193 | chunk->pos_size = size; |
11194 | return GRN_SUCCESS; |
11195 | } |
11196 | |
11197 | /* |
11198 | * grn_ii_builder_chunk_reserve_enc_buf estimates a size that is enough to |
11199 | * store encoded data and allocates memory to enc_buf. |
11200 | */ |
11201 | static grn_rc |
11202 | grn_ii_builder_chunk_reserve_enc_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, |
11203 | uint32_t n_cinfos) |
11204 | { |
11205 | size_t rich_size = (chunk->n + 4) * sizeof(uint32_t) + |
11206 | n_cinfos * sizeof(chunk_info); |
11207 | if (chunk->enc_size < rich_size) { |
11208 | size_t size = chunk->enc_size ? chunk->enc_size * 2 : 1; |
11209 | uint8_t *buf; |
11210 | while (size < rich_size) { |
11211 | size *= 2; |
11212 | } |
11213 | buf = GRN_REALLOC(chunk->enc_buf, size); |
11214 | if (!buf) { |
11215 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11216 | "failed to allocate memory for encoding: size = %" GRN_FMT_SIZE, |
11217 | size); |
11218 | return ctx->rc; |
11219 | } |
11220 | chunk->enc_buf = buf; |
11221 | chunk->enc_size = size; |
11222 | } |
11223 | chunk->enc_offset = 0; |
11224 | return GRN_SUCCESS; |
11225 | } |
11226 | |
11227 | /* grn_ii_builder_chunk_encode encodes a chunk buffer. */ |
11228 | static void |
11229 | grn_ii_builder_chunk_encode_buf(grn_ctx *ctx, grn_ii_builder_chunk *chunk, |
11230 | uint32_t *values, uint32_t n_values, |
11231 | grn_bool use_p_enc) |
11232 | { |
11233 | uint8_t *p = chunk->enc_buf + chunk->enc_offset; |
11234 | uint32_t i; |
11235 | if (use_p_enc) { |
11236 | uint8_t freq[33]; |
11237 | uint32_t buf[UNIT_SIZE]; |
11238 | while (n_values >= UNIT_SIZE) { |
11239 | memset(freq, 0, 33); |
11240 | for (i = 0; i < UNIT_SIZE; i++) { |
11241 | buf[i] = values[i]; |
11242 | if (buf[i]) { |
11243 | uint32_t w; |
11244 | GRN_BIT_SCAN_REV(buf[i], w); |
11245 | freq[w + 1]++; |
11246 | } else { |
11247 | freq[0]++; |
11248 | } |
11249 | } |
11250 | p = pack(buf, UNIT_SIZE, freq, p); |
11251 | values += UNIT_SIZE; |
11252 | n_values -= UNIT_SIZE; |
11253 | } |
11254 | if (n_values) { |
11255 | memset(freq, 0, 33); |
11256 | for (i = 0; i < n_values; i++) { |
11257 | buf[i] = values[i]; |
11258 | if (buf[i]) { |
11259 | uint32_t w; |
11260 | GRN_BIT_SCAN_REV(buf[i], w); |
11261 | freq[w + 1]++; |
11262 | } else { |
11263 | freq[0]++; |
11264 | } |
11265 | } |
11266 | p = pack(buf, n_values, freq, p); |
11267 | } |
11268 | } else { |
11269 | for (i = 0; i < n_values; i++) { |
11270 | GRN_B_ENC(values[i], p); |
11271 | } |
11272 | } |
11273 | chunk->enc_offset = p - chunk->enc_buf; |
11274 | } |
11275 | |
11276 | /* grn_ii_builder_chunk_encode encodes a chunk. */ |
11277 | static grn_rc |
11278 | grn_ii_builder_chunk_encode(grn_ctx *ctx, grn_ii_builder_chunk *chunk, |
11279 | chunk_info *cinfos, uint32_t n_cinfos) |
11280 | { |
11281 | grn_rc rc; |
11282 | uint8_t *p; |
11283 | uint8_t shift = 0, use_p_enc_flags = 0; |
11284 | uint8_t rid_use_p_enc, rest_use_p_enc, pos_use_p_enc = 0; |
11285 | |
11286 | /* Choose an encoding. */ |
11287 | rid_use_p_enc = chunk->offset >= 16 && chunk->offset > (chunk->rid >> 8); |
11288 | use_p_enc_flags |= rid_use_p_enc << shift++; |
11289 | rest_use_p_enc = chunk->offset >= 3; |
11290 | if (chunk->sid_buf) { |
11291 | use_p_enc_flags |= rest_use_p_enc << shift++; |
11292 | } |
11293 | use_p_enc_flags |= rest_use_p_enc << shift++; |
11294 | if (chunk->weight_buf) { |
11295 | use_p_enc_flags |= rest_use_p_enc << shift++; |
11296 | } |
11297 | if (chunk->pos_buf) { |
11298 | pos_use_p_enc = chunk->pos_offset >= 32 && |
11299 | chunk->pos_offset > (chunk->pos_sum >> 13); |
11300 | use_p_enc_flags |= pos_use_p_enc << shift++; |
11301 | } |
11302 | |
11303 | rc = grn_ii_builder_chunk_reserve_enc_buf(ctx, chunk, n_cinfos); |
11304 | if (rc != GRN_SUCCESS) { |
11305 | return rc; |
11306 | } |
11307 | |
11308 | /* Encode a header. */ |
11309 | p = chunk->enc_buf; |
11310 | if (n_cinfos) { |
11311 | uint32_t i; |
11312 | GRN_B_ENC(n_cinfos, p); |
11313 | for (i = 0; i < n_cinfos; i++) { |
11314 | GRN_B_ENC(cinfos[i].segno, p); |
11315 | GRN_B_ENC(cinfos[i].size, p); |
11316 | GRN_B_ENC(cinfos[i].dgap, p); |
11317 | } |
11318 | } |
11319 | if (use_p_enc_flags) { |
11320 | GRN_B_ENC(use_p_enc_flags << 1, p); |
11321 | GRN_B_ENC(chunk->offset, p); |
11322 | if (chunk->pos_buf) { |
11323 | GRN_B_ENC(chunk->pos_offset - chunk->offset, p); |
11324 | } |
11325 | } else { |
11326 | GRN_B_ENC((chunk->offset << 1) | 1, p); |
11327 | } |
11328 | chunk->enc_offset = p - chunk->enc_buf; |
11329 | |
11330 | /* Encode a body. */ |
11331 | grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->rid_buf, chunk->offset, |
11332 | rid_use_p_enc); |
11333 | if (chunk->sid_buf) { |
11334 | grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->sid_buf, chunk->offset, |
11335 | rest_use_p_enc); |
11336 | } |
11337 | grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->freq_buf, chunk->offset, |
11338 | rest_use_p_enc); |
11339 | if (chunk->weight_buf) { |
11340 | grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->weight_buf, |
11341 | chunk->offset, rest_use_p_enc); |
11342 | } |
11343 | if (chunk->pos_buf) { |
11344 | grn_ii_builder_chunk_encode_buf(ctx, chunk, chunk->pos_buf, |
11345 | chunk->pos_offset, pos_use_p_enc); |
11346 | } |
11347 | |
11348 | return GRN_SUCCESS; |
11349 | } |
11350 | |
11351 | typedef struct { |
11352 | grn_ii *ii; /* Building inverted index */ |
11353 | grn_ii_builder_options options; /* Options */ |
11354 | |
11355 | grn_obj *src_table; /* Source table */ |
11356 | grn_obj **srcs; /* Source columns (to be freed) */ |
11357 | uint32_t n_srcs; /* Number of source columns */ |
11358 | uint8_t sid_bits; /* Number of bits for section ID */ |
11359 | uint64_t sid_mask; /* Mask bits for section ID */ |
11360 | |
11361 | grn_obj *lexicon; /* Block lexicon (to be closed) */ |
11362 | grn_obj *tokenizer; /* Lexicon's tokenizer */ |
11363 | grn_obj *normalizer; /* Lexicon's normalzier */ |
11364 | |
11365 | uint32_t n; /* Number of integers appended to the current block */ |
11366 | grn_id rid; /* Record ID */ |
11367 | uint32_t sid; /* Section ID */ |
11368 | uint32_t pos; /* Position */ |
11369 | |
11370 | grn_ii_builder_term *terms; /* Terms (to be freed) */ |
11371 | uint32_t n_terms; /* Number of distinct terms */ |
11372 | uint32_t max_n_terms; /* Maximum number of distinct terms */ |
11373 | uint32_t terms_size; /* Buffer size of terms */ |
11374 | |
11375 | /* A temporary file to save blocks. */ |
11376 | char path[PATH_MAX]; /* File path */ |
11377 | int fd; /* File descriptor (to be closed) */ |
11378 | uint8_t *file_buf; /* File buffer for buffered output (to be freed) */ |
11379 | uint32_t file_buf_offset; /* File buffer write offset */ |
11380 | |
11381 | grn_ii_builder_block *blocks; /* Blocks (to be freed) */ |
11382 | uint32_t n_blocks; /* Number of blocks */ |
11383 | uint32_t blocks_size; /* Buffer size of blocks */ |
11384 | |
11385 | grn_ii_builder_buffer buf; /* Buffer (to be finalized) */ |
11386 | grn_ii_builder_chunk chunk; /* Chunk (to be finalized) */ |
11387 | |
11388 | uint32_t df; /* Document frequency (number of sections) */ |
11389 | chunk_info *cinfos; /* Chunk headers (to be freed) */ |
11390 | uint32_t n_cinfos; /* Number of chunks */ |
11391 | uint32_t cinfos_size; /* Size of cinfos */ |
11392 | } grn_ii_builder; |
11393 | |
11394 | /* |
11395 | * grn_ii_builder_init initializes a builder. Note that an initialized builder |
11396 | * must be finalized by grn_ii_builder_fin. |
11397 | */ |
11398 | static grn_rc |
11399 | grn_ii_builder_init(grn_ctx *ctx, grn_ii_builder *builder, |
11400 | grn_ii *ii, const grn_ii_builder_options *options) |
11401 | { |
11402 | builder->ii = ii; |
11403 | builder->options = *options; |
11404 | if (grn_ii_builder_block_threshold_force > 0) { |
11405 | builder->options.block_threshold = grn_ii_builder_block_threshold_force; |
11406 | } |
11407 | grn_ii_builder_options_fix(&builder->options); |
11408 | |
11409 | builder->src_table = NULL; |
11410 | builder->srcs = NULL; |
11411 | builder->n_srcs = 0; |
11412 | builder->sid_bits = 0; |
11413 | builder->sid_mask = 0; |
11414 | |
11415 | builder->lexicon = NULL; |
11416 | builder->tokenizer = NULL; |
11417 | builder->normalizer = NULL; |
11418 | |
11419 | builder->n = 0; |
11420 | builder->rid = GRN_ID_NIL; |
11421 | builder->sid = 0; |
11422 | builder->pos = 0; |
11423 | |
11424 | builder->terms = NULL; |
11425 | builder->n_terms = 0; |
11426 | builder->max_n_terms = 0; |
11427 | builder->terms_size = 0; |
11428 | |
11429 | builder->path[0] = '\0'; |
11430 | builder->fd = -1; |
11431 | builder->file_buf = NULL; |
11432 | builder->file_buf_offset = 0; |
11433 | |
11434 | builder->blocks = NULL; |
11435 | builder->n_blocks = 0; |
11436 | builder->blocks_size = 0; |
11437 | |
11438 | grn_ii_builder_buffer_init(ctx, &builder->buf, ii); |
11439 | grn_ii_builder_chunk_init(ctx, &builder->chunk); |
11440 | |
11441 | builder->df = 0; |
11442 | builder->cinfos = NULL; |
11443 | builder->n_cinfos = 0; |
11444 | builder->cinfos_size = 0; |
11445 | |
11446 | return GRN_SUCCESS; |
11447 | } |
11448 | |
11449 | /* grn_ii_builder_fin_terms finalizes terms. */ |
11450 | static void |
11451 | grn_ii_builder_fin_terms(grn_ctx *ctx, grn_ii_builder *builder) |
11452 | { |
11453 | if (builder->terms) { |
11454 | uint32_t i; |
11455 | for (i = 0; i < builder->max_n_terms; i++) { |
11456 | grn_ii_builder_term_fin(ctx, &builder->terms[i]); |
11457 | } |
11458 | GRN_FREE(builder->terms); |
11459 | |
11460 | /* To avoid double finalization. */ |
11461 | builder->terms = NULL; |
11462 | } |
11463 | } |
11464 | |
11465 | /* grn_ii_builder_fin finalizes a builder. */ |
11466 | static grn_rc |
11467 | grn_ii_builder_fin(grn_ctx *ctx, grn_ii_builder *builder) |
11468 | { |
11469 | if (builder->cinfos) { |
11470 | GRN_FREE(builder->cinfos); |
11471 | } |
11472 | grn_ii_builder_chunk_fin(ctx, &builder->chunk); |
11473 | grn_ii_builder_buffer_fin(ctx, &builder->buf); |
11474 | if (builder->blocks) { |
11475 | uint32_t i; |
11476 | for (i = 0; i < builder->n_blocks; i++) { |
11477 | grn_ii_builder_block_fin(ctx, &builder->blocks[i]); |
11478 | } |
11479 | GRN_FREE(builder->blocks); |
11480 | } |
11481 | if (builder->file_buf) { |
11482 | GRN_FREE(builder->file_buf); |
11483 | } |
11484 | if (builder->fd != -1) { |
11485 | grn_close(builder->fd); |
11486 | if (grn_unlink(builder->path) == 0) { |
11487 | GRN_LOG(ctx, GRN_LOG_INFO, |
11488 | "[ii][builder][fin] removed path: <%s>" , |
11489 | builder->path); |
11490 | } else { |
11491 | ERRNO_ERR("[ii][builder][fin] failed to remove path: <%s>" , |
11492 | builder->path); |
11493 | } |
11494 | } |
11495 | grn_ii_builder_fin_terms(ctx, builder); |
11496 | if (builder->lexicon) { |
11497 | grn_obj_close(ctx, builder->lexicon); |
11498 | } |
11499 | if (builder->srcs) { |
11500 | GRN_FREE(builder->srcs); |
11501 | } |
11502 | return GRN_SUCCESS; |
11503 | } |
11504 | |
11505 | /* |
11506 | * grn_ii_builder_open creates a builder. Note that a builder must be closed by |
11507 | * grn_ii_builder_close. |
11508 | */ |
11509 | static grn_rc |
11510 | grn_ii_builder_open(grn_ctx *ctx, grn_ii *ii, |
11511 | const grn_ii_builder_options *options, |
11512 | grn_ii_builder **builder) |
11513 | { |
11514 | grn_rc rc; |
11515 | grn_ii_builder *new_builder = GRN_MALLOCN(grn_ii_builder, 1); |
11516 | if (!new_builder) { |
11517 | return GRN_NO_MEMORY_AVAILABLE; |
11518 | } |
11519 | if (!options) { |
11520 | options = &grn_ii_builder_default_options; |
11521 | } |
11522 | rc = grn_ii_builder_init(ctx, new_builder, ii, options); |
11523 | if (rc != GRN_SUCCESS) { |
11524 | GRN_FREE(new_builder); |
11525 | return rc; |
11526 | } |
11527 | *builder = new_builder; |
11528 | return GRN_SUCCESS; |
11529 | } |
11530 | |
11531 | /* grn_ii_builder_close closes a builder. */ |
11532 | static grn_rc |
11533 | grn_ii_builder_close(grn_ctx *ctx, grn_ii_builder *builder) |
11534 | { |
11535 | grn_rc rc; |
11536 | if (!builder) { |
11537 | ERR(GRN_INVALID_ARGUMENT, "builder is null" ); |
11538 | return ctx->rc; |
11539 | } |
11540 | rc = grn_ii_builder_fin(ctx, builder); |
11541 | GRN_FREE(builder); |
11542 | return rc; |
11543 | } |
11544 | |
11545 | /* grn_ii_builder_create_lexicon creates a block lexicon. */ |
11546 | static grn_rc |
11547 | grn_ii_builder_create_lexicon(grn_ctx *ctx, grn_ii_builder *builder) |
11548 | { |
11549 | grn_table_flags flags; |
11550 | grn_obj *domain = grn_ctx_at(ctx, builder->ii->lexicon->header.domain); |
11551 | grn_obj *range = grn_ctx_at(ctx, DB_OBJ(builder->ii->lexicon)->range); |
11552 | grn_obj *tokenizer, *normalizer, *token_filters; |
11553 | grn_rc rc = grn_table_get_info(ctx, builder->ii->lexicon, &flags, NULL, |
11554 | &tokenizer, &normalizer, &token_filters); |
11555 | if (rc != GRN_SUCCESS) { |
11556 | return rc; |
11557 | } |
11558 | flags &= ~GRN_OBJ_PERSISTENT; |
11559 | builder->lexicon = grn_table_create(ctx, NULL, 0, NULL, |
11560 | flags, domain, range); |
11561 | if (!builder->lexicon) { |
11562 | if (ctx->rc == GRN_SUCCESS) { |
11563 | ERR(GRN_UNKNOWN_ERROR, "[index] failed to create a block lexicon" ); |
11564 | } |
11565 | return ctx->rc; |
11566 | } |
11567 | builder->tokenizer = tokenizer; |
11568 | builder->normalizer = normalizer; |
11569 | rc = grn_obj_set_info(ctx, builder->lexicon, |
11570 | GRN_INFO_DEFAULT_TOKENIZER, tokenizer); |
11571 | if (rc == GRN_SUCCESS) { |
11572 | rc = grn_obj_set_info(ctx, builder->lexicon, |
11573 | GRN_INFO_NORMALIZER, normalizer); |
11574 | if (rc == GRN_SUCCESS) { |
11575 | rc = grn_obj_set_info(ctx, builder->lexicon, |
11576 | GRN_INFO_TOKEN_FILTERS, token_filters); |
11577 | } |
11578 | } |
11579 | if (rc != GRN_SUCCESS) { |
11580 | return rc; |
11581 | } |
11582 | if ((flags & GRN_OBJ_TABLE_TYPE_MASK) == GRN_OBJ_TABLE_PAT_KEY) { |
11583 | if (builder->options.lexicon_cache_size) { |
11584 | rc = grn_pat_cache_enable(ctx, (grn_pat *)builder->lexicon, |
11585 | builder->options.lexicon_cache_size); |
11586 | if (rc != GRN_SUCCESS) { |
11587 | return rc; |
11588 | } |
11589 | } |
11590 | } |
11591 | return GRN_SUCCESS; |
11592 | } |
11593 | |
11594 | /* |
11595 | * grn_ii_builder_extend_terms extends a buffer for terms in order to make |
11596 | * terms[n_terms - 1] available. |
11597 | */ |
11598 | static grn_rc |
11599 | grn_ii_builder_extend_terms(grn_ctx *ctx, grn_ii_builder *builder, |
11600 | uint32_t n_terms) |
11601 | { |
11602 | if (n_terms <= builder->n_terms) { |
11603 | return GRN_SUCCESS; |
11604 | } |
11605 | |
11606 | if (n_terms > builder->max_n_terms) { |
11607 | uint32_t i; |
11608 | if (n_terms > builder->terms_size) { |
11609 | /* Resize builder->terms for new terms. */ |
11610 | size_t n_bytes; |
11611 | uint32_t terms_size = builder->terms_size ? builder->terms_size * 2 : 1; |
11612 | grn_ii_builder_term *terms; |
11613 | while (terms_size < n_terms) { |
11614 | terms_size *= 2; |
11615 | } |
11616 | n_bytes = terms_size * sizeof(grn_ii_builder_term); |
11617 | terms = (grn_ii_builder_term *)GRN_REALLOC(builder->terms, n_bytes); |
11618 | if (!terms) { |
11619 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11620 | "failed to allocate memory for terms: n_bytes = %" GRN_FMT_SIZE, |
11621 | n_bytes); |
11622 | return ctx->rc; |
11623 | } |
11624 | builder->terms = terms; |
11625 | builder->terms_size = terms_size; |
11626 | } |
11627 | /* Initialize new terms. */ |
11628 | for (i = builder->max_n_terms; i < n_terms; i++) { |
11629 | grn_ii_builder_term_init(ctx, &builder->terms[i]); |
11630 | } |
11631 | builder->max_n_terms = n_terms; |
11632 | } |
11633 | |
11634 | builder->n += n_terms - builder->n_terms; |
11635 | builder->n_terms = n_terms; |
11636 | return GRN_SUCCESS; |
11637 | } |
11638 | |
11639 | /* grn_ii_builder_get_term gets a term associated with tid. */ |
11640 | inline static grn_rc |
11641 | grn_ii_builder_get_term(grn_ctx *ctx, grn_ii_builder *builder, grn_id tid, |
11642 | grn_ii_builder_term **term) |
11643 | { |
11644 | uint32_t n_terms = tid; |
11645 | if (n_terms > builder->n_terms) { |
11646 | grn_rc rc = grn_ii_builder_extend_terms(ctx, builder, n_terms); |
11647 | if (rc != GRN_SUCCESS) { |
11648 | return rc; |
11649 | } |
11650 | } |
11651 | *term = &builder->terms[tid - 1]; |
11652 | return GRN_SUCCESS; |
11653 | } |
11654 | |
11655 | /* grn_ii_builder_flush_file_buf flushes buffered data as a block. */ |
11656 | static grn_rc |
11657 | grn_ii_builder_flush_file_buf(grn_ctx *ctx, grn_ii_builder *builder) |
11658 | { |
11659 | if (builder->file_buf_offset) { |
11660 | ssize_t size = grn_write(builder->fd, builder->file_buf, |
11661 | builder->file_buf_offset); |
11662 | if ((uint64_t)size != builder->file_buf_offset) { |
11663 | SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, |
11664 | builder->file_buf_offset, (int64_t)size); |
11665 | } |
11666 | builder->file_buf_offset = 0; |
11667 | } |
11668 | return GRN_SUCCESS; |
11669 | } |
11670 | |
11671 | /* grn_ii_builder_flush_term flushes a term and clears it */ |
11672 | static grn_rc |
11673 | grn_ii_builder_flush_term(grn_ctx *ctx, grn_ii_builder *builder, |
11674 | grn_ii_builder_term *term) |
11675 | { |
11676 | grn_rc rc; |
11677 | uint8_t *term_buf; |
11678 | |
11679 | /* Append sentinels. */ |
11680 | if (term->rid != GRN_ID_NIL) { |
11681 | if (builder->ii->header->flags & GRN_OBJ_WITH_POSITION) { |
11682 | rc = grn_ii_builder_term_append(ctx, term, 0); |
11683 | } else { |
11684 | rc = grn_ii_builder_term_append(ctx, term, term->pos_or_freq); |
11685 | } |
11686 | if (rc != GRN_SUCCESS) { |
11687 | return rc; |
11688 | } |
11689 | } |
11690 | rc = grn_ii_builder_term_append(ctx, term, 0); |
11691 | if (rc != GRN_SUCCESS) { |
11692 | return rc; |
11693 | } |
11694 | |
11695 | { |
11696 | /* Put the global term ID. */ |
11697 | int key_size; |
11698 | char key[GRN_TABLE_MAX_KEY_SIZE]; |
11699 | uint8_t *p; |
11700 | uint32_t rest, value; |
11701 | grn_rc rc; |
11702 | grn_id local_tid = term - builder->terms + 1, global_tid; |
11703 | key_size = grn_table_get_key(ctx, builder->lexicon, local_tid, |
11704 | key, GRN_TABLE_MAX_KEY_SIZE); |
11705 | if (!key_size) { |
11706 | if (ctx->rc == GRN_SUCCESS) { |
11707 | ERR(GRN_UNKNOWN_ERROR, "failed to get key: tid = %u" , local_tid); |
11708 | } |
11709 | return ctx->rc; |
11710 | } |
11711 | global_tid = grn_table_add(ctx, builder->ii->lexicon, key, key_size, NULL); |
11712 | if (global_tid == GRN_ID_NIL) { |
11713 | if (ctx->rc == GRN_SUCCESS) { |
11714 | ERR(GRN_UNKNOWN_ERROR, |
11715 | "failed to get global term ID: tid = %u, key = \"%.*s\"" , |
11716 | local_tid, key_size, key); |
11717 | } |
11718 | return ctx->rc; |
11719 | } |
11720 | |
11721 | rest = builder->options.file_buf_size - builder->file_buf_offset; |
11722 | if (rest < 10) { |
11723 | rc = grn_ii_builder_flush_file_buf(ctx, builder); |
11724 | if (rc != GRN_SUCCESS) { |
11725 | return rc; |
11726 | } |
11727 | } |
11728 | value = global_tid; |
11729 | p = builder->file_buf + builder->file_buf_offset; |
11730 | if (value < 1U << 5) { |
11731 | p[0] = (uint8_t)value; |
11732 | builder->file_buf_offset++; |
11733 | } else if (value < 1U << 13) { |
11734 | p[0] = (uint8_t)((value & 0x1f) | (1 << 5)); |
11735 | p[1] = (uint8_t)(value >> 5); |
11736 | builder->file_buf_offset += 2; |
11737 | } else { |
11738 | uint8_t i, n; |
11739 | if (value < 1U << 21) { |
11740 | n = 3; |
11741 | } else if (value < 1U << 29) { |
11742 | n = 4; |
11743 | } else { |
11744 | n = 5; |
11745 | } |
11746 | p[0] = (uint8_t)(value & 0x1f) | ((n - 1) << 5); |
11747 | value >>= 5; |
11748 | for (i = 1; i < n; i++) { |
11749 | p[i] = (uint8_t)value; |
11750 | value >>= 8; |
11751 | } |
11752 | builder->file_buf_offset += n; |
11753 | } |
11754 | } |
11755 | |
11756 | /* Flush a term buffer. */ |
11757 | term_buf = grn_ii_builder_term_get_buf(term); |
11758 | if (term->offset > builder->options.file_buf_size) { |
11759 | ssize_t size; |
11760 | rc = grn_ii_builder_flush_file_buf(ctx, builder); |
11761 | if (rc != GRN_SUCCESS) { |
11762 | return rc; |
11763 | } |
11764 | size = grn_write(builder->fd, term_buf, term->offset); |
11765 | if ((uint64_t)size != term->offset) { |
11766 | SERR("failed to write data: expected = %u, actual = %" GRN_FMT_INT64D, |
11767 | term->offset, (int64_t)size); |
11768 | } |
11769 | } else { |
11770 | uint32_t rest = builder->options.file_buf_size - builder->file_buf_offset; |
11771 | if (term->offset <= rest) { |
11772 | grn_memcpy(builder->file_buf + builder->file_buf_offset, |
11773 | term_buf, term->offset); |
11774 | builder->file_buf_offset += term->offset; |
11775 | } else { |
11776 | grn_memcpy(builder->file_buf + builder->file_buf_offset, |
11777 | term_buf, rest); |
11778 | builder->file_buf_offset += rest; |
11779 | rc = grn_ii_builder_flush_file_buf(ctx, builder); |
11780 | if (rc != GRN_SUCCESS) { |
11781 | return rc; |
11782 | } |
11783 | builder->file_buf_offset = term->offset - rest; |
11784 | grn_memcpy(builder->file_buf, term_buf + rest, builder->file_buf_offset); |
11785 | } |
11786 | } |
11787 | grn_ii_builder_term_reinit(ctx, term); |
11788 | return GRN_SUCCESS; |
11789 | } |
11790 | |
11791 | /* |
11792 | * grn_ii_builder_create_file creates a temporary file and allocates memory for |
11793 | * buffered output. |
11794 | */ |
11795 | static grn_rc |
11796 | grn_ii_builder_create_file(grn_ctx *ctx, grn_ii_builder *builder) |
11797 | { |
11798 | grn_snprintf(builder->path, PATH_MAX, PATH_MAX, |
11799 | "%sXXXXXX" , grn_io_path(builder->ii->seg)); |
11800 | builder->fd = grn_mkstemp(builder->path); |
11801 | if (builder->fd == -1) { |
11802 | SERR("failed to create a temporary file: path = \"%s\"" , |
11803 | builder->path); |
11804 | return ctx->rc; |
11805 | } |
11806 | builder->file_buf = (uint8_t *)GRN_MALLOC(builder->options.file_buf_size); |
11807 | if (!builder->file_buf) { |
11808 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11809 | "failed to allocate memory for buffered output: size = %u" , |
11810 | builder->options.file_buf_size); |
11811 | return ctx->rc; |
11812 | } |
11813 | return GRN_SUCCESS; |
11814 | } |
11815 | |
11816 | /* grn_ii_builder_register_block registers a block. */ |
11817 | static grn_rc |
11818 | grn_ii_builder_register_block(grn_ctx *ctx, grn_ii_builder *builder) |
11819 | { |
11820 | grn_ii_builder_block *block; |
11821 | uint64_t file_offset = grn_lseek(builder->fd, 0, SEEK_CUR); |
11822 | if (file_offset == (uint64_t)-1) { |
11823 | SERR("failed to get file offset" ); |
11824 | return ctx->rc; |
11825 | } |
11826 | if (builder->n_blocks >= builder->blocks_size) { |
11827 | size_t n_bytes; |
11828 | uint32_t blocks_size = 1; |
11829 | grn_ii_builder_block *blocks; |
11830 | while (blocks_size <= builder->n_blocks) { |
11831 | blocks_size *= 2; |
11832 | } |
11833 | n_bytes = blocks_size * sizeof(grn_ii_builder_block); |
11834 | blocks = (grn_ii_builder_block *)GRN_REALLOC(builder->blocks, n_bytes); |
11835 | if (!blocks) { |
11836 | ERR(GRN_NO_MEMORY_AVAILABLE, |
11837 | "failed to allocate memory for block: n_bytes = %" GRN_FMT_SIZE, |
11838 | n_bytes); |
11839 | return ctx->rc; |
11840 | } |
11841 | builder->blocks = blocks; |
11842 | builder->blocks_size = blocks_size; |
11843 | } |
11844 | block = &builder->blocks[builder->n_blocks]; |
11845 | grn_ii_builder_block_init(ctx, block); |
11846 | if (!builder->n_blocks) { |
11847 | block->offset = 0; |
11848 | } else { |
11849 | grn_ii_builder_block *prev_block = &builder->blocks[builder->n_blocks - 1]; |
11850 | block->offset = prev_block->offset + prev_block->rest; |
11851 | } |
11852 | block->rest = (uint32_t)(file_offset - block->offset); |
11853 | builder->n_blocks++; |
11854 | return GRN_SUCCESS; |
11855 | } |
11856 | |
11857 | /* grn_ii_builder_flush_block flushes a block to a temporary file. */ |
11858 | static grn_rc |
11859 | grn_ii_builder_flush_block(grn_ctx *ctx, grn_ii_builder *builder) |
11860 | { |
11861 | grn_rc rc; |
11862 | grn_table_cursor *cursor; |
11863 | |
11864 | if (!builder->n) { |
11865 | /* Do nothing if there are no output data. */ |
11866 | return GRN_SUCCESS; |
11867 | } |
11868 | if (builder->fd == -1) { |
11869 | rc = grn_ii_builder_create_file(ctx, builder); |
11870 | if (rc != GRN_SUCCESS) { |
11871 | return rc; |
11872 | } |
11873 | } |
11874 | |
11875 | /* Flush terms into a temporary file. */ |
11876 | cursor = grn_table_cursor_open(ctx, builder->lexicon, |
11877 | NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); |
11878 | for (;;) { |
11879 | grn_id tid = grn_table_cursor_next(ctx, cursor); |
11880 | if (tid == GRN_ID_NIL) { |
11881 | break; |
11882 | } |
11883 | rc = grn_ii_builder_flush_term(ctx, builder, &builder->terms[tid - 1]); |
11884 | if (rc != GRN_SUCCESS) { |
11885 | return rc; |
11886 | } |
11887 | } |
11888 | grn_table_cursor_close(ctx, cursor); |
11889 | rc = grn_ii_builder_flush_file_buf(ctx, builder); |
11890 | if (rc != GRN_SUCCESS) { |
11891 | return rc; |
11892 | } |
11893 | |
11894 | /* Register a block and clear the current data. */ |
11895 | rc = grn_ii_builder_register_block(ctx, builder); |
11896 | if (rc != GRN_SUCCESS) { |
11897 | return rc; |
11898 | } |
11899 | rc = grn_table_truncate(ctx, builder->lexicon); |
11900 | if (rc != GRN_SUCCESS) { |
11901 | return rc; |
11902 | } |
11903 | builder->rid = GRN_ID_NIL; |
11904 | builder->n_terms = 0; |
11905 | builder->n = 0; |
11906 | return GRN_SUCCESS; |
11907 | } |
11908 | |
11909 | /* grn_ii_builder_append_token appends a token. */ |
11910 | static grn_rc |
11911 | grn_ii_builder_append_token(grn_ctx *ctx, grn_ii_builder *builder, |
11912 | grn_id rid, uint32_t sid, uint32_t weight, |
11913 | grn_id tid, uint32_t pos) |
11914 | { |
11915 | grn_rc rc; |
11916 | uint32_t ii_flags = builder->ii->header->flags; |
11917 | grn_ii_builder_term *term; |
11918 | rc = grn_ii_builder_get_term(ctx, builder, tid, &term); |
11919 | if (rc != GRN_SUCCESS) { |
11920 | return rc; |
11921 | } |
11922 | if (rid != term->rid || sid != term->sid) { |
11923 | uint64_t rsid; |
11924 | if (term->rid != GRN_ID_NIL) { |
11925 | if (ii_flags & GRN_OBJ_WITH_POSITION) { |
11926 | /* Append the end of positions. */ |
11927 | rc = grn_ii_builder_term_append(ctx, term, 0); |
11928 | if (rc != GRN_SUCCESS) { |
11929 | return rc; |
11930 | } |
11931 | builder->n++; |
11932 | } else { |
11933 | /* Append a frequency if positions are not available. */ |
11934 | rc = grn_ii_builder_term_append(ctx, term, term->pos_or_freq); |
11935 | if (rc != GRN_SUCCESS) { |
11936 | return rc; |
11937 | } |
11938 | builder->n++; |
11939 | } |
11940 | } |
11941 | rsid = ((uint64_t)(rid - term->rid) << builder->sid_bits) | (sid - 1); |
11942 | rc = grn_ii_builder_term_append(ctx, term, rsid); |
11943 | if (rc != GRN_SUCCESS) { |
11944 | return rc; |
11945 | } |
11946 | builder->n++; |
11947 | if (ii_flags & GRN_OBJ_WITH_WEIGHT) { |
11948 | rc = grn_ii_builder_term_append(ctx, term, weight); |
11949 | if (rc != GRN_SUCCESS) { |
11950 | return rc; |
11951 | } |
11952 | builder->n++; |
11953 | } |
11954 | term->rid = rid; |
11955 | term->sid = sid; |
11956 | term->pos_or_freq = 0; |
11957 | } |
11958 | if (ii_flags & GRN_OBJ_WITH_POSITION) { |
11959 | rc = grn_ii_builder_term_append(ctx, term, pos - term->pos_or_freq); |
11960 | if (rc != GRN_SUCCESS) { |
11961 | return rc; |
11962 | } |
11963 | builder->n++; |
11964 | term->pos_or_freq = pos; |
11965 | } else { |
11966 | term->pos_or_freq++; |
11967 | } |
11968 | return GRN_SUCCESS; |
11969 | } |
11970 | |
11971 | /* |
11972 | * grn_ii_builder_append_value appends a value. Note that values must be |
11973 | * appended in ascending rid and sid order. |
11974 | */ |
11975 | static grn_rc |
11976 | grn_ii_builder_append_value(grn_ctx *ctx, grn_ii_builder *builder, |
11977 | grn_id rid, uint32_t sid, uint32_t weight, |
11978 | const char *value, uint32_t value_size) |
11979 | { |
11980 | uint32_t pos = 0; |
11981 | grn_token_cursor *cursor; |
11982 | if (rid != builder->rid) { |
11983 | builder->rid = rid; |
11984 | builder->sid = sid; |
11985 | builder->pos = 1; |
11986 | } else if (sid != builder->sid) { |
11987 | builder->sid = sid; |
11988 | builder->pos = 1; |
11989 | } else { |
11990 | /* Insert a space between values. */ |
11991 | builder->pos++; |
11992 | } |
11993 | if (value_size) { |
11994 | if (!builder->tokenizer && !builder->normalizer) { |
11995 | grn_id tid; |
11996 | switch (builder->lexicon->header.type) { |
11997 | case GRN_TABLE_PAT_KEY : |
11998 | tid = grn_pat_add(ctx, (grn_pat *)builder->lexicon, |
11999 | value, value_size, NULL, NULL); |
12000 | break; |
12001 | case GRN_TABLE_DAT_KEY : |
12002 | tid = grn_dat_add(ctx, (grn_dat *)builder->lexicon, |
12003 | value, value_size, NULL, NULL); |
12004 | break; |
12005 | case GRN_TABLE_HASH_KEY : |
12006 | tid = grn_hash_add(ctx, (grn_hash *)builder->lexicon, |
12007 | value, value_size, NULL, NULL); |
12008 | break; |
12009 | case GRN_TABLE_NO_KEY : |
12010 | tid = *(grn_id *)value; |
12011 | break; |
12012 | default : |
12013 | tid = GRN_ID_NIL; |
12014 | break; |
12015 | } |
12016 | if (tid != GRN_ID_NIL) { |
12017 | grn_rc rc; |
12018 | pos = builder->pos; |
12019 | rc = grn_ii_builder_append_token(ctx, builder, rid, sid, |
12020 | weight, tid, pos); |
12021 | if (rc != GRN_SUCCESS) { |
12022 | return rc; |
12023 | } |
12024 | } |
12025 | } else { |
12026 | cursor = grn_token_cursor_open(ctx, builder->lexicon, value, value_size, |
12027 | GRN_TOKEN_ADD, 0); |
12028 | if (!cursor) { |
12029 | if (ctx->rc == GRN_SUCCESS) { |
12030 | ERR(GRN_UNKNOWN_ERROR, |
12031 | "grn_token_cursor_open failed: value = <%.*s>" , |
12032 | value_size, value); |
12033 | } |
12034 | return ctx->rc; |
12035 | } |
12036 | while (cursor->status == GRN_TOKEN_CURSOR_DOING) { |
12037 | grn_id tid = grn_token_cursor_next(ctx, cursor); |
12038 | if (tid != GRN_ID_NIL) { |
12039 | grn_rc rc; |
12040 | pos = builder->pos + cursor->pos; |
12041 | rc = grn_ii_builder_append_token(ctx, builder, rid, sid, |
12042 | weight, tid, pos); |
12043 | if (rc != GRN_SUCCESS) { |
12044 | break; |
12045 | } |
12046 | } |
12047 | } |
12048 | grn_token_cursor_close(ctx, cursor); |
12049 | } |
12050 | } |
12051 | builder->pos = pos + 1; |
12052 | return ctx->rc; |
12053 | } |
12054 | |
12055 | /* grn_ii_builder_append_obj appends a BULK, UVECTOR or VECTOR object. */ |
12056 | static grn_rc |
12057 | grn_ii_builder_append_obj(grn_ctx *ctx, grn_ii_builder *builder, |
12058 | grn_id rid, uint32_t sid, grn_obj *obj) |
12059 | { |
12060 | switch (obj->header.type) { |
12061 | case GRN_BULK : |
12062 | return grn_ii_builder_append_value(ctx, builder, rid, sid, 0, |
12063 | GRN_TEXT_VALUE(obj), GRN_TEXT_LEN(obj)); |
12064 | case GRN_UVECTOR : |
12065 | { |
12066 | const char *p = GRN_BULK_HEAD(obj); |
12067 | uint32_t i, n_values = grn_uvector_size(ctx, obj); |
12068 | uint32_t value_size = grn_uvector_element_size(ctx, obj); |
12069 | for (i = 0; i < n_values; i++) { |
12070 | grn_rc rc = grn_ii_builder_append_value(ctx, builder, rid, sid, 0, |
12071 | p, value_size); |
12072 | if (rc != GRN_SUCCESS) { |
12073 | return rc; |
12074 | } |
12075 | p += value_size; |
12076 | } |
12077 | } |
12078 | return GRN_SUCCESS; |
12079 | case GRN_VECTOR : |
12080 | if (obj->u.v.body) { |
12081 | /* |
12082 | * Note that the following sections and n_sections don't correspond to |
12083 | * source columns. |
12084 | */ |
12085 | int i, n_secs = obj->u.v.n_sections; |
12086 | grn_section *secs = obj->u.v.sections; |
12087 | const char *head = GRN_BULK_HEAD(obj->u.v.body); |
12088 | for (i = 0; i < n_secs; i++) { |
12089 | grn_rc rc; |
12090 | grn_section *sec = &secs[i]; |
12091 | if (sec->length == 0) { |
12092 | continue; |
12093 | } |
12094 | if (builder->tokenizer) { |
12095 | sid = i + 1; |
12096 | } |
12097 | rc = grn_ii_builder_append_value(ctx, builder, rid, sid, sec->weight, |
12098 | head + sec->offset, sec->length); |
12099 | if (rc != GRN_SUCCESS) { |
12100 | return rc; |
12101 | } |
12102 | } |
12103 | } |
12104 | return GRN_SUCCESS; |
12105 | default : |
12106 | ERR(GRN_INVALID_ARGUMENT, "[index] invalid object assigned as value" ); |
12107 | return ctx->rc; |
12108 | } |
12109 | } |
12110 | |
12111 | /* |
12112 | * grn_ii_builder_append_srcs reads values from source columns and appends the |
12113 | * values. |
12114 | */ |
12115 | static grn_rc |
12116 | grn_ii_builder_append_srcs(grn_ctx *ctx, grn_ii_builder *builder) |
12117 | { |
12118 | size_t i; |
12119 | grn_rc rc = GRN_SUCCESS; |
12120 | grn_obj *objs; |
12121 | grn_table_cursor *cursor; |
12122 | |
12123 | /* Allocate memory for objects to store source values. */ |
12124 | objs = GRN_MALLOCN(grn_obj, builder->n_srcs); |
12125 | if (!objs) { |
12126 | ERR(GRN_NO_MEMORY_AVAILABLE, |
12127 | "failed to allocate memory for objs: n_srcs = %u" , builder->n_srcs); |
12128 | return ctx->rc; |
12129 | } |
12130 | |
12131 | /* Create a cursor to get records in the ID order. */ |
12132 | cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, |
12133 | 0, -1, GRN_CURSOR_BY_ID); |
12134 | if (!cursor) { |
12135 | if (ctx->rc == GRN_SUCCESS) { |
12136 | ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor" ); |
12137 | } |
12138 | GRN_FREE(objs); |
12139 | return ctx->rc; |
12140 | } |
12141 | |
12142 | /* Read source values and append it. */ |
12143 | for (i = 0; i < builder->n_srcs; i++) { |
12144 | GRN_TEXT_INIT(&objs[i], 0); |
12145 | } |
12146 | while (rc == GRN_SUCCESS) { |
12147 | grn_id rid = grn_table_cursor_next(ctx, cursor); |
12148 | if (rid == GRN_ID_NIL) { |
12149 | break; |
12150 | } |
12151 | for (i = 0; i < builder->n_srcs; i++) { |
12152 | grn_obj *obj = &objs[i]; |
12153 | grn_obj *src = builder->srcs[i]; |
12154 | rc = grn_obj_reinit_for(ctx, obj, src); |
12155 | if (rc == GRN_SUCCESS) { |
12156 | if (GRN_OBJ_TABLEP(src)) { |
12157 | int len = grn_table_get_key2(ctx, src, rid, obj); |
12158 | if (len <= 0) { |
12159 | if (ctx->rc == GRN_SUCCESS) { |
12160 | ERR(GRN_UNKNOWN_ERROR, "failed to get key: rid = %u, len = %d" , |
12161 | rid, len); |
12162 | } |
12163 | rc = ctx->rc; |
12164 | } |
12165 | } else { |
12166 | if (!grn_obj_get_value(ctx, src, rid, obj)) { |
12167 | if (ctx->rc == GRN_SUCCESS) { |
12168 | ERR(GRN_UNKNOWN_ERROR, "failed to get value: rid = %u" , rid); |
12169 | } |
12170 | rc = ctx->rc; |
12171 | } |
12172 | } |
12173 | if (rc == GRN_SUCCESS) { |
12174 | uint32_t sid = (uint32_t)(i + 1); |
12175 | rc = grn_ii_builder_append_obj(ctx, builder, rid, sid, obj); |
12176 | } |
12177 | } |
12178 | } |
12179 | if (rc == GRN_SUCCESS && builder->n >= builder->options.block_threshold) { |
12180 | rc = grn_ii_builder_flush_block(ctx, builder); |
12181 | } |
12182 | } |
12183 | if (rc == GRN_SUCCESS) { |
12184 | rc = grn_ii_builder_flush_block(ctx, builder); |
12185 | } |
12186 | for (i = 0; i < builder->n_srcs; i++) { |
12187 | GRN_OBJ_FIN(ctx, &objs[i]); |
12188 | } |
12189 | grn_table_cursor_close(ctx, cursor); |
12190 | GRN_FREE(objs); |
12191 | return rc; |
12192 | } |
12193 | |
12194 | /* grn_ii_builder_set_src_table sets a source table. */ |
12195 | static grn_rc |
12196 | grn_ii_builder_set_src_table(grn_ctx *ctx, grn_ii_builder *builder) |
12197 | { |
12198 | builder->src_table = grn_ctx_at(ctx, DB_OBJ(builder->ii)->range); |
12199 | if (!builder->src_table) { |
12200 | if (ctx->rc == GRN_SUCCESS) { |
12201 | ERR(GRN_INVALID_ARGUMENT, "source table is null: range = %d" , |
12202 | DB_OBJ(builder->ii)->range); |
12203 | } |
12204 | return ctx->rc; |
12205 | } |
12206 | return GRN_SUCCESS; |
12207 | } |
12208 | |
12209 | /* grn_ii_builder_set_sid_bits calculates sid_bits and sid_mask. */ |
12210 | static grn_rc |
12211 | grn_ii_builder_set_sid_bits(grn_ctx *ctx, grn_ii_builder *builder) |
12212 | { |
12213 | /* Calculate the number of bits required to represent a section ID. */ |
12214 | if (builder->n_srcs == 1 && builder->tokenizer && |
12215 | (builder->srcs[0]->header.flags & GRN_OBJ_COLUMN_VECTOR) != 0) { |
12216 | /* If the source column is a vector column and the index has a tokenizer, */ |
12217 | /* the maximum sid equals to the maximum number of elements. */ |
12218 | size_t max_elems = 0; |
12219 | grn_table_cursor *cursor; |
12220 | grn_obj obj; |
12221 | cursor = grn_table_cursor_open(ctx, builder->src_table, NULL, 0, NULL, 0, |
12222 | 0, -1, GRN_CURSOR_BY_ID); |
12223 | if (!cursor) { |
12224 | if (ctx->rc == GRN_SUCCESS) { |
12225 | ERR(GRN_OBJECT_CORRUPT, "[index] failed to open table cursor" ); |
12226 | } |
12227 | return ctx->rc; |
12228 | } |
12229 | GRN_TEXT_INIT(&obj, 0); |
12230 | for (;;) { |
12231 | grn_id rid = grn_table_cursor_next(ctx, cursor); |
12232 | if (rid == GRN_ID_NIL) { |
12233 | break; |
12234 | } |
12235 | if (!grn_obj_get_value(ctx, builder->srcs[0], rid, &obj)) { |
12236 | continue; |
12237 | } |
12238 | if (obj.u.v.n_sections > max_elems) { |
12239 | max_elems = obj.u.v.n_sections; |
12240 | } |
12241 | } |
12242 | GRN_OBJ_FIN(ctx, &obj); |
12243 | grn_table_cursor_close(ctx, cursor); |
12244 | while (((uint32_t)1 << builder->sid_bits) < max_elems) { |
12245 | builder->sid_bits++; |
12246 | } |
12247 | } |
12248 | if (builder->sid_bits == 0) { |
12249 | while (((uint32_t)1 << builder->sid_bits) < builder->n_srcs) { |
12250 | builder->sid_bits++; |
12251 | } |
12252 | } |
12253 | builder->sid_mask = ((uint64_t)1 << builder->sid_bits) - 1; |
12254 | return GRN_SUCCESS; |
12255 | } |
12256 | |
12257 | /* grn_ii_builder_set_srcs sets source columns. */ |
12258 | static grn_rc |
12259 | grn_ii_builder_set_srcs(grn_ctx *ctx, grn_ii_builder *builder) |
12260 | { |
12261 | size_t i; |
12262 | grn_id *source; |
12263 | builder->n_srcs = builder->ii->obj.source_size / sizeof(grn_id); |
12264 | source = (grn_id *)builder->ii->obj.source; |
12265 | if (!source || !builder->n_srcs) { |
12266 | ERR(GRN_INVALID_ARGUMENT, |
12267 | "source is not available: source = %p, source_size = %u" , |
12268 | builder->ii->obj.source, builder->ii->obj.source_size); |
12269 | return ctx->rc; |
12270 | } |
12271 | builder->srcs = GRN_MALLOCN(grn_obj *, builder->n_srcs); |
12272 | if (!builder->srcs) { |
12273 | return GRN_NO_MEMORY_AVAILABLE; |
12274 | } |
12275 | for (i = 0; i < builder->n_srcs; i++) { |
12276 | builder->srcs[i] = grn_ctx_at(ctx, source[i]); |
12277 | if (!builder->srcs[i]) { |
12278 | if (ctx->rc == GRN_SUCCESS) { |
12279 | ERR(GRN_OBJECT_CORRUPT, "source not found: id = %d" , source[i]); |
12280 | } |
12281 | return ctx->rc; |
12282 | } |
12283 | } |
12284 | return grn_ii_builder_set_sid_bits(ctx, builder); |
12285 | } |
12286 | |
12287 | /* grn_ii_builder_append_source appends values in source columns. */ |
12288 | static grn_rc |
12289 | grn_ii_builder_append_source(grn_ctx *ctx, grn_ii_builder *builder) |
12290 | { |
12291 | grn_rc rc = grn_ii_builder_set_src_table(ctx, builder); |
12292 | if (rc != GRN_SUCCESS) { |
12293 | return rc; |
12294 | } |
12295 | if (grn_table_size(ctx, builder->src_table) == 0) { |
12296 | /* Nothing to do because there are no values. */ |
12297 | return ctx->rc; |
12298 | } |
12299 | /* Create a block lexicon. */ |
12300 | rc = grn_ii_builder_create_lexicon(ctx, builder); |
12301 | if (rc != GRN_SUCCESS) { |
12302 | return rc; |
12303 | } |
12304 | rc = grn_ii_builder_set_srcs(ctx, builder); |
12305 | if (rc != GRN_SUCCESS) { |
12306 | return rc; |
12307 | } |
12308 | rc = grn_ii_builder_append_srcs(ctx, builder); |
12309 | if (rc != GRN_SUCCESS) { |
12310 | return rc; |
12311 | } |
12312 | grn_ii_builder_fin_terms(ctx, builder); |
12313 | return GRN_SUCCESS; |
12314 | } |
12315 | |
12316 | /* |
12317 | * grn_ii_builder_fill_block reads the next data from a temporary file and fill |
12318 | * a block buffer. |
12319 | */ |
12320 | static grn_rc |
12321 | grn_ii_builder_fill_block(grn_ctx *ctx, grn_ii_builder *builder, |
12322 | uint32_t block_id) |
12323 | { |
12324 | ssize_t size; |
12325 | uint32_t buf_rest; |
12326 | uint64_t file_offset; |
12327 | grn_ii_builder_block *block = &builder->blocks[block_id]; |
12328 | if (!block->rest) { |
12329 | return GRN_END_OF_DATA; |
12330 | } |
12331 | if (!block->buf) { |
12332 | block->buf = (uint8_t *)GRN_MALLOC(builder->options.block_buf_size); |
12333 | if (!block->buf) { |
12334 | ERR(GRN_NO_MEMORY_AVAILABLE, |
12335 | "failed to allocate memory for buffered input: size = %u" , |
12336 | builder->options.block_buf_size); |
12337 | return ctx->rc; |
12338 | } |
12339 | } |
12340 | |
12341 | /* Move the remaining data to the head. */ |
12342 | buf_rest = block->end - block->cur; |
12343 | if (buf_rest) { |
12344 | grn_memmove(block->buf, block->cur, buf_rest); |
12345 | } |
12346 | block->cur = block->buf; |
12347 | block->end = block->buf + buf_rest; |
12348 | |
12349 | /* Read the next data. */ |
12350 | file_offset = grn_lseek(builder->fd, block->offset, SEEK_SET); |
12351 | if (file_offset != block->offset) { |
12352 | SERR("failed to seek file: expected = %" GRN_FMT_INT64U |
12353 | ", actual = %" GRN_FMT_INT64D, |
12354 | block->offset, file_offset); |
12355 | return ctx->rc; |
12356 | } |
12357 | buf_rest = builder->options.block_buf_size - buf_rest; |
12358 | if (block->rest < buf_rest) { |
12359 | buf_rest = block->rest; |
12360 | } |
12361 | size = grn_read(builder->fd, block->end, buf_rest); |
12362 | if (size <= 0) { |
12363 | SERR("failed to read data: expected = %u, actual = %" GRN_FMT_INT64D, |
12364 | buf_rest, (int64_t)size); |
12365 | return ctx->rc; |
12366 | } |
12367 | block->offset += size; |
12368 | block->rest -= size; |
12369 | block->end += size; |
12370 | return GRN_SUCCESS; |
12371 | } |
12372 | |
12373 | /* grn_ii_builder_read_from_block reads the next value from a block. */ |
12374 | static grn_rc |
12375 | grn_ii_builder_read_from_block(grn_ctx *ctx, grn_ii_builder *builder, |
12376 | uint32_t block_id, uint64_t *value) |
12377 | { |
12378 | grn_ii_builder_block *block = &builder->blocks[block_id]; |
12379 | grn_rc rc = grn_ii_builder_block_next(ctx, block, value); |
12380 | if (rc == GRN_SUCCESS) { |
12381 | return GRN_SUCCESS; |
12382 | } else if (rc == GRN_END_OF_DATA) { |
12383 | rc = grn_ii_builder_fill_block(ctx, builder, block_id); |
12384 | if (rc != GRN_SUCCESS) { |
12385 | return rc; |
12386 | } |
12387 | return grn_ii_builder_block_next(ctx, block, value); |
12388 | } |
12389 | return rc; |
12390 | } |
12391 | |
12392 | /* grn_ii_builder_pack_chunk tries to pack a chunk. */ |
12393 | static grn_rc |
12394 | grn_ii_builder_pack_chunk(grn_ctx *ctx, grn_ii_builder *builder, |
12395 | grn_bool *packed) |
12396 | { |
12397 | grn_id rid; |
12398 | uint32_t sid, pos, *a; |
12399 | grn_ii_builder_chunk *chunk = &builder->chunk; |
12400 | *packed = GRN_FALSE; |
12401 | if (chunk->offset != 1) { /* df != 1 */ |
12402 | return GRN_SUCCESS; |
12403 | } |
12404 | if (chunk->weight_buf && chunk->weight_buf[0]) { /* weight != 0 */ |
12405 | return GRN_SUCCESS; |
12406 | } |
12407 | if (chunk->freq_buf[0] != 0) { /* freq != 1 */ |
12408 | return GRN_SUCCESS; |
12409 | } |
12410 | rid = chunk->rid_buf[0]; |
12411 | if (chunk->sid_buf) { |
12412 | if (rid >= 0x100000) { |
12413 | return GRN_SUCCESS; |
12414 | } |
12415 | sid = chunk->sid_buf[0] + 1; |
12416 | if (sid >= 0x800) { |
12417 | return GRN_SUCCESS; |
12418 | } |
12419 | a = array_get(ctx, builder->ii, chunk->tid); |
12420 | if (!a) { |
12421 | DEFINE_NAME(builder->ii); |
12422 | MERR("[ii][builder][chunk][pack] failed to allocate an array: " |
12423 | "<%.*s>: " |
12424 | "<%u>:<%u>:<%u>" , |
12425 | name_size, name, |
12426 | rid, sid, chunk->tid); |
12427 | return ctx->rc; |
12428 | } |
12429 | a[0] = ((rid << 12) + (sid << 1)) | 1; |
12430 | } else { |
12431 | a = array_get(ctx, builder->ii, chunk->tid); |
12432 | if (!a) { |
12433 | DEFINE_NAME(builder->ii); |
12434 | MERR("[ii][builder][chunk][pack] failed to allocate an array: " |
12435 | "<%.*s>: " |
12436 | "<%u>:<%u>" , |
12437 | name_size, name, |
12438 | rid, chunk->tid); |
12439 | return ctx->rc; |
12440 | } |
12441 | a[0] = (rid << 1) | 1; |
12442 | } |
12443 | pos = 0; |
12444 | if (chunk->pos_buf) { |
12445 | pos = chunk->pos_buf[0]; |
12446 | } |
12447 | a[1] = pos; |
12448 | array_unref(builder->ii, chunk->tid); |
12449 | *packed = GRN_TRUE; |
12450 | |
12451 | grn_ii_builder_chunk_clear(ctx, chunk); |
12452 | return GRN_SUCCESS; |
12453 | } |
12454 | |
12455 | /* grn_ii_builder_get_cinfo returns a new cinfo. */ |
12456 | static grn_rc |
12457 | grn_ii_builder_get_cinfo(grn_ctx *ctx, grn_ii_builder *builder, |
12458 | chunk_info **cinfo) |
12459 | { |
12460 | if (builder->n_cinfos == builder->cinfos_size) { |
12461 | uint32_t size = builder->cinfos_size ? (builder->cinfos_size * 2) : 1; |
12462 | size_t n_bytes = size * sizeof(chunk_info); |
12463 | chunk_info *cinfos = (chunk_info *)GRN_REALLOC(builder->cinfos, n_bytes); |
12464 | if (!cinfos) { |
12465 | ERR(GRN_NO_MEMORY_AVAILABLE, |
12466 | "failed to allocate memory for cinfos: n_bytes = %" GRN_FMT_SIZE, |
12467 | n_bytes); |
12468 | return ctx->rc; |
12469 | } |
12470 | builder->cinfos = cinfos; |
12471 | builder->cinfos_size = size; |
12472 | } |
12473 | *cinfo = &builder->cinfos[builder->n_cinfos++]; |
12474 | return GRN_SUCCESS; |
12475 | } |
12476 | |
12477 | /* grn_ii_builder_flush_chunk flushes a chunk. */ |
12478 | static grn_rc |
12479 | grn_ii_builder_flush_chunk(grn_ctx *ctx, grn_ii_builder *builder) |
12480 | { |
12481 | grn_rc rc; |
12482 | chunk_info *cinfo = NULL; |
12483 | grn_ii_builder_chunk *chunk = &builder->chunk; |
12484 | void *seg; |
12485 | uint8_t *in; |
12486 | uint32_t in_size, chunk_id, seg_id, seg_offset, seg_rest; |
12487 | |
12488 | rc = grn_ii_builder_chunk_encode(ctx, chunk, NULL, 0); |
12489 | if (rc != GRN_SUCCESS) { |
12490 | return rc; |
12491 | } |
12492 | in = chunk->enc_buf; |
12493 | in_size = chunk->enc_offset; |
12494 | |
12495 | rc = chunk_new(ctx, builder->ii, &chunk_id, chunk->enc_offset); |
12496 | if (rc != GRN_SUCCESS) { |
12497 | return rc; |
12498 | } |
12499 | |
12500 | /* Copy to the first segment. */ |
12501 | seg_id = chunk_id >> GRN_II_N_CHUNK_VARIATION; |
12502 | seg_offset = (chunk_id & ((1 << GRN_II_N_CHUNK_VARIATION) - 1)) << |
12503 | GRN_II_W_LEAST_CHUNK; |
12504 | GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); |
12505 | if (!seg) { |
12506 | if (ctx->rc == GRN_SUCCESS) { |
12507 | ERR(GRN_UNKNOWN_ERROR, |
12508 | "failed access chunk segment: chunk_id = %u, seg_id = %u" , |
12509 | chunk_id, seg_id); |
12510 | } |
12511 | return ctx->rc; |
12512 | } |
12513 | seg_rest = S_CHUNK - seg_offset; |
12514 | if (in_size <= seg_rest) { |
12515 | grn_memcpy((uint8_t *)seg + seg_offset, in, in_size); |
12516 | in_size = 0; |
12517 | } else { |
12518 | grn_memcpy((uint8_t *)seg + seg_offset, in, seg_rest); |
12519 | in += seg_rest; |
12520 | in_size -= seg_rest; |
12521 | } |
12522 | GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); |
12523 | |
12524 | /* Copy to the next segments. */ |
12525 | while (in_size) { |
12526 | seg_id++; |
12527 | GRN_IO_SEG_REF(builder->ii->chunk, seg_id, seg); |
12528 | if (!seg) { |
12529 | if (ctx->rc == GRN_SUCCESS) { |
12530 | ERR(GRN_UNKNOWN_ERROR, |
12531 | "failed access chunk segment: chunk_id = %u, seg_id = %u" , |
12532 | chunk_id, seg_id); |
12533 | } |
12534 | return ctx->rc; |
12535 | } |
12536 | if (in_size <= S_CHUNK) { |
12537 | grn_memcpy(seg, in, in_size); |
12538 | in_size = 0; |
12539 | } else { |
12540 | grn_memcpy(seg, in, S_CHUNK); |
12541 | in += S_CHUNK; |
12542 | in_size -= S_CHUNK; |
12543 | } |
12544 | GRN_IO_SEG_UNREF(builder->ii->chunk, seg_id); |
12545 | } |
12546 | |
12547 | /* Append a cinfo. */ |
12548 | rc = grn_ii_builder_get_cinfo(ctx, builder, &cinfo); |
12549 | if (rc != GRN_SUCCESS) { |
12550 | return rc; |
12551 | } |
12552 | cinfo->segno = chunk_id; |
12553 | cinfo->size = chunk->enc_offset; |
12554 | cinfo->dgap = chunk->rid_gap; |
12555 | |
12556 | builder->buf.ii->header->total_chunk_size += chunk->enc_offset; |
12557 | grn_ii_builder_chunk_clear(ctx, chunk); |
12558 | return GRN_SUCCESS; |
12559 | } |
12560 | |
12561 | /* grn_ii_builder_read_to_chunk read values from a block to a chunk. */ |
12562 | static grn_rc |
12563 | grn_ii_builder_read_to_chunk(grn_ctx *ctx, grn_ii_builder *builder, |
12564 | uint32_t block_id) |
12565 | { |
12566 | grn_rc rc; |
12567 | uint64_t value; |
12568 | uint32_t rid = GRN_ID_NIL, last_sid = 0; |
12569 | uint32_t ii_flags = builder->ii->header->flags; |
12570 | grn_ii_builder_chunk *chunk = &builder->chunk; |
12571 | |
12572 | for (;;) { |
12573 | uint32_t gap, freq; |
12574 | uint64_t value; |
12575 | rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); |
12576 | if (rc != GRN_SUCCESS) { |
12577 | return rc; |
12578 | } |
12579 | if (!value) { |
12580 | break; |
12581 | } |
12582 | if (builder->chunk.offset == builder->chunk.size) { |
12583 | rc = grn_ii_builder_chunk_extend_bufs(ctx, chunk, ii_flags); |
12584 | if (rc != GRN_SUCCESS) { |
12585 | return rc; |
12586 | } |
12587 | } |
12588 | |
12589 | /* Read record ID. */ |
12590 | gap = value >> builder->sid_bits; /* In-block gap */ |
12591 | if (gap) { |
12592 | if (chunk->n >= builder->options.chunk_threshold) { |
12593 | rc = grn_ii_builder_flush_chunk(ctx, builder); |
12594 | if (rc != GRN_SUCCESS) { |
12595 | return rc; |
12596 | } |
12597 | } |
12598 | last_sid = 0; |
12599 | } |
12600 | rid += gap; |
12601 | gap = rid - chunk->rid; /* Global gap */ |
12602 | chunk->rid_buf[chunk->offset] = chunk->offset ? gap : rid; |
12603 | chunk->n++; |
12604 | chunk->rid = rid; |
12605 | chunk->rid_gap += gap; |
12606 | builder->df++; |
12607 | |
12608 | /* Read section ID. */ |
12609 | if (ii_flags & GRN_OBJ_WITH_SECTION) { |
12610 | uint32_t sid = (value & builder->sid_mask) + 1; |
12611 | chunk->sid_buf[chunk->offset] = sid - last_sid - 1; |
12612 | chunk->n++; |
12613 | last_sid = sid; |
12614 | } |
12615 | |
12616 | /* Read weight. */ |
12617 | if (ii_flags & GRN_OBJ_WITH_WEIGHT) { |
12618 | uint32_t weight; |
12619 | rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); |
12620 | if (rc != GRN_SUCCESS) { |
12621 | return rc; |
12622 | } |
12623 | weight = value; |
12624 | chunk->weight_buf[chunk->offset] = weight; |
12625 | chunk->n++; |
12626 | } |
12627 | |
12628 | /* Read positions or a frequency. */ |
12629 | if (ii_flags & GRN_OBJ_WITH_POSITION) { |
12630 | uint32_t pos = -1; |
12631 | freq = 0; |
12632 | for (;;) { |
12633 | rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); |
12634 | if (rc != GRN_SUCCESS) { |
12635 | return rc; |
12636 | } |
12637 | if (!value) { |
12638 | break; |
12639 | } |
12640 | if (builder->chunk.pos_offset == builder->chunk.pos_size) { |
12641 | rc = grn_ii_builder_chunk_extend_pos_buf(ctx, chunk); |
12642 | if (rc != GRN_SUCCESS) { |
12643 | return rc; |
12644 | } |
12645 | } |
12646 | if (pos == -1) { |
12647 | chunk->pos_buf[chunk->pos_offset] = value - 1; |
12648 | chunk->pos_sum += value - 1; |
12649 | } else { |
12650 | chunk->pos_buf[chunk->pos_offset] = value; |
12651 | chunk->pos_sum += value; |
12652 | } |
12653 | chunk->n++; |
12654 | pos += value; |
12655 | chunk->pos_offset++; |
12656 | freq++; |
12657 | } |
12658 | } else { |
12659 | rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); |
12660 | if (rc != GRN_SUCCESS) { |
12661 | return rc; |
12662 | } |
12663 | freq = value; |
12664 | } |
12665 | chunk->freq_buf[chunk->offset] = freq - 1; |
12666 | chunk->n++; |
12667 | chunk->offset++; |
12668 | } |
12669 | rc = grn_ii_builder_read_from_block(ctx, builder, block_id, &value); |
12670 | if (rc == GRN_SUCCESS) { |
12671 | builder->blocks[block_id].tid = value; |
12672 | } else if (rc == GRN_END_OF_DATA) { |
12673 | builder->blocks[block_id].tid = GRN_ID_NIL; |
12674 | } else { |
12675 | return rc; |
12676 | } |
12677 | return GRN_SUCCESS; |
12678 | } |
12679 | |
12680 | /* grn_ii_builder_register_chunks registers chunks. */ |
12681 | static grn_rc |
12682 | grn_ii_builder_register_chunks(grn_ctx *ctx, grn_ii_builder *builder) |
12683 | { |
12684 | grn_rc rc; |
12685 | uint32_t buf_tid, *a; |
12686 | buffer_term *buf_term; |
12687 | |
12688 | rc = grn_ii_builder_chunk_encode(ctx, &builder->chunk, builder->cinfos, |
12689 | builder->n_cinfos); |
12690 | if (rc != GRN_SUCCESS) { |
12691 | return rc; |
12692 | } |
12693 | |
12694 | if (!grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { |
12695 | rc = grn_ii_builder_buffer_assign(ctx, &builder->buf, |
12696 | builder->chunk.enc_offset); |
12697 | if (rc != GRN_SUCCESS) { |
12698 | return rc; |
12699 | } |
12700 | } |
12701 | buf_tid = builder->buf.buf->header.nterms; |
12702 | if (buf_tid >= builder->options.buffer_max_n_terms || |
12703 | builder->buf.chunk_size - builder->buf.chunk_offset < |
12704 | builder->chunk.enc_offset) { |
12705 | rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); |
12706 | if (rc != GRN_SUCCESS) { |
12707 | return rc; |
12708 | } |
12709 | rc = grn_ii_builder_buffer_assign(ctx, &builder->buf, |
12710 | builder->chunk.enc_offset); |
12711 | if (rc != GRN_SUCCESS) { |
12712 | return rc; |
12713 | } |
12714 | buf_tid = 0; |
12715 | } |
12716 | buf_term = &builder->buf.buf->terms[buf_tid]; |
12717 | buf_term->tid = builder->chunk.tid; |
12718 | if (builder->n_cinfos) { |
12719 | buf_term->tid |= CHUNK_SPLIT; |
12720 | } |
12721 | buf_term->size_in_buffer = 0; |
12722 | buf_term->pos_in_buffer = 0; |
12723 | buf_term->size_in_chunk = builder->chunk.enc_offset; |
12724 | buf_term->pos_in_chunk = builder->buf.chunk_offset; |
12725 | |
12726 | grn_memcpy(builder->buf.chunk + builder->buf.chunk_offset, |
12727 | builder->chunk.enc_buf, builder->chunk.enc_offset); |
12728 | builder->buf.chunk_offset += builder->chunk.enc_offset; |
12729 | |
12730 | a = array_get(ctx, builder->ii, builder->chunk.tid); |
12731 | if (!a) { |
12732 | DEFINE_NAME(builder->ii); |
12733 | MERR("[ii][builder][chunk][register] " |
12734 | "failed to allocate an array in segment: " |
12735 | "<%.*s>: " |
12736 | "tid=<%u>: max_n_segments=<%u>" , |
12737 | name_size, name, |
12738 | builder->chunk.tid, |
12739 | builder->ii->seg->header->max_segment); |
12740 | return ctx->rc; |
12741 | } |
12742 | a[0] = SEG2POS(builder->buf.buf_id, |
12743 | sizeof(buffer_header) + buf_tid * sizeof(buffer_term)); |
12744 | a[1] = builder->df; |
12745 | array_unref(builder->ii, builder->chunk.tid); |
12746 | |
12747 | builder->buf.buf->header.nterms++; |
12748 | builder->n_cinfos = 0; |
12749 | grn_ii_builder_chunk_clear(ctx, &builder->chunk); |
12750 | return GRN_SUCCESS; |
12751 | } |
12752 | |
12753 | static grn_rc |
12754 | grn_ii_builder_commit(grn_ctx *ctx, grn_ii_builder *builder) |
12755 | { |
12756 | uint32_t i; |
12757 | grn_rc rc; |
12758 | grn_table_cursor *cursor; |
12759 | |
12760 | for (i = 0; i < builder->n_blocks; i++) { |
12761 | uint64_t value; |
12762 | rc = grn_ii_builder_read_from_block(ctx, builder, i, &value); |
12763 | if (rc != GRN_SUCCESS) { |
12764 | return rc; |
12765 | } |
12766 | builder->blocks[i].tid = value; |
12767 | } |
12768 | |
12769 | cursor = grn_table_cursor_open(ctx, builder->ii->lexicon, |
12770 | NULL, 0, NULL, 0, 0, -1, GRN_CURSOR_BY_KEY); |
12771 | for (;;) { |
12772 | grn_id tid = grn_table_cursor_next(ctx, cursor); |
12773 | if (tid == GRN_ID_NIL) { |
12774 | break; |
12775 | } |
12776 | builder->chunk.tid = tid; |
12777 | builder->chunk.rid = GRN_ID_NIL; |
12778 | builder->df = 0; |
12779 | for (i = 0; i < builder->n_blocks; i++) { |
12780 | if (tid == builder->blocks[i].tid) { |
12781 | rc = grn_ii_builder_read_to_chunk(ctx, builder, i); |
12782 | if (rc != GRN_SUCCESS) { |
12783 | return rc; |
12784 | } |
12785 | } |
12786 | } |
12787 | if (!builder->chunk.n) { |
12788 | /* This term does not appear. */ |
12789 | continue; |
12790 | } |
12791 | if (!builder->n_cinfos) { |
12792 | grn_bool packed; |
12793 | rc = grn_ii_builder_pack_chunk(ctx, builder, &packed); |
12794 | if (rc != GRN_SUCCESS) { |
12795 | return rc; |
12796 | } |
12797 | if (packed) { |
12798 | continue; |
12799 | } |
12800 | } |
12801 | rc = grn_ii_builder_register_chunks(ctx, builder); |
12802 | if (rc != GRN_SUCCESS) { |
12803 | return rc; |
12804 | } |
12805 | } |
12806 | grn_table_cursor_close(ctx, cursor); |
12807 | if (grn_ii_builder_buffer_is_assigned(ctx, &builder->buf)) { |
12808 | rc = grn_ii_builder_buffer_flush(ctx, &builder->buf); |
12809 | if (rc != GRN_SUCCESS) { |
12810 | return rc; |
12811 | } |
12812 | } |
12813 | return GRN_SUCCESS; |
12814 | } |
12815 | |
12816 | grn_rc |
12817 | grn_ii_build2(grn_ctx *ctx, grn_ii *ii, const grn_ii_builder_options *options) |
12818 | { |
12819 | grn_rc rc, rc_close; |
12820 | grn_ii_builder *builder; |
12821 | rc = grn_ii_builder_open(ctx, ii, options, &builder); |
12822 | if (rc == GRN_SUCCESS) { |
12823 | rc = grn_ii_builder_append_source(ctx, builder); |
12824 | if (rc == GRN_SUCCESS) { |
12825 | rc = grn_ii_builder_commit(ctx, builder); |
12826 | } |
12827 | rc_close = grn_ii_builder_close(ctx, builder); |
12828 | if (rc == GRN_SUCCESS) { |
12829 | rc = rc_close; |
12830 | } |
12831 | } |
12832 | return rc; |
12833 | } |
12834 | |