1/*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V.
7 */
8
9/*
10 * author Lefteris Sidirourgos
11 * Tokenizer
12 * This module implements a vertical fragmented tokenizer for strings.
13 * It is based on the ideas of the urlbox module by mk.
14 *
15 * The input string is tokenized according to a separator character.
16 * Each token is inserted to the next BAT with the same order of
17 * appearance in the string. We currently support 255 tokens in each
18 * string as this module is intended for use with short and similar
19 * strings such as URLs. In addition we maintain a 2-dimensional index
20 * that points to the depth and height of the last token of each string.
21 * The 2-dimensional index is combined to one BAT where the 8 least
22 * significant bits represent the depth, and the rest bits the height.
23 *
24 * The tokenizer can be accessed in two ways. Given the oid retrieve the
25 * re-constructed string, or given a string return its oid if present,
26 * otherwise nil.
27 *
28 * Strings can be added either in batch (from a file or a bat of
29 * strings) and by appending a single string. Duplicate elimination is
30 * always performed.
31 *
32 * There can be only one tokenizer open at the same time. This is
33 * achieved by setting a TRANSaction bat. This might change in the
34 * future. However there can be more than one tokenizers stored in the
35 * disk, each of which is identified by its name (usually the name of
36 * the active schema of the db). These administrative issues and
37 * security aspects (e.g., opening a tokenizer of a different schema)
38 * should be addressed more thoroughly.
39 */
40#include "monetdb_config.h"
41#include "bat5.h"
42#include "tokenizer.h"
43#include "mal_linker.h"
44
45#define MAX_TKNZR_DEPTH 256
46#define INDEX MAX_TKNZR_DEPTH
47static int tokenDepth = 0;
48struct {
49 BAT *idx, *val;
50} tokenBAT[MAX_TKNZR_DEPTH + 1];
51
52static BAT *TRANS = NULL; /* the catalog of tokenizers */
53static char name[128];
54
55#if SIZEOF_OID == 4 /* 32-bit oid */
56#define MAX_h ((((oid) 1) << 23) - 1)
57#else /* 64-bit oid */
58#define MAX_h ((((oid) 1) << 55) - 1)
59#endif
60
61#define COMP(h, d) ((h << 8) | (d & 255))
62#define GET_d(x) ((sht) ((x) & 255))
63#define GET_h(x) ((x) >> 8)
64
65static int prvlocate(BAT* b, BAT* bidx, oid *prv, str part)
66{
67 BATiter bi = bat_iterator(b);
68 BUN p;
69
70 if (BAThash(b) == GDK_SUCCEED) {
71 HASHloop_str(bi, b->thash, p, part) {
72 if (BUNtoid(bidx, p) == *prv) {
73 *prv = (oid) p;
74 return TRUE;
75 }
76 }
77 } else {
78 /* hash failed, slow scan */
79 BUN q;
80
81 BATloop(b, p, q) {
82 if (BUNtoid(bidx, p) == *prv &&
83 strcmp(BUNtail(bi, p), part) == 0) {
84 *prv = (oid) p;
85 return TRUE;
86 }
87 }
88 }
89 return FALSE;
90}
91
92str
93TKNZRopen(void *ret, str *in)
94{
95 int depth;
96 bat r;
97 bat idx;
98 char batname[134];
99 BAT *b;
100
101 (void) ret;
102 if (strlen(*in) > 127)
103 throw(MAL, "tokenizer.open",
104 ILLEGAL_ARGUMENT " tokenizer name too long");
105
106 MT_lock_set(&mal_contextLock);
107 if (TRANS != NULL) {
108 MT_lock_unset(&mal_contextLock);
109 throw(MAL, "tokenizer.open", "Another tokenizer is already open");
110 }
111
112 for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
113 tokenBAT[depth].idx = 0;
114 tokenBAT[depth].val = 0;
115 }
116 tokenDepth = 0;
117
118 TRANS = COLnew(0, TYPE_str, MAX_TKNZR_DEPTH + 1, TRANSIENT);
119 if (TRANS == NULL) {
120 MT_lock_unset(&mal_contextLock);
121 throw(MAL, "tokenizer.open", SQLSTATE(HY001) MAL_MALLOC_FAIL);
122 }
123 /* now we are sure that none overwrites the tokenizer table*/
124 MT_lock_unset(&mal_contextLock);
125
126 snprintf(name, 128, "%s", *in);
127
128 snprintf(batname, sizeof(batname), "%s_index", name);
129 idx = BBPindex(batname);
130
131 if (idx == 0) { /* new tokenizer */
132 b = COLnew(0, TYPE_oid, 1024, PERSISTENT);
133 if (b == NULL)
134 throw(MAL, "tokenizer.open", SQLSTATE(HY001) MAL_MALLOC_FAIL);
135 if (BKCsetName(&r, &b->batCacheid, &(const char*){batname}) != MAL_SUCCEED ||
136 BKCsetPersistent(&r, &b->batCacheid) != MAL_SUCCEED ||
137 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
138 BBPreclaim(b);
139 throw(MAL, "tokenizer.open", OPERATION_FAILED);
140 }
141 tokenBAT[INDEX].val = b;
142 } else { /* existing tokenizer */
143 tokenBAT[INDEX].val = BATdescriptor(idx);
144
145 if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
146 BBPunfix(tokenBAT[INDEX].val->batCacheid);
147 tokenBAT[INDEX].val = NULL;
148 throw(MAL, "tokenizer.open", OPERATION_FAILED);
149 }
150
151 for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
152 snprintf(batname, sizeof(batname), "%s_%d", name, depth);
153 idx = BBPindex(batname);
154 if (idx == 0)
155 break;
156 tokenBAT[depth].val = BATdescriptor(idx);
157 if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
158 BBPunfix(tokenBAT[depth].val->batCacheid);
159 tokenBAT[depth].val = NULL;
160 throw(MAL, "tokenizer.open", SQLSTATE(HY001) MAL_MALLOC_FAIL);
161 }
162
163 /* For idx BATs */
164 snprintf(batname, sizeof(batname), "%s_idx_%d", name, depth);
165 idx = BBPindex(batname);
166 if (idx == 0)
167 break;
168 tokenBAT[depth].idx = BATdescriptor(idx);
169 if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
170 BBPunfix(tokenBAT[depth].idx->batCacheid);
171 tokenBAT[depth].idx = NULL;
172 throw(MAL, "tokenizer.open", SQLSTATE(HY001) MAL_MALLOC_FAIL);
173 }
174
175 }
176 tokenDepth = depth;
177 }
178
179 return MAL_SUCCEED;
180}
181
182str
183TKNZRclose(void *r)
184{
185 int i;
186 (void) r;
187
188 if (TRANS == NULL)
189 throw(MAL, "tokenizer", "no tokenizer store open");
190
191 TMsubcommit(TRANS);
192
193 for (i = 0; i < tokenDepth; i++) {
194 BBPunfix(tokenBAT[i].idx->batCacheid);
195 BBPunfix(tokenBAT[i].val->batCacheid);
196 }
197 BBPunfix(tokenBAT[INDEX].val->batCacheid);
198 tokenDepth = 0;
199
200 BBPreclaim(TRANS);
201 TRANS = NULL;
202 return MAL_SUCCEED;
203}
204
205/*
206 * Tokenize operations
207 * The tokenizer operation assumes a private copy to mark the end of the
208 * token separators with a zero byte. Tokens are separated by a single
209 * character for simplicity. Might be a good scheme to assume that
210 * strings to be broken are properly ended with either 0 or nl, not
211 * both. It seems 0 can be assumed.
212 */
213static int
214TKNZRtokenize(str in, str *parts, char tkn)
215{
216 char *s, *t;
217 int depth = 0;
218
219 s = in;
220 while (*s && *s != '\n') {
221 t = s;
222 while (*t != tkn && *t != '\n' && *t)
223 t++;
224 parts[depth++] = s;
225 s = t + (*t != 0);
226 *t = 0;
227 if (depth > MAX_TKNZR_DEPTH)
228 break;
229 }
230 return depth;
231}
232
233str
234TKNZRappend(oid *pos, str *s)
235{
236 str url;
237 char batname[132];
238 str parts[MAX_TKNZR_DEPTH];
239 str msg;
240 int i, new, depth;
241 bat r;
242 BAT *bVal;
243 BAT *bIdx;
244 BUN p;
245 BUN idx = 0;
246 oid prv = 0;
247 oid comp;
248
249 if (TRANS == NULL)
250 throw(MAL, "tokenizer", "no tokenizer store open");
251
252 if ((url = GDKstrdup(*s)) == NULL) {
253 throw(MAL, "tokenizer.append", SQLSTATE(HY001) MAL_MALLOC_FAIL);
254 }
255
256 depth = TKNZRtokenize(url, parts, '/');
257 new = depth;
258
259 if (depth == 0) {
260 GDKfree(url);
261 return MAL_SUCCEED;
262 }
263 if (depth > MAX_TKNZR_DEPTH) {
264 GDKfree(url);
265 throw(MAL, "tokenizer",
266 ILLEGAL_ARGUMENT "input string breaks to too many parts");
267 }
268 if (depth > tokenDepth || tokenBAT[0].val == NULL) {
269 new = tokenDepth;
270 for (i = tokenDepth; i < depth; i++) {
271 /* make new bat for value */
272 snprintf(batname, sizeof(batname), "%s_%d", name, i);
273 bVal = COLnew(0, TYPE_str, 1024, PERSISTENT);
274 if (bVal == NULL) {
275 GDKfree(url);
276 throw(MAL, "tokenizer.append", SQLSTATE(HY001) MAL_MALLOC_FAIL);
277 }
278
279 tokenBAT[i].val = bVal;
280
281 if ((msg = BKCsetName(&r, &bVal->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
282 (msg = BKCsetPersistent(&r, &bVal->batCacheid)) != MAL_SUCCEED ||
283 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
284 GDKfree(url);
285 return msg ? msg : createException(MAL, "tokenizer.append", SQLSTATE(HY001) MAL_MALLOC_FAIL);
286 }
287
288 /* make new bat for index */
289 snprintf(batname, sizeof(batname), "%s_idx_%d", name, i);
290 bIdx = COLnew(0, TYPE_oid, 1024, PERSISTENT);
291 if (bIdx == NULL) {
292 GDKfree(url);
293 throw(MAL, "tokenizer.append", SQLSTATE(HY001) MAL_MALLOC_FAIL);
294 }
295
296 tokenBAT[i].idx = bIdx;
297
298 if ((msg = BKCsetName(&r, &bIdx->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
299 (msg = BKCsetPersistent(&r, &bIdx->batCacheid)) != MAL_SUCCEED ||
300 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
301 GDKfree(url);
302 return msg ? msg : createException(MAL, "tokenizer.append", SQLSTATE(HY001) MAL_MALLOC_FAIL);
303 }
304
305 }
306 tokenDepth = depth;
307 }
308
309 /* findcommn */
310 p = BUNfnd(tokenBAT[0].val, parts[0]);
311 if (p != BUN_NONE) {
312 prv = (oid) p;
313 for (i = 1; i < new; i++) {
314 if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, &prv, parts[i]))
315 break;
316 }
317 } else {
318 i = 0;
319 }
320
321 if (i == depth) {
322 comp = COMP(prv, depth);
323 *pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
324 if (*pos != BUN_NONE) {
325 /* the string is already there */
326 /* printf("The string %s is already there",url); */
327 GDKfree(url);
328 return MAL_SUCCEED;
329 }
330 }
331
332 /* insremainder */
333 for (; i < depth; i++) {
334 idx = BATcount(tokenBAT[i].val);
335 if (idx > MAX_h) {
336 GDKfree(url);
337 throw(MAL, "tokenizer.append",
338 OPERATION_FAILED " no more free oid's");
339 }
340 if (BUNappend(tokenBAT[i].val, parts[i], false) != GDK_SUCCEED) {
341 GDKfree(url);
342 throw(MAL, "tokenizer.append",
343 OPERATION_FAILED " could not append");
344 }
345 if (tokenBAT[i].val->thash == NULL ||
346 tokenBAT[i].val->thash == (Hash *) 1 ||
347 BATcount(tokenBAT[i].val) > 4 * tokenBAT[i].val->thash->mask) {
348 HASHdestroy(tokenBAT[i].val);
349 BAThash(tokenBAT[i].val);
350 }
351
352 if (BUNappend(tokenBAT[i].idx, (ptr) & prv, false) != GDK_SUCCEED) {
353 GDKfree(url);
354 throw(MAL, "tokenizer.append",
355 OPERATION_FAILED " could not append");
356 }
357
358 prv = (oid) idx;
359 }
360
361 *pos = (oid) BATcount(tokenBAT[INDEX].val);
362 comp = COMP(prv, depth);
363 if (BUNappend(tokenBAT[INDEX].val, &comp, false) != GDK_SUCCEED) {
364 GDKfree(url);
365 throw(MAL, "tokenizer.append", SQLSTATE(HY001) MAL_MALLOC_FAIL);
366 }
367 if (tokenBAT[INDEX].val->thash == NULL ||
368 tokenBAT[INDEX].val->thash == (Hash *) 1 ||
369 BATcount(tokenBAT[INDEX].val) > 4 * tokenBAT[INDEX].val->thash->mask) {
370 HASHdestroy(tokenBAT[INDEX].val);
371 BAThash(tokenBAT[INDEX].val);
372 }
373
374 GDKfree(url);
375 return MAL_SUCCEED;
376}
377
378#define SIZE (1 * 1024 * 1024)
379str
380TKNZRdepositFile(void *r, str *fnme)
381{
382 stream *fs;
383 bstream *bs;
384 char *s, *t;
385 int len = 0;
386 char buf[FILENAME_MAX];
387 oid pos;
388 str msg= MAL_SUCCEED;
389
390 if (TRANS == NULL)
391 throw(MAL, "tokenizer", "no tokenizer store open");
392
393 (void) r;
394 if (**fnme == '/')
395 len = snprintf(buf, FILENAME_MAX, "%s", *fnme);
396 else
397 len = snprintf(buf, FILENAME_MAX, "%s/%s", monet_cwd, *fnme);
398 if (len == -1 || len >= FILENAME_MAX)
399 throw(MAL, "tokenizer.depositFile", SQLSTATE(HY001) "tokenizer filename path is too large");
400 /* later, handle directory separator */
401 fs = open_rastream(buf);
402 if (fs == NULL)
403 throw(MAL, "tokenizer.depositFile", RUNTIME_FILE_NOT_FOUND "%s", buf);
404 if (mnstr_errnr(fs)) {
405 close_stream(fs);
406 throw(MAL, "tokenizer.depositFile", RUNTIME_FILE_NOT_FOUND "%s", buf);
407 }
408 bs = bstream_create(fs, SIZE);
409 if (bs == NULL)
410 throw(MAL, "tokenizer.depositFile", SQLSTATE(HY001) MAL_MALLOC_FAIL);
411 while (bstream_read(bs, bs->size - (bs->len - bs->pos)) != 0 &&
412 !mnstr_errnr(bs->s))
413 {
414 s = bs->buf;
415 for (t = s; *t;) {
416 while (t < bs->buf + bs->len && *t && *t != '\n')
417 t++;
418 if (t == bs->buf + bs->len || *t != '\n') {
419 /* read next block if possible after shift */
420 assert(t - s <= INT_MAX);
421 len = (int) (t - s);
422 memcpy(bs->buf, s, len);
423 bs->len = len;
424 bs->pos = 0;
425 break;
426 }
427 /* found a string to be processed */
428 *t = 0;
429 msg = TKNZRappend(&pos, &s);
430 if (msg ) {
431 bstream_destroy(bs);
432 close_stream(fs);
433 return msg;
434 }
435 *t = '\n';
436 s = t + 1;
437 t = s;
438 }
439 }
440
441 bstream_destroy(bs);
442 close_stream(fs);
443 return MAL_SUCCEED;
444}
445
446str
447TKNZRlocate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
448{
449 oid pos;
450 str url;
451 str parts[MAX_TKNZR_DEPTH];
452 int i = 0, depth;
453 BUN p;
454 oid prv = 0;
455 oid comp;
456 (void) cntxt;
457 (void) mb;
458
459 if (TRANS == NULL)
460 throw(MAL, "tokenizer", "no tokenizer store open");
461
462 url = (str) GDKmalloc(sizeof(char) *
463 (strlen(*getArgReference_str(stk, pci, 1)) + 1));
464 if (url == NULL)
465 throw(MAL, "tokenizer.locate", SQLSTATE(HY001) MAL_MALLOC_FAIL);
466 strcpy(url, *getArgReference_str(stk, pci, 1));
467
468
469 depth = TKNZRtokenize(url, parts, '/');
470
471 if (depth == 0) {
472 pos = oid_nil;
473 } else if (depth > MAX_TKNZR_DEPTH) {
474 GDKfree(url);
475 throw(MAL, "tokenizer.locate",
476 ILLEGAL_ARGUMENT "strings breaks to too many parts");
477 } else if (depth > tokenDepth) {
478 pos = oid_nil;
479 } else {
480 p = BUNfnd(tokenBAT[0].val, parts[0]);
481 if (p != BUN_NONE) {
482 prv = (oid) p;
483 for (i = 1; i < depth; i++) {
484 if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, (ptr) & prv, parts[i]))
485 break;
486 }
487 if (i < depth) {
488 pos = oid_nil;
489 } else {
490 comp = COMP(prv, i);
491 pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
492 }
493 } else {
494 pos = oid_nil;
495 }
496 }
497
498 VALset(&stk->stk[pci->argv[0]], TYPE_oid, &pos);
499 GDKfree(url);
500 return MAL_SUCCEED;
501}
502
503str
504takeOid(oid id, str *val)
505{
506 int i, depth;
507 str parts[MAX_TKNZR_DEPTH];
508 size_t lngth = 0;
509 str s;
510
511 if (id >= BATcount(tokenBAT[INDEX].val)) {
512 throw(MAL, "tokenizer.takeOid", OPERATION_FAILED " illegal oid");
513 }
514
515 id = *(oid *) Tloc(tokenBAT[INDEX].val, id);
516
517 depth = GET_d(id);
518 id = GET_h(id);
519
520 for (i = depth - 1; i >= 0; i--) {
521 BATiter bi = bat_iterator(tokenBAT[i].val);
522 parts[i] = (str) BUNtvar(bi, id);
523 id = BUNtoid(tokenBAT[i].idx, id);
524 lngth += strlen(parts[i]);
525 }
526
527 *val = (str) GDKmalloc(lngth+depth+1);
528 if( *val == NULL)
529 throw(MAL, "tokenizer.takeOid", SQLSTATE(HY001) MAL_MALLOC_FAIL);
530 s = *val;
531
532 for (i = 0; i < depth; i++) {
533 strcpy(s, parts[i]);
534 s += strlen(parts[i]);
535 *s++ = '/';
536 }
537 *s = '\0';
538
539 return MAL_SUCCEED;
540}
541
542str
543TKNZRtakeOid(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
544{
545 str ret, val = NULL;
546 oid id;
547 (void) cntxt;
548 (void) mb;
549
550 if (TRANS == NULL) {
551 throw(MAL, "tokenizer", "no tokenizer store open");
552 }
553 id = *getArgReference_oid(stk, pci, 1);
554 ret = takeOid(id, &val);
555 if (ret == MAL_SUCCEED) {
556 VALset(&stk->stk[pci->argv[0]], TYPE_str, val);
557 }
558 return ret;
559}
560
561str
562TKNZRgetIndex(bat *r)
563{
564 if (TRANS == NULL)
565 throw(MAL, "tokenizer", "no tokenizer store open");
566 *r = tokenBAT[INDEX].val->batCacheid;
567 BBPretain(*r);
568 return MAL_SUCCEED;
569}
570
571str
572TKNZRgetLevel(bat *r, int *level)
573{
574 BAT* view;
575 if (TRANS == NULL)
576 throw(MAL, "tokenizer", "no tokenizer store open");
577 if (*level < 0 || *level >= tokenDepth)
578 throw(MAL, "tokenizer.getLevel", OPERATION_FAILED " illegal level");
579 view = VIEWcreate(tokenBAT[*level].val->hseqbase, tokenBAT[*level].val);
580 if (view == NULL)
581 throw(MAL, "tokenizer.getLevel", SQLSTATE(HY001) MAL_MALLOC_FAIL);
582 *r = view->batCacheid;
583
584 BBPkeepref(*r);
585 return MAL_SUCCEED;
586}
587
588str
589TKNZRgetCount(bat *r)
590{
591 BAT *b;
592 int i;
593 lng cnt;
594
595 if (TRANS == NULL)
596 throw(MAL, "tokenizer", "no tokenizer store open");
597 b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
598 if (b == NULL)
599 throw(MAL, "tokenizer.getCount", SQLSTATE(HY001) MAL_MALLOC_FAIL);
600 for (i = 0; i < tokenDepth; i++) {
601 cnt = (lng) BATcount(tokenBAT[i].val);
602 if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
603 BBPreclaim(b);
604 throw(MAL, "tokenizer", SQLSTATE(HY001) MAL_MALLOC_FAIL);
605 }
606 }
607 BATsetcount(b, tokenDepth);
608 *r = b->batCacheid;
609 BBPkeepref(*r);
610 return MAL_SUCCEED;
611}
612
613str
614TKNZRgetCardinality(bat *r)
615{
616 BAT *b, *en;
617 int i;
618 lng cnt;
619
620 if (TRANS == NULL)
621 throw(MAL, "tokenizer", "no tokenizer store open");
622 b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
623 if (b == NULL)
624 throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY001) MAL_MALLOC_FAIL);
625 for (i = 0; i < tokenDepth; i++) {
626 if ((en = BATunique(tokenBAT[i].val, NULL)) == NULL) {
627 BBPreclaim(b);
628 throw(MAL, "tokenizer.getCardinality", GDK_EXCEPTION);
629 }
630 cnt = (lng) BATcount(en);
631 BBPunfix(en->batCacheid);
632 if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
633 BBPreclaim(b);
634 throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY001) MAL_MALLOC_FAIL);
635 }
636 }
637
638 BATsetcount(b, tokenDepth);
639 *r = b->batCacheid;
640 BBPkeepref(*r);
641 return MAL_SUCCEED;
642}
643
644