Line data Source code
1 : /*
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 : *
6 : * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
7 : */
8 :
9 : /*
10 : * author Lefteris Sidirourgos
11 : * Tokenizer
12 : * This module implements a vertical fragmented tokenizer for strings.
13 : * It is based on the ideas of the urlbox module by mk.
14 : *
15 : * The input string is tokenized according to a separator character.
16 : * Each token is inserted to the next BAT with the same order of
17 : * appearance in the string. We currently support 255 tokens in each
18 : * string as this module is intended for use with short and similar
19 : * strings such as URLs. In addition we maintain a 2-dimensional index
20 : * that points to the depth and height of the last token of each string.
21 : * The 2-dimensional index is combined to one BAT where the 8 least
22 : * significant bits represent the depth, and the rest bits the height.
23 : *
24 : * The tokenizer can be accessed in two ways. Given the oid retrieve the
25 : * re-constructed string, or given a string return its oid if present,
26 : * otherwise nil.
27 : *
28 : * Strings can be added either in batch (from a file or a bat of
29 : * strings) and by appending a single string. Duplicate elimination is
30 : * always performed.
31 : *
32 : * There can be only one tokenizer open at the same time. This is
33 : * achieved by setting a TRANSaction bat. This might change in the
34 : * future. However there can be more than one tokenizers stored in the
35 : * disk, each of which is identified by its name (usually the name of
36 : * the active schema of the db). These administrative issues and
37 : * security aspects (e.g., opening a tokenizer of a different schema)
38 : * should be addressed more thoroughly.
39 : */
40 : #include "monetdb_config.h"
41 : #include "bat5.h"
42 : #include "mal.h"
43 : #include "mal_client.h"
44 : #include "mal_interpreter.h"
45 : #include "mal_linker.h"
46 : #include "mal_exception.h"
47 :
48 : #define MAX_TKNZR_DEPTH 256
49 : #define INDEX MAX_TKNZR_DEPTH
50 : static int tokenDepth = 0;
51 : struct {
52 : BAT *idx, *val;
53 : } tokenBAT[MAX_TKNZR_DEPTH + 1];
54 :
55 : static BAT *TRANS = NULL; /* the catalog of tokenizers */
56 : static char name[128];
57 :
58 : #if SIZEOF_OID == 4 /* 32-bit oid */
59 : #define MAX_h ((((oid) 1) << 23) - 1)
60 : #else /* 64-bit oid */
61 : #define MAX_h ((((oid) 1) << 55) - 1)
62 : #endif
63 :
64 : #define COMP(h, d) ((h << 8) | (d & 255))
65 : #define GET_d(x) ((sht) ((x) & 255))
66 : #define GET_h(x) ((x) >> 8)
67 :
68 32 : static int prvlocate(BAT* b, BAT* bidx, oid *prv, str part)
69 : {
70 32 : BATiter bi = bat_iterator(b);
71 : BUN p;
72 :
73 32 : if (BAThash(b) == GDK_SUCCEED) {
74 32 : MT_rwlock_rdlock(&b->thashlock);
75 44 : HASHloop_str(bi, b->thash, p, part) {
76 31 : if (BUNtoid(bidx, p) == *prv) {
77 25 : MT_rwlock_rdunlock(&b->thashlock);
78 25 : bat_iterator_end(&bi);
79 25 : *prv = (oid) p;
80 25 : return TRUE;
81 : }
82 : }
83 7 : MT_rwlock_rdunlock(&b->thashlock);
84 : } else {
85 : /* hash failed, slow scan */
86 : BUN q;
87 :
88 0 : BATloop(b, p, q) {
89 0 : if (BUNtoid(bidx, p) == *prv &&
90 0 : strcmp(BUNtail(bi, p), part) == 0) {
91 0 : bat_iterator_end(&bi);
92 0 : *prv = (oid) p;
93 0 : return TRUE;
94 : }
95 : }
96 : }
97 7 : bat_iterator_end(&bi);
98 7 : return FALSE;
99 : }
100 :
101 : static str
102 1 : TKNZRopen(void *ret, str *in)
103 : {
104 : int depth;
105 : bat r;
106 : bat idx;
107 : char batname[134];
108 : BAT *b;
109 :
110 : (void) ret;
111 1 : if (strlen(*in) > 127)
112 0 : throw(MAL, "tokenizer.open",
113 : ILLEGAL_ARGUMENT " tokenizer name too long");
114 :
115 1 : MT_lock_set(&mal_contextLock);
116 1 : if (TRANS != NULL) {
117 0 : MT_lock_unset(&mal_contextLock);
118 0 : throw(MAL, "tokenizer.open", "Another tokenizer is already open");
119 : }
120 :
121 257 : for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
122 256 : tokenBAT[depth].idx = 0;
123 256 : tokenBAT[depth].val = 0;
124 : }
125 1 : tokenDepth = 0;
126 :
127 1 : TRANS = COLnew(0, TYPE_str, MAX_TKNZR_DEPTH + 1, TRANSIENT);
128 1 : if (TRANS == NULL) {
129 0 : MT_lock_unset(&mal_contextLock);
130 0 : throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
131 : }
132 : /* now we are sure that none overwrites the tokenizer table*/
133 1 : MT_lock_unset(&mal_contextLock);
134 :
135 1 : snprintf(name, 128, "%s", *in);
136 :
137 1 : snprintf(batname, sizeof(batname), "%s_index", name);
138 1 : idx = BBPindex(batname);
139 :
140 1 : if (idx == 0) { /* new tokenizer */
141 1 : b = COLnew(0, TYPE_oid, 1024, PERSISTENT);
142 1 : if (b == NULL)
143 0 : throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
144 : str msg;
145 2 : if ((msg = BKCsetName(&r, &b->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
146 2 : (msg = BKCsetPersistent(&r, &b->batCacheid)) != MAL_SUCCEED ||
147 1 : BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
148 0 : BBPreclaim(b);
149 0 : if (msg)
150 0 : return msg;
151 0 : throw(MAL, "tokenizer.open", GDK_EXCEPTION);
152 : }
153 1 : tokenBAT[INDEX].val = b;
154 : } else { /* existing tokenizer */
155 0 : tokenBAT[INDEX].val = BATdescriptor(idx);
156 :
157 0 : if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
158 0 : BBPunfix(tokenBAT[INDEX].val->batCacheid);
159 0 : tokenBAT[INDEX].val = NULL;
160 0 : throw(MAL, "tokenizer.open", OPERATION_FAILED);
161 : }
162 :
163 0 : for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
164 0 : snprintf(batname, sizeof(batname), "%s_%d", name, depth);
165 0 : idx = BBPindex(batname);
166 0 : if (idx == 0)
167 : break;
168 0 : tokenBAT[depth].val = BATdescriptor(idx);
169 0 : if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
170 0 : BBPunfix(tokenBAT[depth].val->batCacheid);
171 0 : tokenBAT[depth].val = NULL;
172 0 : throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
173 : }
174 :
175 : /* For idx BATs */
176 0 : snprintf(batname, sizeof(batname), "%s_idx_%d", name, depth);
177 0 : idx = BBPindex(batname);
178 0 : if (idx == 0)
179 : break;
180 0 : tokenBAT[depth].idx = BATdescriptor(idx);
181 0 : if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
182 0 : BBPunfix(tokenBAT[depth].idx->batCacheid);
183 0 : tokenBAT[depth].idx = NULL;
184 0 : throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
185 : }
186 :
187 : }
188 0 : tokenDepth = depth;
189 : }
190 :
191 : return MAL_SUCCEED;
192 : }
193 :
194 : static str
195 1 : TKNZRclose(void *r)
196 : {
197 : int i;
198 : (void) r;
199 :
200 1 : if (TRANS == NULL)
201 0 : throw(MAL, "tokenizer", "no tokenizer store open");
202 :
203 1 : TMsubcommit(TRANS);
204 :
205 5 : for (i = 0; i < tokenDepth; i++) {
206 4 : BBPunfix(tokenBAT[i].idx->batCacheid);
207 4 : BBPunfix(tokenBAT[i].val->batCacheid);
208 : }
209 1 : BBPunfix(tokenBAT[INDEX].val->batCacheid);
210 1 : tokenDepth = 0;
211 :
212 1 : BBPreclaim(TRANS);
213 1 : TRANS = NULL;
214 1 : return MAL_SUCCEED;
215 : }
216 :
217 : /*
218 : * Tokenize operations
219 : * The tokenizer operation assumes a private copy to mark the end of the
220 : * token separators with a zero byte. Tokens are separated by a single
221 : * character for simplicity. Might be a good scheme to assume that
222 : * strings to be broken are properly ended with either 0 or nl, not
223 : * both. It seems 0 can be assumed.
224 : */
225 : static int
226 15 : TKNZRtokenize(str in, str *parts, char tkn)
227 : {
228 : char *s, *t;
229 : int depth = 0;
230 :
231 : s = in;
232 68 : while (*s && *s != '\n') {
233 : t = s;
234 356 : while (*t != tkn && *t != '\n' && *t)
235 303 : t++;
236 53 : parts[depth++] = s;
237 53 : s = t + (*t != 0);
238 53 : *t = 0;
239 53 : if (depth > MAX_TKNZR_DEPTH)
240 : break;
241 : }
242 15 : return depth;
243 : }
244 :
245 : static str
246 14 : TKNZRappend(oid *pos, str *s)
247 : {
248 : str url;
249 : char batname[132];
250 : str parts[MAX_TKNZR_DEPTH];
251 : str msg;
252 : int i, new, depth;
253 : bat r;
254 : BAT *bVal;
255 : BAT *bIdx;
256 : BUN p;
257 : BUN idx = 0;
258 14 : oid prv = 0;
259 : oid comp;
260 :
261 14 : if (TRANS == NULL)
262 0 : throw(MAL, "tokenizer", "no tokenizer store open");
263 :
264 14 : if ((url = GDKstrdup(*s)) == NULL) {
265 0 : throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
266 : }
267 :
268 14 : depth = TKNZRtokenize(url, parts, '/');
269 : new = depth;
270 :
271 14 : if (depth == 0) {
272 0 : GDKfree(url);
273 0 : return MAL_SUCCEED;
274 : }
275 14 : if (depth > MAX_TKNZR_DEPTH) {
276 0 : GDKfree(url);
277 0 : throw(MAL, "tokenizer",
278 : ILLEGAL_ARGUMENT "input string breaks to too many parts");
279 : }
280 14 : if (depth > tokenDepth || tokenBAT[0].val == NULL) {
281 : new = tokenDepth;
282 6 : for (i = tokenDepth; i < depth; i++) {
283 : /* make new bat for value */
284 4 : snprintf(batname, sizeof(batname), "%s_%d", name, i);
285 4 : bVal = COLnew(0, TYPE_str, 1024, PERSISTENT);
286 4 : if (bVal == NULL) {
287 0 : GDKfree(url);
288 0 : throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
289 : }
290 :
291 4 : tokenBAT[i].val = bVal;
292 :
293 8 : if ((msg = BKCsetName(&r, &bVal->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
294 8 : (msg = BKCsetPersistent(&r, &bVal->batCacheid)) != MAL_SUCCEED ||
295 4 : BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
296 0 : GDKfree(url);
297 0 : return msg ? msg : createException(MAL, "tokenizer.append", GDK_EXCEPTION);
298 : }
299 :
300 : /* make new bat for index */
301 4 : snprintf(batname, sizeof(batname), "%s_idx_%d", name, i);
302 4 : bIdx = COLnew(0, TYPE_oid, 1024, PERSISTENT);
303 4 : if (bIdx == NULL) {
304 0 : GDKfree(url);
305 0 : throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
306 : }
307 :
308 4 : tokenBAT[i].idx = bIdx;
309 :
310 8 : if ((msg = BKCsetName(&r, &bIdx->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
311 8 : (msg = BKCsetPersistent(&r, &bIdx->batCacheid)) != MAL_SUCCEED ||
312 4 : BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
313 0 : GDKfree(url);
314 0 : return msg ? msg : createException(MAL, "tokenizer.append", GDK_EXCEPTION);
315 : }
316 :
317 : }
318 2 : tokenDepth = depth;
319 : }
320 :
321 : /* findcommn */
322 14 : p = BUNfnd(tokenBAT[0].val, parts[0]);
323 14 : if (p != BUN_NONE) {
324 12 : prv = (oid) p;
325 34 : for (i = 1; i < new; i++) {
326 29 : if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, &prv, parts[i]))
327 : break;
328 : }
329 : } else {
330 : i = 0;
331 : }
332 :
333 14 : if (i == depth) {
334 4 : comp = COMP(prv, depth);
335 4 : *pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
336 4 : if (*pos != BUN_NONE) {
337 : /* the string is already there */
338 : /* printf("The string %s is already there",url); */
339 3 : GDKfree(url);
340 3 : return MAL_SUCCEED;
341 : }
342 : }
343 :
344 : /* insremainder */
345 26 : for (; i < depth; i++) {
346 15 : idx = BATcount(tokenBAT[i].val);
347 15 : if (idx > MAX_h) {
348 0 : GDKfree(url);
349 0 : throw(MAL, "tokenizer.append",
350 : OPERATION_FAILED " no more free oid's");
351 : }
352 15 : if (BUNappend(tokenBAT[i].val, parts[i], false) != GDK_SUCCEED) {
353 0 : GDKfree(url);
354 0 : throw(MAL, "tokenizer.append",
355 : OPERATION_FAILED " could not append");
356 : }
357 :
358 15 : if (BUNappend(tokenBAT[i].idx, (ptr) & prv, false) != GDK_SUCCEED) {
359 0 : GDKfree(url);
360 0 : throw(MAL, "tokenizer.append",
361 : OPERATION_FAILED " could not append");
362 : }
363 :
364 15 : prv = (oid) idx;
365 : }
366 :
367 11 : *pos = (oid) BATcount(tokenBAT[INDEX].val);
368 11 : comp = COMP(prv, depth);
369 11 : if (BUNappend(tokenBAT[INDEX].val, &comp, false) != GDK_SUCCEED) {
370 0 : GDKfree(url);
371 0 : throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
372 : }
373 :
374 11 : GDKfree(url);
375 11 : return MAL_SUCCEED;
376 : }
377 :
378 : #define SIZE (1 * 1024 * 1024)
379 : static str
380 0 : TKNZRdepositFile(void *r, str *fnme)
381 : {
382 : stream *fs;
383 : bstream *bs;
384 : char *s, *t;
385 : int len = 0;
386 : char buf[FILENAME_MAX];
387 : oid pos;
388 : str msg= MAL_SUCCEED;
389 :
390 0 : if (TRANS == NULL)
391 0 : throw(MAL, "tokenizer", "no tokenizer store open");
392 :
393 : (void) r;
394 0 : if (**fnme == '/')
395 0 : len = snprintf(buf, FILENAME_MAX, "%s", *fnme);
396 : else
397 0 : len = snprintf(buf, FILENAME_MAX, "%s/%s", monet_cwd, *fnme);
398 0 : if (len == -1 || len >= FILENAME_MAX)
399 0 : throw(MAL, "tokenizer.depositFile", SQLSTATE(HY013) "tokenizer filename path is too large");
400 : /* later, handle directory separator */
401 0 : fs = open_rastream(buf);
402 0 : if (fs == NULL)
403 0 : throw(MAL, "tokenizer.depositFile", "%s", mnstr_peek_error(NULL));
404 0 : if (mnstr_errnr(fs)) {
405 0 : close_stream(fs);
406 0 : throw(MAL, "tokenizer.depositFile", "%s", mnstr_peek_error(NULL));
407 : }
408 0 : bs = bstream_create(fs, SIZE);
409 0 : if (bs == NULL) {
410 0 : close_stream(fs);
411 0 : throw(MAL, "tokenizer.depositFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
412 : }
413 0 : while (bstream_read(bs, bs->size - (bs->len - bs->pos)) != 0 &&
414 0 : !mnstr_errnr(bs->s))
415 : {
416 0 : s = bs->buf;
417 0 : for (t = s; *t;) {
418 0 : while (t < bs->buf + bs->len && *t && *t != '\n')
419 0 : t++;
420 0 : if (t == bs->buf + bs->len || *t != '\n') {
421 : /* read next block if possible after shift */
422 0 : assert(t - s <= INT_MAX);
423 0 : len = (int) (t - s);
424 0 : memcpy(bs->buf, s, len);
425 0 : bs->len = len;
426 0 : bs->pos = 0;
427 0 : break;
428 : }
429 : /* found a string to be processed */
430 0 : *t = 0;
431 0 : msg = TKNZRappend(&pos, &s);
432 0 : if (msg ) {
433 0 : bstream_destroy(bs);
434 0 : close_stream(fs);
435 0 : return msg;
436 : }
437 0 : *t = '\n';
438 0 : s = t + 1;
439 : t = s;
440 : }
441 : }
442 :
443 0 : bstream_destroy(bs);
444 0 : close_stream(fs);
445 0 : return MAL_SUCCEED;
446 : }
447 :
448 : static str
449 1 : TKNZRlocate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
450 : {
451 : oid pos;
452 : str url;
453 : str parts[MAX_TKNZR_DEPTH];
454 : int i = 0, depth;
455 : BUN p;
456 1 : oid prv = 0;
457 : oid comp;
458 : (void) cntxt;
459 : (void) mb;
460 :
461 1 : if (TRANS == NULL)
462 0 : throw(MAL, "tokenizer", "no tokenizer store open");
463 :
464 1 : url = (str) GDKmalloc(sizeof(char) *
465 : (strlen(*getArgReference_str(stk, pci, 1)) + 1));
466 1 : if (url == NULL)
467 0 : throw(MAL, "tokenizer.locate", SQLSTATE(HY013) MAL_MALLOC_FAIL);
468 1 : strcpy(url, *getArgReference_str(stk, pci, 1));
469 :
470 :
471 1 : depth = TKNZRtokenize(url, parts, '/');
472 :
473 1 : if (depth == 0) {
474 0 : pos = oid_nil;
475 1 : } else if (depth > MAX_TKNZR_DEPTH) {
476 0 : GDKfree(url);
477 0 : throw(MAL, "tokenizer.locate",
478 : ILLEGAL_ARGUMENT "strings breaks to too many parts");
479 1 : } else if (depth > tokenDepth) {
480 0 : pos = oid_nil;
481 : } else {
482 1 : p = BUNfnd(tokenBAT[0].val, parts[0]);
483 1 : if (p != BUN_NONE) {
484 1 : prv = (oid) p;
485 4 : for (i = 1; i < depth; i++) {
486 3 : if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, (ptr) & prv, parts[i]))
487 : break;
488 : }
489 1 : if (i < depth) {
490 0 : pos = oid_nil;
491 : } else {
492 1 : comp = COMP(prv, i);
493 1 : pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
494 : }
495 : } else {
496 0 : pos = oid_nil;
497 : }
498 : }
499 :
500 1 : VALset(&stk->stk[pci->argv[0]], TYPE_oid, &pos);
501 1 : GDKfree(url);
502 1 : return MAL_SUCCEED;
503 : }
504 :
505 : static str
506 1 : takeOid(oid id, str *val)
507 : {
508 : int i, depth;
509 : str parts[MAX_TKNZR_DEPTH];
510 : size_t lngth = 0;
511 : str s;
512 :
513 1 : if (id >= BATcount(tokenBAT[INDEX].val)) {
514 0 : throw(MAL, "tokenizer.takeOid", OPERATION_FAILED " illegal oid");
515 : }
516 :
517 1 : id = *(oid *) Tloc(tokenBAT[INDEX].val, id);
518 :
519 1 : depth = GET_d(id);
520 1 : id = GET_h(id);
521 :
522 5 : for (i = depth - 1; i >= 0; i--) {
523 4 : BATiter bi = bat_iterator(tokenBAT[i].val);
524 4 : parts[i] = (str) BUNtvar(bi, id);
525 4 : bat_iterator_end(&bi);
526 4 : id = BUNtoid(tokenBAT[i].idx, id);
527 4 : lngth += strlen(parts[i]);
528 : }
529 :
530 1 : *val = (str) GDKmalloc(lngth+depth+1);
531 1 : if( *val == NULL)
532 0 : throw(MAL, "tokenizer.takeOid", SQLSTATE(HY013) MAL_MALLOC_FAIL);
533 : s = *val;
534 :
535 5 : for (i = 0; i < depth; i++) {
536 4 : strcpy(s, parts[i]);
537 4 : s += strlen(parts[i]);
538 4 : *s++ = '/';
539 : }
540 1 : *s = '\0';
541 :
542 1 : return MAL_SUCCEED;
543 : }
544 :
545 : static str
546 1 : TKNZRtakeOid(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
547 : {
548 1 : str ret, val = NULL;
549 : oid id;
550 : (void) cntxt;
551 : (void) mb;
552 :
553 1 : if (TRANS == NULL) {
554 0 : throw(MAL, "tokenizer", "no tokenizer store open");
555 : }
556 1 : id = *getArgReference_oid(stk, pci, 1);
557 1 : ret = takeOid(id, &val);
558 1 : if (ret == MAL_SUCCEED) {
559 1 : VALset(&stk->stk[pci->argv[0]], TYPE_str, val);
560 : }
561 : return ret;
562 : }
563 :
564 : static str
565 1 : TKNZRgetIndex(bat *r)
566 : {
567 1 : if (TRANS == NULL)
568 0 : throw(MAL, "tokenizer", "no tokenizer store open");
569 1 : *r = tokenBAT[INDEX].val->batCacheid;
570 1 : BBPretain(*r);
571 1 : return MAL_SUCCEED;
572 : }
573 :
574 : static str
575 4 : TKNZRgetLevel(bat *r, int *level)
576 : {
577 : BAT* view;
578 4 : if (TRANS == NULL)
579 0 : throw(MAL, "tokenizer", "no tokenizer store open");
580 4 : if (*level < 0 || *level >= tokenDepth)
581 0 : throw(MAL, "tokenizer.getLevel", OPERATION_FAILED " illegal level");
582 4 : view = VIEWcreate(tokenBAT[*level].val->hseqbase, tokenBAT[*level].val);
583 4 : if (view == NULL)
584 0 : throw(MAL, "tokenizer.getLevel", SQLSTATE(HY013) MAL_MALLOC_FAIL);
585 4 : *r = view->batCacheid;
586 :
587 4 : BBPkeepref(*r);
588 4 : return MAL_SUCCEED;
589 : }
590 :
591 : static str
592 1 : TKNZRgetCount(bat *r)
593 : {
594 : BAT *b;
595 : int i;
596 : lng cnt;
597 :
598 1 : if (TRANS == NULL)
599 0 : throw(MAL, "tokenizer", "no tokenizer store open");
600 1 : b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
601 1 : if (b == NULL)
602 0 : throw(MAL, "tokenizer.getCount", SQLSTATE(HY013) MAL_MALLOC_FAIL);
603 5 : for (i = 0; i < tokenDepth; i++) {
604 4 : cnt = (lng) BATcount(tokenBAT[i].val);
605 4 : if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
606 0 : BBPreclaim(b);
607 0 : throw(MAL, "tokenizer", SQLSTATE(HY013) MAL_MALLOC_FAIL);
608 : }
609 : }
610 1 : BATsetcount(b, tokenDepth);
611 1 : *r = b->batCacheid;
612 1 : BBPkeepref(*r);
613 1 : return MAL_SUCCEED;
614 : }
615 :
616 : static str
617 1 : TKNZRgetCardinality(bat *r)
618 : {
619 : BAT *b, *en;
620 : int i;
621 : lng cnt;
622 :
623 1 : if (TRANS == NULL)
624 0 : throw(MAL, "tokenizer", "no tokenizer store open");
625 1 : b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
626 1 : if (b == NULL)
627 0 : throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY013) MAL_MALLOC_FAIL);
628 5 : for (i = 0; i < tokenDepth; i++) {
629 4 : if ((en = BATunique(tokenBAT[i].val, NULL)) == NULL) {
630 0 : BBPreclaim(b);
631 0 : throw(MAL, "tokenizer.getCardinality", GDK_EXCEPTION);
632 : }
633 4 : cnt = (lng) canditer_init(&(struct canditer){0}, NULL, en);
634 4 : BBPunfix(en->batCacheid);
635 4 : if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
636 0 : BBPreclaim(b);
637 0 : throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY013) MAL_MALLOC_FAIL);
638 : }
639 : }
640 :
641 1 : BATsetcount(b, tokenDepth);
642 1 : *r = b->batCacheid;
643 1 : BBPkeepref(*r);
644 1 : return MAL_SUCCEED;
645 : }
646 :
647 : #include "mel.h"
648 : mel_func tokenizer_init_funcs[] = {
649 : command("tokenizer", "open", TKNZRopen, false, "open the named tokenizer store, a new one is created if the specified name does not exist", args(1,2, arg("",void),arg("name",str))),
650 : command("tokenizer", "close", TKNZRclose, false, "close the current tokenizer store", args(1,1, arg("",void))),
651 : pattern("tokenizer", "take", TKNZRtakeOid, false, "reconstruct and returns the i-th string", args(1,2, arg("",str),arg("i",oid))),
652 : pattern("tokenizer", "locate", TKNZRlocate, false, "if the given string is in the store returns its oid, otherwise oid_nil", args(1,2, arg("",oid),arg("s",str))),
653 : command("tokenizer", "append", TKNZRappend, false, "tokenize a new string and append it to the tokenizer (duplicate elimination is performed)", args(1,2, arg("",oid),arg("u",str))),
654 : command("tokenizer", "depositFile", TKNZRdepositFile, false, "batch insertion from a file of strings to tokenize, each string is separated by a new line", args(1,2, arg("",void),arg("fnme",str))),
655 : command("tokenizer", "getLevel", TKNZRgetLevel, false, "administrative function that returns the bat on level i", args(1,2, batarg("",str),arg("i",int))),
656 : command("tokenizer", "getIndex", TKNZRgetIndex, false, "administrative function that returns the INDEX bat", args(1,1, batarg("",oid))),
657 : command("tokenizer", "getCount", TKNZRgetCount, false, "debugging function that returns the size of the bats at each level", args(1,1, batarg("",lng))),
658 : command("tokenizer", "getCardinality", TKNZRgetCardinality, false, "debugging function that returns the unique tokens at each level", args(1,1, batarg("",lng))),
659 : { .imp=NULL }
660 : };
661 : #include "mal_import.h"
662 : #ifdef _MSC_VER
663 : #undef read
664 : #pragma section(".CRT$XCU",read)
665 : #endif
666 257 : LIB_STARTUP_FUNC(init_tokenizer_mal)
667 257 : { mal_module("tokenizer", NULL, tokenizer_init_funcs); }
|