LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - tokenizer.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 186 315 59.0 %
Date: 2021-10-13 02:24:04 Functions: 13 14 92.9 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * author Lefteris Sidirourgos
      11             :  * Tokenizer
      12             :  * This module implements a vertical fragmented tokenizer for strings.
      13             :  * It is based on the ideas of the urlbox module by mk.
      14             :  *
      15             :  * The input string is tokenized according to a separator character.
      16             :  * Each token is inserted to the next BAT with the same order of
      17             :  * appearance in the string. We currently support 255 tokens in each
      18             :  * string as this module is intended for use with short and similar
      19             :  * strings such as URLs. In addition we maintain a 2-dimensional index
      20             :  * that points to the depth and height of the last token of each string.
      21             :  * The 2-dimensional index is combined to one BAT where the 8 least
      22             :  * significant bits represent the depth, and the rest bits the height.
      23             :  *
      24             :  * The tokenizer can be accessed in two ways. Given the oid retrieve the
      25             :  * re-constructed string, or given a string return its oid if present,
      26             :  * otherwise nil.
      27             :  *
      28             :  * Strings can be added either in batch (from a file or a bat of
      29             :  * strings) and by appending a single string. Duplicate elimination is
      30             :  * always performed.
      31             :  *
      32             :  * There can be only one tokenizer open at the same time. This is
      33             :  * achieved by setting a TRANSaction bat. This might change in the
      34             :  * future. However there can be more than one tokenizers stored in the
      35             :  * disk, each of which is identified by its name (usually the name of
      36             :  * the active schema of the db). These administrative issues and
      37             :  * security aspects (e.g., opening a tokenizer of a different schema)
      38             :  * should be addressed more thoroughly.
      39             :  */
      40             : #include "monetdb_config.h"
      41             : #include "bat5.h"
      42             : #include "mal.h"
      43             : #include "mal_client.h"
      44             : #include "mal_interpreter.h"
      45             : #include "mal_linker.h"
      46             : #include "mal_exception.h"
      47             : 
      48             : #define MAX_TKNZR_DEPTH 256
      49             : #define INDEX MAX_TKNZR_DEPTH
      50             : static int tokenDepth = 0;
      51             : struct {
      52             :         BAT *idx, *val;
      53             : } tokenBAT[MAX_TKNZR_DEPTH + 1];
      54             : 
      55             : static BAT *TRANS = NULL;   /* the catalog of tokenizers */
      56             : static char name[128];
      57             : 
      58             : #if SIZEOF_OID == 4 /* 32-bit oid */
      59             : #define MAX_h ((((oid) 1) << 23) - 1)
      60             : #else /* 64-bit oid */
      61             : #define MAX_h ((((oid) 1) << 55) - 1)
      62             : #endif
      63             : 
      64             : #define COMP(h, d) ((h << 8) | (d & 255))
      65             : #define GET_d(x) ((sht) ((x) & 255))
      66             : #define GET_h(x) ((x) >> 8)
      67             : 
      68          32 : static int prvlocate(BAT* b, BAT* bidx, oid *prv, str part)
      69             : {
      70          32 :         BATiter bi = bat_iterator(b);
      71             :         BUN p;
      72             : 
      73          32 :         if (BAThash(b) == GDK_SUCCEED) {
      74          32 :                 MT_rwlock_rdlock(&b->thashlock);
      75          44 :                 HASHloop_str(bi, b->thash, p, part) {
      76          31 :                         if (BUNtoid(bidx, p) == *prv) {
      77          25 :                                 MT_rwlock_rdunlock(&b->thashlock);
      78          25 :                                 bat_iterator_end(&bi);
      79          25 :                                 *prv = (oid) p;
      80          25 :                                 return TRUE;
      81             :                         }
      82             :                 }
      83           7 :                 MT_rwlock_rdunlock(&b->thashlock);
      84             :         } else {
      85             :                 /* hash failed, slow scan */
      86             :                 BUN q;
      87             : 
      88           0 :                 BATloop(b, p, q) {
      89           0 :                         if (BUNtoid(bidx, p) == *prv &&
      90           0 :                                 strcmp(BUNtail(bi, p), part) == 0) {
      91           0 :                                 bat_iterator_end(&bi);
      92           0 :                                 *prv = (oid) p;
      93           0 :                                 return TRUE;
      94             :                         }
      95             :                 }
      96             :         }
      97           7 :         bat_iterator_end(&bi);
      98           7 :         return FALSE;
      99             : }
     100             : 
     101             : static str
     102           1 : TKNZRopen(void *ret, str *in)
     103             : {
     104             :         int depth;
     105             :         bat r;
     106             :         bat idx;
     107             :         char batname[134];
     108             :         BAT *b;
     109             : 
     110             :         (void) ret;
     111           1 :         if (strlen(*in) > 127)
     112           0 :                 throw(MAL, "tokenizer.open",
     113             :                           ILLEGAL_ARGUMENT " tokenizer name too long");
     114             : 
     115           1 :         MT_lock_set(&mal_contextLock);
     116           1 :         if (TRANS != NULL) {
     117           0 :                 MT_lock_unset(&mal_contextLock);
     118           0 :                 throw(MAL, "tokenizer.open", "Another tokenizer is already open");
     119             :         }
     120             : 
     121         257 :         for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
     122         256 :                 tokenBAT[depth].idx = 0;
     123         256 :                 tokenBAT[depth].val = 0;
     124             :         }
     125           1 :         tokenDepth = 0;
     126             : 
     127           1 :         TRANS = COLnew(0, TYPE_str, MAX_TKNZR_DEPTH + 1, TRANSIENT);
     128           1 :         if (TRANS == NULL) {
     129           0 :                 MT_lock_unset(&mal_contextLock);
     130           0 :                 throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     131             :         }
     132             :         /* now we are sure that none overwrites the tokenizer table*/
     133           1 :         MT_lock_unset(&mal_contextLock);
     134             : 
     135           1 :         snprintf(name, 128, "%s", *in);
     136             : 
     137           1 :         snprintf(batname, sizeof(batname), "%s_index", name);
     138           1 :         idx = BBPindex(batname);
     139             : 
     140           1 :         if (idx == 0) { /* new tokenizer */
     141           1 :                 b = COLnew(0, TYPE_oid, 1024, PERSISTENT);
     142           1 :                 if (b == NULL)
     143           0 :                         throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     144             :                 str msg;
     145           2 :                 if ((msg = BKCsetName(&r, &b->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
     146           2 :                         (msg = BKCsetPersistent(&r, &b->batCacheid)) != MAL_SUCCEED ||
     147           1 :                         BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     148           0 :                         BBPreclaim(b);
     149           0 :                         if (msg)
     150           0 :                                 return msg;
     151           0 :                         throw(MAL, "tokenizer.open", GDK_EXCEPTION);
     152             :                 }
     153           1 :                 tokenBAT[INDEX].val = b;
     154             :         } else { /* existing tokenizer */
     155           0 :                 tokenBAT[INDEX].val = BATdescriptor(idx);
     156             : 
     157           0 :                 if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     158           0 :                         BBPunfix(tokenBAT[INDEX].val->batCacheid);
     159           0 :                         tokenBAT[INDEX].val = NULL;
     160           0 :                         throw(MAL, "tokenizer.open", OPERATION_FAILED);
     161             :                 }
     162             : 
     163           0 :                 for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
     164           0 :                         snprintf(batname, sizeof(batname), "%s_%d", name, depth);
     165           0 :                         idx = BBPindex(batname);
     166           0 :                         if (idx == 0)
     167             :                                 break;
     168           0 :                         tokenBAT[depth].val = BATdescriptor(idx);
     169           0 :                         if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     170           0 :                                 BBPunfix(tokenBAT[depth].val->batCacheid);
     171           0 :                                 tokenBAT[depth].val = NULL;
     172           0 :                                 throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     173             :                         }
     174             : 
     175             :                         /* For idx BATs */
     176           0 :                         snprintf(batname, sizeof(batname), "%s_idx_%d", name, depth);
     177           0 :                         idx = BBPindex(batname);
     178           0 :                         if (idx == 0)
     179             :                                 break;
     180           0 :                         tokenBAT[depth].idx = BATdescriptor(idx);
     181           0 :                         if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     182           0 :                                 BBPunfix(tokenBAT[depth].idx->batCacheid);
     183           0 :                                 tokenBAT[depth].idx = NULL;
     184           0 :                                 throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     185             :                         }
     186             : 
     187             :                 }
     188           0 :                 tokenDepth = depth;
     189             :         }
     190             : 
     191             :         return MAL_SUCCEED;
     192             : }
     193             : 
     194             : static str
     195           1 : TKNZRclose(void *r)
     196             : {
     197             :         int i;
     198             :         (void) r;
     199             : 
     200           1 :         if (TRANS == NULL)
     201           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     202             : 
     203           1 :         TMsubcommit(TRANS);
     204             : 
     205           5 :         for (i = 0; i < tokenDepth; i++) {
     206           4 :                 BBPunfix(tokenBAT[i].idx->batCacheid);
     207           4 :                 BBPunfix(tokenBAT[i].val->batCacheid);
     208             :         }
     209           1 :         BBPunfix(tokenBAT[INDEX].val->batCacheid);
     210           1 :         tokenDepth = 0;
     211             : 
     212           1 :         BBPreclaim(TRANS);
     213           1 :         TRANS = NULL;
     214           1 :         return MAL_SUCCEED;
     215             : }
     216             : 
     217             : /*
     218             :  * Tokenize operations
     219             :  * The tokenizer operation assumes a private copy to mark the end of the
     220             :  * token separators with a zero byte. Tokens are separated by a single
     221             :  * character for simplicity.  Might be a good scheme to assume that
     222             :  * strings to be broken are properly ended with either 0 or nl, not
     223             :  * both.  It seems 0 can be assumed.
     224             :  */
     225             : static int
     226          15 : TKNZRtokenize(str in, str *parts, char tkn)
     227             : {
     228             :         char *s, *t;
     229             :         int depth = 0;
     230             : 
     231             :         s = in;
     232          68 :         while (*s && *s != '\n') {
     233             :                 t = s;
     234         356 :                 while (*t != tkn && *t != '\n' && *t)
     235         303 :                         t++;
     236          53 :                 parts[depth++] = s;
     237          53 :                 s = t + (*t != 0);
     238          53 :                 *t = 0;
     239          53 :                 if (depth > MAX_TKNZR_DEPTH)
     240             :                         break;
     241             :         }
     242          15 :         return depth;
     243             : }
     244             : 
     245             : static str
     246          14 : TKNZRappend(oid *pos, str *s)
     247             : {
     248             :         str url;
     249             :         char batname[132];
     250             :         str parts[MAX_TKNZR_DEPTH];
     251             :         str msg;
     252             :         int i, new, depth;
     253             :         bat r;
     254             :         BAT *bVal;
     255             :         BAT *bIdx;
     256             :         BUN p;
     257             :         BUN idx = 0;
     258          14 :         oid prv = 0;
     259             :         oid comp;
     260             : 
     261          14 :         if (TRANS == NULL)
     262           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     263             : 
     264          14 :         if ((url = GDKstrdup(*s)) == NULL) {
     265           0 :                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     266             :         }
     267             : 
     268          14 :         depth = TKNZRtokenize(url, parts, '/');
     269             :         new = depth;
     270             : 
     271          14 :         if (depth == 0) {
     272           0 :                 GDKfree(url);
     273           0 :                 return MAL_SUCCEED;
     274             :         }
     275          14 :         if (depth > MAX_TKNZR_DEPTH) {
     276           0 :                 GDKfree(url);
     277           0 :                 throw(MAL, "tokenizer",
     278             :                                 ILLEGAL_ARGUMENT "input string breaks to too many parts");
     279             :         }
     280          14 :         if (depth > tokenDepth || tokenBAT[0].val == NULL) {
     281             :                 new = tokenDepth;
     282           6 :                 for (i = tokenDepth; i < depth; i++) {
     283             :                         /* make new bat for value */
     284           4 :                         snprintf(batname, sizeof(batname), "%s_%d", name, i);
     285           4 :                         bVal = COLnew(0, TYPE_str, 1024, PERSISTENT);
     286           4 :                         if (bVal == NULL) {
     287           0 :                                 GDKfree(url);
     288           0 :                                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     289             :                         }
     290             : 
     291           4 :                         tokenBAT[i].val = bVal;
     292             : 
     293           8 :                         if ((msg = BKCsetName(&r, &bVal->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
     294           8 :                                 (msg = BKCsetPersistent(&r, &bVal->batCacheid)) != MAL_SUCCEED ||
     295           4 :                                 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     296           0 :                                 GDKfree(url);
     297           0 :                                 return msg ? msg : createException(MAL, "tokenizer.append", GDK_EXCEPTION);
     298             :                         }
     299             : 
     300             :                         /* make new bat for index */
     301           4 :                         snprintf(batname, sizeof(batname), "%s_idx_%d", name, i);
     302           4 :                         bIdx = COLnew(0, TYPE_oid, 1024, PERSISTENT);
     303           4 :                         if (bIdx == NULL) {
     304           0 :                                 GDKfree(url);
     305           0 :                                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     306             :                         }
     307             : 
     308           4 :                         tokenBAT[i].idx = bIdx;
     309             : 
     310           8 :                         if ((msg = BKCsetName(&r, &bIdx->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
     311           8 :                                 (msg = BKCsetPersistent(&r, &bIdx->batCacheid)) != MAL_SUCCEED ||
     312           4 :                                 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     313           0 :                                 GDKfree(url);
     314           0 :                                 return msg ? msg : createException(MAL, "tokenizer.append", GDK_EXCEPTION);
     315             :                         }
     316             : 
     317             :                 }
     318           2 :                 tokenDepth = depth;
     319             :         }
     320             : 
     321             :         /* findcommn */
     322          14 :         p = BUNfnd(tokenBAT[0].val, parts[0]);
     323          14 :         if (p != BUN_NONE) {
     324          12 :                 prv = (oid) p;
     325          34 :                 for (i = 1; i < new; i++) {
     326          29 :                         if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, &prv, parts[i]))
     327             :                                 break;
     328             :                 }
     329             :         } else {
     330             :                 i = 0;
     331             :         }
     332             : 
     333          14 :         if (i == depth) {
     334           4 :                 comp = COMP(prv, depth);
     335           4 :                 *pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
     336           4 :                 if (*pos != BUN_NONE) {
     337             :                         /* the string is already there */
     338             :                         /* printf("The string %s is already there",url); */
     339           3 :                         GDKfree(url);
     340           3 :                         return MAL_SUCCEED;
     341             :                 }
     342             :         }
     343             : 
     344             :         /* insremainder */
     345          26 :         for (; i < depth; i++) {
     346          15 :                 idx = BATcount(tokenBAT[i].val);
     347          15 :                 if (idx > MAX_h) {
     348           0 :                         GDKfree(url);
     349           0 :                         throw(MAL, "tokenizer.append",
     350             :                                         OPERATION_FAILED " no more free oid's");
     351             :                 }
     352          15 :                 if (BUNappend(tokenBAT[i].val, parts[i], false) != GDK_SUCCEED) {
     353           0 :                         GDKfree(url);
     354           0 :                         throw(MAL, "tokenizer.append",
     355             :                                         OPERATION_FAILED " could not append");
     356             :                 }
     357             : 
     358          15 :                 if (BUNappend(tokenBAT[i].idx, (ptr) & prv, false) != GDK_SUCCEED) {
     359           0 :                         GDKfree(url);
     360           0 :                         throw(MAL, "tokenizer.append",
     361             :                                         OPERATION_FAILED " could not append");
     362             :                 }
     363             : 
     364          15 :                 prv = (oid) idx;
     365             :         }
     366             : 
     367          11 :         *pos = (oid) BATcount(tokenBAT[INDEX].val);
     368          11 :         comp = COMP(prv, depth);
     369          11 :         if (BUNappend(tokenBAT[INDEX].val, &comp, false) != GDK_SUCCEED) {
     370           0 :                 GDKfree(url);
     371           0 :                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     372             :         }
     373             : 
     374          11 :         GDKfree(url);
     375          11 :         return MAL_SUCCEED;
     376             : }
     377             : 
     378             : #define SIZE (1 * 1024 * 1024)
     379             : static str
     380           0 : TKNZRdepositFile(void *r, str *fnme)
     381             : {
     382             :         stream *fs;
     383             :         bstream *bs;
     384             :         char *s, *t;
     385             :         int len = 0;
     386             :         char buf[FILENAME_MAX];
     387             :         oid pos;
     388             :         str msg= MAL_SUCCEED;
     389             : 
     390           0 :         if (TRANS == NULL)
     391           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     392             : 
     393             :         (void) r;
     394           0 :         if (**fnme == '/')
     395           0 :                 len = snprintf(buf, FILENAME_MAX, "%s", *fnme);
     396             :         else
     397           0 :                 len = snprintf(buf, FILENAME_MAX, "%s/%s", monet_cwd, *fnme);
     398           0 :         if (len == -1 || len >= FILENAME_MAX)
     399           0 :                 throw(MAL, "tokenizer.depositFile", SQLSTATE(HY013) "tokenizer filename path is too large");
     400             :         /* later, handle directory separator */
     401           0 :         fs = open_rastream(buf);
     402           0 :         if (fs == NULL)
     403           0 :                 throw(MAL, "tokenizer.depositFile", "%s", mnstr_peek_error(NULL));
     404           0 :         if (mnstr_errnr(fs)) {
     405           0 :                 close_stream(fs);
     406           0 :                 throw(MAL, "tokenizer.depositFile", "%s", mnstr_peek_error(NULL));
     407             :         }
     408           0 :         bs = bstream_create(fs, SIZE);
     409           0 :         if (bs == NULL) {
     410           0 :                 close_stream(fs);
     411           0 :                 throw(MAL, "tokenizer.depositFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     412             :         }
     413           0 :         while (bstream_read(bs, bs->size - (bs->len - bs->pos)) != 0 &&
     414           0 :                    !mnstr_errnr(bs->s))
     415             :         {
     416           0 :                 s = bs->buf;
     417           0 :                 for (t = s; *t;) {
     418           0 :                         while (t < bs->buf + bs->len && *t && *t != '\n')
     419           0 :                                 t++;
     420           0 :                         if (t == bs->buf + bs->len || *t != '\n') {
     421             :                                 /* read next block if possible after shift  */
     422           0 :                                 assert(t - s <= INT_MAX);
     423           0 :                                 len = (int) (t - s);
     424           0 :                                 memcpy(bs->buf, s, len);
     425           0 :                                 bs->len = len;
     426           0 :                                 bs->pos = 0;
     427           0 :                                 break;
     428             :                         }
     429             :                         /* found a string to be processed */
     430           0 :                         *t = 0;
     431           0 :                         msg = TKNZRappend(&pos, &s);
     432           0 :                         if (msg ) {
     433           0 :                                 bstream_destroy(bs);
     434           0 :                                 close_stream(fs);
     435           0 :                                 return msg;
     436             :                         }
     437           0 :                         *t = '\n';
     438           0 :                         s = t + 1;
     439             :                         t = s;
     440             :                 }
     441             :         }
     442             : 
     443           0 :         bstream_destroy(bs);
     444           0 :         close_stream(fs);
     445           0 :         return MAL_SUCCEED;
     446             : }
     447             : 
     448             : static str
     449           1 : TKNZRlocate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     450             : {
     451             :         oid pos;
     452             :         str url;
     453             :         str parts[MAX_TKNZR_DEPTH];
     454             :         int i = 0, depth;
     455             :         BUN p;
     456           1 :         oid prv = 0;
     457             :         oid comp;
     458             :         (void) cntxt;
     459             :         (void) mb;
     460             : 
     461           1 :         if (TRANS == NULL)
     462           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     463             : 
     464           1 :         url = (str) GDKmalloc(sizeof(char) *
     465             :                         (strlen(*getArgReference_str(stk, pci, 1)) + 1));
     466           1 :         if (url == NULL)
     467           0 :                 throw(MAL, "tokenizer.locate", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     468           1 :         strcpy(url, *getArgReference_str(stk, pci, 1));
     469             : 
     470             : 
     471           1 :         depth = TKNZRtokenize(url, parts, '/');
     472             : 
     473           1 :         if (depth == 0) {
     474           0 :                 pos = oid_nil;
     475           1 :         } else if (depth > MAX_TKNZR_DEPTH) {
     476           0 :                 GDKfree(url);
     477           0 :                 throw(MAL, "tokenizer.locate",
     478             :                                 ILLEGAL_ARGUMENT "strings breaks to too many parts");
     479           1 :         } else if (depth > tokenDepth) {
     480           0 :                 pos = oid_nil;
     481             :         } else {
     482           1 :                 p = BUNfnd(tokenBAT[0].val, parts[0]);
     483           1 :                 if (p != BUN_NONE) {
     484           1 :                         prv = (oid) p;
     485           4 :                         for (i = 1; i < depth; i++) {
     486           3 :                                 if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, (ptr) & prv, parts[i]))
     487             :                                         break;
     488             :                         }
     489           1 :                         if (i < depth) {
     490           0 :                                 pos = oid_nil;
     491             :                         } else {
     492           1 :                                 comp = COMP(prv, i);
     493           1 :                                 pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
     494             :                         }
     495             :                 } else {
     496           0 :                         pos = oid_nil;
     497             :                 }
     498             :         }
     499             : 
     500           1 :         VALset(&stk->stk[pci->argv[0]], TYPE_oid, &pos);
     501           1 :         GDKfree(url);
     502           1 :         return MAL_SUCCEED;
     503             : }
     504             : 
     505             : static str
     506           1 : takeOid(oid id, str *val)
     507             : {
     508             :         int i, depth;
     509             :         str parts[MAX_TKNZR_DEPTH];
     510             :         size_t lngth = 0;
     511             :         str s;
     512             : 
     513           1 :         if (id >= BATcount(tokenBAT[INDEX].val)) {
     514           0 :                 throw(MAL, "tokenizer.takeOid", OPERATION_FAILED " illegal oid");
     515             :         }
     516             : 
     517           1 :         id = *(oid *) Tloc(tokenBAT[INDEX].val, id);
     518             : 
     519           1 :         depth = GET_d(id);
     520           1 :         id = GET_h(id);
     521             : 
     522           5 :         for (i = depth - 1; i >= 0; i--) {
     523           4 :                 BATiter bi = bat_iterator(tokenBAT[i].val);
     524           4 :                 parts[i] = (str) BUNtvar(bi, id);
     525           4 :                 bat_iterator_end(&bi);
     526           4 :                 id = BUNtoid(tokenBAT[i].idx, id);
     527           4 :                 lngth += strlen(parts[i]);
     528             :         }
     529             : 
     530           1 :         *val = (str) GDKmalloc(lngth+depth+1);
     531           1 :         if( *val == NULL)
     532           0 :                 throw(MAL, "tokenizer.takeOid", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     533             :         s = *val;
     534             : 
     535           5 :         for (i = 0; i < depth; i++) {
     536           4 :                 strcpy(s, parts[i]);
     537           4 :                 s += strlen(parts[i]);
     538           4 :                 *s++ = '/';
     539             :         }
     540           1 :         *s = '\0';
     541             : 
     542           1 :         return MAL_SUCCEED;
     543             : }
     544             : 
     545             : static str
     546           1 : TKNZRtakeOid(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     547             : {
     548           1 :         str ret, val = NULL;
     549             :         oid id;
     550             :         (void) cntxt;
     551             :         (void) mb;
     552             : 
     553           1 :         if (TRANS == NULL) {
     554           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     555             :         }
     556           1 :         id = *getArgReference_oid(stk, pci, 1);
     557           1 :         ret = takeOid(id, &val);
     558           1 :         if (ret == MAL_SUCCEED) {
     559           1 :                 VALset(&stk->stk[pci->argv[0]], TYPE_str, val);
     560             :         }
     561             :         return ret;
     562             : }
     563             : 
     564             : static str
     565           1 : TKNZRgetIndex(bat *r)
     566             : {
     567           1 :         if (TRANS == NULL)
     568           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     569           1 :         *r = tokenBAT[INDEX].val->batCacheid;
     570           1 :         BBPretain(*r);
     571           1 :         return MAL_SUCCEED;
     572             : }
     573             : 
     574             : static str
     575           4 : TKNZRgetLevel(bat *r, int *level)
     576             : {
     577             :         BAT* view;
     578           4 :         if (TRANS == NULL)
     579           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     580           4 :         if (*level < 0 || *level >= tokenDepth)
     581           0 :                 throw(MAL, "tokenizer.getLevel", OPERATION_FAILED " illegal level");
     582           4 :         view = VIEWcreate(tokenBAT[*level].val->hseqbase, tokenBAT[*level].val);
     583           4 :         if (view == NULL)
     584           0 :                 throw(MAL, "tokenizer.getLevel", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     585           4 :         *r = view->batCacheid;
     586             : 
     587           4 :         BBPkeepref(*r);
     588           4 :         return MAL_SUCCEED;
     589             : }
     590             : 
     591             : static str
     592           1 : TKNZRgetCount(bat *r)
     593             : {
     594             :         BAT *b;
     595             :         int i;
     596             :         lng cnt;
     597             : 
     598           1 :         if (TRANS == NULL)
     599           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     600           1 :         b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
     601           1 :         if (b == NULL)
     602           0 :                 throw(MAL, "tokenizer.getCount", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     603           5 :         for (i = 0; i < tokenDepth; i++) {
     604           4 :                 cnt = (lng) BATcount(tokenBAT[i].val);
     605           4 :                 if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
     606           0 :                         BBPreclaim(b);
     607           0 :                         throw(MAL, "tokenizer", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     608             :                 }
     609             :         }
     610           1 :         BATsetcount(b, tokenDepth);
     611           1 :         *r = b->batCacheid;
     612           1 :         BBPkeepref(*r);
     613           1 :         return MAL_SUCCEED;
     614             : }
     615             : 
     616             : static str
     617           1 : TKNZRgetCardinality(bat *r)
     618             : {
     619             :         BAT *b, *en;
     620             :         int i;
     621             :         lng cnt;
     622             : 
     623           1 :         if (TRANS == NULL)
     624           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     625           1 :         b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
     626           1 :         if (b == NULL)
     627           0 :                 throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     628           5 :         for (i = 0; i < tokenDepth; i++) {
     629           4 :                 if ((en = BATunique(tokenBAT[i].val, NULL)) == NULL) {
     630           0 :                         BBPreclaim(b);
     631           0 :                         throw(MAL, "tokenizer.getCardinality", GDK_EXCEPTION);
     632             :                 }
     633           4 :                 cnt = (lng) canditer_init(&(struct canditer){0}, NULL, en);
     634           4 :                 BBPunfix(en->batCacheid);
     635           4 :                 if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
     636           0 :                         BBPreclaim(b);
     637           0 :                         throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     638             :                 }
     639             :         }
     640             : 
     641           1 :         BATsetcount(b, tokenDepth);
     642           1 :         *r = b->batCacheid;
     643           1 :         BBPkeepref(*r);
     644           1 :         return MAL_SUCCEED;
     645             : }
     646             : 
     647             : #include "mel.h"
     648             : mel_func tokenizer_init_funcs[] = {
     649             :  command("tokenizer", "open", TKNZRopen, false, "open the named tokenizer store, a new one is created if the specified name does not exist", args(1,2, arg("",void),arg("name",str))),
     650             :  command("tokenizer", "close", TKNZRclose, false, "close the current tokenizer store", args(1,1, arg("",void))),
     651             :  pattern("tokenizer", "take", TKNZRtakeOid, false, "reconstruct and returns the i-th string", args(1,2, arg("",str),arg("i",oid))),
     652             :  pattern("tokenizer", "locate", TKNZRlocate, false, "if the given string is in the store returns its oid, otherwise oid_nil", args(1,2, arg("",oid),arg("s",str))),
     653             :  command("tokenizer", "append", TKNZRappend, false, "tokenize a new string and append it to the tokenizer (duplicate elimination is performed)", args(1,2, arg("",oid),arg("u",str))),
     654             :  command("tokenizer", "depositFile", TKNZRdepositFile, false, "batch insertion from a file of strings to tokenize, each string is separated by a new line", args(1,2, arg("",void),arg("fnme",str))),
     655             :  command("tokenizer", "getLevel", TKNZRgetLevel, false, "administrative function that returns the bat on level i", args(1,2, batarg("",str),arg("i",int))),
     656             :  command("tokenizer", "getIndex", TKNZRgetIndex, false, "administrative function that returns the INDEX bat", args(1,1, batarg("",oid))),
     657             :  command("tokenizer", "getCount", TKNZRgetCount, false, "debugging function that returns the size of the bats at each level", args(1,1, batarg("",lng))),
     658             :  command("tokenizer", "getCardinality", TKNZRgetCardinality, false, "debugging function that returns the unique tokens at each level", args(1,1, batarg("",lng))),
     659             :  { .imp=NULL }
     660             : };
     661             : #include "mal_import.h"
     662             : #ifdef _MSC_VER
     663             : #undef read
     664             : #pragma section(".CRT$XCU",read)
     665             : #endif
     666         259 : LIB_STARTUP_FUNC(init_tokenizer_mal)
     667         259 : { mal_module("tokenizer", NULL, tokenizer_init_funcs); }

Generated by: LCOV version 1.14