LCOV - code coverage report
Current view: top level - monetdb5/modules/mal - tokenizer.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 180 306 58.8 %
Date: 2021-01-13 20:07:21 Functions: 13 14 92.9 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * author Lefteris Sidirourgos
      11             :  * Tokenizer
      12             :  * This module implements a vertical fragmented tokenizer for strings.
      13             :  * It is based on the ideas of the urlbox module by mk.
      14             :  *
      15             :  * The input string is tokenized according to a separator character.
      16             :  * Each token is inserted to the next BAT with the same order of
      17             :  * appearance in the string. We currently support 255 tokens in each
      18             :  * string as this module is intended for use with short and similar
      19             :  * strings such as URLs. In addition we maintain a 2-dimensional index
      20             :  * that points to the depth and height of the last token of each string.
      21             :  * The 2-dimensional index is combined to one BAT where the 8 least
      22             :  * significant bits represent the depth, and the rest bits the height.
      23             :  *
      24             :  * The tokenizer can be accessed in two ways. Given the oid retrieve the
      25             :  * re-constructed string, or given a string return its oid if present,
      26             :  * otherwise nil.
      27             :  *
      28             :  * Strings can be added either in batch (from a file or a bat of
      29             :  * strings) and by appending a single string. Duplicate elimination is
      30             :  * always performed.
      31             :  *
      32             :  * There can be only one tokenizer open at the same time. This is
      33             :  * achieved by setting a TRANSaction bat. This might change in the
      34             :  * future. However there can be more than one tokenizers stored in the
      35             :  * disk, each of which is identified by its name (usually the name of
      36             :  * the active schema of the db). These administrative issues and
      37             :  * security aspects (e.g., opening a tokenizer of a different schema)
      38             :  * should be addressed more thoroughly.
      39             :  */
      40             : #include "monetdb_config.h"
      41             : #include "bat5.h"
      42             : #include "tokenizer.h"
      43             : #include "mal_linker.h"
      44             : 
      45             : #define MAX_TKNZR_DEPTH 256
      46             : #define INDEX MAX_TKNZR_DEPTH
      47             : static int tokenDepth = 0;
      48             : struct {
      49             :         BAT *idx, *val;
      50             : } tokenBAT[MAX_TKNZR_DEPTH + 1];
      51             : 
      52             : static BAT *TRANS = NULL;   /* the catalog of tokenizers */
      53             : static char name[128];
      54             : 
      55             : #if SIZEOF_OID == 4 /* 32-bit oid */
      56             : #define MAX_h ((((oid) 1) << 23) - 1)
      57             : #else /* 64-bit oid */
      58             : #define MAX_h ((((oid) 1) << 55) - 1)
      59             : #endif
      60             : 
      61             : #define COMP(h, d) ((h << 8) | (d & 255))
      62             : #define GET_d(x) ((sht) ((x) & 255))
      63             : #define GET_h(x) ((x) >> 8)
      64             : 
      65          32 : static int prvlocate(BAT* b, BAT* bidx, oid *prv, str part)
      66             : {
      67          32 :         BATiter bi = bat_iterator(b);
      68             :         BUN p;
      69             : 
      70          32 :         if (BAThash(b) == GDK_SUCCEED) {
      71          82 :                 HASHloop_str(bi, b->thash, p, part) {
      72          31 :                         if (BUNtoid(bidx, p) == *prv) {
      73          25 :                                 *prv = (oid) p;
      74          25 :                                 return TRUE;
      75             :                         }
      76             :                 }
      77             :         } else {
      78             :                 /* hash failed, slow scan */
      79             :                 BUN q;
      80             : 
      81           0 :                 BATloop(b, p, q) {
      82           0 :                         if (BUNtoid(bidx, p) == *prv &&
      83           0 :                                 strcmp(BUNtail(bi, p), part) == 0) {
      84           0 :                                 *prv = (oid) p;
      85           0 :                                 return TRUE;
      86             :                         }
      87             :                 }
      88             :         }
      89             :         return FALSE;
      90             : }
      91             : 
      92             : str
      93           1 : TKNZRopen(void *ret, str *in)
      94             : {
      95             :         int depth;
      96             :         bat r;
      97             :         bat idx;
      98             :         char batname[134];
      99             :         BAT *b;
     100             : 
     101             :         (void) ret;
     102           1 :         if (strlen(*in) > 127)
     103           0 :                 throw(MAL, "tokenizer.open",
     104             :                           ILLEGAL_ARGUMENT " tokenizer name too long");
     105             : 
     106           1 :         MT_lock_set(&mal_contextLock);
     107           1 :         if (TRANS != NULL) {
     108           0 :                 MT_lock_unset(&mal_contextLock);
     109           0 :                 throw(MAL, "tokenizer.open", "Another tokenizer is already open");
     110             :         }
     111             : 
     112         257 :         for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
     113         256 :                 tokenBAT[depth].idx = 0;
     114         256 :                 tokenBAT[depth].val = 0;
     115             :         }
     116           1 :         tokenDepth = 0;
     117             : 
     118           1 :         TRANS = COLnew(0, TYPE_str, MAX_TKNZR_DEPTH + 1, TRANSIENT);
     119           1 :         if (TRANS == NULL) {
     120           0 :                 MT_lock_unset(&mal_contextLock);
     121           0 :                 throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     122             :         }
     123             :         /* now we are sure that none overwrites the tokenizer table*/
     124           1 :         MT_lock_unset(&mal_contextLock);
     125             : 
     126           1 :         snprintf(name, 128, "%s", *in);
     127             : 
     128           1 :         snprintf(batname, sizeof(batname), "%s_index", name);
     129           1 :         idx = BBPindex(batname);
     130             : 
     131           1 :         if (idx == 0) { /* new tokenizer */
     132           1 :                 b = COLnew(0, TYPE_oid, 1024, PERSISTENT);
     133           1 :                 if (b == NULL)
     134           0 :                         throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     135             :                 str msg;
     136           2 :                 if ((msg = BKCsetName(&r, &b->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
     137           2 :                         (msg = BKCsetPersistent(&r, &b->batCacheid)) != MAL_SUCCEED ||
     138           1 :                         BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     139           0 :                         BBPreclaim(b);
     140           0 :                         if (msg)
     141           0 :                                 return msg;
     142           0 :                         throw(MAL, "tokenizer.open", GDK_EXCEPTION);
     143             :                 }
     144           1 :                 tokenBAT[INDEX].val = b;
     145             :         } else { /* existing tokenizer */
     146           0 :                 tokenBAT[INDEX].val = BATdescriptor(idx);
     147             : 
     148           0 :                 if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     149           0 :                         BBPunfix(tokenBAT[INDEX].val->batCacheid);
     150           0 :                         tokenBAT[INDEX].val = NULL;
     151           0 :                         throw(MAL, "tokenizer.open", OPERATION_FAILED);
     152             :                 }
     153             : 
     154           0 :                 for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
     155           0 :                         snprintf(batname, sizeof(batname), "%s_%d", name, depth);
     156           0 :                         idx = BBPindex(batname);
     157           0 :                         if (idx == 0)
     158             :                                 break;
     159           0 :                         tokenBAT[depth].val = BATdescriptor(idx);
     160           0 :                         if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     161           0 :                                 BBPunfix(tokenBAT[depth].val->batCacheid);
     162           0 :                                 tokenBAT[depth].val = NULL;
     163           0 :                                 throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     164             :                         }
     165             : 
     166             :                         /* For idx BATs */
     167           0 :                         snprintf(batname, sizeof(batname), "%s_idx_%d", name, depth);
     168           0 :                         idx = BBPindex(batname);
     169           0 :                         if (idx == 0)
     170             :                                 break;
     171           0 :                         tokenBAT[depth].idx = BATdescriptor(idx);
     172           0 :                         if (BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     173           0 :                                 BBPunfix(tokenBAT[depth].idx->batCacheid);
     174           0 :                                 tokenBAT[depth].idx = NULL;
     175           0 :                                 throw(MAL, "tokenizer.open", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     176             :                         }
     177             : 
     178             :                 }
     179           0 :                 tokenDepth = depth;
     180             :         }
     181             : 
     182             :         return MAL_SUCCEED;
     183             : }
     184             : 
     185             : str
     186           1 : TKNZRclose(void *r)
     187             : {
     188             :         int i;
     189             :         (void) r;
     190             : 
     191           1 :         if (TRANS == NULL)
     192           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     193             : 
     194           1 :         TMsubcommit(TRANS);
     195             : 
     196           5 :         for (i = 0; i < tokenDepth; i++) {
     197           4 :                 BBPunfix(tokenBAT[i].idx->batCacheid);
     198           4 :                 BBPunfix(tokenBAT[i].val->batCacheid);
     199             :         }
     200           1 :         BBPunfix(tokenBAT[INDEX].val->batCacheid);
     201           1 :         tokenDepth = 0;
     202             : 
     203           1 :         BBPreclaim(TRANS);
     204           1 :         TRANS = NULL;
     205           1 :         return MAL_SUCCEED;
     206             : }
     207             : 
     208             : /*
     209             :  * Tokenize operations
     210             :  * The tokenizer operation assumes a private copy to mark the end of the
     211             :  * token separators with a zero byte. Tokens are separated by a single
     212             :  * character for simplicity.  Might be a good scheme to assume that
     213             :  * strings to be broken are properly ended with either 0 or nl, not
     214             :  * both.  It seems 0 can be assumed.
     215             :  */
     216             : static int
     217          15 : TKNZRtokenize(str in, str *parts, char tkn)
     218             : {
     219             :         char *s, *t;
     220             :         int depth = 0;
     221             : 
     222             :         s = in;
     223          68 :         while (*s && *s != '\n') {
     224             :                 t = s;
     225         356 :                 while (*t != tkn && *t != '\n' && *t)
     226         303 :                         t++;
     227          53 :                 parts[depth++] = s;
     228          53 :                 s = t + (*t != 0);
     229          53 :                 *t = 0;
     230          53 :                 if (depth > MAX_TKNZR_DEPTH)
     231             :                         break;
     232             :         }
     233          15 :         return depth;
     234             : }
     235             : 
     236             : str
     237          14 : TKNZRappend(oid *pos, str *s)
     238             : {
     239             :         str url;
     240             :         char batname[132];
     241             :         str parts[MAX_TKNZR_DEPTH];
     242             :         str msg;
     243             :         int i, new, depth;
     244             :         bat r;
     245             :         BAT *bVal;
     246             :         BAT *bIdx;
     247             :         BUN p;
     248             :         BUN idx = 0;
     249          14 :         oid prv = 0;
     250             :         oid comp;
     251             : 
     252          14 :         if (TRANS == NULL)
     253           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     254             : 
     255          14 :         if ((url = GDKstrdup(*s)) == NULL) {
     256           0 :                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     257             :         }
     258             : 
     259          14 :         depth = TKNZRtokenize(url, parts, '/');
     260             :         new = depth;
     261             : 
     262          14 :         if (depth == 0) {
     263           0 :                 GDKfree(url);
     264           0 :                 return MAL_SUCCEED;
     265             :         }
     266          14 :         if (depth > MAX_TKNZR_DEPTH) {
     267           0 :                 GDKfree(url);
     268           0 :                 throw(MAL, "tokenizer",
     269             :                                 ILLEGAL_ARGUMENT "input string breaks to too many parts");
     270             :         }
     271          14 :         if (depth > tokenDepth || tokenBAT[0].val == NULL) {
     272             :                 new = tokenDepth;
     273           6 :                 for (i = tokenDepth; i < depth; i++) {
     274             :                         /* make new bat for value */
     275           4 :                         snprintf(batname, sizeof(batname), "%s_%d", name, i);
     276           4 :                         bVal = COLnew(0, TYPE_str, 1024, PERSISTENT);
     277           4 :                         if (bVal == NULL) {
     278           0 :                                 GDKfree(url);
     279           0 :                                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     280             :                         }
     281             : 
     282           4 :                         tokenBAT[i].val = bVal;
     283             : 
     284           8 :                         if ((msg = BKCsetName(&r, &bVal->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
     285           8 :                                 (msg = BKCsetPersistent(&r, &bVal->batCacheid)) != MAL_SUCCEED ||
     286           4 :                                 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     287           0 :                                 GDKfree(url);
     288           0 :                                 return msg ? msg : createException(MAL, "tokenizer.append", GDK_EXCEPTION);
     289             :                         }
     290             : 
     291             :                         /* make new bat for index */
     292           4 :                         snprintf(batname, sizeof(batname), "%s_idx_%d", name, i);
     293           4 :                         bIdx = COLnew(0, TYPE_oid, 1024, PERSISTENT);
     294           4 :                         if (bIdx == NULL) {
     295           0 :                                 GDKfree(url);
     296           0 :                                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     297             :                         }
     298             : 
     299           4 :                         tokenBAT[i].idx = bIdx;
     300             : 
     301           8 :                         if ((msg = BKCsetName(&r, &bIdx->batCacheid, &(const char*){batname})) != MAL_SUCCEED ||
     302           8 :                                 (msg = BKCsetPersistent(&r, &bIdx->batCacheid)) != MAL_SUCCEED ||
     303           4 :                                 BUNappend(TRANS, batname, false) != GDK_SUCCEED) {
     304           0 :                                 GDKfree(url);
     305           0 :                                 return msg ? msg : createException(MAL, "tokenizer.append", GDK_EXCEPTION);
     306             :                         }
     307             : 
     308             :                 }
     309           2 :                 tokenDepth = depth;
     310             :         }
     311             : 
     312             :         /* findcommn */
     313          14 :         p = BUNfnd(tokenBAT[0].val, parts[0]);
     314          14 :         if (p != BUN_NONE) {
     315          12 :                 prv = (oid) p;
     316          34 :                 for (i = 1; i < new; i++) {
     317          29 :                         if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, &prv, parts[i]))
     318             :                                 break;
     319             :                 }
     320             :         } else {
     321             :                 i = 0;
     322             :         }
     323             : 
     324          14 :         if (i == depth) {
     325           4 :                 comp = COMP(prv, depth);
     326           4 :                 *pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
     327           4 :                 if (*pos != BUN_NONE) {
     328             :                         /* the string is already there */
     329             :                         /* printf("The string %s is already there",url); */
     330           3 :                         GDKfree(url);
     331           3 :                         return MAL_SUCCEED;
     332             :                 }
     333             :         }
     334             : 
     335             :         /* insremainder */
     336          26 :         for (; i < depth; i++) {
     337          15 :                 idx = BATcount(tokenBAT[i].val);
     338          15 :                 if (idx > MAX_h) {
     339           0 :                         GDKfree(url);
     340           0 :                         throw(MAL, "tokenizer.append",
     341             :                                         OPERATION_FAILED " no more free oid's");
     342             :                 }
     343          15 :                 if (BUNappend(tokenBAT[i].val, parts[i], false) != GDK_SUCCEED) {
     344           0 :                         GDKfree(url);
     345           0 :                         throw(MAL, "tokenizer.append",
     346             :                                         OPERATION_FAILED " could not append");
     347             :                 }
     348             : 
     349          15 :                 if (BUNappend(tokenBAT[i].idx, (ptr) & prv, false) != GDK_SUCCEED) {
     350           0 :                         GDKfree(url);
     351           0 :                         throw(MAL, "tokenizer.append",
     352             :                                         OPERATION_FAILED " could not append");
     353             :                 }
     354             : 
     355          15 :                 prv = (oid) idx;
     356             :         }
     357             : 
     358          11 :         *pos = (oid) BATcount(tokenBAT[INDEX].val);
     359          11 :         comp = COMP(prv, depth);
     360          11 :         if (BUNappend(tokenBAT[INDEX].val, &comp, false) != GDK_SUCCEED) {
     361           0 :                 GDKfree(url);
     362           0 :                 throw(MAL, "tokenizer.append", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     363             :         }
     364             : 
     365          11 :         GDKfree(url);
     366          11 :         return MAL_SUCCEED;
     367             : }
     368             : 
     369             : #define SIZE (1 * 1024 * 1024)
     370             : str
     371           0 : TKNZRdepositFile(void *r, str *fnme)
     372             : {
     373             :         stream *fs;
     374             :         bstream *bs;
     375             :         char *s, *t;
     376             :         int len = 0;
     377             :         char buf[FILENAME_MAX];
     378             :         oid pos;
     379             :         str msg= MAL_SUCCEED;
     380             : 
     381           0 :         if (TRANS == NULL)
     382           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     383             : 
     384             :         (void) r;
     385           0 :         if (**fnme == '/')
     386           0 :                 len = snprintf(buf, FILENAME_MAX, "%s", *fnme);
     387             :         else
     388           0 :                 len = snprintf(buf, FILENAME_MAX, "%s/%s", monet_cwd, *fnme);
     389           0 :         if (len == -1 || len >= FILENAME_MAX)
     390           0 :                 throw(MAL, "tokenizer.depositFile", SQLSTATE(HY013) "tokenizer filename path is too large");
     391             :         /* later, handle directory separator */
     392           0 :         fs = open_rastream(buf);
     393           0 :         if (fs == NULL)
     394           0 :                 throw(MAL, "tokenizer.depositFile", "%s", mnstr_peek_error(NULL));
     395           0 :         if (mnstr_errnr(fs)) {
     396           0 :                 close_stream(fs);
     397           0 :                 throw(MAL, "tokenizer.depositFile", "%s", mnstr_peek_error(NULL));
     398             :         }
     399           0 :         bs = bstream_create(fs, SIZE);
     400           0 :         if (bs == NULL)
     401           0 :                 throw(MAL, "tokenizer.depositFile", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     402           0 :         while (bstream_read(bs, bs->size - (bs->len - bs->pos)) != 0 &&
     403           0 :                    !mnstr_errnr(bs->s))
     404             :         {
     405           0 :                 s = bs->buf;
     406           0 :                 for (t = s; *t;) {
     407           0 :                         while (t < bs->buf + bs->len && *t && *t != '\n')
     408           0 :                                 t++;
     409           0 :                         if (t == bs->buf + bs->len || *t != '\n') {
     410             :                                 /* read next block if possible after shift  */
     411             :                                 assert(t - s <= INT_MAX);
     412           0 :                                 len = (int) (t - s);
     413           0 :                                 memcpy(bs->buf, s, len);
     414           0 :                                 bs->len = len;
     415           0 :                                 bs->pos = 0;
     416           0 :                                 break;
     417             :                         }
     418             :                         /* found a string to be processed */
     419           0 :                         *t = 0;
     420           0 :                         msg = TKNZRappend(&pos, &s);
     421           0 :                         if (msg ) {
     422           0 :                                 bstream_destroy(bs);
     423           0 :                                 close_stream(fs);
     424           0 :                                 return msg;
     425             :                         }
     426           0 :                         *t = '\n';
     427           0 :                         s = t + 1;
     428             :                         t = s;
     429             :                 }
     430             :         }
     431             : 
     432           0 :         bstream_destroy(bs);
     433           0 :         close_stream(fs);
     434           0 :         return MAL_SUCCEED;
     435             : }
     436             : 
     437             : str
     438           1 : TKNZRlocate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     439             : {
     440             :         oid pos;
     441             :         str url;
     442             :         str parts[MAX_TKNZR_DEPTH];
     443             :         int i = 0, depth;
     444             :         BUN p;
     445           1 :         oid prv = 0;
     446             :         oid comp;
     447             :         (void) cntxt;
     448             :         (void) mb;
     449             : 
     450           1 :         if (TRANS == NULL)
     451           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     452             : 
     453           1 :         url = (str) GDKmalloc(sizeof(char) *
     454           1 :                         (strlen(*getArgReference_str(stk, pci, 1)) + 1));
     455           1 :         if (url == NULL)
     456           0 :                 throw(MAL, "tokenizer.locate", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     457           1 :         strcpy(url, *getArgReference_str(stk, pci, 1));
     458             : 
     459             : 
     460           1 :         depth = TKNZRtokenize(url, parts, '/');
     461             : 
     462           1 :         if (depth == 0) {
     463           0 :                 pos = oid_nil;
     464           1 :         } else if (depth > MAX_TKNZR_DEPTH) {
     465           0 :                 GDKfree(url);
     466           0 :                 throw(MAL, "tokenizer.locate",
     467             :                                 ILLEGAL_ARGUMENT "strings breaks to too many parts");
     468           1 :         } else if (depth > tokenDepth) {
     469           0 :                 pos = oid_nil;
     470             :         } else {
     471           1 :                 p = BUNfnd(tokenBAT[0].val, parts[0]);
     472           1 :                 if (p != BUN_NONE) {
     473           1 :                         prv = (oid) p;
     474           4 :                         for (i = 1; i < depth; i++) {
     475           3 :                                 if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, (ptr) & prv, parts[i]))
     476             :                                         break;
     477             :                         }
     478           1 :                         if (i < depth) {
     479           0 :                                 pos = oid_nil;
     480             :                         } else {
     481           1 :                                 comp = COMP(prv, i);
     482           1 :                                 pos = BUNfnd(tokenBAT[INDEX].val, (ptr) & comp);
     483             :                         }
     484             :                 } else {
     485           0 :                         pos = oid_nil;
     486             :                 }
     487             :         }
     488             : 
     489           1 :         VALset(&stk->stk[pci->argv[0]], TYPE_oid, &pos);
     490           1 :         GDKfree(url);
     491           1 :         return MAL_SUCCEED;
     492             : }
     493             : 
     494             : str
     495           1 : takeOid(oid id, str *val)
     496             : {
     497             :         int i, depth;
     498             :         str parts[MAX_TKNZR_DEPTH];
     499             :         size_t lngth = 0;
     500             :         str s;
     501             : 
     502           1 :         if (id >= BATcount(tokenBAT[INDEX].val)) {
     503           0 :                 throw(MAL, "tokenizer.takeOid", OPERATION_FAILED " illegal oid");
     504             :         }
     505             : 
     506           1 :         id = *(oid *) Tloc(tokenBAT[INDEX].val, id);
     507             : 
     508           1 :         depth = GET_d(id);
     509           1 :         id = GET_h(id);
     510             : 
     511           5 :         for (i = depth - 1; i >= 0; i--) {
     512           4 :                 BATiter bi = bat_iterator(tokenBAT[i].val);
     513           4 :                 parts[i] = (str) BUNtvar(bi, id);
     514           4 :                 id = BUNtoid(tokenBAT[i].idx, id);
     515           4 :                 lngth += strlen(parts[i]);
     516             :         }
     517             : 
     518           1 :         *val = (str) GDKmalloc(lngth+depth+1);
     519           1 :         if( *val == NULL)
     520           0 :                 throw(MAL, "tokenizer.takeOid", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     521             :         s = *val;
     522             : 
     523           5 :         for (i = 0; i < depth; i++) {
     524           4 :                 strcpy(s, parts[i]);
     525           4 :                 s += strlen(parts[i]);
     526           4 :                 *s++ = '/';
     527             :         }
     528           1 :         *s = '\0';
     529             : 
     530           1 :         return MAL_SUCCEED;
     531             : }
     532             : 
     533             : str
     534           1 : TKNZRtakeOid(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
     535             : {
     536           1 :         str ret, val = NULL;
     537             :         oid id;
     538             :         (void) cntxt;
     539             :         (void) mb;
     540             : 
     541           1 :         if (TRANS == NULL) {
     542           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     543             :         }
     544           1 :         id = *getArgReference_oid(stk, pci, 1);
     545           1 :         ret = takeOid(id, &val);
     546           1 :         if (ret == MAL_SUCCEED) {
     547           1 :                 VALset(&stk->stk[pci->argv[0]], TYPE_str, val);
     548             :         }
     549             :         return ret;
     550             : }
     551             : 
     552             : str
     553           1 : TKNZRgetIndex(bat *r)
     554             : {
     555           1 :         if (TRANS == NULL)
     556           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     557           1 :         *r = tokenBAT[INDEX].val->batCacheid;
     558           1 :         BBPretain(*r);
     559           1 :         return MAL_SUCCEED;
     560             : }
     561             : 
     562             : str
     563           4 : TKNZRgetLevel(bat *r, int *level)
     564             : {
     565             :         BAT* view;
     566           4 :         if (TRANS == NULL)
     567           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     568           4 :         if (*level < 0 || *level >= tokenDepth)
     569           0 :                 throw(MAL, "tokenizer.getLevel", OPERATION_FAILED " illegal level");
     570           4 :         view = VIEWcreate(tokenBAT[*level].val->hseqbase, tokenBAT[*level].val);
     571           4 :         if (view == NULL)
     572           0 :                 throw(MAL, "tokenizer.getLevel", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     573           4 :         *r = view->batCacheid;
     574             : 
     575           4 :         BBPkeepref(*r);
     576           4 :         return MAL_SUCCEED;
     577             : }
     578             : 
     579             : str
     580           1 : TKNZRgetCount(bat *r)
     581             : {
     582             :         BAT *b;
     583             :         int i;
     584             :         lng cnt;
     585             : 
     586           1 :         if (TRANS == NULL)
     587           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     588           1 :         b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
     589           1 :         if (b == NULL)
     590           0 :                 throw(MAL, "tokenizer.getCount", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     591           5 :         for (i = 0; i < tokenDepth; i++) {
     592           4 :                 cnt = (lng) BATcount(tokenBAT[i].val);
     593           4 :                 if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
     594           0 :                         BBPreclaim(b);
     595           0 :                         throw(MAL, "tokenizer", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     596             :                 }
     597             :         }
     598           1 :         BATsetcount(b, tokenDepth);
     599           1 :         *r = b->batCacheid;
     600           1 :         BBPkeepref(*r);
     601           1 :         return MAL_SUCCEED;
     602             : }
     603             : 
     604             : str
     605           1 : TKNZRgetCardinality(bat *r)
     606             : {
     607             :         BAT *b, *en;
     608             :         int i;
     609             :         lng cnt;
     610             : 
     611           1 :         if (TRANS == NULL)
     612           0 :                 throw(MAL, "tokenizer", "no tokenizer store open");
     613           1 :         b = COLnew(0, TYPE_lng, tokenDepth + 1, TRANSIENT);
     614           1 :         if (b == NULL)
     615           0 :                 throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     616           5 :         for (i = 0; i < tokenDepth; i++) {
     617           4 :                 if ((en = BATunique(tokenBAT[i].val, NULL)) == NULL) {
     618           0 :                         BBPreclaim(b);
     619           0 :                         throw(MAL, "tokenizer.getCardinality", GDK_EXCEPTION);
     620             :                 }
     621           4 :                 cnt = (lng) BATcount(en);
     622           4 :                 BBPunfix(en->batCacheid);
     623           4 :                 if (BUNappend(b, &cnt, false) != GDK_SUCCEED) {
     624           0 :                         BBPreclaim(b);
     625           0 :                         throw(MAL, "tokenizer.getCardinality", SQLSTATE(HY013) MAL_MALLOC_FAIL);
     626             :                 }
     627             :         }
     628             : 
     629           1 :         BATsetcount(b, tokenDepth);
     630           1 :         *r = b->batCacheid;
     631           1 :         BBPkeepref(*r);
     632           1 :         return MAL_SUCCEED;
     633             : }
     634             : 
     635             : #include "mel.h"
     636             : mel_func tokenizer_init_funcs[] = {
     637             :  command("tokenizer", "open", TKNZRopen, false, "open the named tokenizer store, a new one is created if the specified name does not exist", args(1,2, arg("",void),arg("name",str))),
     638             :  command("tokenizer", "close", TKNZRclose, false, "close the current tokenizer store", args(1,1, arg("",void))),
     639             :  pattern("tokenizer", "take", TKNZRtakeOid, false, "reconstruct and returns the i-th string", args(1,2, arg("",str),arg("i",oid))),
     640             :  pattern("tokenizer", "locate", TKNZRlocate, false, "if the given string is in the store returns its oid, otherwise oid_nil", args(1,2, arg("",oid),arg("s",str))),
     641             :  command("tokenizer", "append", TKNZRappend, false, "tokenize a new string and append it to the tokenizer (duplicate elimination is performed)", args(1,2, arg("",oid),arg("u",str))),
     642             :  command("tokenizer", "depositFile", TKNZRdepositFile, false, "batch insertion from a file of strings to tokenize, each string is separated by a new line", args(1,2, arg("",void),arg("fnme",str))),
     643             :  command("tokenizer", "getLevel", TKNZRgetLevel, false, "administrative function that returns the bat on level i", args(1,2, batarg("",str),arg("i",int))),
     644             :  command("tokenizer", "getIndex", TKNZRgetIndex, false, "administrative function that returns the INDEX bat", args(1,1, batarg("",oid))),
     645             :  command("tokenizer", "getCount", TKNZRgetCount, false, "debugging function that returns the size of the bats at each level", args(1,1, batarg("",lng))),
     646             :  command("tokenizer", "getCardinality", TKNZRgetCardinality, false, "debugging function that returns the unique tokens at each level", args(1,1, batarg("",lng))),
     647             :  { .imp=NULL }
     648             : };
     649             : #include "mal_import.h"
     650             : #ifdef _MSC_VER
     651             : #undef read
     652             : #pragma section(".CRT$XCU",read)
     653             : #endif
     654         255 : LIB_STARTUP_FUNC(init_tokenizer_mal)
     655         255 : { mal_module("tokenizer", NULL, tokenizer_init_funcs); }

Generated by: LCOV version 1.14