LCOV - code coverage report
Current view: top level - gdk - gdk_posix.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 39 84 46.4 %
Date: 2021-09-14 19:48:19 Functions: 7 9 77.8 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
       7             :  */
       8             : 
       9             : /*
      10             :  * @a Niels Nes, Peter Boncz
      11             :  * @* System Independent Layer
      12             :  *
      13             :  * GDK is built on Posix. Exceptions are made for memory mapped files
      14             :  * and anonymous virtual memory, for which somewhat higher-level
      15             :  * functions are defined here.  Most of this file concerns itself with
      16             :  * emulation of Posix functionality on the WIN32 native platform.
      17             :  */
      18             : #include "monetdb_config.h"
      19             : #include "gdk.h"              /* includes gdk_posix.h */
      20             : #include "gdk_private.h"
      21             : #include "mutils.h"
      22             : #include <unistd.h>
      23             : #include <string.h>     /* strncpy */
      24             : 
      25             : #ifdef HAVE_FCNTL_H
      26             : # include <fcntl.h>
      27             : #endif
      28             : #ifdef HAVE_PROCFS_H
      29             : # include <procfs.h>
      30             : #endif
      31             : #ifdef HAVE_MACH_TASK_H
      32             : # include <mach/task.h>
      33             : #endif
      34             : #ifdef HAVE_MACH_MACH_INIT_H
      35             : # include <mach/mach_init.h>
      36             : #endif
      37             : #if defined(HAVE_KVM_H)
      38             : # include <kvm.h>
      39             : # include <sys/param.h>
      40             : # include <sys/sysctl.h>
      41             : # include <sys/user.h>
      42             : #endif
      43             : 
      44             : #if defined(__GNUC__) && defined(HAVE_VALGRIND)
      45             : #include <valgrind.h>
      46             : #else
      47             : #define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed)
      48             : #define VALGRIND_FREELIKE_BLOCK(addr, rzB)
      49             : #define VALGRIND_RESIZEINPLACE_BLOCK(addr, oldSizeB, newSizeB, rzB)
      50             : #endif
      51             : 
      52             : #ifndef MAP_NORESERVE
      53             : # define MAP_NORESERVE          MAP_PRIVATE
      54             : #endif
      55             : #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
      56             : #define MAP_ANONYMOUS           MAP_ANON
      57             : #endif
      58             : 
      59             : #define MMAP_ADVISE             7
      60             : #define MMAP_WRITABLE           (MMAP_WRITE|MMAP_COPY)
      61             : 
      62             : #ifndef O_CLOEXEC
      63             : #define O_CLOEXEC 0
      64             : #endif
      65             : 
      66             : /* Crude VM buffer management that keep a list of all memory mapped
      67             :  * regions.
      68             :  *
      69             :  * a.k.a. "helping stupid VM implementations that ignore VM advice"
      70             :  *
      71             :  * The main goal is to be able to tell the OS to please stop buffering
      72             :  * all memory mapped pages when under pressure. A major problem is
      73             :  * materialization of large results in newly created memory mapped
      74             :  * files. Operating systems tend to cache all dirty pages, such that
      75             :  * when memory is out, all pages are dirty and cannot be unloaded
      76             :  * quickly. The VM panic occurs and comatose OS states may be
      77             :  * observed.  This is in spite of our use of
      78             :  * madvise(MADV_SEQUENTIAL). That is; we would want that the OS drops
      79             :  * pages after we've passed them. That does not happen; pages are
      80             :  * retained and pollute the buffer cache.
      81             :  *
      82             :  * Regrettably, at this level, we don't know anything about how Monet
      83             :  * is using the mmapped regions. Monet code is totally oblivious of
      84             :  * any I/O; that's why it is so easy to create CPU efficient code in
      85             :  * Monet.
      86             :  *
      87             :  * The current solution focuses on large writable maps. These often
      88             :  * represent newly created BATs, that are the result of some (running)
      89             :  * operator. We assume two things here:
      90             :  * - the BAT is created in sequential fashion (always almost true)
      91             :  * - afterwards, this BAT is used in sequential fashion (often true)
      92             :  *
      93             :  * A VMtrim thread keeps an eye on the RSS (memory pressure) and large
      94             :  * writable memory maps. If RSS approaches mem_maxsize(), it starts to
      95             :  * *worry*, and starts to write dirty data from these writable maps to
      96             :  * disk in 128MB tiles. So, if memory pressure rises further in the
      97             :  * near future, the OS has some option to release memory pages cheaply
      98             :  * (i.e. without needing I/O). This is also done explicitly by the
      99             :  * VM-thread: when RSS exceeds mem_maxsize() is explicitly asks the OS
     100             :  * to release pages.  The reason is that Linux is not smart enough to
     101             :  * do even this. Anyway..
     102             :  *
     103             :  * The way to free pages explicitly in Linux is to call
     104             :  * posix_fadvise(..,MADV_DONTNEED).  Particularly,
     105             :  * posix_madvise(..,POSIX_MADV_DONTNEED) which is supported and
     106             :  * documented doesn't work on Linux. But we do both posix_madvise and
     107             :  * posix_fadvise, so on other unix systems that don't support
     108             :  * posix_fadvise, posix_madvise still might work.  On Windows, to our
     109             :  * knowledge, there is no way to tell it stop buffering a memory
     110             :  * mapped region. msync (FlushViewOfFile) does work, though. So let's
     111             :  * hope the VM paging algorithm behaves better than Linux which just
     112             :  * runs off the cliff and if MonetDB does not prevent RSS from being
     113             :  * too high, enters coma.
     114             :  *
     115             :  * We will only be able to sensibly test this on Windows64. On
     116             :  * Windows32, mmap sizes do not significantly exceed RAM sizes so
     117             :  * MonetDB swapping actually will not happen (of course, you've got
     118             :  * this nasty problem of VM fragemntation and failing mmaps instead).
     119             :  *
     120             :  * In principle, page tiles are saved sequentially, and behind it, but
     121             :  * never overtaking it, is an "unload-cursor" that frees the pages if
     122             :  * that is needed to keep RSS down.  There is a tweak in the
     123             :  * algorithm, that re-sets the unload-cursor if it seems that all
     124             :  * tiles to the end have been saved (whether a tile is actually saved
     125             :  * is determined by timing the sync action). This means that the
     126             :  * producing operator is ready creating the BAT, and we assume it is
     127             :  * going to be used sequentially afterwards.  In that case, we should
     128             :  * start unloading right after the 'read-cursor', that is, from the
     129             :  * start.
     130             :  *
     131             :  * EXAMPLE
     132             :  * D = dirty tile
     133             :  * s = saved tile (i.e. clean)
     134             :  * u = unloaded tile
     135             :  * L = tile that is being loaded
     136             :  *
     137             :  *           +--> operator produces  BAT
     138             :  * (1) DDDDDD|......................................| end of reserved mmap
     139             :  *                      ____|RSS
     140             :  *                     |
     141             :  *                     | at 3/4 of RSS consumed we start to worry
     142             :  *                     +--> operator produces BAT
     143             :  * (2) DDDDDDDDDDDDDDDD|............................|
     144             :  *                    s<----------------------------- VM backwards save thread
     145             :  *                    |
     146             :  *                    + first tile of which saving costs anything
     147             :  *
     148             :  *                        +--> operator produces BAT
     149             :  * (3) DDDDDDDDDDDDDDDss|D|.........................|
     150             :  *     VM-thread save ->|
     151             :  *
     152             :  * When the RSS target is exceeded, we start unloading tiles..
     153             :  *
     154             :  *                     +-->  VM-thread unload starts at *second* 's'
     155             :  *                     |
     156             :  *                     |    +--> operator produces BAT
     157             :  * (4) DDDDDDDDDDDDDDDsus|DD|........................|
     158             :  *     VM-thread save -->|  | RSS = Full!
     159             :  *
     160             :  *                                  +-- 0 => save costs nothing!!
     161             :  *     VM-thread save ------------->|        assume bat complete
     162             :  * (5) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
     163             :  *                    |<-------- re-set unload cursor
     164             :  *                    +--- first tile was not unloaded.
     165             :  *
     166             :  * later.. some other operator sequentially reads the bat
     167             :  * first part is 'D', that is, nicely cached.
     168             :  *
     169             :  *     ---read------->|
     170             :  * (6) DDDDDDDDDDDDDDDsuuuuuuuuussss0................|
     171             :  *
     172             :  * now we're hitting the unloaded region. the query becomes
     173             :  * I/O read bound here (typically 20% CPU utilization).
     174             :  *
     175             :  *     ---read-------->|
     176             :  * (7) DDDDDDDDDDDDDDDuLuuuuuuuussss0................|
     177             :  *                   /  \
     178             :  *      unload cursor    load cursor
     179             :  *
     180             :  *     ---read---------------->|
     181             :  * (8) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
     182             :  *                           /  \
     183             :  *              unload cursor    load cursor
     184             :  *
     185             :  *     ---read--------------------->| done
     186             :  * (9) DDDDDDDDDDDDDDDuuuuuuuuuLssss0................|
     187             :  *                              ****
     188             :  *                              last part still cached
     189             :  *
     190             :  * note: if we would not have re-setted the unload cursor (5)
     191             :  *       the last part would have been lost due to continuing
     192             :  *       RSS pressure from the 'L' read-cursor.
     193             :  *
     194             :  * If multiple write-mmaps exist, we do unload-tile and save-tile
     195             :  * selection on a round-robin basis among them.
     196             :  *
     197             :  * Of course, this is a simple solution for simple cases only.
     198             :  * (a) if the bat is produced too fast, (or your disk is too slow)
     199             :  *     RSS will exceeds its limit and Linux will go into swapping.
     200             :  * (b) if your data is not produced and read sequentially.
     201             :  *     Examples are sorting or clustering on huge datasets.
     202             :  * (c) if RSS pressure is due to large read-maps, rather than
     203             :  *     intermediate results.
     204             :  *
     205             :  * Two crude suggestions:
     206             :  * - If we are under RSS pressure without unloadable tiles and with
     207             :  *   savable tiles, we should consider suspending *all* other threads
     208             :  *   until we manage to unload a tile.
     209             :  * - if there are no savable tiles (or in case of read-only maps)
     210             :  *   we could resort to saving and unloading random tiles.
     211             :  *
     212             :  * To do better, our BAT algorithms should provide even more detailed
     213             :  * advice on their access patterns, which may even consist of pointers
     214             :  * to the cursors (i.e. pointers to b->batBuns->free or the cursors
     215             :  * in radix-cluster), which an enhanced version of this thread might
     216             :  * take into account.
     217             :  *
     218             :  * [Kersten] The memory map table should be aligned to the number of
     219             :  * mapped files. In more recent applications, such as the SkyServer
     220             :  * this may be around 2000 BATs easily.
     221             :  */
     222             : 
     223             : #ifdef HAVE_PTHREAD_H
     224             : /* pthread.h on Windows includes config.h if HAVE_CONFIG_H is set */
     225             : #undef HAVE_CONFIG_H
     226             : #include <sched.h>
     227             : #include <pthread.h>
     228             : #endif
     229             : #ifdef HAVE_SEMAPHORE_H
     230             : #include <semaphore.h>
     231             : #endif
     232             : 
     233             : #ifndef NATIVE_WIN32
     234             : #ifdef HAVE_POSIX_FADVISE
     235             : #ifdef HAVE_UNAME
     236             : #include <sys/utsname.h>
     237             : #endif
     238             : #endif
     239             : 
     240             : void
     241         268 : MT_init_posix(void)
     242             : {
     243         268 : }
     244             : 
     245             : /* return RSS in bytes */
     246             : size_t
     247           0 : MT_getrss(void)
     248             : {
     249             : #if defined(HAVE_PROCFS_H) && defined(__sun__)
     250             :         /* retrieve RSS the Solaris way (2.6+) */
     251             :         int fd;
     252             :         psinfo_t psbuff;
     253             : 
     254             :         fd = open("/proc/self/psinfo", O_RDONLY | O_CLOEXEC);
     255             :         if (fd >= 0) {
     256             :                 if (read(fd, &psbuff, sizeof(psbuff)) == sizeof(psbuff)) {
     257             :                         close(fd);
     258             :                         return psbuff.pr_rssize * 1024;
     259             :                 }
     260             :                 close(fd);
     261             :         }
     262             : #elif defined(HAVE_TASK_INFO)
     263             :         /* Darwin/MACH call for process' RSS */
     264             :         task_t task = mach_task_self();
     265             :         struct task_basic_info_64 t_info;
     266             :         mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_64_COUNT;
     267             : 
     268             :         if (task_info(task, TASK_BASIC_INFO_64, (task_info_t)&t_info, &t_info_count) != KERN_INVALID_POLICY)
     269             :                 return t_info.resident_size;  /* bytes */
     270             : #elif defined(HAVE_KVM_H)
     271             :         /* get RSS on FreeBSD and NetBSD */
     272             :         struct kinfo_proc *ki;
     273             :         int ski = 1;
     274             :         kvm_t *kd;
     275             :         size_t rss = 0;
     276             : 
     277             :         kd = kvm_open(NULL, "/dev/null", NULL, O_RDONLY, "kvm_open");
     278             :         if (kd != NULL) {
     279             :                 ki = kvm_getprocs(kd, KERN_PROC_PID, getpid(), &ski);
     280             :                 if (ki != NULL) {
     281             : #ifdef __NetBSD__               /* should we use configure for this? */
     282             :                         /* see bug 3217 */
     283             :                         rss = ki->kp_eproc.e_vm.vm_rssize;
     284             : #else
     285             :                         rss = ki->ki_rssize;
     286             : #endif
     287             :                         kvm_close(kd);
     288             : 
     289             :                         return rss * MT_pagesize();
     290             :                 } else {
     291             :                         kvm_close(kd);
     292             :                 }
     293             :         }
     294             : #elif defined(__linux__)
     295             :         /* get RSS on Linux */
     296             :         int fd;
     297             : 
     298           0 :         fd = open("/proc/self/stat", O_RDONLY | O_CLOEXEC);
     299           0 :         if (fd >= 0) {
     300             :                 char buf[1024], *r = buf;
     301           0 :                 ssize_t i, sz = read(fd, buf, 1024);
     302             : 
     303           0 :                 close(fd);
     304           0 :                 if (sz > 0) {
     305           0 :                         for (i = 0; i < 23; i++) {
     306           0 :                                 while (*r && (*r == ' ' || *r == '\t'))
     307           0 :                                         r++;
     308           0 :                                 while (*r && (*r != ' ' && *r != '\t'))
     309           0 :                                         r++;
     310             :                         }
     311           0 :                         while (*r && (*r == ' ' || *r == '\t'))
     312           0 :                                 r++;
     313           0 :                         return ((size_t) atol(r)) * MT_pagesize();
     314             :                 }
     315             :         }
     316             : #endif
     317             :         return 0;
     318             : }
     319             : 
     320             : void *
     321        2047 : MT_mmap(const char *path, int mode, size_t len)
     322             : {
     323             :         int fd;
     324             :         void *ret;
     325             : 
     326        2047 :         fd = open(path, O_CREAT | ((mode & MMAP_WRITE) ? O_RDWR : O_RDONLY) | O_CLOEXEC, MONETDB_MODE);
     327        2081 :         if (fd < 0) {
     328           0 :                 GDKsyserror("open %s failed\n", path);
     329             :                 return MAP_FAILED;
     330             :         }
     331        2081 :         ret = mmap(NULL,
     332             :                    len,
     333        2081 :                    ((mode & MMAP_WRITABLE) ? PROT_WRITE : 0) | PROT_READ,
     334        2081 :                    (mode & MMAP_COPY) ? (MAP_PRIVATE | MAP_NORESERVE) : MAP_SHARED,
     335             :                    fd,
     336             :                    0);
     337        2099 :         if (ret == MAP_FAILED) {
     338           0 :                 GDKsyserror("mmap(%s,%zu) failed\n", path, len);
     339             :                 ret = NULL;
     340             :         }
     341        2099 :         close(fd);
     342             :         VALGRIND_MALLOCLIKE_BLOCK(ret, len, 0, 1);
     343        2099 :         return ret;
     344             : }
     345             : 
     346             : int
     347        2096 : MT_munmap(void *p, size_t len)
     348             : {
     349        2096 :         int ret = munmap(p, len);
     350             : 
     351        2099 :         if (ret < 0)
     352           0 :                 GDKsyserror("munmap(%p,%zu) failed\n", p, len);
     353             :         VALGRIND_FREELIKE_BLOCK(p, 0);
     354        2099 :         return ret;
     355             : }
     356             : 
     357             : /* expand or shrink a memory map (ala realloc).
     358             :  * the address returned may be different from the address going in.
     359             :  * in case of failure, the old address is still mapped and NULL is returned.
     360             :  */
     361             : void *
     362         510 : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
     363             : {
     364             :         void *p;
     365             :         int fd = -1;
     366         510 :         int flags = mode & MMAP_COPY ? MAP_PRIVATE : MAP_SHARED;
     367             :         int prot = PROT_WRITE | PROT_READ;
     368             : 
     369             :         /* round up to multiple of page size */
     370         510 :         *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
     371             : 
     372             :         /* doesn't make sense for us to extend read-only memory map */
     373         510 :         assert(mode & MMAP_WRITABLE);
     374             : 
     375         510 :         if (*new_size < old_size) {
     376             : #ifndef __COVERITY__    /* hide this from static code analyzer */
     377             :                 /* shrink */
     378             :                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     379           0 :                 if (munmap((char *) old_address + *new_size,
     380             :                            old_size - *new_size) < 0) {
     381           0 :                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): munmap() failed\n", path?path:"NULL", old_address, old_size, *new_size);
     382             :                         /* even though the system call failed, we
     383             :                          * don't need to propagate the error up: the
     384             :                          * address should still work in the same way
     385             :                          * as it did before */
     386             :                         return old_address;
     387             :                 }
     388           0 :                 if (path && truncate(path, *new_size) < 0)
     389           0 :                         TRC_WARNING(GDK, "MT_mremap(%s): truncate failed: %s\n",
     390             :                                     path, GDKstrerror(errno, (char[64]){0}, 64));
     391             : #endif  /* !__COVERITY__ */
     392           0 :                 return old_address;
     393             :         }
     394         510 :         if (*new_size == old_size) {
     395             :                 /* do nothing */
     396             :                 return old_address;
     397             :         }
     398             : 
     399         510 :         if (!(mode & MMAP_COPY) && path != NULL) {
     400             :                 /* "normal" memory map */
     401             : 
     402         510 :                 if ((fd = open(path, O_RDWR | O_CLOEXEC)) < 0) {
     403           0 :                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): open failed\n",
     404             :                                     path, old_address, old_size, *new_size);
     405             :                         return NULL;
     406             :                 }
     407         510 :                 if (GDKextendf(fd, *new_size, path) != GDK_SUCCEED) {
     408           0 :                         close(fd);
     409           0 :                         TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextendf() failed\n", path, old_address, old_size, *new_size);
     410           0 :                         return NULL;
     411             :                 }
     412             : #ifdef HAVE_MREMAP
     413             :                 /* on Linux it's easy */
     414         510 :                 p = mremap(old_address, old_size, *new_size, MREMAP_MAYMOVE);
     415             : #ifdef HAVE_VALGRIND
     416             :                 if (p != MAP_FAILED) {
     417             :                         if (p == old_address) {
     418             :                                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     419             :                         } else {
     420             :                                 VALGRIND_FREELIKE_BLOCK(old_address, 0);
     421             :                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     422             :                         }
     423             :                 }
     424             : #endif
     425             : #else
     426             :                 /* try to map extension at end of current map */
     427             :                 p = mmap((char *) old_address + old_size, *new_size - old_size,
     428             :                          prot, flags, fd, old_size);
     429             :                 /* if it failed, there is no point trying a full mmap:
     430             :                  * that too won't fit */
     431             :                 if (p != MAP_FAILED) {
     432             :                         if (p == (char *) old_address + old_size) {
     433             :                                 /* we got the requested address, make
     434             :                                  * sure we return the correct (old)
     435             :                                  * address */
     436             :                                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     437             :                                 p = old_address;
     438             :                         } else {
     439             :                                 /* we got some other address: discard
     440             :                                  * it and make full mmap */
     441             :                                 if (munmap(p, *new_size - old_size) < 0)
     442             :                                         GDKsyserror("munmap");
     443             : #ifdef NO_MMAP_ALIASING
     444             :                                 if (msync(old_address, old_size, MS_SYNC) < 0)
     445             :                                         GDKsyserror("msync");
     446             : #endif
     447             :                                 /* first create full mmap, then, if
     448             :                                  * successful, remove old mmap */
     449             :                                 p = mmap(NULL, *new_size, prot, flags, fd, 0);
     450             :                                 if (p != MAP_FAILED) {
     451             :                                         VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     452             :                                         if (munmap(old_address, old_size) < 0)
     453             :                                                 GDKsyserror("munmap");
     454             :                                         VALGRIND_FREELIKE_BLOCK(old_address, 0);
     455             :                                 }
     456             :                         }
     457             :                 }
     458             : #endif  /* HAVE_MREMAP */
     459             :         } else {
     460             :                 /* "copy-on-write" or "anonymous" memory map */
     461             : #ifdef MAP_ANONYMOUS
     462           0 :                 flags |= MAP_ANONYMOUS;
     463             : #else
     464             :                 if ((fd = open("/dev/zero", O_RDWR | O_CLOEXEC)) < 0) {
     465             :                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): "
     466             :                                     "open('/dev/zero') failed\n",
     467             :                                     path ? path : "NULL", old_address,
     468             :                                     old_size, *new_size);
     469             :                         return NULL;
     470             :                 }
     471             : #endif
     472             :                 /* try to map an anonymous area as extent to the
     473             :                  * current map */
     474           0 :                 p = mmap((char *) old_address + old_size, *new_size - old_size,
     475             :                          prot, flags, fd, 0);
     476             :                 /* no point trying a full map if this didn't work:
     477             :                  * there isn't enough space */
     478           0 :                 if (p != MAP_FAILED) {
     479           0 :                         if (p == (char *) old_address + old_size) {
     480             :                                 /* we got the requested address, make
     481             :                                  * sure we return the correct (old)
     482             :                                  * address */
     483             :                                 VALGRIND_RESIZEINPLACE_BLOCK(old_address, old_size, *new_size, 0);
     484             :                                 p = old_address;
     485             :                         } else {
     486             :                                 /* we got some other address: discard
     487             :                                  * it and make full mmap */
     488           0 :                                 if (munmap(p, *new_size - old_size) < 0)
     489           0 :                                         GDKsyserror("munmap");
     490             : #ifdef HAVE_MREMAP
     491             :                                 /* first get an area large enough for
     492             :                                  * *new_size */
     493           0 :                                 p = mmap(NULL, *new_size, prot, flags, fd, 0);
     494           0 :                                 if (p != MAP_FAILED) {
     495             :                                         /* then overlay old mmap over new */
     496             :                                         void *q;
     497             : 
     498           0 :                                         q = mremap(old_address, old_size,
     499             :                                                    old_size,
     500             :                                                    MREMAP_FIXED | MREMAP_MAYMOVE,
     501             :                                                    p);
     502           0 :                                         assert(q == p || q == MAP_FAILED);
     503           0 :                                         if (q == MAP_FAILED) {
     504           0 :                                                 int e = errno;
     505             :                                                 /* we didn't expect this... */
     506           0 :                                                 if (munmap(p, *new_size) < 0)
     507           0 :                                                         GDKsyserror("munmap");
     508             :                                                 p = MAP_FAILED;
     509           0 :                                                 errno = e;
     510             :                                         }
     511             : #ifdef HAVE_VALGRIND
     512             :                                         else {
     513             :                                                 VALGRIND_FREELIKE_BLOCK(old_size, 0);
     514             :                                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     515             :                                         }
     516             : #endif
     517             :                                 }
     518             : #else
     519             :                                 p = MAP_FAILED;
     520             :                                 if (path == NULL ||
     521             :                                     *new_size <= GDK_mmap_minsize_persistent) {
     522             :                                         /* size not too big yet or
     523             :                                          * anonymous, try to make new
     524             :                                          * anonymous mmap and copy
     525             :                                          * data over */
     526             :                                         p = mmap(NULL, *new_size, prot, flags,
     527             :                                                  fd, 0);
     528             :                                         if (p != MAP_FAILED) {
     529             :                                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 0);
     530             :                                                 memcpy(p, old_address,
     531             :                                                        old_size);
     532             :                                                 munmap(old_address, old_size);
     533             :                                                 VALGRIND_FREELIKE_BLOCK(old_address, 0);
     534             :                                         }
     535             :                                         /* if it failed, try alternative */
     536             :                                 }
     537             :                                 if (p == MAP_FAILED && path != NULL) {
     538             :                                         /* write data to disk, then
     539             :                                          * mmap it to new address */
     540             :                                         if (fd >= 0)
     541             :                                                 close(fd);
     542             :                                         fd = -1;
     543             :                                         p = malloc(strlen(path) + 5);
     544             :                                         if (p == NULL){
     545             :                                                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
     546             :                                                 return NULL;
     547             :                                         }
     548             : 
     549             :                                         strcat(strcpy(p, path), ".tmp");
     550             :                                         fd = open(p, O_RDWR | O_CREAT | O_CLOEXEC,
     551             :                                                   MONETDB_MODE);
     552             :                                         if (fd < 0) {
     553             :                                                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): fd < 0\n", path, old_address, old_size, *new_size);
     554             :                                                 free(p);
     555             :                                                 return NULL;
     556             :                                         }
     557             :                                         free(p);
     558             :                                         if (write(fd, old_address,
     559             :                                                   old_size) < 0 ||
     560             : #ifdef HAVE_FALLOCATE
     561             :                                             /* prefer Linux-specific
     562             :                                              * fallocate over standard
     563             :                                              * posix_fallocate, since
     564             :                                              * glibc uses a rather
     565             :                                              * slow method of
     566             :                                              * allocating the file if
     567             :                                              * the file system doesn't
     568             :                                              * support the operation,
     569             :                                              * we just use ftruncate
     570             :                                              * in that case */
     571             :                                             (fallocate(fd, 0, (off_t) old_size, (off_t) *new_size - (off_t) old_size) < 0 && (errno != EOPNOTSUPP || ftruncate(fd, (off_t) *new_size) < 0))
     572             : #else
     573             : #ifdef HAVE_POSIX_FALLOCATE
     574             :                                             /* posix_fallocate returns
     575             :                                              * error number on
     576             :                                              * failure, not -1, and if
     577             :                                              * it returns EINVAL, the
     578             :                                              * underlying file system
     579             :                                              * may not support the
     580             :                                              * operation, so we then
     581             :                                              * need to try
     582             :                                              * ftruncate */
     583             :                                             ((errno = posix_fallocate(fd, (off_t) old_size, (off_t) *new_size - (off_t) old_size)) == EINVAL ? ftruncate(fd, (off_t) *new_size) < 0 : errno != 0)
     584             : #else
     585             :                                             ftruncate(fd, (off_t) *new_size) < 0
     586             : #endif
     587             : #endif
     588             :                                                 ) {
     589             :                                                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): write() or "
     590             : #ifdef HAVE_FALLOCATE
     591             :                                                             "fallocate()"
     592             : #else
     593             : #ifdef HAVE_POSIX_FALLOCATE
     594             :                                                             "posix_fallocate()"
     595             : #else
     596             :                                                             "ftruncate()"
     597             : #endif
     598             : #endif
     599             :                                                             " failed\n", path, old_address, old_size, *new_size);
     600             :                                                 /* extending failed:
     601             :                                                  * free any disk space
     602             :                                                  * allocated in the
     603             :                                                  * process */
     604             :                                                 if (ftruncate(fd, (off_t) old_size) < 0)
     605             :                                                         GDKsyserror("MT_mremap(%s,%p,%zu,%zu): ftruncate() failed\n", path, old_address, old_size, *new_size);
     606             :                                                 close(fd);
     607             :                                                 return NULL;
     608             :                                         }
     609             :                                         p = mmap(NULL, *new_size, prot, flags,
     610             :                                                  fd, 0);
     611             :                                         if (p != MAP_FAILED) {
     612             :                                                 VALGRIND_MALLOCLIKE_BLOCK(p, *new_size, 0, 1);
     613             :                                                 munmap(old_address, old_size);
     614             :                                                 VALGRIND_FREELIKE_BLOCK(old_address, 0);
     615             :                                         }
     616             :                                 }
     617             : #endif  /* HAVE_MREMAP */
     618             :                         }
     619             :                 }
     620             :         }
     621         510 :         if (p == MAP_FAILED)
     622           0 :                 GDKsyserror("MT_mremap(%s,%p,%zu,%zu): p == MAP_FAILED\n", path?path:"NULL", old_address, old_size, *new_size);
     623         510 :         if (fd >= 0)
     624         510 :                 close(fd);
     625         510 :         return p == MAP_FAILED ? NULL : p;
     626             : }
     627             : 
     628             : int
     629         835 : MT_msync(void *p, size_t len)
     630             : {
     631         835 :         int ret = msync(p, len, MS_SYNC);
     632             : 
     633         831 :         if (ret < 0)
     634           0 :                 GDKsyserror("msync failed\n");
     635         831 :         return ret;
     636             : }
     637             : 
     638             : bool
     639    72760725 : MT_path_absolute(const char *pathname)
     640             : {
     641    72760725 :         return (*pathname == DIR_SEP);
     642             : }
     643             : 
     644             : #ifdef HAVE_DLFCN_H
     645             : # include <dlfcn.h>
     646             : #endif
     647             : 
     648             : void *
     649           0 : mdlopen(const char *library, int mode)
     650             : {
     651             :         (void)library; /* Not used because of MacOs not handling dlopen on linked library */
     652           0 :         return dlopen(NULL, mode);
     653             : }
     654             : 
     655             : #else /* WIN32 native */
     656             : 
     657             : #ifndef BUFSIZ
     658             : #define BUFSIZ 1024
     659             : #endif
     660             : 
     661             : #undef _errno
     662             : 
     663             : #include <windows.h>
     664             : 
     665             : #ifdef _MSC_VER
     666             : #include <io.h>
     667             : #endif /* _MSC_VER */
     668             : #include <Psapi.h>
     669             : 
     670             : #define MT_SMALLBLOCK 256
     671             : 
     672             : static LONG WINAPI
     673             : MT_ignore_exceptions(struct _EXCEPTION_POINTERS *ExceptionInfo)
     674             : {
     675             :         (void) ExceptionInfo;
     676             :         return EXCEPTION_EXECUTE_HANDLER;
     677             : }
     678             : 
     679             : void
     680             : MT_init_posix(void)
     681             : {
     682             :         SetUnhandledExceptionFilter(MT_ignore_exceptions);
     683             : }
     684             : 
     685             : size_t
     686             : MT_getrss(void)
     687             : {
     688             :         PROCESS_MEMORY_COUNTERS ctr;
     689             :         if (GetProcessMemoryInfo(GetCurrentProcess(), &ctr, sizeof(ctr)))
     690             :                 return ctr.WorkingSetSize;
     691             :         return 0;
     692             : }
     693             : 
     694             : /* Windows mmap keeps a global list of base addresses for complex
     695             :  * (remapped) memory maps the reason is that each remapped segment
     696             :  * needs to be unmapped separately in the end. */
     697             : 
     698             : void *
     699             : MT_mmap(const char *path, int mode, size_t len)
     700             : {
     701             :         DWORD mode0 = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
     702             :         DWORD mode1 = FILE_SHARE_READ | FILE_SHARE_WRITE;
     703             :         DWORD mode2 = mode & MMAP_ADVISE;
     704             :         DWORD mode3 = PAGE_READONLY;
     705             :         int mode4 = FILE_MAP_READ;
     706             :         SECURITY_ATTRIBUTES sa;
     707             :         HANDLE h1, h2;
     708             :         void *ret;
     709             :         wchar_t *wpath = utf8towchar(path);
     710             :         if (wpath == NULL)
     711             :                 return NULL;
     712             : 
     713             :         if (mode & MMAP_WRITE) {
     714             :                 mode0 |= FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA;
     715             :         }
     716             :         if (mode2 == MMAP_RANDOM || mode2 == MMAP_DONTNEED) {
     717             :                 mode2 = FILE_FLAG_RANDOM_ACCESS;
     718             :         } else if (mode2 == MMAP_SEQUENTIAL || mode2 == MMAP_WILLNEED) {
     719             :                 mode2 = FILE_FLAG_SEQUENTIAL_SCAN;
     720             :         } else {
     721             :                 mode2 = FILE_FLAG_NO_BUFFERING;
     722             :         }
     723             :         if (mode & MMAP_SYNC) {
     724             :                 mode2 |= FILE_FLAG_WRITE_THROUGH;
     725             :         }
     726             :         if (mode & MMAP_COPY) {
     727             :                 mode3 = PAGE_WRITECOPY;
     728             :                 mode4 = FILE_MAP_COPY;
     729             :         } else if (mode & MMAP_WRITE) {
     730             :                 mode3 = PAGE_READWRITE;
     731             :                 mode4 = FILE_MAP_WRITE;
     732             :         }
     733             :         sa.nLength = sizeof(SECURITY_ATTRIBUTES);
     734             :         sa.bInheritHandle = TRUE;
     735             :         sa.lpSecurityDescriptor = 0;
     736             : 
     737             :         h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
     738             :         if (h1 == INVALID_HANDLE_VALUE) {
     739             :                 (void) SetFileAttributesW(wpath, FILE_ATTRIBUTE_NORMAL);
     740             :                 h1 = CreateFileW(wpath, mode0, mode1, &sa, OPEN_ALWAYS, mode2, NULL);
     741             :                 if (h1 == INVALID_HANDLE_VALUE) {
     742             :                         free(wpath);
     743             :                         GDKwinerror("CreateFile('%s', %lu, %lu, &sa, %lu, %lu, NULL) failed\n",
     744             :                                     path, (unsigned long) mode0, (unsigned long) mode1, (unsigned long) OPEN_ALWAYS, (unsigned long) mode2);
     745             :                         return NULL;
     746             :                 }
     747             :         }
     748             :         free(wpath);
     749             : 
     750             :         h2 = CreateFileMapping(h1, &sa, mode3, (DWORD) (((__int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)), (DWORD) (len & LL_CONSTANT(0xFFFFFFFF)), NULL);
     751             :         if (h2 == NULL) {
     752             :                 GDKwinerror("CreateFileMapping(%p, &sa, %lu, %lu, %lu, NULL) failed\n",
     753             :                             h1, (unsigned long) mode3,
     754             :                             (unsigned long) (((unsigned __int64) len >> 32) & LL_CONSTANT(0xFFFFFFFF)),
     755             :                             (unsigned long) (len & LL_CONSTANT(0xFFFFFFFF)));
     756             :                 CloseHandle(h1);
     757             :                 return NULL;
     758             :         }
     759             :         CloseHandle(h1);
     760             : 
     761             :         ret = MapViewOfFileEx(h2, mode4, (DWORD) 0, (DWORD) 0, len, NULL);
     762             :         if (ret == NULL)
     763             :                 errno = winerror(GetLastError());
     764             :         CloseHandle(h2);
     765             : 
     766             :         return ret;
     767             : }
     768             : 
     769             : int
     770             : MT_munmap(void *p, size_t dummy)
     771             : {
     772             :         int ret;
     773             : 
     774             :         (void) dummy;
     775             :         /*       Windows' UnmapViewOfFile returns success!=0, error== 0,
     776             :          * while Unix's   munmap          returns success==0, error==-1. */
     777             :         ret = UnmapViewOfFile(p);
     778             :         if (ret == 0) {
     779             :                 GDKwinerror("UnmapViewOfFile failed\n");
     780             :                 return -1;
     781             :         }
     782             :         return 0;
     783             : }
     784             : 
     785             : void *
     786             : MT_mremap(const char *path, int mode, void *old_address, size_t old_size, size_t *new_size)
     787             : {
     788             :         void *p;
     789             : 
     790             :         /* doesn't make sense for us to extend read-only memory map */
     791             :         assert(mode & MMAP_WRITABLE);
     792             : 
     793             :         /* round up to multiple of page size */
     794             :         *new_size = (*new_size + GDK_mmap_pagesize - 1) & ~(GDK_mmap_pagesize - 1);
     795             : 
     796             :         if (old_size >= *new_size) {
     797             :                 *new_size = old_size;
     798             :                 return old_address;     /* don't bother shrinking */
     799             :         }
     800             :         if (GDKextend(path, *new_size) != GDK_SUCCEED) {
     801             :                 TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): GDKextend() failed\n", path?path:"NULL", old_address, old_size, *new_size);
     802             :                 return NULL;
     803             :         }
     804             :         if (path && !(mode & MMAP_COPY))
     805             :                 MT_munmap(old_address, old_size);
     806             :         p = MT_mmap(path, mode, *new_size);
     807             :         if (p != NULL && (path == NULL || (mode & MMAP_COPY))) {
     808             :                 memcpy(p, old_address, old_size);
     809             :                 MT_munmap(old_address, old_size);
     810             :         }
     811             : 
     812             :         if (p == NULL)
     813             :                 TRC_ERROR(GDK, "MT_mremap(%s,%p,%zu,%zu): p == NULL\n", path?path:"NULL", old_address, old_size, *new_size);
     814             :         return p;
     815             : }
     816             : 
     817             : int
     818             : MT_msync(void *p, size_t len)
     819             : {
     820             :         int ret;
     821             : 
     822             :         /*       Windows' FlushViewOfFile returns success!=0, error== 0,
     823             :          * while Unix's   munmap          returns success==0, error==-1. */
     824             :         ret = FlushViewOfFile(p, len);
     825             :         if (ret == 0) {
     826             :                 GDKwinerror("FlushViewOfFile failed\n");
     827             :                 return -1;
     828             :         }
     829             :         return 0;
     830             : }
     831             : 
     832             : bool
     833             : MT_path_absolute(const char *pathname)
     834             : {
     835             :         /* drive letter, colon, directory separator */
     836             :         return (((('a' <= pathname[0] && pathname[0] <= 'z') ||
     837             :                   ('A' <= pathname[0] && pathname[0] <= 'Z')) &&
     838             :                  pathname[1] == ':' &&
     839             :                  (pathname[2] == '/' || pathname[2] == '\\')) ||
     840             :                 (pathname[0] == '\\')); // && pathname[1] == '\\'));
     841             : }
     842             : 
     843             : #ifndef HAVE_GETTIMEOFDAY
     844             : static int nodays[12] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
     845             : 
     846             : #define LEAPYEAR(y) ((((y)%4)==0 && ((y)%100)!=0) || ((y)%400)==0)
     847             : #define NODAYS(m,y) (((m)!=2)?nodays[(m)-1]:LEAPYEAR(y)?29:28)
     848             : 
     849             : int
     850             : gettimeofday(struct timeval *tv, int *ignore_zone)
     851             : {
     852             :         unsigned int year, day, month;
     853             :         SYSTEMTIME st;
     854             : 
     855             :         (void) ignore_zone;
     856             :         GetSystemTime(&st);
     857             :         day = 0;
     858             :         for (year = 1970; year < st.wYear; year++)
     859             :                 day += LEAPYEAR(year) ? 366 : 365;
     860             : 
     861             :         for (month = 1; month < st.wMonth; month++)
     862             :                 day += NODAYS(month, st.wYear);
     863             : 
     864             :         day += st.wDay;
     865             :         tv->tv_sec = 60 * (day * 24 * 60 + st.wMinute) + st.wSecond;
     866             :         tv->tv_usec = 1000 * st.wMilliseconds;
     867             :         return 0;
     868             : }
     869             : #endif
     870             : 
     871             : void *
     872             : mdlopen(const char *file, int mode)
     873             : {
     874             :         return dlopen(file, mode);
     875             : }
     876             : 
     877             : void *
     878             : dlopen(const char *file, int mode)
     879             : {
     880             :         (void) mode;
     881             :         if (file != NULL) {
     882             :                 return (void *) LoadLibrary(file);
     883             :         }
     884             :         return GetModuleHandle(NULL);
     885             : }
     886             : 
     887             : int
     888             : dlclose(void *handle)
     889             : {
     890             :         if (handle != NULL) {
     891             :                 return FreeLibrary((HINSTANCE) handle);
     892             :         }
     893             :         return -1;
     894             : }
     895             : 
     896             : void *
     897             : dlsym(void *handle, const char *name)
     898             : {
     899             :         if (handle != NULL) {
     900             :                 return (void *) GetProcAddress((HINSTANCE) handle, name);
     901             :         }
     902             :         return NULL;
     903             : }
     904             : 
     905             : char *
     906             : dlerror(void)
     907             : {
     908             :         static char msg[1024];
     909             : 
     910             :         FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, GetLastError(), 0, msg, sizeof(msg), NULL);
     911             :         return msg;
     912             : }
     913             : #endif
     914             : 
     915             : void
     916      147660 : MT_sleep_ms(unsigned int ms)
     917             : {
     918             : #ifdef NATIVE_WIN32
     919             :         Sleep(ms);
     920             : #else
     921             : #ifdef HAVE_NANOSLEEP
     922      147660 :         (void) nanosleep(&(struct timespec) {.tv_sec = ms / 1000,
     923      147660 :                                 .tv_nsec = ms == 1 ? 1000 : (long) (ms % 1000) * 1000000,},
     924             :                 NULL);
     925             : #else
     926             :         (void) select(0, NULL, NULL, NULL,
     927             :                       &(struct timeval) {.tv_sec = ms / 1000,
     928             :                                       .tv_usec = ms == 1 ? 1 : (ms % 1000) * 1000,});
     929             : #endif
     930             : #endif
     931      147535 : }
     932             : 
     933             : #if !defined(HAVE_LOCALTIME_R) || !defined(HAVE_GMTIME_R) || !defined(HAVE_ASCTIME_R) || !defined(HAVE_CTIME_R)
     934             : static MT_Lock timelock = MT_LOCK_INITIALIZER(timelock);
     935             : #endif
     936             : 
     937             : #ifndef HAVE_LOCALTIME_R
     938             : struct tm *
     939             : localtime_r(const time_t *restrict timep, struct tm *restrict result)
     940             : {
     941             :         struct tm *tmp;
     942             :         MT_lock_set(&timelock);
     943             :         tmp = localtime(timep);
     944             :         if (tmp)
     945             :                 *result = *tmp;
     946             :         MT_lock_unset(&timelock);
     947             :         return tmp ? result : NULL;
     948             : }
     949             : #endif
     950             : 
     951             : #ifndef HAVE_GMTIME_R
     952             : struct tm *
     953             : gmtime_r(const time_t *restrict timep, struct tm *restrict result)
     954             : {
     955             :         struct tm *tmp;
     956             :         MT_lock_set(&timelock);
     957             :         tmp = gmtime(timep);
     958             :         if (tmp)
     959             :                 *result = *tmp;
     960             :         MT_lock_unset(&timelock);
     961             :         return tmp ? result : NULL;
     962             : }
     963             : #endif
     964             : 
     965             : #ifndef HAVE_ASCTIME_R
     966             : char *
     967             : asctime_r(const struct tm *restrict tm, char *restrict buf)
     968             : {
     969             :         char *tmp;
     970             :         MT_lock_set(&timelock);
     971             :         tmp = asctime(tm);
     972             :         if (tmp)
     973             :                 strcpy(buf, tmp);
     974             :         MT_lock_unset(&timelock);
     975             :         return tmp ? buf : NULL;
     976             : }
     977             : #endif
     978             : 
     979             : #ifndef HAVE_CTIME_R
     980             : char *
     981             : ctime_r(const time_t *restrict t, char *restrict buf)
     982             : {
     983             :         char *tmp;
     984             :         MT_lock_set(&timelock);
     985             :         tmp = ctime(t);
     986             :         if (tmp)
     987             :                 strcpy(buf, tmp);
     988             :         MT_lock_unset(&timelock);
     989             :         return tmp ? buf : NULL;
     990             : }
     991             : #endif
     992             : 
     993             : #ifndef HAVE_STRERROR_R
     994             : static MT_Lock strerrlock = MT_LOCK_INITIALIZER(strerrlock);
     995             : 
     996             : int
     997             : strerror_r(int errnum, char *buf, size_t buflen)
     998             : {
     999             :         char *msg;
    1000             :         MT_lock_set(&strerrlock);
    1001             :         msg = strerror(errnum);
    1002             :         strcpy_len(buf, msg, buflen);
    1003             :         MT_lock_unset(&strerrlock);
    1004             :         return 0;
    1005             : }
    1006             : #endif

Generated by: LCOV version 1.14