LCOV - code coverage report
Current view: top level - sql/backends/monet5/UDF/pyapi3 - unicode3.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 26 132 19.7 %
Date: 2020-06-29 20:00:14 Functions: 2 10 20.0 %

          Line data    Source code
       1             : /*
       2             :  * This Source Code Form is subject to the terms of the Mozilla Public
       3             :  * License, v. 2.0.  If a copy of the MPL was not distributed with this
       4             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
       5             :  *
       6             :  * Copyright 1997 - July 2008 CWI, August 2008 - 2020 MonetDB B.V.
       7             :  */
       8             : 
       9             : #include "monetdb_config.h"
      10             : #include "unicode.h"
      11             : 
      12             : #include <string.h>
      13             : 
      14           0 : int utf8_strlen(const char *utf8_str, bool *ascii)
      15             : {
      16           0 :         int utf8_char_count = 0;
      17           0 :         int i = 0;
      18             :         // we traverse the string and simply count the amount of utf8 characters in
      19             :         // the string
      20           0 :         while (true) {
      21           0 :                 int offset;
      22           0 :                 if (utf8_str[i] == '\0')
      23             :                         break;
      24           0 :                 offset = utf8_length(utf8_str[i]);
      25           0 :                 if (offset < 0)
      26             :                         return -1; // invalid utf8 character
      27           0 :                 i += offset;
      28           0 :                 utf8_char_count++;
      29             :         }
      30           0 :         if (ascii != NULL)
      31           0 :                 *ascii = i == utf8_char_count;
      32             :         return utf8_char_count;
      33             : }
      34             : 
      35           0 : size_t utf32_strlen(const Py_UNICODE *utf32_str)
      36             : {
      37           0 :         size_t i = 0;
      38           0 :         while (utf32_str[i] != 0)
      39           0 :                 i++;
      40           0 :         return i;
      41             : }
      42             : 
      43           0 : int utf8_length(unsigned char utf8_char)
      44             : {
      45             :         // the first byte tells us how many bytes the utf8 character uses
      46           0 :         if (utf8_char < 0x80)
      47             :                 return 1;
      48           0 :         else if (utf8_char < 0xe0)
      49             :                 return 2;
      50           0 :         else if (utf8_char < 0xf0)
      51             :                 return 3;
      52           0 :         else if (utf8_char < 0xf8)
      53             :                 return 4;
      54             :         else
      55           0 :                 return -1; // invalid utf8 character, the maximum value of the first
      56             :                                    // byte is 0xf7
      57             : }
      58             : 
      59     5000150 : int utf32_char_to_utf8_char(size_t position, char *utf8_storage,
      60             :                                                         unsigned int utf32_char)
      61             : {
      62     5000150 :         int utf8_size = 4;
      63     5000150 :         if (utf32_char < 0x80)
      64             :                 utf8_size = 1;
      65           2 :         else if (utf32_char < 0x800)
      66             :                 utf8_size = 2;
      67           0 :         else if (utf32_char < 0x10000)
      68             :                 utf8_size = 3;
      69           0 :         else if (utf32_char > 0x0010FFFF)
      70             :                 return -1; // utf32 character is out of legal range
      71             : 
      72     5000150 :         switch (utf8_size) {
      73             :                 case 4:
      74           0 :                         utf8_storage[position + 3] = ((utf32_char | 0x80) & 0xbf);
      75           0 :                         utf32_char >>= 6;
      76           0 :                         utf8_storage[position + 2] = ((utf32_char | 0x80) & 0xbf);
      77           0 :                         utf32_char >>= 6;
      78           0 :                         utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
      79           0 :                         utf32_char >>= 6;
      80           0 :                         utf8_storage[position] = (utf32_char | 0xf0);
      81           0 :                         return utf8_size;
      82             :                 case 3:
      83           0 :                         utf8_storage[position + 2] = ((utf32_char | 0x80) & 0xbf);
      84           0 :                         utf32_char >>= 6;
      85           0 :                         utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
      86           0 :                         utf32_char >>= 6;
      87           0 :                         utf8_storage[position] = (utf32_char | 0xe0);
      88           0 :                         return utf8_size;
      89             :                 case 2:
      90           2 :                         utf8_storage[position + 1] = ((utf32_char | 0x80) & 0xbf);
      91           2 :                         utf32_char >>= 6;
      92           2 :                         utf8_storage[position] = (utf32_char | 0xc0);
      93           2 :                         return utf8_size;
      94             :                 default:
      95     5000150 :                         utf8_storage[position] = (char)utf32_char;
      96     5000150 :                         return utf8_size;
      97             :         }
      98             : }
      99             : 
     100           0 : bool ucs2_to_utf8(size_t offset, size_t size, char *utf8_storage,
     101             :                                   const Py_UNICODE *ucs2)
     102             : {
     103           0 :         size_t i = 0;
     104           0 :         int position = 0;
     105           0 :         int shift;
     106           0 :         for (i = 0; i < size; i++) {
     107           0 :                 if (ucs2[offset + i] == 0) {
     108           0 :                         utf8_storage[position] = '\0';
     109           0 :                         return true;
     110             :                 }
     111           0 :                 shift =
     112           0 :                         utf32_char_to_utf8_char(position, utf8_storage, ucs2[offset + i]);
     113           0 :                 if (shift < 0)
     114             :                         return false;
     115           0 :                 position += shift;
     116             :         }
     117           0 :         utf8_storage[position] = '\0';
     118           0 :         return true;
     119             : }
     120             : 
     121     1000020 : bool utf32_to_utf8(size_t offset, size_t size, char *utf8_storage,
     122             :                                    const Py_UNICODE *utf32_input)
     123             : {
     124     1000020 :         size_t i = 0;
     125     1000020 :         int position = 0;
     126     1000020 :         int shift;
     127     1000020 :         unsigned int *utf32 = (unsigned int *)utf32_input;
     128             : 
     129     6000170 :         for (i = 0; i < size; i++) {
     130     5000160 :                 if (utf32[offset + i] == 0) {
     131           8 :                         utf8_storage[position] = '\0';
     132           8 :                         return true;
     133             :                 }
     134             : 
     135     5000150 :                 shift =
     136     5000150 :                         utf32_char_to_utf8_char(position, utf8_storage, utf32[offset + i]);
     137     5000150 :                 if (shift < 0)
     138             :                         return false;
     139     5000150 :                 position += shift;
     140             :         }
     141     1000020 :         utf8_storage[position] = '\0';
     142     1000020 :         return true;
     143             : }
     144             : 
     145           0 : bool unicode_to_utf8(size_t offset, size_t size, char *utf8_storage,
     146             :                                          const Py_UNICODE *unicode)
     147             : {
     148             : #if Py_UNICODE_SIZE == 2
     149             :         return ucs2_to_utf8(offset, size, utf8_storage, unicode);
     150             : #else
     151           0 :         return utf32_to_utf8(offset, size, utf8_storage, unicode);
     152             : #endif
     153             : }
     154             : 
     155           0 : int utf8_char_to_utf32_char(size_t position, Py_UNICODE *utf32_storage,
     156             :                                                         int offset, const unsigned char *utf8_char)
     157             : {
     158           0 :         unsigned char bytes[4];
     159           0 :         int utf8_size = 4;
     160           0 :         bytes[0] = utf8_char[offset];
     161           0 :         bytes[1] = 0xFF;
     162           0 :         bytes[2] = 0xFF;
     163           0 :         bytes[3] = 0xFF;
     164             :         // the first byte tells us how many bytes the utf8 character uses
     165           0 :         if (bytes[0] < 0x80)
     166             :                 utf8_size = 1;
     167           0 :         else if (bytes[0] < 0xe0)
     168             :                 utf8_size = 2;
     169           0 :         else if (bytes[0] < 0xf0)
     170             :                 utf8_size = 3;
     171           0 :         else if (bytes[0] < 0xf8)
     172             :                 utf8_size = 4;
     173             :         else
     174             :                 return -1; // invalid utf8 character, the maximum value of the first
     175             :                                    // byte is 0xf7
     176             : 
     177             : #if Py_UNICODE_SIZE == 2
     178             :         if (utf8_size > 2) {
     179             :                 // utf-8 character out of range on a UCS2 python compilation
     180             :                 return -1;
     181             :         }
     182             : #endif
     183             : 
     184           0 :         switch (utf8_size) {
     185             :                 case 4:
     186           0 :                         bytes[3] = utf8_char[offset + 3];
     187           0 :                         if (bytes[3] > 0xc0)
     188             :                                 return -1; // invalid utf8 character, the maximum value of the
     189             :                                                    // second, third and fourth bytes is 0xbf
     190             :                         /* fall through */
     191             :                 case 3:
     192           0 :                         bytes[2] = utf8_char[offset + 2];
     193           0 :                         if (bytes[2] > 0xc0)
     194             :                                 return -1;
     195             :                         /* fall through */
     196             :                 case 2:
     197           0 :                         bytes[1] = utf8_char[offset + 1];
     198           0 :                         if (bytes[1] > 0xc0)
     199             :                                 return -1;
     200             :         }
     201             : 
     202           0 :         utf32_storage[position] = 0;
     203             : 
     204           0 :         switch (utf8_size) {
     205           0 :                 case 4:
     206           0 :                         utf32_storage[position] |= (0x3f & bytes[3]);
     207           0 :                         utf32_storage[position] |= (0x3f & bytes[2]) << 6;
     208           0 :                         utf32_storage[position] |= (0x3f & bytes[1]) << 12;
     209           0 :                         utf32_storage[position] |= (0x7 & bytes[0]) << 18;
     210           0 :                         return utf8_size;
     211           0 :                 case 3:
     212           0 :                         utf32_storage[position] |= (0x3f & bytes[2]);
     213           0 :                         utf32_storage[position] |= (0x3f & bytes[1]) << 6;
     214           0 :                         utf32_storage[position] |= (0xf & bytes[0]) << 12;
     215           0 :                         return utf8_size;
     216           0 :                 case 2:
     217           0 :                         utf32_storage[position] |= (0x3f & bytes[1]);
     218           0 :                         utf32_storage[position] |= (0x1f & bytes[0]) << 6;
     219           0 :                         return utf8_size;
     220           0 :                 default:
     221           0 :                         utf32_storage[position] |= 0x7f & bytes[0];
     222           0 :                         return utf8_size;
     223             :         }
     224             : }
     225             : 
     226           0 : bool utf8_to_utf32(size_t offset, size_t size, Py_UNICODE *utf32_storage,
     227             :                                    const unsigned char *utf8)
     228             : {
     229           0 :         size_t i = 0;
     230           0 :         int position = 0;
     231           0 :         int shift;
     232           0 :         for (i = 0; i < size; i++) {
     233           0 :                 if (utf8[offset + position] == 0) {
     234           0 :                         utf32_storage[i] = '\0';
     235           0 :                         return true;
     236             :                 }
     237             : 
     238           0 :                 shift = utf8_char_to_utf32_char((int)i, utf32_storage,
     239           0 :                                                                                 (int)(offset + position), utf8);
     240           0 :                 if (shift < 0)
     241             :                         return false;
     242           0 :                 position += shift;
     243             :         }
     244             :         return true;
     245             : }
     246             : 
     247           0 : void _unicode_init(void) { _import_array(); }

Generated by: LCOV version 1.14