| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | #ifndef _GDK_ATOMS_H_ |
| 10 | #define _GDK_ATOMS_H_ |
| 11 | |
| 12 | #define MAXATOMS 128 |
| 13 | |
| 14 | /* |
| 15 | * @- maximum atomic string lengths |
| 16 | */ |
| 17 | #define bitStrlen 8 |
| 18 | #define bteStrlen 8 |
| 19 | #define shtStrlen 12 |
| 20 | #define intStrlen 24 |
| 21 | #if SIZEOF_OID == SIZEOF_INT |
| 22 | #define oidStrlen 24 |
| 23 | #else |
| 24 | #define oidStrlen 48 |
| 25 | #endif |
| 26 | #if SIZEOF_PTR == SIZEOF_INT |
| 27 | #define ptrStrlen 24 |
| 28 | #else |
| 29 | #define ptrStrlen 48 |
| 30 | #endif |
| 31 | #define lngStrlen 48 |
| 32 | #ifdef HAVE_HGE |
| 33 | #define hgeStrlen 96 |
| 34 | #endif |
| 35 | #define fltStrlen 48 |
| 36 | #define dblStrlen 96 |
| 37 | |
| 38 | /* |
| 39 | * The system comes with the traditional atomic types: int (4 bytes), |
| 40 | * bool(1 byte) and str (variable). In addition, we support the notion |
| 41 | * of an OID type, which ensures uniqueness of its members. This |
| 42 | * leads to the following type descriptor table. |
| 43 | */ |
| 44 | |
| 45 | #ifdef HAVE_HGE |
| 46 | gdk_export ssize_t hgeFromStr(const char *src, size_t *len, hge **dst, bool external); |
| 47 | gdk_export ssize_t hgeToStr(str *dst, size_t *len, const hge *src, bool external); |
| 48 | #endif |
| 49 | gdk_export ssize_t lngFromStr(const char *src, size_t *len, lng **dst, bool external); |
| 50 | gdk_export ssize_t lngToStr(str *dst, size_t *len, const lng *src, bool external); |
| 51 | gdk_export ssize_t intFromStr(const char *src, size_t *len, int **dst, bool external); |
| 52 | gdk_export ssize_t intToStr(str *dst, size_t *len, const int *src, bool external); |
| 53 | gdk_export ssize_t batFromStr(const char *src, size_t *len, bat **dst, bool external); |
| 54 | gdk_export ssize_t batToStr(str *dst, size_t *len, const bat *src, bool external); |
| 55 | gdk_export ssize_t ptrFromStr(const char *src, size_t *len, ptr **dst, bool external); |
| 56 | gdk_export ssize_t ptrToStr(str *dst, size_t *len, const ptr *src, bool external); |
| 57 | gdk_export ssize_t bitFromStr(const char *src, size_t *len, bit **dst, bool external); |
| 58 | gdk_export ssize_t bitToStr(str *dst, size_t *len, const bit *src, bool external); |
| 59 | gdk_export ssize_t OIDfromStr(const char *src, size_t *len, oid **dst, bool external); |
| 60 | gdk_export ssize_t OIDtoStr(str *dst, size_t *len, const oid *src, bool external); |
| 61 | gdk_export ssize_t shtFromStr(const char *src, size_t *len, sht **dst, bool external); |
| 62 | gdk_export ssize_t shtToStr(str *dst, size_t *len, const sht *src, bool external); |
| 63 | gdk_export ssize_t bteFromStr(const char *src, size_t *len, bte **dst, bool external); |
| 64 | gdk_export ssize_t bteToStr(str *dst, size_t *len, const bte *src, bool external); |
| 65 | gdk_export ssize_t fltFromStr(const char *src, size_t *len, flt **dst, bool external); |
| 66 | gdk_export ssize_t fltToStr(str *dst, size_t *len, const flt *src, bool external); |
| 67 | gdk_export ssize_t dblFromStr(const char *src, size_t *len, dbl **dst, bool external); |
| 68 | gdk_export ssize_t dblToStr(str *dst, size_t *len, const dbl *src, bool external); |
| 69 | gdk_export ssize_t GDKstrFromStr(unsigned char *restrict dst, const unsigned char *restrict src, ssize_t len); |
| 70 | gdk_export ssize_t strFromStr(const char *restrict src, size_t *restrict len, str *restrict dst, bool external); |
| 71 | gdk_export BUN strHash(const char *s); |
| 72 | gdk_export size_t strLen(const char *s); |
| 73 | gdk_export int strNil(const char *s); |
| 74 | gdk_export size_t escapedStrlen(const char *restrict src, const char *sep1, const char *sep2, int quote); |
| 75 | gdk_export size_t escapedStr(char *restrict dst, const char *restrict src, size_t dstlen, const char *sep1, const char *sep2, int quote); |
| 76 | /* |
| 77 | * @- nil values |
| 78 | * All types have a single value designated as a NIL value. It |
| 79 | * designates a missing value and it is ignored (forbidden) in several |
| 80 | * primitives. The current policy is to use the smallest value in any |
| 81 | * ordered domain. The routine atomnil returns a pointer to the nil |
| 82 | * value representation. |
| 83 | */ |
| 84 | #define GDK_bit_max ((bit) 1) |
| 85 | #define GDK_bit_min ((bit) 0) |
| 86 | #define GDK_bte_max ((bte) INT8_MAX) |
| 87 | #define GDK_bte_min ((bte) INT8_MIN+1) |
| 88 | #define GDK_sht_max ((sht) INT16_MAX) |
| 89 | #define GDK_sht_min ((sht) INT16_MIN+1) |
| 90 | #define GDK_int_max ((int) INT32_MAX) |
| 91 | #define GDK_int_min ((int) INT32_MIN+1) |
| 92 | #define GDK_lng_max ((lng) INT64_MAX) |
| 93 | #define GDK_lng_min ((lng) INT64_MIN+1) |
| 94 | #ifdef HAVE_HGE |
| 95 | #define GDK_hge_max ((((hge) 1) << 126) - 1 + (((hge) 1) << 126)) |
| 96 | #define GDK_hge_min (-GDK_hge_max) |
| 97 | #endif |
| 98 | #define GDK_flt_max ((flt) FLT_MAX) |
| 99 | #define GDK_flt_min ((flt) -FLT_MAX) |
| 100 | #define GDK_dbl_max ((dbl) DBL_MAX) |
| 101 | #define GDK_dbl_min ((dbl) -DBL_MAX) |
| 102 | #define GDK_oid_max (((oid) 1 << ((8 * SIZEOF_OID) - 1)) - 1) |
| 103 | #define GDK_oid_min ((oid) 0) |
| 104 | /* representation of the nil */ |
| 105 | gdk_export const bte bte_nil; |
| 106 | gdk_export const sht sht_nil; |
| 107 | gdk_export const int int_nil; |
| 108 | #ifdef NAN_CANNOT_BE_USED_AS_INITIALIZER |
| 109 | /* Definition of NAN is seriously broken on Intel compiler (at least |
| 110 | * in some versions), so we work around it. */ |
| 111 | union _flt_nil_t { |
| 112 | uint32_t l; |
| 113 | flt f; |
| 114 | }; |
| 115 | gdk_export const union _flt_nil_t _flt_nil_; |
| 116 | #define flt_nil (_flt_nil_.f) |
| 117 | union _dbl_nil_t { |
| 118 | uint64_t l; |
| 119 | dbl d; |
| 120 | }; |
| 121 | gdk_export const union _dbl_nil_t _dbl_nil_; |
| 122 | #define dbl_nil (_dbl_nil_.d) |
| 123 | #else |
| 124 | gdk_export const flt flt_nil; |
| 125 | gdk_export const dbl dbl_nil; |
| 126 | #endif |
| 127 | gdk_export const lng lng_nil; |
| 128 | #ifdef HAVE_HGE |
| 129 | gdk_export const hge hge_nil; |
| 130 | #endif |
| 131 | gdk_export const oid oid_nil; |
| 132 | gdk_export const char str_nil[2]; |
| 133 | gdk_export const ptr ptr_nil; |
| 134 | |
| 135 | /* derived NIL values - OIDDEPEND */ |
| 136 | #define bit_nil ((bit) bte_nil) |
| 137 | #define bat_nil ((bat) int_nil) |
| 138 | |
| 139 | #define void_nil oid_nil |
| 140 | |
| 141 | #define is_bit_nil(v) ((v) == bit_nil) |
| 142 | #define is_bte_nil(v) ((v) == bte_nil) |
| 143 | #define is_sht_nil(v) ((v) == sht_nil) |
| 144 | #define is_int_nil(v) ((v) == int_nil) |
| 145 | #define is_lng_nil(v) ((v) == lng_nil) |
| 146 | #ifdef HAVE_HGE |
| 147 | #define is_hge_nil(v) ((v) == hge_nil) |
| 148 | #endif |
| 149 | #define is_oid_nil(v) ((v) == oid_nil) |
| 150 | #define is_flt_nil(v) isnan(v) |
| 151 | #define is_dbl_nil(v) isnan(v) |
| 152 | #define is_bat_nil(v) ((v) == bat_nil || (v) == 0) |
| 153 | |
| 154 | #include <math.h> |
| 155 | |
| 156 | #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) && _MSC_VER < 1800 |
| 157 | #include <float.h> |
| 158 | #define isnan(x) _isnan(x) |
| 159 | #define isinf(x) (_fpclass(x) & (_FPCLASS_NINF | _FPCLASS_PINF)) |
| 160 | #define isfinite(x) _finite(x) |
| 161 | #endif |
| 162 | |
| 163 | /* |
| 164 | * @- Derived types |
| 165 | * In all algorithms across GDK, you will find switches on the types |
| 166 | * (bte, sht, int, flt, dbl, lng, hge, str). They respectively |
| 167 | * represent an octet, a 16-bit int, a 32-bit int, a 32-bit float, a |
| 168 | * 64-bit double, a 64-bit int, and a pointer-sized location of a |
| 169 | * char-buffer (ended by a zero char). |
| 170 | * |
| 171 | * In contrast, the types (bit, ptr, bat, oid) are derived types. They |
| 172 | * do not occur in the switches. The ATOMstorage macro maps them |
| 173 | * respectively onto a @code{ bte}, @code{ int} (pointers are 32-bit), |
| 174 | * @code{ int}, and @code{ int}. OIDs are 32-bit. |
| 175 | * |
| 176 | * This approach makes it tractable to switch to 64-bits OIDs, or to a |
| 177 | * fully 64-bits OS easily. One only has to map the @code{ oid} and |
| 178 | * @code{ ptr} types to @code{ lng} instead of @code{ int}. |
| 179 | * |
| 180 | * Derived types mimic their fathers in many ways. They inherit the |
| 181 | * @code{ size}, @code{ linear}, and @code{ null} |
| 182 | * properties of their father. The same goes for the |
| 183 | * ADT functions HASH, CMP, PUT, NULL, DEL, LEN, and HEAP. So, a |
| 184 | * derived type differs in only two ways from its father: |
| 185 | * @table @code |
| 186 | * @item [string representation] |
| 187 | * the only two ADT operations specific for a derived type are FROMSTR |
| 188 | * and TOSTR. |
| 189 | * @item [identity] |
| 190 | * (a @code{ bit} is really of a different type than @code{ bte}). The |
| 191 | * set of operations on derived type values or BATs of such types may |
| 192 | * differ from the sets of operations on the father type. |
| 193 | * @end table |
| 194 | */ |
| 195 | /* use "do ... while(0)" so that lhs can safely be used in if statements */ |
| 196 | #define ATOMstorage(t) BATatoms[t].storage |
| 197 | #define ATOMsize(t) BATatoms[t].size |
| 198 | #define ATOMfromstr(t,s,l,src,ext) BATatoms[t].atomFromStr(src,l,s,ext) |
| 199 | #define ATOMnilptr(t) BATatoms[t].atomNull |
| 200 | #define ATOMcompare(t) BATatoms[t].atomCmp |
| 201 | #define ATOMcmp(t,l,r) ((*ATOMcompare(t))(l, r)) |
| 202 | #define ATOMhash(t,src) BATatoms[t].atomHash(src) |
| 203 | #define ATOMdel(t,hp,src) do if (BATatoms[t].atomDel) BATatoms[t].atomDel(hp,src); while (0) |
| 204 | #define ATOMvarsized(t) (BATatoms[t].atomPut != NULL) |
| 205 | #define ATOMlinear(t) BATatoms[t].linear |
| 206 | #define ATOMtype(t) ((t) == TYPE_void ? TYPE_oid : (t)) |
| 207 | #define ATOMfix(t,v) do if (BATatoms[t].atomFix) BATatoms[t].atomFix(v); while (0) |
| 208 | #define ATOMunfix(t,v) do if (BATatoms[t].atomUnfix) BATatoms[t].atomUnfix(v); while (0) |
| 209 | |
| 210 | /* The base type is the storage type if the comparison function, the |
| 211 | * hash function, and the nil value are the same as those of the |
| 212 | * storage type; otherwise it is the type itself. */ |
| 213 | #define ATOMbasetype(t) ((t) != ATOMstorage(t) && \ |
| 214 | ATOMnilptr(t) == ATOMnilptr(ATOMstorage(t)) && \ |
| 215 | ATOMcompare(t) == ATOMcompare(ATOMstorage(t)) && \ |
| 216 | BATatoms[t].atomHash == BATatoms[ATOMstorage(t)].atomHash ? \ |
| 217 | ATOMstorage(t) : (t)) |
| 218 | |
| 219 | /* |
| 220 | * In case that atoms are added to a bat, their logical reference |
| 221 | * count should be incremented (and decremented if deleted). Notice |
| 222 | * that BATs with atomic types that have logical references (e.g. BATs |
| 223 | * of BATs but also BATs of ODMG odSet) can never be persistent, as |
| 224 | * this would make the commit tremendously complicated. |
| 225 | */ |
| 226 | #ifdef HAVE_HGE |
| 227 | #define ATOM_CASE_16_hge \ |
| 228 | case 16: \ |
| 229 | * (hge *) d_ = * (hge *) s_; \ |
| 230 | break |
| 231 | #else |
| 232 | #define ATOM_CASE_16_hge |
| 233 | #endif |
| 234 | |
| 235 | #define ATOMputVAR(type, heap, dst, src) \ |
| 236 | do { \ |
| 237 | assert(BATatoms[type].atomPut != NULL); \ |
| 238 | if ((*BATatoms[type].atomPut)(heap, dst, src) == 0) \ |
| 239 | goto bunins_failed; \ |
| 240 | } while (0) |
| 241 | #define ATOMputFIX(type, dst, src) \ |
| 242 | do { \ |
| 243 | int t_ = (type); \ |
| 244 | void *d_ = (dst); \ |
| 245 | const void *s_ = (src); \ |
| 246 | \ |
| 247 | assert(BATatoms[t_].atomPut == NULL); \ |
| 248 | ATOMfix(t_, s_); \ |
| 249 | switch (ATOMsize(t_)) { \ |
| 250 | case 0: /* void */ \ |
| 251 | break; \ |
| 252 | case 1: \ |
| 253 | * (bte *) d_ = * (bte *) s_; \ |
| 254 | break; \ |
| 255 | case 2: \ |
| 256 | * (sht *) d_ = * (sht *) s_; \ |
| 257 | break; \ |
| 258 | case 4: \ |
| 259 | * (int *) d_ = * (int *) s_; \ |
| 260 | break; \ |
| 261 | case 8: \ |
| 262 | * (lng *) d_ = * (lng *) s_; \ |
| 263 | break; \ |
| 264 | ATOM_CASE_16_hge; \ |
| 265 | default: \ |
| 266 | memcpy(d_, s_, ATOMsize(t_)); \ |
| 267 | break; \ |
| 268 | } \ |
| 269 | } while (0) |
| 270 | |
| 271 | #define ATOMreplaceVAR(type, heap, dst, src) \ |
| 272 | do { \ |
| 273 | int t_ = (type); \ |
| 274 | var_t *d_ = (var_t *) (dst); \ |
| 275 | const void *s_ = (src); \ |
| 276 | var_t loc_ = *d_; \ |
| 277 | Heap *h_ = (heap); \ |
| 278 | \ |
| 279 | assert(BATatoms[t_].atomPut != NULL); \ |
| 280 | if ((*BATatoms[t_].atomPut)(h_, &loc_, s_) == 0) \ |
| 281 | goto bunins_failed; \ |
| 282 | ATOMunfix(t_, d_); \ |
| 283 | ATOMdel(t_, h_, d_); \ |
| 284 | *d_ = loc_; \ |
| 285 | ATOMfix(t_, s_); \ |
| 286 | } while (0) |
| 287 | |
| 288 | /* string heaps: |
| 289 | * - strings are 8 byte aligned |
| 290 | * - start with a 1024 bucket hash table |
| 291 | * - heaps < 64KiB are fully duplicate eliminated with this hash tables |
| 292 | * - heaps >= 64KiB are opportunistically (imperfect) duplicate |
| 293 | * eliminated as only the last 128KiB chunk is considered and there |
| 294 | * is no linked list |
| 295 | * - buckets and next pointers are unsigned short "indices" |
| 296 | * - indices should be multiplied by 8 and takes from ELIMBASE to get |
| 297 | * an offset |
| 298 | * Note that a 64KiB chunk of the heap contains at most 8K 8-byte |
| 299 | * aligned strings. The 1K bucket list means that in worst load, the |
| 300 | * list length is 8 (OK). |
| 301 | */ |
| 302 | #define GDK_STRHASHTABLE (1<<10) /* 1024 */ |
| 303 | #define GDK_STRHASHMASK (GDK_STRHASHTABLE-1) |
| 304 | #define GDK_STRHASHSIZE (GDK_STRHASHTABLE * sizeof(stridx_t)) |
| 305 | #define GDK_ELIMPOWER 16 /* 64KiB is the threshold */ |
| 306 | #define GDK_ELIMDOUBLES(h) ((h)->free < GDK_ELIMLIMIT) |
| 307 | #define GDK_ELIMLIMIT (1<<GDK_ELIMPOWER) /* equivalently: ELIMBASE == 0 */ |
| 308 | #define GDK_ELIMBASE(x) (((x) >> GDK_ELIMPOWER) << GDK_ELIMPOWER) |
| 309 | #define GDK_VAROFFSET ((var_t) GDK_STRHASHSIZE) |
| 310 | |
| 311 | /* |
| 312 | * @- String Comparison, NILs and UTF-8 |
| 313 | * |
| 314 | * Using the char* type for strings is handy as this is the type of |
| 315 | * any constant strings in a C/C++ program. Therefore, MonetDB uses |
| 316 | * this definition for str. However, different compilers and |
| 317 | * platforms use either signed or unsigned characters for the char |
| 318 | * type. It is required that string ordering in MonetDB is consistent |
| 319 | * over platforms though. |
| 320 | * |
| 321 | * As for the choice how strings should be ordered, our support for |
| 322 | * UTF-8 actually imposes that it should follow 'unsigned char' |
| 323 | * doctrine (like in the AIX native compiler). In this semantics, |
| 324 | * though we have to take corrective action to ensure that str(nil) is |
| 325 | * the smallest value of the domain. |
| 326 | */ |
| 327 | #define GDK_STRNIL(s) ((s) == NULL || *(const char*) (s) == '\200') |
| 328 | #define GDK_STRLEN(s) ((GDK_STRNIL(s)?1:strlen(s))+1) |
| 329 | #define GDK_STRCMP(l,r) (GDK_STRNIL(l)?(GDK_STRNIL(r)?0:-1):GDK_STRNIL(r)?1: \ |
| 330 | (*(const unsigned char*)(l) < *(const unsigned char*)(r))?-1: \ |
| 331 | (*(const unsigned char*)(l) > *(const unsigned char*)(r))?1: \ |
| 332 | strCmpNoNil((const unsigned char*)(l),(const unsigned char*)(r))) |
| 333 | /* |
| 334 | * @- Hash Function |
| 335 | * The string hash function is a very simple hash function that xors |
| 336 | * and rotates all characters together. It is optimized to process 2 |
| 337 | * characters at a time (adding 16-bits to the hash value each |
| 338 | * iteration). |
| 339 | */ |
| 340 | #define GDK_STRHASH(x,y) \ |
| 341 | do { \ |
| 342 | const char *_key = (const char *) (x); \ |
| 343 | BUN _i; \ |
| 344 | for (_i = y = 0; _key[_i]; _i++) { \ |
| 345 | y += _key[_i]; \ |
| 346 | y += (y << 10); \ |
| 347 | y ^= (y >> 6); \ |
| 348 | } \ |
| 349 | y += (y << 3); \ |
| 350 | y ^= (y >> 11); \ |
| 351 | y += (y << 15); \ |
| 352 | } while (0) |
| 353 | |
| 354 | #endif /* _GDK_ATOMS_H_ */ |
| 355 | |