| 1 | /* |
| 2 | ** String handling. |
| 3 | ** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h |
| 4 | */ |
| 5 | |
| 6 | #define lj_str_c |
| 7 | #define LUA_CORE |
| 8 | |
| 9 | #include "lj_obj.h" |
| 10 | #include "lj_gc.h" |
| 11 | #include "lj_err.h" |
| 12 | #include "lj_str.h" |
| 13 | #include "lj_char.h" |
| 14 | #include "lj_prng.h" |
| 15 | |
| 16 | /* -- String helpers ------------------------------------------------------ */ |
| 17 | |
| 18 | /* Ordered compare of strings. Assumes string data is 4-byte aligned. */ |
| 19 | int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b) |
| 20 | { |
| 21 | MSize i, n = a->len > b->len ? b->len : a->len; |
| 22 | for (i = 0; i < n; i += 4) { |
| 23 | /* Note: innocuous access up to end of string + 3. */ |
| 24 | uint32_t va = *(const uint32_t *)(strdata(a)+i); |
| 25 | uint32_t vb = *(const uint32_t *)(strdata(b)+i); |
| 26 | if (va != vb) { |
| 27 | #if LJ_LE |
| 28 | va = lj_bswap(va); vb = lj_bswap(vb); |
| 29 | #endif |
| 30 | i -= n; |
| 31 | if ((int32_t)i >= -3) { |
| 32 | va >>= 32+(i<<3); vb >>= 32+(i<<3); |
| 33 | if (va == vb) break; |
| 34 | } |
| 35 | return va < vb ? -1 : 1; |
| 36 | } |
| 37 | } |
| 38 | return (int32_t)(a->len - b->len); |
| 39 | } |
| 40 | |
| 41 | /* Find fixed string p inside string s. */ |
| 42 | const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen) |
| 43 | { |
| 44 | if (plen <= slen) { |
| 45 | if (plen == 0) { |
| 46 | return s; |
| 47 | } else { |
| 48 | int c = *(const uint8_t *)p++; |
| 49 | plen--; slen -= plen; |
| 50 | while (slen) { |
| 51 | const char *q = (const char *)memchr(s, c, slen); |
| 52 | if (!q) break; |
| 53 | if (memcmp(q+1, p, plen) == 0) return q; |
| 54 | q++; slen -= (MSize)(q-s); s = q; |
| 55 | } |
| 56 | } |
| 57 | } |
| 58 | return NULL; |
| 59 | } |
| 60 | |
| 61 | /* Check whether a string has a pattern matching character. */ |
| 62 | int lj_str_haspattern(GCstr *s) |
| 63 | { |
| 64 | const char *p = strdata(s), *q = p + s->len; |
| 65 | while (p < q) { |
| 66 | int c = *(const uint8_t *)p++; |
| 67 | if (lj_char_ispunct(c) && strchr("^$*+?.([%-" , c)) |
| 68 | return 1; /* Found a pattern matching char. */ |
| 69 | } |
| 70 | return 0; /* No pattern matching chars found. */ |
| 71 | } |
| 72 | |
| 73 | /* -- String hashing ------------------------------------------------------ */ |
| 74 | |
| 75 | /* Keyed sparse ARX string hash. Constant time. */ |
| 76 | static StrHash hash_sparse(uint64_t seed, const char *str, MSize len) |
| 77 | { |
| 78 | /* Constants taken from lookup3 hash by Bob Jenkins. */ |
| 79 | StrHash a, b, h = len ^ (StrHash)seed; |
| 80 | if (len >= 4) { /* Caveat: unaligned access! */ |
| 81 | a = lj_getu32(str); |
| 82 | h ^= lj_getu32(str+len-4); |
| 83 | b = lj_getu32(str+(len>>1)-2); |
| 84 | h ^= b; h -= lj_rol(b, 14); |
| 85 | b += lj_getu32(str+(len>>2)-1); |
| 86 | } else { |
| 87 | a = *(const uint8_t *)str; |
| 88 | h ^= *(const uint8_t *)(str+len-1); |
| 89 | b = *(const uint8_t *)(str+(len>>1)); |
| 90 | h ^= b; h -= lj_rol(b, 14); |
| 91 | } |
| 92 | a ^= h; a -= lj_rol(h, 11); |
| 93 | b ^= a; b -= lj_rol(a, 25); |
| 94 | h ^= b; h -= lj_rol(b, 16); |
| 95 | return h; |
| 96 | } |
| 97 | |
| 98 | #if LUAJIT_SECURITY_STRHASH |
| 99 | /* Keyed dense ARX string hash. Linear time. */ |
| 100 | static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h, |
| 101 | const char *str, MSize len) |
| 102 | { |
| 103 | StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4)); |
| 104 | if (len > 12) { |
| 105 | StrHash a = (StrHash)seed; |
| 106 | const char *pe = str+len-12, *p = pe, *q = str; |
| 107 | do { |
| 108 | a += lj_getu32(p); |
| 109 | b += lj_getu32(p+4); |
| 110 | h += lj_getu32(p+8); |
| 111 | p = q; q += 12; |
| 112 | h ^= b; h -= lj_rol(b, 14); |
| 113 | a ^= h; a -= lj_rol(h, 11); |
| 114 | b ^= a; b -= lj_rol(a, 25); |
| 115 | } while (p < pe); |
| 116 | h ^= b; h -= lj_rol(b, 16); |
| 117 | a ^= h; a -= lj_rol(h, 4); |
| 118 | b ^= a; b -= lj_rol(a, 14); |
| 119 | } |
| 120 | return b; |
| 121 | } |
| 122 | #endif |
| 123 | |
| 124 | /* -- String interning ---------------------------------------------------- */ |
| 125 | |
| 126 | #define LJ_STR_MAXCOLL 32 |
| 127 | |
| 128 | /* Resize the string interning hash table (grow and shrink). */ |
| 129 | void lj_str_resize(lua_State *L, MSize newmask) |
| 130 | { |
| 131 | global_State *g = G(L); |
| 132 | GCRef *newtab, *oldtab = g->str.tab; |
| 133 | MSize i; |
| 134 | |
| 135 | /* No resizing during GC traversal or if already too big. */ |
| 136 | if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1) |
| 137 | return; |
| 138 | |
| 139 | newtab = lj_mem_newvec(L, newmask+1, GCRef); |
| 140 | memset(newtab, 0, (newmask+1)*sizeof(GCRef)); |
| 141 | |
| 142 | #if LUAJIT_SECURITY_STRHASH |
| 143 | /* Check which chains need secondary hashes. */ |
| 144 | if (g->str.second) { |
| 145 | int newsecond = 0; |
| 146 | /* Compute primary chain lengths. */ |
| 147 | for (i = g->str.mask; i != ~(MSize)0; i--) { |
| 148 | GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); |
| 149 | while (o) { |
| 150 | GCstr *s = gco2str(o); |
| 151 | MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) : |
| 152 | s->hash; |
| 153 | hash &= newmask; |
| 154 | setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1); |
| 155 | o = gcnext(o); |
| 156 | } |
| 157 | } |
| 158 | /* Mark secondary chains. */ |
| 159 | for (i = newmask; i != ~(MSize)0; i--) { |
| 160 | int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL; |
| 161 | newsecond |= secondary; |
| 162 | setgcrefp(newtab[i], secondary); |
| 163 | } |
| 164 | g->str.second = newsecond; |
| 165 | } |
| 166 | #endif |
| 167 | |
| 168 | /* Reinsert all strings from the old table into the new table. */ |
| 169 | for (i = g->str.mask; i != ~(MSize)0; i--) { |
| 170 | GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); |
| 171 | while (o) { |
| 172 | GCobj *next = gcnext(o); |
| 173 | GCstr *s = gco2str(o); |
| 174 | MSize hash = s->hash; |
| 175 | #if LUAJIT_SECURITY_STRHASH |
| 176 | uintptr_t u; |
| 177 | if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */ |
| 178 | hash &= newmask; |
| 179 | u = gcrefu(newtab[hash]); |
| 180 | if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */ |
| 181 | s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len); |
| 182 | s->hashalg = 1; |
| 183 | hash &= newmask; |
| 184 | u = gcrefu(newtab[hash]); |
| 185 | } |
| 186 | } else { /* String hashed with secondary hash. */ |
| 187 | MSize shash = hash_sparse(g->str.seed, strdata(s), s->len); |
| 188 | u = gcrefu(newtab[shash & newmask]); |
| 189 | if (u & 1) { |
| 190 | hash &= newmask; |
| 191 | u = gcrefu(newtab[hash]); |
| 192 | } else { /* Revert string back to primary hash. */ |
| 193 | s->hash = shash; |
| 194 | s->hashalg = 0; |
| 195 | hash = (shash & newmask); |
| 196 | } |
| 197 | } |
| 198 | /* NOBARRIER: The string table is a GC root. */ |
| 199 | setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); |
| 200 | setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1))); |
| 201 | #else |
| 202 | hash &= newmask; |
| 203 | /* NOBARRIER: The string table is a GC root. */ |
| 204 | setgcrefr(o->gch.nextgc, newtab[hash]); |
| 205 | setgcref(newtab[hash], o); |
| 206 | #endif |
| 207 | o = next; |
| 208 | } |
| 209 | } |
| 210 | |
| 211 | /* Free old table and replace with new table. */ |
| 212 | lj_str_freetab(g); |
| 213 | g->str.tab = newtab; |
| 214 | g->str.mask = newmask; |
| 215 | } |
| 216 | |
| 217 | #if LUAJIT_SECURITY_STRHASH |
| 218 | /* Rehash and rechain all strings in a chain. */ |
| 219 | static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc, |
| 220 | const char *str, MSize len) |
| 221 | { |
| 222 | global_State *g = G(L); |
| 223 | int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */ |
| 224 | GCRef *strtab = g->str.tab; |
| 225 | MSize strmask = g->str.mask; |
| 226 | GCobj *o = gcref(strtab[hashc & strmask]); |
| 227 | setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1)); |
| 228 | g->str.second = 1; |
| 229 | while (o) { |
| 230 | uintptr_t u; |
| 231 | GCobj *next = gcnext(o); |
| 232 | GCstr *s = gco2str(o); |
| 233 | StrHash hash; |
| 234 | if (ow) { /* Must sweep while rechaining. */ |
| 235 | if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */ |
| 236 | lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED), |
| 237 | "sweep of undead string" ); |
| 238 | makewhite(g, o); |
| 239 | } else { /* Free dead string. */ |
| 240 | lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED, |
| 241 | "sweep of unlive string" ); |
| 242 | lj_str_free(g, s); |
| 243 | o = next; |
| 244 | continue; |
| 245 | } |
| 246 | } |
| 247 | hash = s->hash; |
| 248 | if (!s->hashalg) { /* Rehash with secondary hash. */ |
| 249 | hash = hash_dense(g->str.seed, hash, strdata(s), s->len); |
| 250 | s->hash = hash; |
| 251 | s->hashalg = 1; |
| 252 | } |
| 253 | /* Rechain. */ |
| 254 | hash &= strmask; |
| 255 | u = gcrefu(strtab[hash]); |
| 256 | setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); |
| 257 | setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1))); |
| 258 | o = next; |
| 259 | } |
| 260 | /* Try to insert the pending string again. */ |
| 261 | return lj_str_new(L, str, len); |
| 262 | } |
| 263 | #endif |
| 264 | |
| 265 | /* Reseed String ID from PRNG after random interval < 2^bits. */ |
| 266 | #if LUAJIT_SECURITY_STRID == 1 |
| 267 | #define STRID_RESEED_INTERVAL 8 |
| 268 | #elif LUAJIT_SECURITY_STRID == 2 |
| 269 | #define STRID_RESEED_INTERVAL 4 |
| 270 | #elif LUAJIT_SECURITY_STRID >= 3 |
| 271 | #define STRID_RESEED_INTERVAL 0 |
| 272 | #endif |
| 273 | |
| 274 | /* Allocate a new string and add to string interning table. */ |
| 275 | static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len, |
| 276 | StrHash hash, int hashalg) |
| 277 | { |
| 278 | GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr); |
| 279 | global_State *g = G(L); |
| 280 | uintptr_t u; |
| 281 | newwhite(g, s); |
| 282 | s->gct = ~LJ_TSTR; |
| 283 | s->len = len; |
| 284 | s->hash = hash; |
| 285 | #ifndef STRID_RESEED_INTERVAL |
| 286 | s->sid = g->str.id++; |
| 287 | #elif STRID_RESEED_INTERVAL |
| 288 | if (!g->str.idreseed--) { |
| 289 | uint64_t r = lj_prng_u64(&g->prng); |
| 290 | g->str.id = (StrID)r; |
| 291 | g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL)); |
| 292 | } |
| 293 | s->sid = g->str.id++; |
| 294 | #else |
| 295 | s->sid = (StrID)lj_prng_u64(&g->prng); |
| 296 | #endif |
| 297 | s->reserved = 0; |
| 298 | s->hashalg = (uint8_t)hashalg; |
| 299 | /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */ |
| 300 | *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0; |
| 301 | memcpy(strdatawr(s), str, len); |
| 302 | /* Add to string hash table. */ |
| 303 | hash &= g->str.mask; |
| 304 | u = gcrefu(g->str.tab[hash]); |
| 305 | setgcrefp(s->nextgc, (u & ~(uintptr_t)1)); |
| 306 | /* NOBARRIER: The string table is a GC root. */ |
| 307 | setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1))); |
| 308 | if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */ |
| 309 | lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */ |
| 310 | return s; /* Return newly interned string. */ |
| 311 | } |
| 312 | |
| 313 | /* Intern a string and return string object. */ |
| 314 | GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) |
| 315 | { |
| 316 | global_State *g = G(L); |
| 317 | if (lenx-1 < LJ_MAX_STR-1) { |
| 318 | MSize len = (MSize)lenx; |
| 319 | StrHash hash = hash_sparse(g->str.seed, str, len); |
| 320 | MSize coll = 0; |
| 321 | int hashalg = 0; |
| 322 | /* Check if the string has already been interned. */ |
| 323 | GCobj *o = gcref(g->str.tab[hash & g->str.mask]); |
| 324 | #if LUAJIT_SECURITY_STRHASH |
| 325 | if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */ |
| 326 | hashalg = 1; |
| 327 | hash = hash_dense(g->str.seed, hash, str, len); |
| 328 | o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1); |
| 329 | } |
| 330 | #endif |
| 331 | while (o != NULL) { |
| 332 | GCstr *sx = gco2str(o); |
| 333 | if (sx->hash == hash && sx->len == len) { |
| 334 | if (memcmp(str, strdata(sx), len) == 0) { |
| 335 | if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */ |
| 336 | return sx; /* Return existing string. */ |
| 337 | } |
| 338 | coll++; |
| 339 | } |
| 340 | coll++; |
| 341 | o = gcnext(o); |
| 342 | } |
| 343 | #if LUAJIT_SECURITY_STRHASH |
| 344 | /* Rehash chain if there are too many collisions. */ |
| 345 | if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) { |
| 346 | return lj_str_rehash_chain(L, hash, str, len); |
| 347 | } |
| 348 | #endif |
| 349 | /* Otherwise allocate a new string. */ |
| 350 | return lj_str_alloc(L, str, len, hash, hashalg); |
| 351 | } else { |
| 352 | if (lenx) |
| 353 | lj_err_msg(L, LJ_ERR_STROV); |
| 354 | return &g->strempty; |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s) |
| 359 | { |
| 360 | g->str.num--; |
| 361 | lj_mem_free(g, s, lj_str_size(s->len)); |
| 362 | } |
| 363 | |
| 364 | void LJ_FASTCALL lj_str_init(lua_State *L) |
| 365 | { |
| 366 | global_State *g = G(L); |
| 367 | g->str.seed = lj_prng_u64(&g->prng); |
| 368 | lj_str_resize(L, LJ_MIN_STRTAB-1); |
| 369 | } |
| 370 | |
| 371 | |