1/*
2** String handling.
3** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
4*/
5
6#define lj_str_c
7#define LUA_CORE
8
9#include "lj_obj.h"
10#include "lj_gc.h"
11#include "lj_err.h"
12#include "lj_str.h"
13#include "lj_char.h"
14#include "lj_prng.h"
15
16/* -- String helpers ------------------------------------------------------ */
17
18/* Ordered compare of strings. Assumes string data is 4-byte aligned. */
19int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b)
20{
21 MSize i, n = a->len > b->len ? b->len : a->len;
22 for (i = 0; i < n; i += 4) {
23 /* Note: innocuous access up to end of string + 3. */
24 uint32_t va = *(const uint32_t *)(strdata(a)+i);
25 uint32_t vb = *(const uint32_t *)(strdata(b)+i);
26 if (va != vb) {
27#if LJ_LE
28 va = lj_bswap(va); vb = lj_bswap(vb);
29#endif
30 i -= n;
31 if ((int32_t)i >= -3) {
32 va >>= 32+(i<<3); vb >>= 32+(i<<3);
33 if (va == vb) break;
34 }
35 return va < vb ? -1 : 1;
36 }
37 }
38 return (int32_t)(a->len - b->len);
39}
40
41/* Find fixed string p inside string s. */
42const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen)
43{
44 if (plen <= slen) {
45 if (plen == 0) {
46 return s;
47 } else {
48 int c = *(const uint8_t *)p++;
49 plen--; slen -= plen;
50 while (slen) {
51 const char *q = (const char *)memchr(s, c, slen);
52 if (!q) break;
53 if (memcmp(q+1, p, plen) == 0) return q;
54 q++; slen -= (MSize)(q-s); s = q;
55 }
56 }
57 }
58 return NULL;
59}
60
61/* Check whether a string has a pattern matching character. */
62int lj_str_haspattern(GCstr *s)
63{
64 const char *p = strdata(s), *q = p + s->len;
65 while (p < q) {
66 int c = *(const uint8_t *)p++;
67 if (lj_char_ispunct(c) && strchr("^$*+?.([%-", c))
68 return 1; /* Found a pattern matching char. */
69 }
70 return 0; /* No pattern matching chars found. */
71}
72
73/* -- String hashing ------------------------------------------------------ */
74
75/* Keyed sparse ARX string hash. Constant time. */
76static StrHash hash_sparse(uint64_t seed, const char *str, MSize len)
77{
78 /* Constants taken from lookup3 hash by Bob Jenkins. */
79 StrHash a, b, h = len ^ (StrHash)seed;
80 if (len >= 4) { /* Caveat: unaligned access! */
81 a = lj_getu32(str);
82 h ^= lj_getu32(str+len-4);
83 b = lj_getu32(str+(len>>1)-2);
84 h ^= b; h -= lj_rol(b, 14);
85 b += lj_getu32(str+(len>>2)-1);
86 } else {
87 a = *(const uint8_t *)str;
88 h ^= *(const uint8_t *)(str+len-1);
89 b = *(const uint8_t *)(str+(len>>1));
90 h ^= b; h -= lj_rol(b, 14);
91 }
92 a ^= h; a -= lj_rol(h, 11);
93 b ^= a; b -= lj_rol(a, 25);
94 h ^= b; h -= lj_rol(b, 16);
95 return h;
96}
97
98#if LUAJIT_SECURITY_STRHASH
99/* Keyed dense ARX string hash. Linear time. */
100static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h,
101 const char *str, MSize len)
102{
103 StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4));
104 if (len > 12) {
105 StrHash a = (StrHash)seed;
106 const char *pe = str+len-12, *p = pe, *q = str;
107 do {
108 a += lj_getu32(p);
109 b += lj_getu32(p+4);
110 h += lj_getu32(p+8);
111 p = q; q += 12;
112 h ^= b; h -= lj_rol(b, 14);
113 a ^= h; a -= lj_rol(h, 11);
114 b ^= a; b -= lj_rol(a, 25);
115 } while (p < pe);
116 h ^= b; h -= lj_rol(b, 16);
117 a ^= h; a -= lj_rol(h, 4);
118 b ^= a; b -= lj_rol(a, 14);
119 }
120 return b;
121}
122#endif
123
124/* -- String interning ---------------------------------------------------- */
125
126#define LJ_STR_MAXCOLL 32
127
128/* Resize the string interning hash table (grow and shrink). */
129void lj_str_resize(lua_State *L, MSize newmask)
130{
131 global_State *g = G(L);
132 GCRef *newtab, *oldtab = g->str.tab;
133 MSize i;
134
135 /* No resizing during GC traversal or if already too big. */
136 if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1)
137 return;
138
139 newtab = lj_mem_newvec(L, newmask+1, GCRef);
140 memset(newtab, 0, (newmask+1)*sizeof(GCRef));
141
142#if LUAJIT_SECURITY_STRHASH
143 /* Check which chains need secondary hashes. */
144 if (g->str.second) {
145 int newsecond = 0;
146 /* Compute primary chain lengths. */
147 for (i = g->str.mask; i != ~(MSize)0; i--) {
148 GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
149 while (o) {
150 GCstr *s = gco2str(o);
151 MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) :
152 s->hash;
153 hash &= newmask;
154 setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1);
155 o = gcnext(o);
156 }
157 }
158 /* Mark secondary chains. */
159 for (i = newmask; i != ~(MSize)0; i--) {
160 int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL;
161 newsecond |= secondary;
162 setgcrefp(newtab[i], secondary);
163 }
164 g->str.second = newsecond;
165 }
166#endif
167
168 /* Reinsert all strings from the old table into the new table. */
169 for (i = g->str.mask; i != ~(MSize)0; i--) {
170 GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1);
171 while (o) {
172 GCobj *next = gcnext(o);
173 GCstr *s = gco2str(o);
174 MSize hash = s->hash;
175#if LUAJIT_SECURITY_STRHASH
176 uintptr_t u;
177 if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */
178 hash &= newmask;
179 u = gcrefu(newtab[hash]);
180 if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */
181 s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len);
182 s->hashalg = 1;
183 hash &= newmask;
184 u = gcrefu(newtab[hash]);
185 }
186 } else { /* String hashed with secondary hash. */
187 MSize shash = hash_sparse(g->str.seed, strdata(s), s->len);
188 u = gcrefu(newtab[shash & newmask]);
189 if (u & 1) {
190 hash &= newmask;
191 u = gcrefu(newtab[hash]);
192 } else { /* Revert string back to primary hash. */
193 s->hash = shash;
194 s->hashalg = 0;
195 hash = (shash & newmask);
196 }
197 }
198 /* NOBARRIER: The string table is a GC root. */
199 setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
200 setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1)));
201#else
202 hash &= newmask;
203 /* NOBARRIER: The string table is a GC root. */
204 setgcrefr(o->gch.nextgc, newtab[hash]);
205 setgcref(newtab[hash], o);
206#endif
207 o = next;
208 }
209 }
210
211 /* Free old table and replace with new table. */
212 lj_str_freetab(g);
213 g->str.tab = newtab;
214 g->str.mask = newmask;
215}
216
217#if LUAJIT_SECURITY_STRHASH
218/* Rehash and rechain all strings in a chain. */
219static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc,
220 const char *str, MSize len)
221{
222 global_State *g = G(L);
223 int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */
224 GCRef *strtab = g->str.tab;
225 MSize strmask = g->str.mask;
226 GCobj *o = gcref(strtab[hashc & strmask]);
227 setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1));
228 g->str.second = 1;
229 while (o) {
230 uintptr_t u;
231 GCobj *next = gcnext(o);
232 GCstr *s = gco2str(o);
233 StrHash hash;
234 if (ow) { /* Must sweep while rechaining. */
235 if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */
236 lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED),
237 "sweep of undead string");
238 makewhite(g, o);
239 } else { /* Free dead string. */
240 lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED,
241 "sweep of unlive string");
242 lj_str_free(g, s);
243 o = next;
244 continue;
245 }
246 }
247 hash = s->hash;
248 if (!s->hashalg) { /* Rehash with secondary hash. */
249 hash = hash_dense(g->str.seed, hash, strdata(s), s->len);
250 s->hash = hash;
251 s->hashalg = 1;
252 }
253 /* Rechain. */
254 hash &= strmask;
255 u = gcrefu(strtab[hash]);
256 setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1));
257 setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1)));
258 o = next;
259 }
260 /* Try to insert the pending string again. */
261 return lj_str_new(L, str, len);
262}
263#endif
264
265/* Reseed String ID from PRNG after random interval < 2^bits. */
266#if LUAJIT_SECURITY_STRID == 1
267#define STRID_RESEED_INTERVAL 8
268#elif LUAJIT_SECURITY_STRID == 2
269#define STRID_RESEED_INTERVAL 4
270#elif LUAJIT_SECURITY_STRID >= 3
271#define STRID_RESEED_INTERVAL 0
272#endif
273
274/* Allocate a new string and add to string interning table. */
275static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len,
276 StrHash hash, int hashalg)
277{
278 GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr);
279 global_State *g = G(L);
280 uintptr_t u;
281 newwhite(g, s);
282 s->gct = ~LJ_TSTR;
283 s->len = len;
284 s->hash = hash;
285#ifndef STRID_RESEED_INTERVAL
286 s->sid = g->str.id++;
287#elif STRID_RESEED_INTERVAL
288 if (!g->str.idreseed--) {
289 uint64_t r = lj_prng_u64(&g->prng);
290 g->str.id = (StrID)r;
291 g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL));
292 }
293 s->sid = g->str.id++;
294#else
295 s->sid = (StrID)lj_prng_u64(&g->prng);
296#endif
297 s->reserved = 0;
298 s->hashalg = (uint8_t)hashalg;
299 /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */
300 *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0;
301 memcpy(strdatawr(s), str, len);
302 /* Add to string hash table. */
303 hash &= g->str.mask;
304 u = gcrefu(g->str.tab[hash]);
305 setgcrefp(s->nextgc, (u & ~(uintptr_t)1));
306 /* NOBARRIER: The string table is a GC root. */
307 setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1)));
308 if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */
309 lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */
310 return s; /* Return newly interned string. */
311}
312
313/* Intern a string and return string object. */
314GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx)
315{
316 global_State *g = G(L);
317 if (lenx-1 < LJ_MAX_STR-1) {
318 MSize len = (MSize)lenx;
319 StrHash hash = hash_sparse(g->str.seed, str, len);
320 MSize coll = 0;
321 int hashalg = 0;
322 /* Check if the string has already been interned. */
323 GCobj *o = gcref(g->str.tab[hash & g->str.mask]);
324#if LUAJIT_SECURITY_STRHASH
325 if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */
326 hashalg = 1;
327 hash = hash_dense(g->str.seed, hash, str, len);
328 o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1);
329 }
330#endif
331 while (o != NULL) {
332 GCstr *sx = gco2str(o);
333 if (sx->hash == hash && sx->len == len) {
334 if (memcmp(str, strdata(sx), len) == 0) {
335 if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */
336 return sx; /* Return existing string. */
337 }
338 coll++;
339 }
340 coll++;
341 o = gcnext(o);
342 }
343#if LUAJIT_SECURITY_STRHASH
344 /* Rehash chain if there are too many collisions. */
345 if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) {
346 return lj_str_rehash_chain(L, hash, str, len);
347 }
348#endif
349 /* Otherwise allocate a new string. */
350 return lj_str_alloc(L, str, len, hash, hashalg);
351 } else {
352 if (lenx)
353 lj_err_msg(L, LJ_ERR_STROV);
354 return &g->strempty;
355 }
356}
357
358void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s)
359{
360 g->str.num--;
361 lj_mem_free(g, s, lj_str_size(s->len));
362}
363
364void LJ_FASTCALL lj_str_init(lua_State *L)
365{
366 global_State *g = G(L);
367 g->str.seed = lj_prng_u64(&g->prng);
368 lj_str_resize(L, LJ_MIN_STRTAB-1);
369}
370
371