1 | /* |
2 | ** String handling. |
3 | ** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h |
4 | */ |
5 | |
6 | #define lj_str_c |
7 | #define LUA_CORE |
8 | |
9 | #include "lj_obj.h" |
10 | #include "lj_gc.h" |
11 | #include "lj_err.h" |
12 | #include "lj_str.h" |
13 | #include "lj_char.h" |
14 | #include "lj_prng.h" |
15 | |
16 | /* -- String helpers ------------------------------------------------------ */ |
17 | |
18 | /* Ordered compare of strings. Assumes string data is 4-byte aligned. */ |
19 | int32_t LJ_FASTCALL lj_str_cmp(GCstr *a, GCstr *b) |
20 | { |
21 | MSize i, n = a->len > b->len ? b->len : a->len; |
22 | for (i = 0; i < n; i += 4) { |
23 | /* Note: innocuous access up to end of string + 3. */ |
24 | uint32_t va = *(const uint32_t *)(strdata(a)+i); |
25 | uint32_t vb = *(const uint32_t *)(strdata(b)+i); |
26 | if (va != vb) { |
27 | #if LJ_LE |
28 | va = lj_bswap(va); vb = lj_bswap(vb); |
29 | #endif |
30 | i -= n; |
31 | if ((int32_t)i >= -3) { |
32 | va >>= 32+(i<<3); vb >>= 32+(i<<3); |
33 | if (va == vb) break; |
34 | } |
35 | return va < vb ? -1 : 1; |
36 | } |
37 | } |
38 | return (int32_t)(a->len - b->len); |
39 | } |
40 | |
41 | /* Find fixed string p inside string s. */ |
42 | const char *lj_str_find(const char *s, const char *p, MSize slen, MSize plen) |
43 | { |
44 | if (plen <= slen) { |
45 | if (plen == 0) { |
46 | return s; |
47 | } else { |
48 | int c = *(const uint8_t *)p++; |
49 | plen--; slen -= plen; |
50 | while (slen) { |
51 | const char *q = (const char *)memchr(s, c, slen); |
52 | if (!q) break; |
53 | if (memcmp(q+1, p, plen) == 0) return q; |
54 | q++; slen -= (MSize)(q-s); s = q; |
55 | } |
56 | } |
57 | } |
58 | return NULL; |
59 | } |
60 | |
61 | /* Check whether a string has a pattern matching character. */ |
62 | int lj_str_haspattern(GCstr *s) |
63 | { |
64 | const char *p = strdata(s), *q = p + s->len; |
65 | while (p < q) { |
66 | int c = *(const uint8_t *)p++; |
67 | if (lj_char_ispunct(c) && strchr("^$*+?.([%-" , c)) |
68 | return 1; /* Found a pattern matching char. */ |
69 | } |
70 | return 0; /* No pattern matching chars found. */ |
71 | } |
72 | |
73 | /* -- String hashing ------------------------------------------------------ */ |
74 | |
75 | /* Keyed sparse ARX string hash. Constant time. */ |
76 | static StrHash hash_sparse(uint64_t seed, const char *str, MSize len) |
77 | { |
78 | /* Constants taken from lookup3 hash by Bob Jenkins. */ |
79 | StrHash a, b, h = len ^ (StrHash)seed; |
80 | if (len >= 4) { /* Caveat: unaligned access! */ |
81 | a = lj_getu32(str); |
82 | h ^= lj_getu32(str+len-4); |
83 | b = lj_getu32(str+(len>>1)-2); |
84 | h ^= b; h -= lj_rol(b, 14); |
85 | b += lj_getu32(str+(len>>2)-1); |
86 | } else { |
87 | a = *(const uint8_t *)str; |
88 | h ^= *(const uint8_t *)(str+len-1); |
89 | b = *(const uint8_t *)(str+(len>>1)); |
90 | h ^= b; h -= lj_rol(b, 14); |
91 | } |
92 | a ^= h; a -= lj_rol(h, 11); |
93 | b ^= a; b -= lj_rol(a, 25); |
94 | h ^= b; h -= lj_rol(b, 16); |
95 | return h; |
96 | } |
97 | |
98 | #if LUAJIT_SECURITY_STRHASH |
99 | /* Keyed dense ARX string hash. Linear time. */ |
100 | static LJ_NOINLINE StrHash hash_dense(uint64_t seed, StrHash h, |
101 | const char *str, MSize len) |
102 | { |
103 | StrHash b = lj_bswap(lj_rol(h ^ (StrHash)(seed >> 32), 4)); |
104 | if (len > 12) { |
105 | StrHash a = (StrHash)seed; |
106 | const char *pe = str+len-12, *p = pe, *q = str; |
107 | do { |
108 | a += lj_getu32(p); |
109 | b += lj_getu32(p+4); |
110 | h += lj_getu32(p+8); |
111 | p = q; q += 12; |
112 | h ^= b; h -= lj_rol(b, 14); |
113 | a ^= h; a -= lj_rol(h, 11); |
114 | b ^= a; b -= lj_rol(a, 25); |
115 | } while (p < pe); |
116 | h ^= b; h -= lj_rol(b, 16); |
117 | a ^= h; a -= lj_rol(h, 4); |
118 | b ^= a; b -= lj_rol(a, 14); |
119 | } |
120 | return b; |
121 | } |
122 | #endif |
123 | |
124 | /* -- String interning ---------------------------------------------------- */ |
125 | |
126 | #define LJ_STR_MAXCOLL 32 |
127 | |
128 | /* Resize the string interning hash table (grow and shrink). */ |
129 | void lj_str_resize(lua_State *L, MSize newmask) |
130 | { |
131 | global_State *g = G(L); |
132 | GCRef *newtab, *oldtab = g->str.tab; |
133 | MSize i; |
134 | |
135 | /* No resizing during GC traversal or if already too big. */ |
136 | if (g->gc.state == GCSsweepstring || newmask >= LJ_MAX_STRTAB-1) |
137 | return; |
138 | |
139 | newtab = lj_mem_newvec(L, newmask+1, GCRef); |
140 | memset(newtab, 0, (newmask+1)*sizeof(GCRef)); |
141 | |
142 | #if LUAJIT_SECURITY_STRHASH |
143 | /* Check which chains need secondary hashes. */ |
144 | if (g->str.second) { |
145 | int newsecond = 0; |
146 | /* Compute primary chain lengths. */ |
147 | for (i = g->str.mask; i != ~(MSize)0; i--) { |
148 | GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); |
149 | while (o) { |
150 | GCstr *s = gco2str(o); |
151 | MSize hash = s->hashalg ? hash_sparse(g->str.seed, strdata(s), s->len) : |
152 | s->hash; |
153 | hash &= newmask; |
154 | setgcrefp(newtab[hash], gcrefu(newtab[hash]) + 1); |
155 | o = gcnext(o); |
156 | } |
157 | } |
158 | /* Mark secondary chains. */ |
159 | for (i = newmask; i != ~(MSize)0; i--) { |
160 | int secondary = gcrefu(newtab[i]) > LJ_STR_MAXCOLL; |
161 | newsecond |= secondary; |
162 | setgcrefp(newtab[i], secondary); |
163 | } |
164 | g->str.second = newsecond; |
165 | } |
166 | #endif |
167 | |
168 | /* Reinsert all strings from the old table into the new table. */ |
169 | for (i = g->str.mask; i != ~(MSize)0; i--) { |
170 | GCobj *o = (GCobj *)(gcrefu(oldtab[i]) & ~(uintptr_t)1); |
171 | while (o) { |
172 | GCobj *next = gcnext(o); |
173 | GCstr *s = gco2str(o); |
174 | MSize hash = s->hash; |
175 | #if LUAJIT_SECURITY_STRHASH |
176 | uintptr_t u; |
177 | if (LJ_LIKELY(!s->hashalg)) { /* String hashed with primary hash. */ |
178 | hash &= newmask; |
179 | u = gcrefu(newtab[hash]); |
180 | if (LJ_UNLIKELY(u & 1)) { /* Switch string to secondary hash. */ |
181 | s->hash = hash = hash_dense(g->str.seed, s->hash, strdata(s), s->len); |
182 | s->hashalg = 1; |
183 | hash &= newmask; |
184 | u = gcrefu(newtab[hash]); |
185 | } |
186 | } else { /* String hashed with secondary hash. */ |
187 | MSize shash = hash_sparse(g->str.seed, strdata(s), s->len); |
188 | u = gcrefu(newtab[shash & newmask]); |
189 | if (u & 1) { |
190 | hash &= newmask; |
191 | u = gcrefu(newtab[hash]); |
192 | } else { /* Revert string back to primary hash. */ |
193 | s->hash = shash; |
194 | s->hashalg = 0; |
195 | hash = (shash & newmask); |
196 | } |
197 | } |
198 | /* NOBARRIER: The string table is a GC root. */ |
199 | setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); |
200 | setgcrefp(newtab[hash], ((uintptr_t)o | (u & 1))); |
201 | #else |
202 | hash &= newmask; |
203 | /* NOBARRIER: The string table is a GC root. */ |
204 | setgcrefr(o->gch.nextgc, newtab[hash]); |
205 | setgcref(newtab[hash], o); |
206 | #endif |
207 | o = next; |
208 | } |
209 | } |
210 | |
211 | /* Free old table and replace with new table. */ |
212 | lj_str_freetab(g); |
213 | g->str.tab = newtab; |
214 | g->str.mask = newmask; |
215 | } |
216 | |
217 | #if LUAJIT_SECURITY_STRHASH |
218 | /* Rehash and rechain all strings in a chain. */ |
219 | static LJ_NOINLINE GCstr *lj_str_rehash_chain(lua_State *L, StrHash hashc, |
220 | const char *str, MSize len) |
221 | { |
222 | global_State *g = G(L); |
223 | int ow = g->gc.state == GCSsweepstring ? otherwhite(g) : 0; /* Sweeping? */ |
224 | GCRef *strtab = g->str.tab; |
225 | MSize strmask = g->str.mask; |
226 | GCobj *o = gcref(strtab[hashc & strmask]); |
227 | setgcrefp(strtab[hashc & strmask], (void *)((uintptr_t)1)); |
228 | g->str.second = 1; |
229 | while (o) { |
230 | uintptr_t u; |
231 | GCobj *next = gcnext(o); |
232 | GCstr *s = gco2str(o); |
233 | StrHash hash; |
234 | if (ow) { /* Must sweep while rechaining. */ |
235 | if (((o->gch.marked ^ LJ_GC_WHITES) & ow)) { /* String alive? */ |
236 | lj_assertG(!isdead(g, o) || (o->gch.marked & LJ_GC_FIXED), |
237 | "sweep of undead string" ); |
238 | makewhite(g, o); |
239 | } else { /* Free dead string. */ |
240 | lj_assertG(isdead(g, o) || ow == LJ_GC_SFIXED, |
241 | "sweep of unlive string" ); |
242 | lj_str_free(g, s); |
243 | o = next; |
244 | continue; |
245 | } |
246 | } |
247 | hash = s->hash; |
248 | if (!s->hashalg) { /* Rehash with secondary hash. */ |
249 | hash = hash_dense(g->str.seed, hash, strdata(s), s->len); |
250 | s->hash = hash; |
251 | s->hashalg = 1; |
252 | } |
253 | /* Rechain. */ |
254 | hash &= strmask; |
255 | u = gcrefu(strtab[hash]); |
256 | setgcrefp(o->gch.nextgc, (u & ~(uintptr_t)1)); |
257 | setgcrefp(strtab[hash], ((uintptr_t)o | (u & 1))); |
258 | o = next; |
259 | } |
260 | /* Try to insert the pending string again. */ |
261 | return lj_str_new(L, str, len); |
262 | } |
263 | #endif |
264 | |
265 | /* Reseed String ID from PRNG after random interval < 2^bits. */ |
266 | #if LUAJIT_SECURITY_STRID == 1 |
267 | #define STRID_RESEED_INTERVAL 8 |
268 | #elif LUAJIT_SECURITY_STRID == 2 |
269 | #define STRID_RESEED_INTERVAL 4 |
270 | #elif LUAJIT_SECURITY_STRID >= 3 |
271 | #define STRID_RESEED_INTERVAL 0 |
272 | #endif |
273 | |
274 | /* Allocate a new string and add to string interning table. */ |
275 | static GCstr *lj_str_alloc(lua_State *L, const char *str, MSize len, |
276 | StrHash hash, int hashalg) |
277 | { |
278 | GCstr *s = lj_mem_newt(L, lj_str_size(len), GCstr); |
279 | global_State *g = G(L); |
280 | uintptr_t u; |
281 | newwhite(g, s); |
282 | s->gct = ~LJ_TSTR; |
283 | s->len = len; |
284 | s->hash = hash; |
285 | #ifndef STRID_RESEED_INTERVAL |
286 | s->sid = g->str.id++; |
287 | #elif STRID_RESEED_INTERVAL |
288 | if (!g->str.idreseed--) { |
289 | uint64_t r = lj_prng_u64(&g->prng); |
290 | g->str.id = (StrID)r; |
291 | g->str.idreseed = (uint8_t)(r >> (64 - STRID_RESEED_INTERVAL)); |
292 | } |
293 | s->sid = g->str.id++; |
294 | #else |
295 | s->sid = (StrID)lj_prng_u64(&g->prng); |
296 | #endif |
297 | s->reserved = 0; |
298 | s->hashalg = (uint8_t)hashalg; |
299 | /* Clear last 4 bytes of allocated memory. Implies zero-termination, too. */ |
300 | *(uint32_t *)(strdatawr(s)+(len & ~(MSize)3)) = 0; |
301 | memcpy(strdatawr(s), str, len); |
302 | /* Add to string hash table. */ |
303 | hash &= g->str.mask; |
304 | u = gcrefu(g->str.tab[hash]); |
305 | setgcrefp(s->nextgc, (u & ~(uintptr_t)1)); |
306 | /* NOBARRIER: The string table is a GC root. */ |
307 | setgcrefp(g->str.tab[hash], ((uintptr_t)s | (u & 1))); |
308 | if (g->str.num++ > g->str.mask) /* Allow a 100% load factor. */ |
309 | lj_str_resize(L, (g->str.mask<<1)+1); /* Grow string table. */ |
310 | return s; /* Return newly interned string. */ |
311 | } |
312 | |
313 | /* Intern a string and return string object. */ |
314 | GCstr *lj_str_new(lua_State *L, const char *str, size_t lenx) |
315 | { |
316 | global_State *g = G(L); |
317 | if (lenx-1 < LJ_MAX_STR-1) { |
318 | MSize len = (MSize)lenx; |
319 | StrHash hash = hash_sparse(g->str.seed, str, len); |
320 | MSize coll = 0; |
321 | int hashalg = 0; |
322 | /* Check if the string has already been interned. */ |
323 | GCobj *o = gcref(g->str.tab[hash & g->str.mask]); |
324 | #if LUAJIT_SECURITY_STRHASH |
325 | if (LJ_UNLIKELY((uintptr_t)o & 1)) { /* Secondary hash for this chain? */ |
326 | hashalg = 1; |
327 | hash = hash_dense(g->str.seed, hash, str, len); |
328 | o = (GCobj *)(gcrefu(g->str.tab[hash & g->str.mask]) & ~(uintptr_t)1); |
329 | } |
330 | #endif |
331 | while (o != NULL) { |
332 | GCstr *sx = gco2str(o); |
333 | if (sx->hash == hash && sx->len == len) { |
334 | if (memcmp(str, strdata(sx), len) == 0) { |
335 | if (isdead(g, o)) flipwhite(o); /* Resurrect if dead. */ |
336 | return sx; /* Return existing string. */ |
337 | } |
338 | coll++; |
339 | } |
340 | coll++; |
341 | o = gcnext(o); |
342 | } |
343 | #if LUAJIT_SECURITY_STRHASH |
344 | /* Rehash chain if there are too many collisions. */ |
345 | if (LJ_UNLIKELY(coll > LJ_STR_MAXCOLL) && !hashalg) { |
346 | return lj_str_rehash_chain(L, hash, str, len); |
347 | } |
348 | #endif |
349 | /* Otherwise allocate a new string. */ |
350 | return lj_str_alloc(L, str, len, hash, hashalg); |
351 | } else { |
352 | if (lenx) |
353 | lj_err_msg(L, LJ_ERR_STROV); |
354 | return &g->strempty; |
355 | } |
356 | } |
357 | |
358 | void LJ_FASTCALL lj_str_free(global_State *g, GCstr *s) |
359 | { |
360 | g->str.num--; |
361 | lj_mem_free(g, s, lj_str_size(s->len)); |
362 | } |
363 | |
364 | void LJ_FASTCALL lj_str_init(lua_State *L) |
365 | { |
366 | global_State *g = G(L); |
367 | g->str.seed = lj_prng_u64(&g->prng); |
368 | lj_str_resize(L, LJ_MIN_STRTAB-1); |
369 | } |
370 | |
371 | |