1 | /* QQ: TODO multi-pinbox */ |
2 | /* Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved. |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ |
16 | |
17 | /* |
18 | wait-free concurrent allocator based on pinning addresses |
19 | |
20 | It works as follows: every thread (strictly speaking - every CPU, but |
21 | it's too difficult to do) has a small array of pointers. They're called |
22 | "pins". Before using an object its address must be stored in this array |
23 | (pinned). When an object is no longer necessary its address must be |
24 | removed from this array (unpinned). When a thread wants to free() an |
25 | object it scans all pins of all threads to see if somebody has this |
26 | object pinned. If yes - the object is not freed (but stored in a |
27 | "purgatory"). To reduce the cost of a single free() pins are not scanned |
28 | on every free() but only added to (thread-local) purgatory. On every |
29 | LF_PURGATORY_SIZE free() purgatory is scanned and all unpinned objects |
30 | are freed. |
31 | |
32 | Pins are used to solve ABA problem. To use pins one must obey |
33 | a pinning protocol: |
34 | |
35 | 1. Let's assume that PTR is a shared pointer to an object. Shared means |
36 | that any thread may modify it anytime to point to a different object |
37 | and free the old object. Later the freed object may be potentially |
38 | allocated by another thread. If we're unlucky that other thread may |
39 | set PTR to point to this object again. This is ABA problem. |
40 | 2. Create a local pointer LOCAL_PTR. |
41 | 3. Pin the PTR in a loop: |
42 | do |
43 | { |
44 | LOCAL_PTR= PTR; |
45 | pin(PTR, PIN_NUMBER); |
46 | } while (LOCAL_PTR != PTR) |
47 | 4. It is guaranteed that after the loop has ended, LOCAL_PTR |
48 | points to an object (or NULL, if PTR may be NULL), that |
49 | will never be freed. It is not guaranteed though |
50 | that LOCAL_PTR == PTR (as PTR can change any time) |
51 | 5. When done working with the object, remove the pin: |
52 | unpin(PIN_NUMBER) |
53 | 6. When copying pins (as in the list traversing loop: |
54 | pin(CUR, 1); |
55 | while () |
56 | { |
57 | do // standard |
58 | { // pinning |
59 | NEXT=CUR->next; // loop |
60 | pin(NEXT, 0); // see #3 |
61 | } while (NEXT != CUR->next); // above |
62 | ... |
63 | ... |
64 | CUR=NEXT; |
65 | pin(CUR, 1); // copy pin[0] to pin[1] |
66 | } |
67 | which keeps CUR address constantly pinned), note than pins may be |
68 | copied only upwards (!!!), that is pin[N] to pin[M], M > N. |
69 | 7. Don't keep the object pinned longer than necessary - the number of |
70 | pins you have is limited (and small), keeping an object pinned |
71 | prevents its reuse and cause unnecessary mallocs. |
72 | |
73 | Explanations: |
74 | |
75 | 3. The loop is important. The following can occur: |
76 | thread1> LOCAL_PTR= PTR |
77 | thread2> free(PTR); PTR=0; |
78 | thread1> pin(PTR, PIN_NUMBER); |
79 | now thread1 cannot access LOCAL_PTR, even if it's pinned, |
80 | because it points to a freed memory. That is, it *must* |
81 | verify that it has indeed pinned PTR, the shared pointer. |
82 | |
83 | 6. When a thread wants to free some LOCAL_PTR, and it scans |
84 | all lists of pins to see whether it's pinned, it does it |
85 | upwards, from low pin numbers to high. Thus another thread |
86 | must copy an address from one pin to another in the same |
87 | direction - upwards, otherwise the scanning thread may |
88 | miss it. |
89 | |
90 | Implementation details: |
91 | |
92 | Pins are given away from a "pinbox". Pinbox is stack-based allocator. |
93 | It used dynarray for storing pins, new elements are allocated by dynarray |
94 | as necessary, old are pushed in the stack for reuse. ABA is solved by |
95 | versioning a pointer - because we use an array, a pointer to pins is 16 bit, |
96 | upper 16 bits are used for a version. |
97 | |
98 | It is assumed that pins belong to a THD and are not transferable |
99 | between THD's (LF_PINS::stack_ends_here being a primary reason |
100 | for this limitation). |
101 | */ |
102 | #include <my_global.h> |
103 | #include <my_sys.h> |
104 | #include <lf.h> |
105 | |
106 | /* |
107 | when using alloca() leave at least that many bytes of the stack - |
108 | for functions we might be calling from within this stack frame |
109 | */ |
110 | #define ALLOCA_SAFETY_MARGIN 8192 |
111 | |
112 | #define LF_PINBOX_MAX_PINS 65536 |
113 | |
114 | static void lf_pinbox_real_free(LF_PINS *pins); |
115 | |
116 | /* |
117 | Initialize a pinbox. Normally called from lf_alloc_init. |
118 | See the latter for details. |
119 | */ |
120 | void lf_pinbox_init(LF_PINBOX *pinbox, uint free_ptr_offset, |
121 | lf_pinbox_free_func *free_func, void *free_func_arg) |
122 | { |
123 | DBUG_ASSERT(free_ptr_offset % sizeof(void *) == 0); |
124 | lf_dynarray_init(&pinbox->pinarray, sizeof(LF_PINS)); |
125 | pinbox->pinstack_top_ver= 0; |
126 | pinbox->pins_in_array= 0; |
127 | pinbox->free_ptr_offset= free_ptr_offset; |
128 | pinbox->free_func= free_func; |
129 | pinbox->free_func_arg= free_func_arg; |
130 | } |
131 | |
132 | void lf_pinbox_destroy(LF_PINBOX *pinbox) |
133 | { |
134 | lf_dynarray_destroy(&pinbox->pinarray); |
135 | } |
136 | |
137 | /* |
138 | Get pins from a pinbox. Usually called via lf_alloc_get_pins() or |
139 | lf_hash_get_pins(). |
140 | |
141 | SYNOPSIS |
142 | pinbox - |
143 | |
144 | DESCRIPTION |
145 | get a new LF_PINS structure from a stack of unused pins, |
146 | or allocate a new one out of dynarray. |
147 | |
148 | NOTE |
149 | It is assumed that pins belong to a thread and are not transferable |
150 | between threads. |
151 | */ |
152 | LF_PINS *lf_pinbox_get_pins(LF_PINBOX *pinbox) |
153 | { |
154 | struct st_my_thread_var *var; |
155 | uint32 pins, next, top_ver; |
156 | LF_PINS *el; |
157 | /* |
158 | We have an array of max. 64k elements. |
159 | The highest index currently allocated is pinbox->pins_in_array. |
160 | Freed elements are in a lifo stack, pinstack_top_ver. |
161 | pinstack_top_ver is 32 bits; 16 low bits are the index in the |
162 | array, to the first element of the list. 16 high bits are a version |
163 | (every time the 16 low bits are updated, the 16 high bits are |
164 | incremented). Versioning prevents the ABA problem. |
165 | */ |
166 | top_ver= pinbox->pinstack_top_ver; |
167 | do |
168 | { |
169 | if (!(pins= top_ver % LF_PINBOX_MAX_PINS)) |
170 | { |
171 | /* the stack of free elements is empty */ |
172 | pins= my_atomic_add32((int32 volatile*) &pinbox->pins_in_array, 1)+1; |
173 | if (unlikely(pins >= LF_PINBOX_MAX_PINS)) |
174 | return 0; |
175 | /* |
176 | note that the first allocated element has index 1 (pins==1). |
177 | index 0 is reserved to mean "NULL pointer" |
178 | */ |
179 | el= (LF_PINS *)lf_dynarray_lvalue(&pinbox->pinarray, pins); |
180 | if (unlikely(!el)) |
181 | return 0; |
182 | break; |
183 | } |
184 | el= (LF_PINS *)lf_dynarray_value(&pinbox->pinarray, pins); |
185 | next= el->link; |
186 | } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver, |
187 | (int32*) &top_ver, |
188 | top_ver-pins+next+LF_PINBOX_MAX_PINS)); |
189 | /* |
190 | set el->link to the index of el in the dynarray (el->link has two usages: |
191 | - if element is allocated, it's its own index |
192 | - if element is free, it's its next element in the free stack |
193 | */ |
194 | el->link= pins; |
195 | el->purgatory_count= 0; |
196 | el->pinbox= pinbox; |
197 | var= my_thread_var; |
198 | /* |
199 | Threads that do not call my_thread_init() should still be |
200 | able to use the LF_HASH. |
201 | */ |
202 | el->stack_ends_here= (var ? & var->stack_ends_here : NULL); |
203 | return el; |
204 | } |
205 | |
206 | /* |
207 | Put pins back to a pinbox. Usually called via lf_alloc_put_pins() or |
208 | lf_hash_put_pins(). |
209 | |
210 | DESCRIPTION |
211 | empty the purgatory (XXX deadlock warning below!), |
212 | push LF_PINS structure to a stack |
213 | */ |
214 | void lf_pinbox_put_pins(LF_PINS *pins) |
215 | { |
216 | LF_PINBOX *pinbox= pins->pinbox; |
217 | uint32 top_ver, nr; |
218 | nr= pins->link; |
219 | |
220 | #ifndef DBUG_OFF |
221 | { |
222 | /* This thread should not hold any pin. */ |
223 | int i; |
224 | for (i= 0; i < LF_PINBOX_PINS; i++) |
225 | DBUG_ASSERT(pins->pin[i] == 0); |
226 | } |
227 | #endif /* DBUG_OFF */ |
228 | |
229 | /* |
230 | XXX this will deadlock if other threads will wait for |
231 | the caller to do something after lf_pinbox_put_pins(), |
232 | and they would have pinned addresses that the caller wants to free. |
233 | Thus: only free pins when all work is done and nobody can wait for you!!! |
234 | */ |
235 | while (pins->purgatory_count) |
236 | { |
237 | lf_pinbox_real_free(pins); |
238 | if (pins->purgatory_count) |
239 | pthread_yield(); |
240 | } |
241 | top_ver= pinbox->pinstack_top_ver; |
242 | do |
243 | { |
244 | pins->link= top_ver % LF_PINBOX_MAX_PINS; |
245 | } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver, |
246 | (int32*) &top_ver, |
247 | top_ver-pins->link+nr+LF_PINBOX_MAX_PINS)); |
248 | return; |
249 | } |
250 | |
251 | static int ptr_cmp(void **a, void **b) |
252 | { |
253 | return *a < *b ? -1 : *a == *b ? 0 : 1; |
254 | } |
255 | |
256 | #define add_to_purgatory(PINS, ADDR) \ |
257 | do \ |
258 | { \ |
259 | *(void **)((char *)(ADDR)+(PINS)->pinbox->free_ptr_offset)= \ |
260 | (PINS)->purgatory; \ |
261 | (PINS)->purgatory= (ADDR); \ |
262 | (PINS)->purgatory_count++; \ |
263 | } while (0) |
264 | |
265 | /* |
266 | Free an object allocated via pinbox allocator |
267 | |
268 | DESCRIPTION |
269 | add an object to purgatory. if necessary, calllf_pinbox_real_free() |
270 | to actually free something. |
271 | */ |
272 | void lf_pinbox_free(LF_PINS *pins, void *addr) |
273 | { |
274 | add_to_purgatory(pins, addr); |
275 | if (pins->purgatory_count % LF_PURGATORY_SIZE == 0) |
276 | lf_pinbox_real_free(pins); |
277 | } |
278 | |
279 | struct st_harvester { |
280 | void **granary; |
281 | int npins; |
282 | }; |
283 | |
284 | /* |
285 | callback forlf_dynarray_iterate: |
286 | scan all pins of all threads and accumulate all pins |
287 | */ |
288 | static int harvest_pins(LF_PINS *el, struct st_harvester *hv) |
289 | { |
290 | int i; |
291 | LF_PINS *el_end= el+MY_MIN(hv->npins, LF_DYNARRAY_LEVEL_LENGTH); |
292 | for (; el < el_end; el++) |
293 | { |
294 | for (i= 0; i < LF_PINBOX_PINS; i++) |
295 | { |
296 | void *p= el->pin[i]; |
297 | if (p) |
298 | *hv->granary++= p; |
299 | } |
300 | } |
301 | /* |
302 | hv->npins may become negative below, but it means that |
303 | we're on the last dynarray page and harvest_pins() won't be |
304 | called again. We don't bother to make hv->npins() correct |
305 | (that is 0) in this case. |
306 | */ |
307 | hv->npins-= LF_DYNARRAY_LEVEL_LENGTH; |
308 | return 0; |
309 | } |
310 | |
311 | /* |
312 | callback forlf_dynarray_iterate: |
313 | scan all pins of all threads and see if addr is present there |
314 | */ |
315 | static int match_pins(LF_PINS *el, void *addr) |
316 | { |
317 | int i; |
318 | LF_PINS *el_end= el+LF_DYNARRAY_LEVEL_LENGTH; |
319 | for (; el < el_end; el++) |
320 | for (i= 0; i < LF_PINBOX_PINS; i++) |
321 | if (el->pin[i] == addr) |
322 | return 1; |
323 | return 0; |
324 | } |
325 | |
326 | #define next_node(P, X) (*((uchar * volatile *)(((uchar *)(X)) + (P)->free_ptr_offset))) |
327 | #define anext_node(X) next_node(&allocator->pinbox, (X)) |
328 | |
329 | /* |
330 | Scan the purgatory and free everything that can be freed |
331 | */ |
332 | static void lf_pinbox_real_free(LF_PINS *pins) |
333 | { |
334 | int npins; |
335 | void *list; |
336 | void **addr= NULL; |
337 | void *first= NULL, *last= NULL; |
338 | LF_PINBOX *pinbox= pins->pinbox; |
339 | |
340 | npins= pinbox->pins_in_array+1; |
341 | |
342 | #ifdef HAVE_ALLOCA |
343 | if (pins->stack_ends_here != NULL) |
344 | { |
345 | int alloca_size= sizeof(void *)*LF_PINBOX_PINS*npins; |
346 | /* create a sorted list of pinned addresses, to speed up searches */ |
347 | if (available_stack_size(&pinbox, *pins->stack_ends_here) > |
348 | alloca_size + ALLOCA_SAFETY_MARGIN) |
349 | { |
350 | struct st_harvester hv; |
351 | addr= (void **) alloca(alloca_size); |
352 | hv.granary= addr; |
353 | hv.npins= npins; |
354 | /* scan the dynarray and accumulate all pinned addresses */ |
355 | lf_dynarray_iterate(&pinbox->pinarray, |
356 | (lf_dynarray_func)harvest_pins, &hv); |
357 | |
358 | npins= (int)(hv.granary-addr); |
359 | /* and sort them */ |
360 | if (npins) |
361 | qsort(addr, npins, sizeof(void *), (qsort_cmp)ptr_cmp); |
362 | } |
363 | } |
364 | #endif |
365 | |
366 | list= pins->purgatory; |
367 | pins->purgatory= 0; |
368 | pins->purgatory_count= 0; |
369 | while (list) |
370 | { |
371 | void *cur= list; |
372 | list= *(void **)((char *)cur+pinbox->free_ptr_offset); |
373 | if (npins) |
374 | { |
375 | if (addr) /* use binary search */ |
376 | { |
377 | void **a, **b, **c; |
378 | for (a= addr, b= addr+npins-1, c= a+(b-a)/2; (b-a) > 1; c= a+(b-a)/2) |
379 | if (cur == *c) |
380 | a= b= c; |
381 | else if (cur > *c) |
382 | a= c; |
383 | else |
384 | b= c; |
385 | if (cur == *a || cur == *b) |
386 | goto found; |
387 | } |
388 | else /* no alloca - no cookie. linear search here */ |
389 | { |
390 | if (lf_dynarray_iterate(&pinbox->pinarray, |
391 | (lf_dynarray_func)match_pins, cur)) |
392 | goto found; |
393 | } |
394 | } |
395 | /* not pinned - freeing */ |
396 | if (last) |
397 | last= next_node(pinbox, last)= (uchar *)cur; |
398 | else |
399 | first= last= (uchar *)cur; |
400 | continue; |
401 | found: |
402 | /* pinned - keeping */ |
403 | add_to_purgatory(pins, cur); |
404 | } |
405 | if (last) |
406 | pinbox->free_func(first, last, pinbox->free_func_arg); |
407 | } |
408 | |
409 | /* lock-free memory allocator for fixed-size objects */ |
410 | |
411 | /* |
412 | callback forlf_pinbox_real_free to free a list of unpinned objects - |
413 | add it back to the allocator stack |
414 | |
415 | DESCRIPTION |
416 | 'first' and 'last' are the ends of the linked list of nodes: |
417 | first->el->el->....->el->last. Use first==last to free only one element. |
418 | */ |
419 | static void alloc_free(uchar *first, |
420 | uchar volatile *last, |
421 | LF_ALLOCATOR *allocator) |
422 | { |
423 | /* |
424 | we need a union here to access type-punned pointer reliably. |
425 | otherwise gcc -fstrict-aliasing will not see 'tmp' changed in the loop |
426 | */ |
427 | union { uchar * node; void *ptr; } tmp; |
428 | tmp.node= allocator->top; |
429 | do |
430 | { |
431 | anext_node(last)= tmp.node; |
432 | } while (!my_atomic_casptr((void **)(char *)&allocator->top, |
433 | (void **)&tmp.ptr, first) && LF_BACKOFF()); |
434 | } |
435 | |
436 | /* |
437 | initialize lock-free allocator |
438 | |
439 | SYNOPSIS |
440 | allocator - |
441 | size a size of an object to allocate |
442 | free_ptr_offset an offset inside the object to a sizeof(void *) |
443 | memory that is guaranteed to be unused after |
444 | the object is put in the purgatory. Unused by ANY |
445 | thread, not only the purgatory owner. |
446 | This memory will be used to link waiting-to-be-freed |
447 | objects in a purgatory list. |
448 | */ |
449 | void lf_alloc_init(LF_ALLOCATOR *allocator, uint size, uint free_ptr_offset) |
450 | { |
451 | lf_pinbox_init(&allocator->pinbox, free_ptr_offset, |
452 | (lf_pinbox_free_func *)alloc_free, allocator); |
453 | allocator->top= 0; |
454 | allocator->mallocs= 0; |
455 | allocator->element_size= size; |
456 | allocator->constructor= 0; |
457 | allocator->destructor= 0; |
458 | DBUG_ASSERT(size >= sizeof(void*) + free_ptr_offset); |
459 | } |
460 | |
461 | /* |
462 | destroy the allocator, free everything that's in it |
463 | |
464 | NOTE |
465 | As every other init/destroy function here and elsewhere it |
466 | is not thread safe. No, this function is no different, ensure |
467 | that no thread needs the allocator before destroying it. |
468 | We are not responsible for any damage that may be caused by |
469 | accessing the allocator when it is being or has been destroyed. |
470 | Oh yes, and don't put your cat in a microwave. |
471 | */ |
472 | void lf_alloc_destroy(LF_ALLOCATOR *allocator) |
473 | { |
474 | uchar *node= allocator->top; |
475 | while (node) |
476 | { |
477 | uchar *tmp= anext_node(node); |
478 | if (allocator->destructor) |
479 | allocator->destructor(node); |
480 | my_free(node); |
481 | node= tmp; |
482 | } |
483 | lf_pinbox_destroy(&allocator->pinbox); |
484 | allocator->top= 0; |
485 | } |
486 | |
487 | /* |
488 | Allocate and return an new object. |
489 | |
490 | DESCRIPTION |
491 | Pop an unused object from the stack or malloc it is the stack is empty. |
492 | pin[0] is used, it's removed on return. |
493 | */ |
494 | void *lf_alloc_new(LF_PINS *pins) |
495 | { |
496 | LF_ALLOCATOR *allocator= (LF_ALLOCATOR *)(pins->pinbox->free_func_arg); |
497 | uchar *node; |
498 | for (;;) |
499 | { |
500 | do |
501 | { |
502 | node= allocator->top; |
503 | lf_pin(pins, 0, node); |
504 | } while (node != allocator->top && LF_BACKOFF()); |
505 | if (!node) |
506 | { |
507 | node= (void *)my_malloc(allocator->element_size, MYF(MY_WME)); |
508 | if (allocator->constructor) |
509 | allocator->constructor(node); |
510 | #ifdef MY_LF_EXTRA_DEBUG |
511 | if (likely(node != 0)) |
512 | my_atomic_add32(&allocator->mallocs, 1); |
513 | #endif |
514 | break; |
515 | } |
516 | if (my_atomic_casptr((void **)(char *)&allocator->top, |
517 | (void *)&node, anext_node(node))) |
518 | break; |
519 | } |
520 | lf_unpin(pins, 0); |
521 | return node; |
522 | } |
523 | |
524 | /* |
525 | count the number of objects in a pool. |
526 | |
527 | NOTE |
528 | This is NOT thread-safe !!! |
529 | */ |
530 | uint lf_alloc_pool_count(LF_ALLOCATOR *allocator) |
531 | { |
532 | uint i; |
533 | uchar *node; |
534 | for (node= allocator->top, i= 0; node; node= anext_node(node), i++) |
535 | /* no op */; |
536 | return i; |
537 | } |
538 | |
539 | |