1 | /* |
2 | * Copyright 2011-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <limits.h> |
20 | |
21 | #include <atomic> |
22 | #include <functional> |
23 | #include <mutex> |
24 | #include <string> |
25 | #include <vector> |
26 | |
27 | #include <glog/logging.h> |
28 | |
29 | #include <folly/Exception.h> |
30 | #include <folly/Function.h> |
31 | #include <folly/Portability.h> |
32 | #include <folly/ScopeGuard.h> |
33 | #include <folly/SharedMutex.h> |
34 | #include <folly/container/Foreach.h> |
35 | #include <folly/detail/AtFork.h> |
36 | #include <folly/memory/Malloc.h> |
37 | #include <folly/portability/PThread.h> |
38 | #include <folly/synchronization/MicroSpinLock.h> |
39 | |
40 | #include <folly/detail/StaticSingletonManager.h> |
41 | |
42 | // In general, emutls cleanup is not guaranteed to play nice with the way |
43 | // StaticMeta mixes direct pthread calls and the use of __thread. This has |
44 | // caused problems on multiple platforms so don't use __thread there. |
45 | // |
46 | // XXX: Ideally we would instead determine if emutls is in use at runtime as it |
47 | // is possible to configure glibc on Linux to use emutls regardless. |
48 | #if !FOLLY_MOBILE && !defined(__APPLE__) && !defined(_MSC_VER) |
49 | #define FOLLY_TLD_USE_FOLLY_TLS 1 |
50 | #else |
51 | #undef FOLLY_TLD_USE_FOLLY_TLS |
52 | #endif |
53 | |
54 | namespace folly { |
55 | |
56 | enum class TLPDestructionMode { THIS_THREAD, ALL_THREADS }; |
57 | struct AccessModeStrict {}; |
58 | |
59 | namespace threadlocal_detail { |
60 | |
61 | constexpr uint32_t kEntryIDInvalid = std::numeric_limits<uint32_t>::max(); |
62 | |
63 | struct ThreadEntry; |
64 | /* This represents a node in doubly linked list where all the nodes |
65 | * are part of an ElementWrapper struct that has the same id. |
66 | * we cannot use prev and next as ThreadEntryNode pointers since the |
67 | * ThreadEntry::elements can be reallocated and the pointers will change |
68 | * in this case. So we keep a pointer to the parent ThreadEntry struct |
69 | * one for the prev and next and also the id. |
70 | * We will traverse and update the list only when holding the |
71 | * StaticMetaBase::lock_ |
72 | */ |
73 | struct ThreadEntryNode { |
74 | uint32_t id; |
75 | ThreadEntry* parent; |
76 | ThreadEntry* prev; |
77 | ThreadEntry* next; |
78 | |
79 | void initIfZero(bool locked); |
80 | |
81 | void init(ThreadEntry* entry, uint32_t newId) { |
82 | id = newId; |
83 | parent = prev = next = entry; |
84 | } |
85 | |
86 | void initZero(ThreadEntry* entry, uint32_t newId) { |
87 | id = newId; |
88 | parent = entry; |
89 | prev = next = nullptr; |
90 | } |
91 | |
92 | // if the list this node is part of is empty |
93 | FOLLY_ALWAYS_INLINE bool empty() const { |
94 | return (next == parent); |
95 | } |
96 | |
97 | FOLLY_ALWAYS_INLINE bool zero() const { |
98 | return (!prev); |
99 | } |
100 | |
101 | FOLLY_ALWAYS_INLINE ThreadEntry* getThreadEntry() { |
102 | return parent; |
103 | } |
104 | |
105 | FOLLY_ALWAYS_INLINE ThreadEntryNode* getPrev(); |
106 | |
107 | FOLLY_ALWAYS_INLINE ThreadEntryNode* getNext(); |
108 | |
109 | void push_back(ThreadEntry* head); |
110 | |
111 | void eraseZero(); |
112 | }; |
113 | |
114 | /** |
115 | * POD wrapper around an element (a void*) and an associated deleter. |
116 | * This must be POD, as we memset() it to 0 and memcpy() it around. |
117 | */ |
118 | struct ElementWrapper { |
119 | using DeleterFunType = void(void*, TLPDestructionMode); |
120 | |
121 | bool dispose(TLPDestructionMode mode) { |
122 | if (ptr == nullptr) { |
123 | return false; |
124 | } |
125 | |
126 | DCHECK(deleter1 != nullptr); |
127 | ownsDeleter ? (*deleter2)(ptr, mode) : (*deleter1)(ptr, mode); |
128 | return true; |
129 | } |
130 | |
131 | void* release() { |
132 | auto retPtr = ptr; |
133 | |
134 | if (ptr != nullptr) { |
135 | cleanup(); |
136 | } |
137 | |
138 | return retPtr; |
139 | } |
140 | |
141 | template <class Ptr> |
142 | void set(Ptr p) { |
143 | auto guard = makeGuard([&] { delete p; }); |
144 | DCHECK(ptr == nullptr); |
145 | DCHECK(deleter1 == nullptr); |
146 | |
147 | if (p) { |
148 | node.initIfZero(true /*locked*/); |
149 | ptr = p; |
150 | deleter1 = [](void* pt, TLPDestructionMode) { |
151 | delete static_cast<Ptr>(pt); |
152 | }; |
153 | ownsDeleter = false; |
154 | guard.dismiss(); |
155 | } |
156 | } |
157 | |
158 | template <class Ptr, class Deleter> |
159 | void set(Ptr p, const Deleter& d) { |
160 | auto guard = makeGuard([&] { |
161 | if (p) { |
162 | d(p, TLPDestructionMode::THIS_THREAD); |
163 | } |
164 | }); |
165 | DCHECK(ptr == nullptr); |
166 | DCHECK(deleter2 == nullptr); |
167 | if (p) { |
168 | node.initIfZero(true /*locked*/); |
169 | ptr = p; |
170 | auto d2 = d; // gcc-4.8 doesn't decay types correctly in lambda captures |
171 | deleter2 = new std::function<DeleterFunType>( |
172 | [d2](void* pt, TLPDestructionMode mode) { |
173 | d2(static_cast<Ptr>(pt), mode); |
174 | }); |
175 | ownsDeleter = true; |
176 | guard.dismiss(); |
177 | } |
178 | } |
179 | |
180 | void cleanup() { |
181 | if (ownsDeleter) { |
182 | delete deleter2; |
183 | } |
184 | ptr = nullptr; |
185 | deleter1 = nullptr; |
186 | ownsDeleter = false; |
187 | } |
188 | |
189 | void* ptr; |
190 | union { |
191 | DeleterFunType* deleter1; |
192 | std::function<DeleterFunType>* deleter2; |
193 | }; |
194 | bool ownsDeleter; |
195 | ThreadEntryNode node; |
196 | }; |
197 | |
198 | struct StaticMetaBase; |
199 | struct ThreadEntryList; |
200 | |
201 | /** |
202 | * Per-thread entry. Each thread using a StaticMeta object has one. |
203 | * This is written from the owning thread only (under the lock), read |
204 | * from the owning thread (no lock necessary), and read from other threads |
205 | * (under the lock). |
206 | * StaticMetaBase::head_ elementsCapacity can be read from any thread on |
207 | * reallocate (no lock) |
208 | */ |
209 | struct ThreadEntry { |
210 | ElementWrapper* elements{nullptr}; |
211 | std::atomic<size_t> elementsCapacity{0}; |
212 | ThreadEntry* next{nullptr}; |
213 | ThreadEntry* prev{nullptr}; |
214 | ThreadEntryList* list{nullptr}; |
215 | ThreadEntry* listNext{nullptr}; |
216 | StaticMetaBase* meta{nullptr}; |
217 | bool removed_{false}; |
218 | |
219 | size_t getElementsCapacity() const noexcept { |
220 | return elementsCapacity.load(std::memory_order_relaxed); |
221 | } |
222 | |
223 | void setElementsCapacity(size_t capacity) noexcept { |
224 | elementsCapacity.store(capacity, std::memory_order_relaxed); |
225 | } |
226 | }; |
227 | |
228 | struct ThreadEntryList { |
229 | ThreadEntry* head{nullptr}; |
230 | size_t count{0}; |
231 | }; |
232 | |
233 | struct PthreadKeyUnregisterTester; |
234 | |
235 | FOLLY_ALWAYS_INLINE ThreadEntryNode* ThreadEntryNode::getPrev() { |
236 | return &prev->elements[id].node; |
237 | } |
238 | |
239 | FOLLY_ALWAYS_INLINE ThreadEntryNode* ThreadEntryNode::getNext() { |
240 | return &next->elements[id].node; |
241 | } |
242 | |
243 | /** |
244 | * We want to disable onThreadExit call at the end of shutdown, we don't care |
245 | * about leaking memory at that point. |
246 | * |
247 | * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be |
248 | * called after dlclose(). |
249 | * |
250 | * This class has one single static instance; however since it's so widely used, |
251 | * directly or indirectly, by so many classes, we need to take care to avoid |
252 | * problems stemming from the Static Initialization/Destruction Order Fiascos. |
253 | * Therefore this class needs to be constexpr-constructible, so as to avoid |
254 | * the need for this to participate in init/destruction order. |
255 | */ |
256 | class PthreadKeyUnregister { |
257 | public: |
258 | static constexpr size_t kMaxKeys = 1UL << 16; |
259 | |
260 | ~PthreadKeyUnregister() { |
261 | // If static constructor priorities are not supported then |
262 | // ~PthreadKeyUnregister logic is not safe. |
263 | #if !defined(__APPLE__) && !defined(_MSC_VER) |
264 | MSLGuard lg(lock_); |
265 | while (size_) { |
266 | pthread_key_delete(keys_[--size_]); |
267 | } |
268 | #endif |
269 | } |
270 | |
271 | static void registerKey(pthread_key_t key) { |
272 | instance_.registerKeyImpl(key); |
273 | } |
274 | |
275 | private: |
276 | /** |
277 | * Only one global instance should exist, hence this is private. |
278 | * See also the important note at the top of this class about `constexpr` |
279 | * usage. |
280 | */ |
281 | constexpr PthreadKeyUnregister() : lock_(), size_(0), keys_() {} |
282 | friend struct folly::threadlocal_detail::PthreadKeyUnregisterTester; |
283 | |
284 | void registerKeyImpl(pthread_key_t key) { |
285 | MSLGuard lg(lock_); |
286 | if (size_ == kMaxKeys) { |
287 | throw std::logic_error("pthread_key limit has already been reached" ); |
288 | } |
289 | keys_[size_++] = key; |
290 | } |
291 | |
292 | MicroSpinLock lock_; |
293 | size_t size_; |
294 | pthread_key_t keys_[kMaxKeys]; |
295 | |
296 | static PthreadKeyUnregister instance_; |
297 | }; |
298 | |
299 | struct StaticMetaBase { |
300 | // Represents an ID of a thread local object. Initially set to the maximum |
301 | // uint. This representation allows us to avoid a branch in accessing TLS data |
302 | // (because if you test capacity > id if id = maxint then the test will always |
303 | // fail). It allows us to keep a constexpr constructor and avoid SIOF. |
304 | class EntryID { |
305 | public: |
306 | std::atomic<uint32_t> value; |
307 | |
308 | constexpr EntryID() : value(kEntryIDInvalid) {} |
309 | |
310 | EntryID(EntryID&& other) noexcept : value(other.value.load()) { |
311 | other.value = kEntryIDInvalid; |
312 | } |
313 | |
314 | EntryID& operator=(EntryID&& other) { |
315 | assert(this != &other); |
316 | value = other.value.load(); |
317 | other.value = kEntryIDInvalid; |
318 | return *this; |
319 | } |
320 | |
321 | EntryID(const EntryID& other) = delete; |
322 | EntryID& operator=(const EntryID& other) = delete; |
323 | |
324 | uint32_t getOrInvalid() { |
325 | // It's OK for this to be relaxed, even though we're effectively doing |
326 | // double checked locking in using this value. We only care about the |
327 | // uniqueness of IDs, getOrAllocate does not modify any other memory |
328 | // this thread will use. |
329 | return value.load(std::memory_order_relaxed); |
330 | } |
331 | |
332 | uint32_t getOrAllocate(StaticMetaBase& meta) { |
333 | uint32_t id = getOrInvalid(); |
334 | if (id != kEntryIDInvalid) { |
335 | return id; |
336 | } |
337 | // The lock inside allocate ensures that a single value is allocated |
338 | return meta.allocate(this); |
339 | } |
340 | }; |
341 | |
342 | StaticMetaBase(ThreadEntry* (*threadEntry)(), bool strict); |
343 | |
344 | void push_back(ThreadEntry* t) { |
345 | t->next = &head_; |
346 | t->prev = head_.prev; |
347 | head_.prev->next = t; |
348 | head_.prev = t; |
349 | } |
350 | |
351 | void erase(ThreadEntry* t) { |
352 | t->next->prev = t->prev; |
353 | t->prev->next = t->next; |
354 | t->next = t->prev = t; |
355 | } |
356 | |
357 | FOLLY_EXPORT static ThreadEntryList* getThreadEntryList(); |
358 | |
359 | static void onThreadExit(void* ptr); |
360 | |
361 | // returns the elementsCapacity for the |
362 | // current thread ThreadEntry struct |
363 | uint32_t elementsCapacity() const; |
364 | |
365 | uint32_t allocate(EntryID* ent); |
366 | |
367 | void destroy(EntryID* ent); |
368 | |
369 | /** |
370 | * Reserve enough space in the ThreadEntry::elements for the item |
371 | * @id to fit in. |
372 | */ |
373 | void reserve(EntryID* id); |
374 | |
375 | ElementWrapper& getElement(EntryID* ent); |
376 | |
377 | // reserve an id in the head_ ThreadEntry->elements |
378 | // array if not already there |
379 | void reserveHeadUnlocked(uint32_t id); |
380 | |
381 | // push back an entry in the doubly linked list |
382 | // that corresponds to idx id |
383 | void pushBackLocked(ThreadEntry* t, uint32_t id); |
384 | void pushBackUnlocked(ThreadEntry* t, uint32_t id); |
385 | |
386 | // static helper method to reallocate the ThreadEntry::elements |
387 | // returns != nullptr if the ThreadEntry::elements was reallocated |
388 | // nullptr if the ThreadEntry::elements was just extended |
389 | // and throws stdd:bad_alloc if memory cannot be allocated |
390 | static ElementWrapper* |
391 | reallocate(ThreadEntry* threadEntry, uint32_t idval, size_t& newCapacity); |
392 | |
393 | uint32_t nextId_; |
394 | std::vector<uint32_t> freeIds_; |
395 | std::mutex lock_; |
396 | SharedMutex accessAllThreadsLock_; |
397 | pthread_key_t pthreadKey_; |
398 | ThreadEntry head_; |
399 | ThreadEntry* (*threadEntry_)(); |
400 | bool strict_; |
401 | |
402 | protected: |
403 | ~StaticMetaBase() {} |
404 | }; |
405 | |
406 | // Held in a singleton to track our global instances. |
407 | // We have one of these per "Tag", by default one for the whole system |
408 | // (Tag=void). |
409 | // |
410 | // Creating and destroying ThreadLocalPtr objects, as well as thread exit |
411 | // for threads that use ThreadLocalPtr objects collide on a lock inside |
412 | // StaticMeta; you can specify multiple Tag types to break that lock. |
413 | template <class Tag, class AccessMode> |
414 | struct StaticMeta final : StaticMetaBase { |
415 | StaticMeta() |
416 | : StaticMetaBase( |
417 | &StaticMeta::getThreadEntrySlow, |
418 | std::is_same<AccessMode, AccessModeStrict>::value) { |
419 | detail::AtFork::registerHandler( |
420 | this, |
421 | /*prepare*/ &StaticMeta::preFork, |
422 | /*parent*/ &StaticMeta::onForkParent, |
423 | /*child*/ &StaticMeta::onForkChild); |
424 | } |
425 | |
426 | ~StaticMeta() = delete; |
427 | |
428 | static StaticMeta<Tag, AccessMode>& instance() { |
429 | // Leak it on exit, there's only one per process and we don't have to |
430 | // worry about synchronization with exiting threads. |
431 | return detail::createGlobal<StaticMeta<Tag, AccessMode>, void>(); |
432 | } |
433 | |
434 | FOLLY_EXPORT FOLLY_ALWAYS_INLINE static ElementWrapper& get(EntryID* ent) { |
435 | // Eliminate as many branches and as much extra code as possible in the |
436 | // cached fast path, leaving only one branch here and one indirection below. |
437 | uint32_t id = ent->getOrInvalid(); |
438 | #ifdef FOLLY_TLD_USE_FOLLY_TLS |
439 | static FOLLY_TLS ThreadEntry* threadEntry{}; |
440 | static FOLLY_TLS size_t capacity{}; |
441 | #else |
442 | ThreadEntry* threadEntry{}; |
443 | size_t capacity{}; |
444 | #endif |
445 | if (FOLLY_UNLIKELY(capacity <= id)) { |
446 | getSlowReserveAndCache(ent, id, threadEntry, capacity); |
447 | } |
448 | return threadEntry->elements[id]; |
449 | } |
450 | |
451 | FOLLY_NOINLINE static void getSlowReserveAndCache( |
452 | EntryID* ent, |
453 | uint32_t& id, |
454 | ThreadEntry*& threadEntry, |
455 | size_t& capacity) { |
456 | auto& inst = instance(); |
457 | threadEntry = inst.threadEntry_(); |
458 | if (UNLIKELY(threadEntry->getElementsCapacity() <= id)) { |
459 | inst.reserve(ent); |
460 | id = ent->getOrInvalid(); |
461 | } |
462 | capacity = threadEntry->getElementsCapacity(); |
463 | assert(capacity > id); |
464 | } |
465 | |
466 | FOLLY_EXPORT FOLLY_NOINLINE static ThreadEntry* getThreadEntrySlow() { |
467 | auto& meta = instance(); |
468 | auto key = meta.pthreadKey_; |
469 | ThreadEntry* threadEntry = |
470 | static_cast<ThreadEntry*>(pthread_getspecific(key)); |
471 | if (!threadEntry) { |
472 | ThreadEntryList* threadEntryList = StaticMeta::getThreadEntryList(); |
473 | #ifdef FOLLY_TLD_USE_FOLLY_TLS |
474 | static FOLLY_TLS ThreadEntry threadEntrySingleton; |
475 | threadEntry = &threadEntrySingleton; |
476 | #else |
477 | threadEntry = new ThreadEntry(); |
478 | #endif |
479 | // if the ThreadEntry already exists |
480 | // but pthread_getspecific returns NULL |
481 | // do not add the same entry twice to the list |
482 | // since this would create a loop in the list |
483 | if (!threadEntry->list) { |
484 | threadEntry->list = threadEntryList; |
485 | threadEntry->listNext = threadEntryList->head; |
486 | threadEntryList->head = threadEntry; |
487 | } |
488 | |
489 | // if we're adding a thread entry |
490 | // we need to increment the list count |
491 | // even if the entry is reused |
492 | threadEntryList->count++; |
493 | |
494 | threadEntry->meta = &meta; |
495 | int ret = pthread_setspecific(key, threadEntry); |
496 | checkPosixError(ret, "pthread_setspecific failed" ); |
497 | } |
498 | return threadEntry; |
499 | } |
500 | |
501 | static bool preFork() { |
502 | return instance().lock_.try_lock(); // Make sure it's created |
503 | } |
504 | |
505 | static void onForkParent() { |
506 | instance().lock_.unlock(); |
507 | } |
508 | |
509 | static void onForkChild() { |
510 | // only the current thread survives |
511 | auto& head = instance().head_; |
512 | // init the head list |
513 | head.next = head.prev = &head; |
514 | // init the circular lists |
515 | auto elementsCapacity = head.getElementsCapacity(); |
516 | for (size_t i = 0u; i < elementsCapacity; ++i) { |
517 | head.elements[i].node.init(&head, static_cast<uint32_t>(i)); |
518 | } |
519 | // init the thread entry |
520 | ThreadEntry* threadEntry = instance().threadEntry_(); |
521 | elementsCapacity = threadEntry->getElementsCapacity(); |
522 | for (size_t i = 0u; i < elementsCapacity; ++i) { |
523 | if (!threadEntry->elements[i].node.zero()) { |
524 | threadEntry->elements[i].node.initZero( |
525 | threadEntry, static_cast<uint32_t>(i)); |
526 | threadEntry->elements[i].node.initIfZero(false /*locked*/); |
527 | } |
528 | } |
529 | |
530 | // If this thread was in the list before the fork, add it back. |
531 | if (elementsCapacity != 0) { |
532 | instance().push_back(threadEntry); |
533 | } |
534 | instance().lock_.unlock(); |
535 | } |
536 | }; |
537 | } // namespace threadlocal_detail |
538 | } // namespace folly |
539 | |