1/*
2 * Copyright 2011-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <limits.h>
20
21#include <atomic>
22#include <functional>
23#include <mutex>
24#include <string>
25#include <vector>
26
27#include <glog/logging.h>
28
29#include <folly/Exception.h>
30#include <folly/Function.h>
31#include <folly/Portability.h>
32#include <folly/ScopeGuard.h>
33#include <folly/SharedMutex.h>
34#include <folly/container/Foreach.h>
35#include <folly/detail/AtFork.h>
36#include <folly/memory/Malloc.h>
37#include <folly/portability/PThread.h>
38#include <folly/synchronization/MicroSpinLock.h>
39
40#include <folly/detail/StaticSingletonManager.h>
41
42// In general, emutls cleanup is not guaranteed to play nice with the way
43// StaticMeta mixes direct pthread calls and the use of __thread. This has
44// caused problems on multiple platforms so don't use __thread there.
45//
46// XXX: Ideally we would instead determine if emutls is in use at runtime as it
47// is possible to configure glibc on Linux to use emutls regardless.
48#if !FOLLY_MOBILE && !defined(__APPLE__) && !defined(_MSC_VER)
49#define FOLLY_TLD_USE_FOLLY_TLS 1
50#else
51#undef FOLLY_TLD_USE_FOLLY_TLS
52#endif
53
54namespace folly {
55
56enum class TLPDestructionMode { THIS_THREAD, ALL_THREADS };
57struct AccessModeStrict {};
58
59namespace threadlocal_detail {
60
61constexpr uint32_t kEntryIDInvalid = std::numeric_limits<uint32_t>::max();
62
63struct ThreadEntry;
64/* This represents a node in doubly linked list where all the nodes
65 * are part of an ElementWrapper struct that has the same id.
66 * we cannot use prev and next as ThreadEntryNode pointers since the
67 * ThreadEntry::elements can be reallocated and the pointers will change
68 * in this case. So we keep a pointer to the parent ThreadEntry struct
69 * one for the prev and next and also the id.
70 * We will traverse and update the list only when holding the
71 * StaticMetaBase::lock_
72 */
73struct ThreadEntryNode {
74 uint32_t id;
75 ThreadEntry* parent;
76 ThreadEntry* prev;
77 ThreadEntry* next;
78
79 void initIfZero(bool locked);
80
81 void init(ThreadEntry* entry, uint32_t newId) {
82 id = newId;
83 parent = prev = next = entry;
84 }
85
86 void initZero(ThreadEntry* entry, uint32_t newId) {
87 id = newId;
88 parent = entry;
89 prev = next = nullptr;
90 }
91
92 // if the list this node is part of is empty
93 FOLLY_ALWAYS_INLINE bool empty() const {
94 return (next == parent);
95 }
96
97 FOLLY_ALWAYS_INLINE bool zero() const {
98 return (!prev);
99 }
100
101 FOLLY_ALWAYS_INLINE ThreadEntry* getThreadEntry() {
102 return parent;
103 }
104
105 FOLLY_ALWAYS_INLINE ThreadEntryNode* getPrev();
106
107 FOLLY_ALWAYS_INLINE ThreadEntryNode* getNext();
108
109 void push_back(ThreadEntry* head);
110
111 void eraseZero();
112};
113
114/**
115 * POD wrapper around an element (a void*) and an associated deleter.
116 * This must be POD, as we memset() it to 0 and memcpy() it around.
117 */
118struct ElementWrapper {
119 using DeleterFunType = void(void*, TLPDestructionMode);
120
121 bool dispose(TLPDestructionMode mode) {
122 if (ptr == nullptr) {
123 return false;
124 }
125
126 DCHECK(deleter1 != nullptr);
127 ownsDeleter ? (*deleter2)(ptr, mode) : (*deleter1)(ptr, mode);
128 return true;
129 }
130
131 void* release() {
132 auto retPtr = ptr;
133
134 if (ptr != nullptr) {
135 cleanup();
136 }
137
138 return retPtr;
139 }
140
141 template <class Ptr>
142 void set(Ptr p) {
143 auto guard = makeGuard([&] { delete p; });
144 DCHECK(ptr == nullptr);
145 DCHECK(deleter1 == nullptr);
146
147 if (p) {
148 node.initIfZero(true /*locked*/);
149 ptr = p;
150 deleter1 = [](void* pt, TLPDestructionMode) {
151 delete static_cast<Ptr>(pt);
152 };
153 ownsDeleter = false;
154 guard.dismiss();
155 }
156 }
157
158 template <class Ptr, class Deleter>
159 void set(Ptr p, const Deleter& d) {
160 auto guard = makeGuard([&] {
161 if (p) {
162 d(p, TLPDestructionMode::THIS_THREAD);
163 }
164 });
165 DCHECK(ptr == nullptr);
166 DCHECK(deleter2 == nullptr);
167 if (p) {
168 node.initIfZero(true /*locked*/);
169 ptr = p;
170 auto d2 = d; // gcc-4.8 doesn't decay types correctly in lambda captures
171 deleter2 = new std::function<DeleterFunType>(
172 [d2](void* pt, TLPDestructionMode mode) {
173 d2(static_cast<Ptr>(pt), mode);
174 });
175 ownsDeleter = true;
176 guard.dismiss();
177 }
178 }
179
180 void cleanup() {
181 if (ownsDeleter) {
182 delete deleter2;
183 }
184 ptr = nullptr;
185 deleter1 = nullptr;
186 ownsDeleter = false;
187 }
188
189 void* ptr;
190 union {
191 DeleterFunType* deleter1;
192 std::function<DeleterFunType>* deleter2;
193 };
194 bool ownsDeleter;
195 ThreadEntryNode node;
196};
197
198struct StaticMetaBase;
199struct ThreadEntryList;
200
201/**
202 * Per-thread entry. Each thread using a StaticMeta object has one.
203 * This is written from the owning thread only (under the lock), read
204 * from the owning thread (no lock necessary), and read from other threads
205 * (under the lock).
206 * StaticMetaBase::head_ elementsCapacity can be read from any thread on
207 * reallocate (no lock)
208 */
209struct ThreadEntry {
210 ElementWrapper* elements{nullptr};
211 std::atomic<size_t> elementsCapacity{0};
212 ThreadEntry* next{nullptr};
213 ThreadEntry* prev{nullptr};
214 ThreadEntryList* list{nullptr};
215 ThreadEntry* listNext{nullptr};
216 StaticMetaBase* meta{nullptr};
217 bool removed_{false};
218
219 size_t getElementsCapacity() const noexcept {
220 return elementsCapacity.load(std::memory_order_relaxed);
221 }
222
223 void setElementsCapacity(size_t capacity) noexcept {
224 elementsCapacity.store(capacity, std::memory_order_relaxed);
225 }
226};
227
228struct ThreadEntryList {
229 ThreadEntry* head{nullptr};
230 size_t count{0};
231};
232
233struct PthreadKeyUnregisterTester;
234
235FOLLY_ALWAYS_INLINE ThreadEntryNode* ThreadEntryNode::getPrev() {
236 return &prev->elements[id].node;
237}
238
239FOLLY_ALWAYS_INLINE ThreadEntryNode* ThreadEntryNode::getNext() {
240 return &next->elements[id].node;
241}
242
243/**
244 * We want to disable onThreadExit call at the end of shutdown, we don't care
245 * about leaking memory at that point.
246 *
247 * Otherwise if ThreadLocal is used in a shared library, onThreadExit may be
248 * called after dlclose().
249 *
250 * This class has one single static instance; however since it's so widely used,
251 * directly or indirectly, by so many classes, we need to take care to avoid
252 * problems stemming from the Static Initialization/Destruction Order Fiascos.
253 * Therefore this class needs to be constexpr-constructible, so as to avoid
254 * the need for this to participate in init/destruction order.
255 */
256class PthreadKeyUnregister {
257 public:
258 static constexpr size_t kMaxKeys = 1UL << 16;
259
260 ~PthreadKeyUnregister() {
261 // If static constructor priorities are not supported then
262 // ~PthreadKeyUnregister logic is not safe.
263#if !defined(__APPLE__) && !defined(_MSC_VER)
264 MSLGuard lg(lock_);
265 while (size_) {
266 pthread_key_delete(keys_[--size_]);
267 }
268#endif
269 }
270
271 static void registerKey(pthread_key_t key) {
272 instance_.registerKeyImpl(key);
273 }
274
275 private:
276 /**
277 * Only one global instance should exist, hence this is private.
278 * See also the important note at the top of this class about `constexpr`
279 * usage.
280 */
281 constexpr PthreadKeyUnregister() : lock_(), size_(0), keys_() {}
282 friend struct folly::threadlocal_detail::PthreadKeyUnregisterTester;
283
284 void registerKeyImpl(pthread_key_t key) {
285 MSLGuard lg(lock_);
286 if (size_ == kMaxKeys) {
287 throw std::logic_error("pthread_key limit has already been reached");
288 }
289 keys_[size_++] = key;
290 }
291
292 MicroSpinLock lock_;
293 size_t size_;
294 pthread_key_t keys_[kMaxKeys];
295
296 static PthreadKeyUnregister instance_;
297};
298
299struct StaticMetaBase {
300 // Represents an ID of a thread local object. Initially set to the maximum
301 // uint. This representation allows us to avoid a branch in accessing TLS data
302 // (because if you test capacity > id if id = maxint then the test will always
303 // fail). It allows us to keep a constexpr constructor and avoid SIOF.
304 class EntryID {
305 public:
306 std::atomic<uint32_t> value;
307
308 constexpr EntryID() : value(kEntryIDInvalid) {}
309
310 EntryID(EntryID&& other) noexcept : value(other.value.load()) {
311 other.value = kEntryIDInvalid;
312 }
313
314 EntryID& operator=(EntryID&& other) {
315 assert(this != &other);
316 value = other.value.load();
317 other.value = kEntryIDInvalid;
318 return *this;
319 }
320
321 EntryID(const EntryID& other) = delete;
322 EntryID& operator=(const EntryID& other) = delete;
323
324 uint32_t getOrInvalid() {
325 // It's OK for this to be relaxed, even though we're effectively doing
326 // double checked locking in using this value. We only care about the
327 // uniqueness of IDs, getOrAllocate does not modify any other memory
328 // this thread will use.
329 return value.load(std::memory_order_relaxed);
330 }
331
332 uint32_t getOrAllocate(StaticMetaBase& meta) {
333 uint32_t id = getOrInvalid();
334 if (id != kEntryIDInvalid) {
335 return id;
336 }
337 // The lock inside allocate ensures that a single value is allocated
338 return meta.allocate(this);
339 }
340 };
341
342 StaticMetaBase(ThreadEntry* (*threadEntry)(), bool strict);
343
344 void push_back(ThreadEntry* t) {
345 t->next = &head_;
346 t->prev = head_.prev;
347 head_.prev->next = t;
348 head_.prev = t;
349 }
350
351 void erase(ThreadEntry* t) {
352 t->next->prev = t->prev;
353 t->prev->next = t->next;
354 t->next = t->prev = t;
355 }
356
357 FOLLY_EXPORT static ThreadEntryList* getThreadEntryList();
358
359 static void onThreadExit(void* ptr);
360
361 // returns the elementsCapacity for the
362 // current thread ThreadEntry struct
363 uint32_t elementsCapacity() const;
364
365 uint32_t allocate(EntryID* ent);
366
367 void destroy(EntryID* ent);
368
369 /**
370 * Reserve enough space in the ThreadEntry::elements for the item
371 * @id to fit in.
372 */
373 void reserve(EntryID* id);
374
375 ElementWrapper& getElement(EntryID* ent);
376
377 // reserve an id in the head_ ThreadEntry->elements
378 // array if not already there
379 void reserveHeadUnlocked(uint32_t id);
380
381 // push back an entry in the doubly linked list
382 // that corresponds to idx id
383 void pushBackLocked(ThreadEntry* t, uint32_t id);
384 void pushBackUnlocked(ThreadEntry* t, uint32_t id);
385
386 // static helper method to reallocate the ThreadEntry::elements
387 // returns != nullptr if the ThreadEntry::elements was reallocated
388 // nullptr if the ThreadEntry::elements was just extended
389 // and throws stdd:bad_alloc if memory cannot be allocated
390 static ElementWrapper*
391 reallocate(ThreadEntry* threadEntry, uint32_t idval, size_t& newCapacity);
392
393 uint32_t nextId_;
394 std::vector<uint32_t> freeIds_;
395 std::mutex lock_;
396 SharedMutex accessAllThreadsLock_;
397 pthread_key_t pthreadKey_;
398 ThreadEntry head_;
399 ThreadEntry* (*threadEntry_)();
400 bool strict_;
401
402 protected:
403 ~StaticMetaBase() {}
404};
405
406// Held in a singleton to track our global instances.
407// We have one of these per "Tag", by default one for the whole system
408// (Tag=void).
409//
410// Creating and destroying ThreadLocalPtr objects, as well as thread exit
411// for threads that use ThreadLocalPtr objects collide on a lock inside
412// StaticMeta; you can specify multiple Tag types to break that lock.
413template <class Tag, class AccessMode>
414struct StaticMeta final : StaticMetaBase {
415 StaticMeta()
416 : StaticMetaBase(
417 &StaticMeta::getThreadEntrySlow,
418 std::is_same<AccessMode, AccessModeStrict>::value) {
419 detail::AtFork::registerHandler(
420 this,
421 /*prepare*/ &StaticMeta::preFork,
422 /*parent*/ &StaticMeta::onForkParent,
423 /*child*/ &StaticMeta::onForkChild);
424 }
425
426 ~StaticMeta() = delete;
427
428 static StaticMeta<Tag, AccessMode>& instance() {
429 // Leak it on exit, there's only one per process and we don't have to
430 // worry about synchronization with exiting threads.
431 return detail::createGlobal<StaticMeta<Tag, AccessMode>, void>();
432 }
433
434 FOLLY_EXPORT FOLLY_ALWAYS_INLINE static ElementWrapper& get(EntryID* ent) {
435 // Eliminate as many branches and as much extra code as possible in the
436 // cached fast path, leaving only one branch here and one indirection below.
437 uint32_t id = ent->getOrInvalid();
438#ifdef FOLLY_TLD_USE_FOLLY_TLS
439 static FOLLY_TLS ThreadEntry* threadEntry{};
440 static FOLLY_TLS size_t capacity{};
441#else
442 ThreadEntry* threadEntry{};
443 size_t capacity{};
444#endif
445 if (FOLLY_UNLIKELY(capacity <= id)) {
446 getSlowReserveAndCache(ent, id, threadEntry, capacity);
447 }
448 return threadEntry->elements[id];
449 }
450
451 FOLLY_NOINLINE static void getSlowReserveAndCache(
452 EntryID* ent,
453 uint32_t& id,
454 ThreadEntry*& threadEntry,
455 size_t& capacity) {
456 auto& inst = instance();
457 threadEntry = inst.threadEntry_();
458 if (UNLIKELY(threadEntry->getElementsCapacity() <= id)) {
459 inst.reserve(ent);
460 id = ent->getOrInvalid();
461 }
462 capacity = threadEntry->getElementsCapacity();
463 assert(capacity > id);
464 }
465
466 FOLLY_EXPORT FOLLY_NOINLINE static ThreadEntry* getThreadEntrySlow() {
467 auto& meta = instance();
468 auto key = meta.pthreadKey_;
469 ThreadEntry* threadEntry =
470 static_cast<ThreadEntry*>(pthread_getspecific(key));
471 if (!threadEntry) {
472 ThreadEntryList* threadEntryList = StaticMeta::getThreadEntryList();
473#ifdef FOLLY_TLD_USE_FOLLY_TLS
474 static FOLLY_TLS ThreadEntry threadEntrySingleton;
475 threadEntry = &threadEntrySingleton;
476#else
477 threadEntry = new ThreadEntry();
478#endif
479 // if the ThreadEntry already exists
480 // but pthread_getspecific returns NULL
481 // do not add the same entry twice to the list
482 // since this would create a loop in the list
483 if (!threadEntry->list) {
484 threadEntry->list = threadEntryList;
485 threadEntry->listNext = threadEntryList->head;
486 threadEntryList->head = threadEntry;
487 }
488
489 // if we're adding a thread entry
490 // we need to increment the list count
491 // even if the entry is reused
492 threadEntryList->count++;
493
494 threadEntry->meta = &meta;
495 int ret = pthread_setspecific(key, threadEntry);
496 checkPosixError(ret, "pthread_setspecific failed");
497 }
498 return threadEntry;
499 }
500
501 static bool preFork() {
502 return instance().lock_.try_lock(); // Make sure it's created
503 }
504
505 static void onForkParent() {
506 instance().lock_.unlock();
507 }
508
509 static void onForkChild() {
510 // only the current thread survives
511 auto& head = instance().head_;
512 // init the head list
513 head.next = head.prev = &head;
514 // init the circular lists
515 auto elementsCapacity = head.getElementsCapacity();
516 for (size_t i = 0u; i < elementsCapacity; ++i) {
517 head.elements[i].node.init(&head, static_cast<uint32_t>(i));
518 }
519 // init the thread entry
520 ThreadEntry* threadEntry = instance().threadEntry_();
521 elementsCapacity = threadEntry->getElementsCapacity();
522 for (size_t i = 0u; i < elementsCapacity; ++i) {
523 if (!threadEntry->elements[i].node.zero()) {
524 threadEntry->elements[i].node.initZero(
525 threadEntry, static_cast<uint32_t>(i));
526 threadEntry->elements[i].node.initIfZero(false /*locked*/);
527 }
528 }
529
530 // If this thread was in the list before the fork, add it back.
531 if (elementsCapacity != 0) {
532 instance().push_back(threadEntry);
533 }
534 instance().lock_.unlock();
535 }
536};
537} // namespace threadlocal_detail
538} // namespace folly
539