1 | /* |
2 | * Copyright 2014-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/detail/MemoryIdler.h> |
18 | |
19 | #include <folly/GLog.h> |
20 | #include <folly/Portability.h> |
21 | #include <folly/ScopeGuard.h> |
22 | #include <folly/concurrency/CacheLocality.h> |
23 | #include <folly/memory/MallctlHelper.h> |
24 | #include <folly/memory/Malloc.h> |
25 | #include <folly/portability/PThread.h> |
26 | #include <folly/portability/SysMman.h> |
27 | #include <folly/portability/Unistd.h> |
28 | #include <folly/synchronization/CallOnce.h> |
29 | |
30 | #include <limits.h> |
31 | #include <stdio.h> |
32 | #include <string.h> |
33 | #include <utility> |
34 | |
35 | namespace folly { |
36 | namespace detail { |
37 | |
38 | AtomicStruct<std::chrono::steady_clock::duration> |
39 | MemoryIdler::defaultIdleTimeout(std::chrono::seconds(5)); |
40 | |
41 | void MemoryIdler::flushLocalMallocCaches() { |
42 | if (!usingJEMalloc()) { |
43 | return; |
44 | } |
45 | if (!mallctl || !mallctlnametomib || !mallctlbymib) { |
46 | FB_LOG_EVERY_MS(ERROR, 10000) << "mallctl* weak link failed" ; |
47 | return; |
48 | } |
49 | |
50 | try { |
51 | // Not using mallctlCall as this will fail if tcache is disabled. |
52 | mallctl("thread.tcache.flush" , nullptr, nullptr, nullptr, 0); |
53 | |
54 | // By default jemalloc has 4 arenas per cpu, and then assigns each |
55 | // thread to one of those arenas. This means that in any service |
56 | // that doesn't perform a lot of context switching, the chances that |
57 | // another thread will be using the current thread's arena (and hence |
58 | // doing the appropriate dirty-page purging) are low. Some good |
59 | // tuned configurations (such as that used by hhvm) use fewer arenas |
60 | // and then pin threads to avoid contended access. In that case, |
61 | // purging the arenas is counter-productive. We use the heuristic |
62 | // that if narenas <= 2 * num_cpus then we shouldn't do anything here, |
63 | // which detects when the narenas has been reduced from the default |
64 | unsigned narenas; |
65 | unsigned arenaForCurrent; |
66 | size_t mib[3]; |
67 | size_t miblen = 3; |
68 | |
69 | mallctlRead("opt.narenas" , &narenas); |
70 | mallctlRead("thread.arena" , &arenaForCurrent); |
71 | if (narenas > 2 * CacheLocality::system().numCpus && |
72 | mallctlnametomib("arena.0.purge" , mib, &miblen) == 0) { |
73 | mib[1] = static_cast<size_t>(arenaForCurrent); |
74 | mallctlbymib(mib, miblen, nullptr, nullptr, nullptr, 0); |
75 | } |
76 | } catch (const std::runtime_error& ex) { |
77 | FB_LOG_EVERY_MS(WARNING, 10000) << ex.what(); |
78 | } |
79 | } |
80 | |
81 | // Stack madvise isn't Linux or glibc specific, but the system calls |
82 | // and arithmetic (and bug compatibility) are not portable. The set of |
83 | // platforms could be increased if it was useful. |
84 | #if (FOLLY_X64 || FOLLY_PPC64) && defined(_GNU_SOURCE) && \ |
85 | defined(__linux__) && !FOLLY_MOBILE && !FOLLY_SANITIZE_ADDRESS |
86 | |
87 | static FOLLY_TLS uintptr_t tls_stackLimit; |
88 | static FOLLY_TLS size_t tls_stackSize; |
89 | |
90 | static size_t pageSize() { |
91 | static const size_t s_pageSize = sysconf(_SC_PAGESIZE); |
92 | return s_pageSize; |
93 | } |
94 | |
95 | static void fetchStackLimits() { |
96 | int err; |
97 | pthread_attr_t attr; |
98 | if ((err = pthread_getattr_np(pthread_self(), &attr))) { |
99 | // some restricted environments can't access /proc |
100 | static folly::once_flag flag; |
101 | folly::call_once(flag, [err]() { |
102 | LOG(WARNING) << "pthread_getaddr_np failed errno=" << err; |
103 | }); |
104 | |
105 | tls_stackSize = 1; |
106 | return; |
107 | } |
108 | SCOPE_EXIT { |
109 | pthread_attr_destroy(&attr); |
110 | }; |
111 | |
112 | void* addr; |
113 | size_t rawSize; |
114 | if ((err = pthread_attr_getstack(&attr, &addr, &rawSize))) { |
115 | // unexpected, but it is better to continue in prod than do nothing |
116 | FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack error " << err; |
117 | assert(false); |
118 | tls_stackSize = 1; |
119 | return; |
120 | } |
121 | if (rawSize >= (1ULL << 32)) { |
122 | // Avoid unmapping huge swaths of memory if there is an insane |
123 | // stack size. The boundary of sanity is somewhat arbitrary: 4GB. |
124 | // |
125 | // If we went into /proc to find the actual contiguous mapped pages |
126 | // before unmapping we wouldn't care about the stack size at all, |
127 | // but our current strategy is to unmap the entire range that might |
128 | // be used for the stack even if it hasn't been fully faulted-in. |
129 | // |
130 | // Very large stack size is a bug (hence the assert), but we can |
131 | // carry on if we are in prod. |
132 | FB_LOG_EVERY_MS(ERROR, 10000) |
133 | << "pthread_attr_getstack returned insane stack size " << rawSize; |
134 | assert(false); |
135 | tls_stackSize = 1; |
136 | return; |
137 | } |
138 | assert(addr != nullptr); |
139 | assert(rawSize >= PTHREAD_STACK_MIN); |
140 | |
141 | // glibc subtracts guard page from stack size, even though pthread docs |
142 | // seem to imply the opposite |
143 | size_t guardSize; |
144 | if (pthread_attr_getguardsize(&attr, &guardSize) != 0) { |
145 | guardSize = 0; |
146 | } |
147 | assert(rawSize > guardSize); |
148 | |
149 | // stack goes down, so guard page adds to the base addr |
150 | tls_stackLimit = reinterpret_cast<uintptr_t>(addr) + guardSize; |
151 | tls_stackSize = rawSize - guardSize; |
152 | |
153 | assert((tls_stackLimit & (pageSize() - 1)) == 0); |
154 | } |
155 | |
156 | FOLLY_NOINLINE static uintptr_t getStackPtr() { |
157 | char marker; |
158 | auto rv = reinterpret_cast<uintptr_t>(&marker); |
159 | return rv; |
160 | } |
161 | |
162 | void MemoryIdler::unmapUnusedStack(size_t retain) { |
163 | if (tls_stackSize == 0) { |
164 | fetchStackLimits(); |
165 | } |
166 | if (tls_stackSize <= std::max(static_cast<size_t>(1), retain)) { |
167 | // covers both missing stack info, and impossibly large retain |
168 | return; |
169 | } |
170 | |
171 | auto sp = getStackPtr(); |
172 | assert(sp >= tls_stackLimit); |
173 | assert(sp - tls_stackLimit < tls_stackSize); |
174 | |
175 | auto end = (sp - retain) & ~(pageSize() - 1); |
176 | if (end <= tls_stackLimit) { |
177 | // no pages are eligible for unmapping |
178 | return; |
179 | } |
180 | |
181 | size_t len = end - tls_stackLimit; |
182 | assert((len & (pageSize() - 1)) == 0); |
183 | if (madvise((void*)tls_stackLimit, len, MADV_DONTNEED) != 0) { |
184 | // It is likely that the stack vma hasn't been fully grown. In this |
185 | // case madvise will apply dontneed to the present vmas, then return |
186 | // errno of ENOMEM. |
187 | // If thread stack pages are backed by locked or huge pages, madvise will |
188 | // fail with EINVAL. (EINVAL may also be returned if the address or length |
189 | // are bad.) Warn in debug mode, since MemoryIdler may not function as |
190 | // expected. |
191 | // We can also get an EAGAIN, theoretically. |
192 | PLOG_IF(WARNING, kIsDebug && errno == EINVAL) << "madvise failed" ; |
193 | assert(errno == EAGAIN || errno == ENOMEM || errno == EINVAL); |
194 | } |
195 | } |
196 | |
197 | #else |
198 | |
199 | void MemoryIdler::unmapUnusedStack(size_t /* retain */) {} |
200 | |
201 | #endif |
202 | |
203 | } // namespace detail |
204 | } // namespace folly |
205 | |