1/*
2 * Copyright 2014-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <folly/detail/MemoryIdler.h>
18
19#include <folly/GLog.h>
20#include <folly/Portability.h>
21#include <folly/ScopeGuard.h>
22#include <folly/concurrency/CacheLocality.h>
23#include <folly/memory/MallctlHelper.h>
24#include <folly/memory/Malloc.h>
25#include <folly/portability/PThread.h>
26#include <folly/portability/SysMman.h>
27#include <folly/portability/Unistd.h>
28#include <folly/synchronization/CallOnce.h>
29
30#include <limits.h>
31#include <stdio.h>
32#include <string.h>
33#include <utility>
34
35namespace folly {
36namespace detail {
37
38AtomicStruct<std::chrono::steady_clock::duration>
39 MemoryIdler::defaultIdleTimeout(std::chrono::seconds(5));
40
41void MemoryIdler::flushLocalMallocCaches() {
42 if (!usingJEMalloc()) {
43 return;
44 }
45 if (!mallctl || !mallctlnametomib || !mallctlbymib) {
46 FB_LOG_EVERY_MS(ERROR, 10000) << "mallctl* weak link failed";
47 return;
48 }
49
50 try {
51 // Not using mallctlCall as this will fail if tcache is disabled.
52 mallctl("thread.tcache.flush", nullptr, nullptr, nullptr, 0);
53
54 // By default jemalloc has 4 arenas per cpu, and then assigns each
55 // thread to one of those arenas. This means that in any service
56 // that doesn't perform a lot of context switching, the chances that
57 // another thread will be using the current thread's arena (and hence
58 // doing the appropriate dirty-page purging) are low. Some good
59 // tuned configurations (such as that used by hhvm) use fewer arenas
60 // and then pin threads to avoid contended access. In that case,
61 // purging the arenas is counter-productive. We use the heuristic
62 // that if narenas <= 2 * num_cpus then we shouldn't do anything here,
63 // which detects when the narenas has been reduced from the default
64 unsigned narenas;
65 unsigned arenaForCurrent;
66 size_t mib[3];
67 size_t miblen = 3;
68
69 mallctlRead("opt.narenas", &narenas);
70 mallctlRead("thread.arena", &arenaForCurrent);
71 if (narenas > 2 * CacheLocality::system().numCpus &&
72 mallctlnametomib("arena.0.purge", mib, &miblen) == 0) {
73 mib[1] = static_cast<size_t>(arenaForCurrent);
74 mallctlbymib(mib, miblen, nullptr, nullptr, nullptr, 0);
75 }
76 } catch (const std::runtime_error& ex) {
77 FB_LOG_EVERY_MS(WARNING, 10000) << ex.what();
78 }
79}
80
81// Stack madvise isn't Linux or glibc specific, but the system calls
82// and arithmetic (and bug compatibility) are not portable. The set of
83// platforms could be increased if it was useful.
84#if (FOLLY_X64 || FOLLY_PPC64) && defined(_GNU_SOURCE) && \
85 defined(__linux__) && !FOLLY_MOBILE && !FOLLY_SANITIZE_ADDRESS
86
87static FOLLY_TLS uintptr_t tls_stackLimit;
88static FOLLY_TLS size_t tls_stackSize;
89
90static size_t pageSize() {
91 static const size_t s_pageSize = sysconf(_SC_PAGESIZE);
92 return s_pageSize;
93}
94
95static void fetchStackLimits() {
96 int err;
97 pthread_attr_t attr;
98 if ((err = pthread_getattr_np(pthread_self(), &attr))) {
99 // some restricted environments can't access /proc
100 static folly::once_flag flag;
101 folly::call_once(flag, [err]() {
102 LOG(WARNING) << "pthread_getaddr_np failed errno=" << err;
103 });
104
105 tls_stackSize = 1;
106 return;
107 }
108 SCOPE_EXIT {
109 pthread_attr_destroy(&attr);
110 };
111
112 void* addr;
113 size_t rawSize;
114 if ((err = pthread_attr_getstack(&attr, &addr, &rawSize))) {
115 // unexpected, but it is better to continue in prod than do nothing
116 FB_LOG_EVERY_MS(ERROR, 10000) << "pthread_attr_getstack error " << err;
117 assert(false);
118 tls_stackSize = 1;
119 return;
120 }
121 if (rawSize >= (1ULL << 32)) {
122 // Avoid unmapping huge swaths of memory if there is an insane
123 // stack size. The boundary of sanity is somewhat arbitrary: 4GB.
124 //
125 // If we went into /proc to find the actual contiguous mapped pages
126 // before unmapping we wouldn't care about the stack size at all,
127 // but our current strategy is to unmap the entire range that might
128 // be used for the stack even if it hasn't been fully faulted-in.
129 //
130 // Very large stack size is a bug (hence the assert), but we can
131 // carry on if we are in prod.
132 FB_LOG_EVERY_MS(ERROR, 10000)
133 << "pthread_attr_getstack returned insane stack size " << rawSize;
134 assert(false);
135 tls_stackSize = 1;
136 return;
137 }
138 assert(addr != nullptr);
139 assert(rawSize >= PTHREAD_STACK_MIN);
140
141 // glibc subtracts guard page from stack size, even though pthread docs
142 // seem to imply the opposite
143 size_t guardSize;
144 if (pthread_attr_getguardsize(&attr, &guardSize) != 0) {
145 guardSize = 0;
146 }
147 assert(rawSize > guardSize);
148
149 // stack goes down, so guard page adds to the base addr
150 tls_stackLimit = reinterpret_cast<uintptr_t>(addr) + guardSize;
151 tls_stackSize = rawSize - guardSize;
152
153 assert((tls_stackLimit & (pageSize() - 1)) == 0);
154}
155
156FOLLY_NOINLINE static uintptr_t getStackPtr() {
157 char marker;
158 auto rv = reinterpret_cast<uintptr_t>(&marker);
159 return rv;
160}
161
162void MemoryIdler::unmapUnusedStack(size_t retain) {
163 if (tls_stackSize == 0) {
164 fetchStackLimits();
165 }
166 if (tls_stackSize <= std::max(static_cast<size_t>(1), retain)) {
167 // covers both missing stack info, and impossibly large retain
168 return;
169 }
170
171 auto sp = getStackPtr();
172 assert(sp >= tls_stackLimit);
173 assert(sp - tls_stackLimit < tls_stackSize);
174
175 auto end = (sp - retain) & ~(pageSize() - 1);
176 if (end <= tls_stackLimit) {
177 // no pages are eligible for unmapping
178 return;
179 }
180
181 size_t len = end - tls_stackLimit;
182 assert((len & (pageSize() - 1)) == 0);
183 if (madvise((void*)tls_stackLimit, len, MADV_DONTNEED) != 0) {
184 // It is likely that the stack vma hasn't been fully grown. In this
185 // case madvise will apply dontneed to the present vmas, then return
186 // errno of ENOMEM.
187 // If thread stack pages are backed by locked or huge pages, madvise will
188 // fail with EINVAL. (EINVAL may also be returned if the address or length
189 // are bad.) Warn in debug mode, since MemoryIdler may not function as
190 // expected.
191 // We can also get an EAGAIN, theoretically.
192 PLOG_IF(WARNING, kIsDebug && errno == EINVAL) << "madvise failed";
193 assert(errno == EAGAIN || errno == ENOMEM || errno == EINVAL);
194 }
195}
196
197#else
198
199void MemoryIdler::unmapUnusedStack(size_t /* retain */) {}
200
201#endif
202
203} // namespace detail
204} // namespace folly
205