1/*
2 * Copyright 2013-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <folly/concurrency/CacheLocality.h>
18
19#ifndef _MSC_VER
20#define _GNU_SOURCE 1 // for RTLD_NOLOAD
21#include <dlfcn.h>
22#endif
23#include <fstream>
24
25#include <folly/Conv.h>
26#include <folly/Exception.h>
27#include <folly/FileUtil.h>
28#include <folly/Format.h>
29#include <folly/ScopeGuard.h>
30
31namespace folly {
32
33///////////// CacheLocality
34
35/// Returns the best real CacheLocality information available
36static CacheLocality getSystemLocalityInfo() {
37 if (kIsLinux) {
38 try {
39 return CacheLocality::readFromSysfs();
40 } catch (...) {
41 // keep trying
42 }
43 }
44
45 long numCpus = sysconf(_SC_NPROCESSORS_CONF);
46 if (numCpus <= 0) {
47 // This shouldn't happen, but if it does we should try to keep
48 // going. We are probably not going to be able to parse /sys on
49 // this box either (although we will try), which means we are going
50 // to fall back to the SequentialThreadId splitter. On my 16 core
51 // (x hyperthreading) dev box 16 stripes is enough to get pretty good
52 // contention avoidance with SequentialThreadId, and there is little
53 // improvement from going from 32 to 64. This default gives us some
54 // wiggle room
55 numCpus = 32;
56 }
57 return CacheLocality::uniform(size_t(numCpus));
58}
59
60template <>
61const CacheLocality& CacheLocality::system<std::atomic>() {
62 static auto* cache = new CacheLocality(getSystemLocalityInfo());
63 return *cache;
64}
65
66// Each level of cache has sharing sets, which are the set of cpus
67// that share a common cache at that level. These are available in a
68// hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
69// for example). They are also available in a human-readable list form,
70// as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list
71// is a comma-separated list of numbers and ranges, where the ranges are
72// a pair of decimal numbers separated by a '-'.
73//
74// To sort the cpus for optimum locality we don't really need to parse
75// the sharing sets, we just need a unique representative from the
76// equivalence class. The smallest value works fine, and happens to be
77// the first decimal number in the file. We load all of the equivalence
78// class information from all of the cpu*/index* directories, order the
79// cpus first by increasing last-level cache equivalence class, then by
80// the smaller caches. Finally, we break ties with the cpu number itself.
81
82/// Returns the first decimal number in the string, or throws an exception
83/// if the string does not start with a number terminated by ',', '-',
84/// '\n', or eos.
85static size_t parseLeadingNumber(const std::string& line) {
86 auto raw = line.c_str();
87 char* end;
88 unsigned long val = strtoul(raw, &end, 10);
89 if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) {
90 throw std::runtime_error(
91 to<std::string>("error parsing list '", line, "'").c_str());
92 }
93 return val;
94}
95
96CacheLocality CacheLocality::readFromSysfsTree(
97 const std::function<std::string(std::string)>& mapping) {
98 // number of equivalence classes per level
99 std::vector<size_t> numCachesByLevel;
100
101 // the list of cache equivalence classes, where equivalance classes
102 // are named by the smallest cpu in the class
103 std::vector<std::vector<size_t>> equivClassesByCpu;
104
105 std::vector<size_t> cpus;
106
107 while (true) {
108 auto cpu = cpus.size();
109 std::vector<size_t> levels;
110 for (size_t index = 0;; ++index) {
111 auto dir =
112 sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
113 auto cacheType = mapping(dir + "type");
114 auto equivStr = mapping(dir + "shared_cpu_list");
115 if (cacheType.size() == 0 || equivStr.size() == 0) {
116 // no more caches
117 break;
118 }
119 if (cacheType[0] == 'I') {
120 // cacheType in { "Data", "Instruction", "Unified" }. skip icache
121 continue;
122 }
123 auto equiv = parseLeadingNumber(equivStr);
124 auto level = levels.size();
125 levels.push_back(equiv);
126
127 if (equiv == cpu) {
128 // we only want to count the equiv classes once, so we do it when
129 // we first encounter them
130 while (numCachesByLevel.size() <= level) {
131 numCachesByLevel.push_back(0);
132 }
133 numCachesByLevel[level]++;
134 }
135 }
136
137 if (levels.size() == 0) {
138 // no levels at all for this cpu, we must be done
139 break;
140 }
141 equivClassesByCpu.emplace_back(std::move(levels));
142 cpus.push_back(cpu);
143 }
144
145 if (cpus.size() == 0) {
146 throw std::runtime_error("unable to load cache sharing info");
147 }
148
149 std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
150 // sort first by equiv class of cache with highest index,
151 // direction doesn't matter. If different cpus have
152 // different numbers of caches then this code might produce
153 // a sub-optimal ordering, but it won't crash
154 auto& lhsEquiv = equivClassesByCpu[lhs];
155 auto& rhsEquiv = equivClassesByCpu[rhs];
156 for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1;
157 i >= 0;
158 --i) {
159 auto idx = size_t(i);
160 if (lhsEquiv[idx] != rhsEquiv[idx]) {
161 return lhsEquiv[idx] < rhsEquiv[idx];
162 }
163 }
164
165 // break ties deterministically by cpu
166 return lhs < rhs;
167 });
168
169 // the cpus are now sorted by locality, with neighboring entries closer
170 // to each other than entries that are far away. For striping we want
171 // the inverse map, since we are starting with the cpu
172 std::vector<size_t> indexes(cpus.size());
173 for (size_t i = 0; i < cpus.size(); ++i) {
174 indexes[cpus[i]] = i;
175 }
176
177 return CacheLocality{
178 cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
179}
180
181CacheLocality CacheLocality::readFromSysfs() {
182 return readFromSysfsTree([](std::string name) {
183 std::ifstream xi(name.c_str());
184 std::string rv;
185 std::getline(xi, rv);
186 return rv;
187 });
188}
189
190CacheLocality CacheLocality::uniform(size_t numCpus) {
191 CacheLocality rv;
192
193 rv.numCpus = numCpus;
194
195 // one cache shared by all cpus
196 rv.numCachesByLevel.push_back(numCpus);
197
198 // no permutations in locality index mapping
199 for (size_t cpu = 0; cpu < numCpus; ++cpu) {
200 rv.localityIndexByCpu.push_back(cpu);
201 }
202
203 return rv;
204}
205
206////////////// Getcpu
207
208Getcpu::Func Getcpu::resolveVdsoFunc() {
209#if !FOLLY_HAVE_LINUX_VDSO
210 return nullptr;
211#else
212 void* h = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
213 if (h == nullptr) {
214 return nullptr;
215 }
216
217 auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
218 if (func == nullptr) {
219 // technically a null result could either be a failure or a successful
220 // lookup of a symbol with the null value, but the second can't actually
221 // happen for this symbol. No point holding the handle forever if
222 // we don't need the code
223 dlclose(h);
224 }
225
226 return func;
227#endif
228}
229
230#ifdef FOLLY_TLS
231/////////////// SequentialThreadId
232template struct SequentialThreadId<std::atomic>;
233#endif
234
235/////////////// AccessSpreader
236template struct AccessSpreader<std::atomic>;
237
238SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
239 : allocSize_{allocSize}, sz_(sz) {}
240
241SimpleAllocator::~SimpleAllocator() {
242 std::lock_guard<std::mutex> g(m_);
243 for (auto& block : blocks_) {
244 folly::aligned_free(block);
245 }
246}
247
248void* SimpleAllocator::allocateHard() {
249 // Allocate a new slab.
250 mem_ = static_cast<uint8_t*>(folly::aligned_malloc(allocSize_, allocSize_));
251 if (!mem_) {
252 throw_exception<std::bad_alloc>();
253 }
254 end_ = mem_ + allocSize_;
255 blocks_.push_back(mem_);
256
257 // Install a pointer to ourselves as the allocator.
258 *reinterpret_cast<SimpleAllocator**>(mem_) = this;
259 static_assert(max_align_v >= sizeof(SimpleAllocator*), "alignment too small");
260 mem_ += std::min(sz_, max_align_v);
261
262 // New allocation.
263 auto mem = mem_;
264 mem_ += sz_;
265 assert(intptr_t(mem) % 128 != 0);
266 return mem;
267}
268
269} // namespace folly
270