1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include <folly/concurrency/CacheLocality.h> |
18 | |
19 | #ifndef _MSC_VER |
20 | #define _GNU_SOURCE 1 // for RTLD_NOLOAD |
21 | #include <dlfcn.h> |
22 | #endif |
23 | #include <fstream> |
24 | |
25 | #include <folly/Conv.h> |
26 | #include <folly/Exception.h> |
27 | #include <folly/FileUtil.h> |
28 | #include <folly/Format.h> |
29 | #include <folly/ScopeGuard.h> |
30 | |
31 | namespace folly { |
32 | |
33 | ///////////// CacheLocality |
34 | |
35 | /// Returns the best real CacheLocality information available |
36 | static CacheLocality getSystemLocalityInfo() { |
37 | if (kIsLinux) { |
38 | try { |
39 | return CacheLocality::readFromSysfs(); |
40 | } catch (...) { |
41 | // keep trying |
42 | } |
43 | } |
44 | |
45 | long numCpus = sysconf(_SC_NPROCESSORS_CONF); |
46 | if (numCpus <= 0) { |
47 | // This shouldn't happen, but if it does we should try to keep |
48 | // going. We are probably not going to be able to parse /sys on |
49 | // this box either (although we will try), which means we are going |
50 | // to fall back to the SequentialThreadId splitter. On my 16 core |
51 | // (x hyperthreading) dev box 16 stripes is enough to get pretty good |
52 | // contention avoidance with SequentialThreadId, and there is little |
53 | // improvement from going from 32 to 64. This default gives us some |
54 | // wiggle room |
55 | numCpus = 32; |
56 | } |
57 | return CacheLocality::uniform(size_t(numCpus)); |
58 | } |
59 | |
60 | template <> |
61 | const CacheLocality& CacheLocality::system<std::atomic>() { |
62 | static auto* cache = new CacheLocality(getSystemLocalityInfo()); |
63 | return *cache; |
64 | } |
65 | |
66 | // Each level of cache has sharing sets, which are the set of cpus |
67 | // that share a common cache at that level. These are available in a |
68 | // hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map, |
69 | // for example). They are also available in a human-readable list form, |
70 | // as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list |
71 | // is a comma-separated list of numbers and ranges, where the ranges are |
72 | // a pair of decimal numbers separated by a '-'. |
73 | // |
74 | // To sort the cpus for optimum locality we don't really need to parse |
75 | // the sharing sets, we just need a unique representative from the |
76 | // equivalence class. The smallest value works fine, and happens to be |
77 | // the first decimal number in the file. We load all of the equivalence |
78 | // class information from all of the cpu*/index* directories, order the |
79 | // cpus first by increasing last-level cache equivalence class, then by |
80 | // the smaller caches. Finally, we break ties with the cpu number itself. |
81 | |
82 | /// Returns the first decimal number in the string, or throws an exception |
83 | /// if the string does not start with a number terminated by ',', '-', |
84 | /// '\n', or eos. |
85 | static size_t parseLeadingNumber(const std::string& line) { |
86 | auto raw = line.c_str(); |
87 | char* end; |
88 | unsigned long val = strtoul(raw, &end, 10); |
89 | if (end == raw || (*end != ',' && *end != '-' && *end != '\n' && *end != 0)) { |
90 | throw std::runtime_error( |
91 | to<std::string>("error parsing list '" , line, "'" ).c_str()); |
92 | } |
93 | return val; |
94 | } |
95 | |
96 | CacheLocality CacheLocality::readFromSysfsTree( |
97 | const std::function<std::string(std::string)>& mapping) { |
98 | // number of equivalence classes per level |
99 | std::vector<size_t> numCachesByLevel; |
100 | |
101 | // the list of cache equivalence classes, where equivalance classes |
102 | // are named by the smallest cpu in the class |
103 | std::vector<std::vector<size_t>> equivClassesByCpu; |
104 | |
105 | std::vector<size_t> cpus; |
106 | |
107 | while (true) { |
108 | auto cpu = cpus.size(); |
109 | std::vector<size_t> levels; |
110 | for (size_t index = 0;; ++index) { |
111 | auto dir = |
112 | sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/" , cpu, index); |
113 | auto cacheType = mapping(dir + "type" ); |
114 | auto equivStr = mapping(dir + "shared_cpu_list" ); |
115 | if (cacheType.size() == 0 || equivStr.size() == 0) { |
116 | // no more caches |
117 | break; |
118 | } |
119 | if (cacheType[0] == 'I') { |
120 | // cacheType in { "Data", "Instruction", "Unified" }. skip icache |
121 | continue; |
122 | } |
123 | auto equiv = parseLeadingNumber(equivStr); |
124 | auto level = levels.size(); |
125 | levels.push_back(equiv); |
126 | |
127 | if (equiv == cpu) { |
128 | // we only want to count the equiv classes once, so we do it when |
129 | // we first encounter them |
130 | while (numCachesByLevel.size() <= level) { |
131 | numCachesByLevel.push_back(0); |
132 | } |
133 | numCachesByLevel[level]++; |
134 | } |
135 | } |
136 | |
137 | if (levels.size() == 0) { |
138 | // no levels at all for this cpu, we must be done |
139 | break; |
140 | } |
141 | equivClassesByCpu.emplace_back(std::move(levels)); |
142 | cpus.push_back(cpu); |
143 | } |
144 | |
145 | if (cpus.size() == 0) { |
146 | throw std::runtime_error("unable to load cache sharing info" ); |
147 | } |
148 | |
149 | std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool { |
150 | // sort first by equiv class of cache with highest index, |
151 | // direction doesn't matter. If different cpus have |
152 | // different numbers of caches then this code might produce |
153 | // a sub-optimal ordering, but it won't crash |
154 | auto& lhsEquiv = equivClassesByCpu[lhs]; |
155 | auto& rhsEquiv = equivClassesByCpu[rhs]; |
156 | for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - 1; |
157 | i >= 0; |
158 | --i) { |
159 | auto idx = size_t(i); |
160 | if (lhsEquiv[idx] != rhsEquiv[idx]) { |
161 | return lhsEquiv[idx] < rhsEquiv[idx]; |
162 | } |
163 | } |
164 | |
165 | // break ties deterministically by cpu |
166 | return lhs < rhs; |
167 | }); |
168 | |
169 | // the cpus are now sorted by locality, with neighboring entries closer |
170 | // to each other than entries that are far away. For striping we want |
171 | // the inverse map, since we are starting with the cpu |
172 | std::vector<size_t> indexes(cpus.size()); |
173 | for (size_t i = 0; i < cpus.size(); ++i) { |
174 | indexes[cpus[i]] = i; |
175 | } |
176 | |
177 | return CacheLocality{ |
178 | cpus.size(), std::move(numCachesByLevel), std::move(indexes)}; |
179 | } |
180 | |
181 | CacheLocality CacheLocality::readFromSysfs() { |
182 | return readFromSysfsTree([](std::string name) { |
183 | std::ifstream xi(name.c_str()); |
184 | std::string rv; |
185 | std::getline(xi, rv); |
186 | return rv; |
187 | }); |
188 | } |
189 | |
190 | CacheLocality CacheLocality::uniform(size_t numCpus) { |
191 | CacheLocality rv; |
192 | |
193 | rv.numCpus = numCpus; |
194 | |
195 | // one cache shared by all cpus |
196 | rv.numCachesByLevel.push_back(numCpus); |
197 | |
198 | // no permutations in locality index mapping |
199 | for (size_t cpu = 0; cpu < numCpus; ++cpu) { |
200 | rv.localityIndexByCpu.push_back(cpu); |
201 | } |
202 | |
203 | return rv; |
204 | } |
205 | |
206 | ////////////// Getcpu |
207 | |
208 | Getcpu::Func Getcpu::resolveVdsoFunc() { |
209 | #if !FOLLY_HAVE_LINUX_VDSO |
210 | return nullptr; |
211 | #else |
212 | void* h = dlopen("linux-vdso.so.1" , RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD); |
213 | if (h == nullptr) { |
214 | return nullptr; |
215 | } |
216 | |
217 | auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu" )); |
218 | if (func == nullptr) { |
219 | // technically a null result could either be a failure or a successful |
220 | // lookup of a symbol with the null value, but the second can't actually |
221 | // happen for this symbol. No point holding the handle forever if |
222 | // we don't need the code |
223 | dlclose(h); |
224 | } |
225 | |
226 | return func; |
227 | #endif |
228 | } |
229 | |
230 | #ifdef FOLLY_TLS |
231 | /////////////// SequentialThreadId |
232 | template struct SequentialThreadId<std::atomic>; |
233 | #endif |
234 | |
235 | /////////////// AccessSpreader |
236 | template struct AccessSpreader<std::atomic>; |
237 | |
238 | SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz) |
239 | : allocSize_{allocSize}, sz_(sz) {} |
240 | |
241 | SimpleAllocator::~SimpleAllocator() { |
242 | std::lock_guard<std::mutex> g(m_); |
243 | for (auto& block : blocks_) { |
244 | folly::aligned_free(block); |
245 | } |
246 | } |
247 | |
248 | void* SimpleAllocator::allocateHard() { |
249 | // Allocate a new slab. |
250 | mem_ = static_cast<uint8_t*>(folly::aligned_malloc(allocSize_, allocSize_)); |
251 | if (!mem_) { |
252 | throw_exception<std::bad_alloc>(); |
253 | } |
254 | end_ = mem_ + allocSize_; |
255 | blocks_.push_back(mem_); |
256 | |
257 | // Install a pointer to ourselves as the allocator. |
258 | *reinterpret_cast<SimpleAllocator**>(mem_) = this; |
259 | static_assert(max_align_v >= sizeof(SimpleAllocator*), "alignment too small" ); |
260 | mem_ += std::min(sz_, max_align_v); |
261 | |
262 | // New allocation. |
263 | auto mem = mem_; |
264 | mem_ += sz_; |
265 | assert(intptr_t(mem) % 128 != 0); |
266 | return mem; |
267 | } |
268 | |
269 | } // namespace folly |
270 | |