CacheLocality.cpp source code [folly/concurrency/CacheLocality.cpp]

1	/*
2	* Copyright 2013-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#include <folly/concurrency/CacheLocality.h>
18
19	#ifndef _MSC_VER
20	#define _GNU_SOURCE 1 // for RTLD_NOLOAD
21	#include <dlfcn.h>
22	#endif
23	#include <fstream>
24
25	#include <folly/Conv.h>
26	#include <folly/Exception.h>
27	#include <folly/FileUtil.h>
28	#include <folly/Format.h>
29	#include <folly/ScopeGuard.h>
30
31	namespace folly {
32
33	///////////// CacheLocality
34
35	/// Returns the best real CacheLocality information available
36	static CacheLocality getSystemLocalityInfo() {
37	if (kIsLinux) {
38	try {
39	return CacheLocality::readFromSysfs();
40	} catch (...) {
41	// keep trying
42	}
43	}
44
45	long numCpus = sysconf(_SC_NPROCESSORS_CONF);
46	if (numCpus <= `0`) {
47	// This shouldn't happen, but if it does we should try to keep
48	// going. We are probably not going to be able to parse /sys on
49	// this box either (although we will try), which means we are going
50	// to fall back to the SequentialThreadId splitter. On my 16 core
51	// (x hyperthreading) dev box 16 stripes is enough to get pretty good
52	// contention avoidance with SequentialThreadId, and there is little
53	// improvement from going from 32 to 64. This default gives us some
54	// wiggle room
55	numCpus = `32`;
56	}
57	return CacheLocality::uniform(size_t(numCpus));
58	}
59
60	template <>
61	const CacheLocality& CacheLocality::system<std::atomic>() {
62	static auto* cache = new CacheLocality(getSystemLocalityInfo());
63	return *cache;
64	}
65
66	// Each level of cache has sharing sets, which are the set of cpus
67	// that share a common cache at that level. These are available in a
68	// hex bitset form (/sys/devices/system/cpu/cpu0/index0/shared_cpu_map,
69	// for example). They are also available in a human-readable list form,
70	// as in /sys/devices/system/cpu/cpu0/index0/shared_cpu_list. The list
71	// is a comma-separated list of numbers and ranges, where the ranges are
72	// a pair of decimal numbers separated by a '-'.
73	//
74	// To sort the cpus for optimum locality we don't really need to parse
75	// the sharing sets, we just need a unique representative from the
76	// equivalence class. The smallest value works fine, and happens to be
77	// the first decimal number in the file. We load all of the equivalence
78	// class information from all of the cpu/index* directories, order the*
79	// cpus first by increasing last-level cache equivalence class, then by
80	// the smaller caches. Finally, we break ties with the cpu number itself.
81
82	/// Returns the first decimal number in the string, or throws an exception
83	/// if the string does not start with a number terminated by ',', '-',
84	/// '\n', or eos.
85	static size_t parseLeadingNumber(const std::string& line) {
86	auto raw = line.c_str();
87	char* end;
88	unsigned long val = strtoul(raw, &end, `10`);
89	if (end == raw \|\| (end != `','` && end != `'-'` && end != `'\n'` && end != `0`)) {
90	throw std::runtime_error (
91	to<std::string>("error parsing list '", line, "'").c_str());
92	}
93	return val;
94	}
95
96	CacheLocality CacheLocality::readFromSysfsTree(
97	const std::function<std::string(std::string)>& mapping) {
98	// number of equivalence classes per level
99	std::vector<size_t> numCachesByLevel;
100
101	// the list of cache equivalence classes, where equivalance classes
102	// are named by the smallest cpu in the class
103	std::vector<std::vector<size_t>> equivClassesByCpu;
104
105	std::vector<size_t> cpus;
106
107	while (true) {
108	auto cpu = cpus.size();
109	std::vector<size_t> levels;
110	for (size_t index = `0`;; ++index) {
111	auto dir =
112	sformat("/sys/devices/system/cpu/cpu{}/cache/index{}/", cpu, index);
113	auto cacheType = mapping (dir + "type");
114	auto equivStr = mapping (dir + "shared_cpu_list");
115	if (cacheType.size() == `0` \|\| equivStr.size() == `0`) {
116	// no more caches
117	break;
118	}
119	if (cacheType [`0`] == `'I'`) {
120	// cacheType in { "Data", "Instruction", "Unified" }. skip icache
121	continue;
122	}
123	auto equiv = parseLeadingNumber(equivStr);
124	auto level = levels.size();
125	levels.push_back(equiv);
126
127	if (equiv == cpu) {
128	// we only want to count the equiv classes once, so we do it when
129	// we first encounter them
130	while (numCachesByLevel.size() <= level) {
131	numCachesByLevel.push_back(`0`);
132	}
133	numCachesByLevel [level]++;
134	}
135	}
136
137	if (levels.size() == `0`) {
138	// no levels at all for this cpu, we must be done
139	break;
140	}
141	equivClassesByCpu.emplace_back(std::move(levels));
142	cpus.push_back(cpu);
143	}
144
145	if (cpus.size() == `0`) {
146	throw std::runtime_error ("unable to load cache sharing info");
147	}
148
149	std::sort(cpus.begin(), cpus.end(), [&](size_t lhs, size_t rhs) -> bool {
150	// sort first by equiv class of cache with highest index,
151	// direction doesn't matter. If different cpus have
152	// different numbers of caches then this code might produce
153	// a sub-optimal ordering, but it won't crash
154	auto& lhsEquiv = equivClassesByCpu [lhs];
155	auto& rhsEquiv = equivClassesByCpu [rhs];
156	for (ssize_t i = ssize_t(std::min(lhsEquiv.size(), rhsEquiv.size())) - `1`;
157	i >= `0`;
158	--i) {
159	auto idx = size_t(i);
160	if (lhsEquiv [idx] != rhsEquiv [idx]) {
161	return lhsEquiv [idx] < rhsEquiv [idx];
162	}
163	}
164
165	// break ties deterministically by cpu
166	return lhs < rhs;
167	});
168
169	// the cpus are now sorted by locality, with neighboring entries closer
170	// to each other than entries that are far away. For striping we want
171	// the inverse map, since we are starting with the cpu
172	std::vector<size_t> indexes(cpus.size());
173	for (size_t i = `0`; i < cpus.size(); ++i) {
174	indexes [cpus [i]] = i;
175	}
176
177	return CacheLocality{
178	cpus.size(), std::move(numCachesByLevel), std::move(indexes)};
179	}
180
181	CacheLocality CacheLocality::readFromSysfs() {
182	return readFromSysfsTree([](std::string name) {
183	std::ifstream xi(name.c_str());
184	std::string rv;
185	std::getline(xi, rv);
186	return rv;
187	});
188	}
189
190	CacheLocality CacheLocality::uniform(size_t numCpus) {
191	CacheLocality rv;
192
193	rv.numCpus = numCpus;
194
195	// one cache shared by all cpus
196	rv.numCachesByLevel.push_back(numCpus);
197
198	// no permutations in locality index mapping
199	for (size_t cpu = `0`; cpu < numCpus; ++cpu) {
200	rv.localityIndexByCpu.push_back(cpu);
201	}
202
203	return rv;
204	}
205
206	////////////// Getcpu
207
208	Getcpu::Func Getcpu::resolveVdsoFunc() {
209	#if !FOLLY_HAVE_LINUX_VDSO
210	return nullptr;
211	#else
212	void* h = dlopen("linux-vdso.so.1", RTLD_LAZY \| RTLD_LOCAL \| RTLD_NOLOAD);
213	if (h == nullptr) {
214	return nullptr;
215	}
216
217	auto func = Getcpu::Func(dlsym(h, "__vdso_getcpu"));
218	if (func == nullptr) {
219	// technically a null result could either be a failure or a successful
220	// lookup of a symbol with the null value, but the second can't actually
221	// happen for this symbol. No point holding the handle forever if
222	// we don't need the code
223	dlclose(h);
224	}
225
226	return func;
227	#endif
228	}
229
230	#ifdef FOLLY_TLS
231	/////////////// SequentialThreadId
232	template struct SequentialThreadId<std::atomic>;
233	#endif
234
235	/////////////// AccessSpreader
236	template struct AccessSpreader<std::atomic>;
237
238	SimpleAllocator::SimpleAllocator(size_t allocSize, size_t sz)
239	: allocSize_{allocSize}, sz_(sz) {}
240
241	SimpleAllocator::~SimpleAllocator() {
242	std::lock_guard<std::mutex> g(m_);
243	for (auto& block : blocks_) {
244	folly::aligned_free(block);
245	}
246	}
247
248	void* SimpleAllocator::allocateHard() {
249	// Allocate a new slab.
250	mem_ = static_cast<uint8_t*>(folly::aligned_malloc(allocSize_, allocSize_));
251	if (!mem_) {
252	throw_exception<std::bad_alloc>();
253	}
254	end_ = mem_ + allocSize_;
255	blocks_.push_back(mem_);
256
257	// Install a pointer to ourselves as the allocator.
258	*reinterpret_cast<SimpleAllocator>(mem_) = this**;
259	static_assert(max_align_v >= sizeof(SimpleAllocator*), "alignment too small");
260	mem_ += std::min(sz_, max_align_v);
261
262	// New allocation.
263	auto mem = mem_;
264	mem_ += sz_;
265	assert(intptr_t(mem) % `128` != `0`);
266	return mem;
267	}
268
269	} // namespace folly
270

Browse the source code of folly/concurrency/CacheLocality.cpp