AtomicUnorderedMap.h source code [folly/AtomicUnorderedMap.h]

1	/*
2	* Copyright 2013-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <atomic>
20	#include <cstdint>
21	#include <functional>
22	#include <limits>
23	#include <stdexcept>
24	#include <system_error>
25	#include <type_traits>
26
27	#include <boost/type_traits/has_trivial_destructor.hpp>
28
29	#include <folly/Conv.h>
30	#include <folly/Likely.h>
31	#include <folly/Random.h>
32	#include <folly/Traits.h>
33	#include <folly/detail/AtomicUnorderedMapUtils.h>
34	#include <folly/lang/Bits.h>
35	#include <folly/portability/SysMman.h>
36	#include <folly/portability/Unistd.h>
37
38	namespace folly {
39
40	/// You're probably reading this because you are looking for an
41	/// AtomicUnorderedMap<K,V> that is fully general, highly concurrent (for
42	/// reads, writes, and iteration), and makes no performance compromises.
43	/// We haven't figured that one out yet. What you will find here is a
44	/// hash table implementation that sacrifices generality so that it can
45	/// give you all of the other things.
46	///
47	/// LIMITATIONS:
48	///
49	/// Insert only () - the only write operation supported directly by
50	/// AtomicUnorderedInsertMap is findOrConstruct. There is a () because*
51	/// values aren't moved, so you can roll your own concurrency control for
52	/// in-place updates of values (see MutableData and MutableAtom below),
53	/// but the hash table itself doesn't help you.
54	///
55	/// No resizing - you must specify the capacity up front, and once*
56	/// the hash map gets full you won't be able to insert. Insert
57	/// performance will degrade once the load factor is high. Insert is
58	/// O(1/(1-actual_load_factor)). Note that this is a pretty strong
59	/// limitation, because you can't remove existing keys.
60	///
61	/// 2^30 maximum default capacity - by default AtomicUnorderedInsertMap*
62	/// uses uint32_t internal indexes (and steals 2 bits), limiting you
63	/// to about a billion entries. If you need more you can fill in all
64	/// of the template params so you change IndexType to uint64_t, or you
65	/// can use AtomicUnorderedInsertMap64. 64-bit indexes will increase
66	/// the space over of the map, of course.
67	///
68	/// WHAT YOU GET IN EXCHANGE:
69	///
70	/// Arbitrary key and value types - any K and V that can be used in a*
71	/// std::unordered_map can be used here. In fact, the key and value
72	/// types don't even have to be copyable or moveable!
73	///
74	/// Keys and values in the map won't be moved - it is safe to keep*
75	/// pointers or references to the keys and values in the map, because
76	/// they are never moved or destroyed (until the map itself is destroyed).
77	///
78	/// Iterators are never invalidated - writes don't invalidate iterators,*
79	/// so you can scan and insert in parallel.
80	///
81	/// Fast wait-free reads - reads are usually only a single cache miss,*
82	/// even when the hash table is very large. Wait-freedom means that
83	/// you won't see latency outliers even in the face of concurrent writes.
84	///
85	/// Lock-free insert - writes proceed in parallel. If a thread in the*
86	/// middle of a write is unlucky and gets suspended, it doesn't block
87	/// anybody else.
88	///
89	/// COMMENTS ON INSERT-ONLY
90	///
91	/// This map provides wait-free linearizable reads and lock-free
92	/// linearizable inserts. Inserted values won't be moved, but no
93	/// concurrency control is provided for safely updating them. To remind
94	/// you of that fact they are only provided in const form. This is the
95	/// only simple safe thing to do while preserving something like the normal
96	/// std::map iteration form, which requires that iteration be exposed
97	/// via std::pair (and prevents encapsulation of access to the value).
98	///
99	/// There are a couple of reasonable policies for doing in-place
100	/// concurrency control on the values. I am hoping that the policy can
101	/// be injected via the value type or an extra template param, to keep
102	/// the core AtomicUnorderedInsertMap insert-only:
103	///
104	/// CONST: this is the currently implemented strategy, which is simple,
105	/// performant, and not that expressive. You can always put in a value
106	/// with a mutable field (see MutableAtom below), but that doesn't look
107	/// as pretty as it should.
108	///
109	/// ATOMIC: for integers and integer-size trivially copyable structs
110	/// (via an adapter like tao/queues/AtomicStruct) the value can be a
111	/// std::atomic and read and written atomically.
112	///
113	/// SEQ-LOCK: attach a counter incremented before and after write.
114	/// Writers serialize by using CAS to make an even->odd transition,
115	/// then odd->even after the write. Readers grab the value with memcpy,
116	/// checking sequence value before and after. Readers retry until they
117	/// see an even sequence number that doesn't change. This works for
118	/// larger structs, but still requires memcpy to be equivalent to copy
119	/// assignment, and it is no longer lock-free. It scales very well,
120	/// because the readers are still invisible (no cache line writes).
121	///
122	/// LOCK: folly's SharedMutex would be a good choice here.
123	///
124	/// MEMORY ALLOCATION
125	///
126	/// Underlying memory is allocated as a big anonymous mmap chunk, which
127	/// might be cheaper than calloc() and is certainly not more expensive
128	/// for large maps. If the SkipKeyValueDeletion template param is true
129	/// then deletion of the map consists of unmapping the backing memory,
130	/// which is much faster than destructing all of the keys and values.
131	/// Feel free to override if std::is_trivial_destructor isn't recognizing
132	/// the triviality of your destructors.
133	template <
134	typename Key,
135	typename Value,
136	typename Hash = std::hash<Key>,
137	typename KeyEqual = std::equal_to<Key>,
138	bool SkipKeyValueDeletion =
139	(boost::has_trivial_destructor<Key>::value &&
140	boost::has_trivial_destructor<Value>::value),
141	template <typename> class Atom = std::atomic,
142	typename IndexType = uint32_t,
143	typename Allocator = folly::detail::MMapAlloc>
144
145	struct AtomicUnorderedInsertMap {
146	typedef Key key_type;
147	typedef Value mapped_type;
148	typedef std::pair<Key, Value> value_type;
149	typedef std::size_t size_type;
150	typedef std::ptrdiff_t difference_type;
151	typedef Hash hasher;
152	typedef KeyEqual key_equal;
153	typedef const value_type& const_reference;
154
155	typedef struct ConstIterator {
156	ConstIterator(const AtomicUnorderedInsertMap& owner, IndexType slot)
157	: owner_(owner), slot_(slot) {}
158
159	ConstIterator(const ConstIterator&) = default;
160	ConstIterator& operator=(const ConstIterator&) = default;
161
162	const value_type& operator() const* {
163	return owner_.slots_[slot_].keyValue();
164	}
165
166	const value_type* operator->() const {
167	return &owner_.slots_[slot_].keyValue();
168	}
169
170	// pre-increment
171	const ConstIterator& operator++() {
172	while (slot_ > `0`) {
173	--slot_;
174	if (owner_.slots_[slot_].state() == LINKED) {
175	break;
176	}
177	}
178	return *this;
179	}
180
181	// post-increment
182	ConstIterator operator++(int / dummy /) {
183	auto prev = *this;
184	++*this;
185	return prev;
186	}
187
188	bool operator==(const ConstIterator& rhs) const {
189	return slot_ == rhs.slot_;
190	}
191	bool operator!=(const ConstIterator& rhs) const {
192	return !(*this == rhs);
193	}
194
195	private:
196	const AtomicUnorderedInsertMap& owner_;
197	IndexType slot_;
198	} const_iterator;
199
200	friend ConstIterator;
201
202	/// Constructs a map that will support the insertion of maxSize key-value
203	/// pairs without exceeding the max load factor. Load factors of greater
204	/// than 1 are not supported, and once the actual load factor of the
205	/// map approaches 1 the insert performance will suffer. The capacity
206	/// is limited to 2^30 (about a billion) for the default IndexType,
207	/// beyond which we will throw invalid_argument.
208	explicit AtomicUnorderedInsertMap(
209	size_t maxSize,
210	float maxLoadFactor = `0.8f`,
211	const Allocator& alloc = Allocator())
212	: allocator_(alloc) {
213	size_t capacity = size_t(maxSize / std::min(`1.0f`, maxLoadFactor) + `128`);
214	size_t avail = size_t{`1`} << (`8` * sizeof(IndexType) - `2`);
215	if (capacity > avail && maxSize < avail) {
216	// we'll do our best
217	capacity = avail;
218	}
219	if (capacity < maxSize \|\| capacity > avail) {
220	throw std::invalid_argument (
221	"AtomicUnorderedInsertMap capacity must fit in IndexType with 2 bits "
222	"left over");
223	}
224
225	numSlots_ = capacity;
226	slotMask_ = folly::nextPowTwo(capacity * `4`) - `1`;
227	mmapRequested_ = sizeof(Slot) * capacity;
228	slots_ = reinterpret_cast<Slot*>(allocator_.allocate(mmapRequested_));
229	zeroFillSlots();
230	// mark the zero-th slot as in-use but not valid, since that happens
231	// to be our nil value
232	slots_[`0`].stateUpdate(EMPTY, CONSTRUCTING);
233	}
234
235	~AtomicUnorderedInsertMap() {
236	if (!SkipKeyValueDeletion) {
237	for (size_t i = `1`; i < numSlots_; ++i) {
238	slots_[i].~Slot();
239	}
240	}
241	allocator_.deallocate(reinterpret_cast<char*>(slots_), mmapRequested_);
242	}
243
244	/// Searches for the key, returning (iter,false) if it is found.
245	/// If it is not found calls the functor Func with a void argument*
246	/// that is raw storage suitable for placement construction of a Value
247	/// (see raw_value_type), then returns (iter,true). May call Func and
248	/// then return (iter,false) if there are other concurrent writes, in
249	/// which case the newly constructed value will be immediately destroyed.
250	///
251	/// This function does not block other readers or writers. If there
252	/// are other concurrent writes, many parallel calls to func may happen
253	/// and only the first one to complete will win. The values constructed
254	/// by the other calls to func will be destroyed.
255	///
256	/// Usage:
257	///
258	/// AtomicUnorderedInsertMap<std::string,std::string> memo;
259	///
260	/// auto value = memo.findOrConstruct(key, [=](void raw) {*
261	/// new (raw) std::string(computation(key));
262	/// })->first;
263	template <typename Func>
264	std::pair<const_iterator, bool> findOrConstruct(const Key& key, Func&& func) {
265	auto const slot = keyToSlotIdx(key);
266	auto prev = slots_[slot].headAndState_.load(std::memory_order_acquire);
267
268	auto existing = find(key, slot);
269	if (existing != `0`) {
270	return std::make_pair(ConstIterator(*this, existing), false);
271	}
272
273	auto idx = allocateNear(slot);
274	new (&slots_[idx].keyValue().first) Key(key);
275	func(static_cast<void*>(&slots_[idx].keyValue().second));
276
277	while (true) {
278	slots_[idx].next_ = prev >> `2`;
279
280	// we can merge the head update and the CONSTRUCTING -> LINKED update
281	// into a single CAS if slot == idx (which should happen often)
282	auto after = idx << `2`;
283	if (slot == idx) {
284	after += LINKED;
285	} else {
286	after += (prev & `3`);
287	}
288
289	if (slots_[slot].headAndState_.compare_exchange_strong(prev, after)) {
290	// success
291	if (idx != slot) {
292	slots_[idx].stateUpdate(CONSTRUCTING, LINKED);
293	}
294	return std::make_pair(ConstIterator(*this, idx), true);
295	}
296	// compare_exchange_strong updates its first arg on failure, so
297	// there is no need to reread prev
298
299	existing = find(key, slot);
300	if (existing != `0`) {
301	// our allocated key and value are no longer needed
302	slots_[idx].keyValue().first.~Key();
303	slots_[idx].keyValue().second.~Value();
304	slots_[idx].stateUpdate(CONSTRUCTING, EMPTY);
305
306	return std::make_pair(ConstIterator(*this, existing), false);
307	}
308	}
309	}
310
311	/// This isn't really emplace, but it is what we need to test.
312	/// Eventually we can duplicate all of the std::pair constructor
313	/// forms, including a recursive tuple forwarding template
314	/// http://functionalcpp.wordpress.com/2013/08/28/tuple-forwarding/).
315	template <class K, class V>
316	std::pair<const_iterator, bool> emplace(const K& key, V&& value) {
317	return findOrConstruct(
318	key, [&](void* raw) { new (raw) Value(std::forward<V>(value)); });
319	}
320
321	const_iterator find(const Key& key) const {
322	return ConstIterator(*this, find(key, keyToSlotIdx(key)));
323	}
324
325	const_iterator cbegin() const {
326	IndexType slot = numSlots_ - `1`;
327	while (slot > `0` && slots_[slot].state() != LINKED) {
328	--slot;
329	}
330	return ConstIterator(*this, slot);
331	}
332
333	const_iterator cend() const {
334	return ConstIterator(*this, `0`);
335	}
336
337	private:
338	enum : IndexType {
339	kMaxAllocationTries = `1000`, // after this we throw
340	};
341
342	enum BucketState : IndexType {
343	EMPTY = `0`,
344	CONSTRUCTING = `1`,
345	LINKED = `2`,
346	};
347
348	/// Lock-free insertion is easiest by prepending to collision chains.
349	/// A large chaining hash table takes two cache misses instead of
350	/// one, however. Our solution is to colocate the bucket storage and
351	/// the head storage, so that even though we are traversing chains we
352	/// are likely to stay within the same cache line. Just make sure to
353	/// traverse head before looking at any keys. This strategy gives us
354	/// 32 bit pointers and fast iteration.
355	struct Slot {
356	/// The bottom two bits are the BucketState, the rest is the index
357	/// of the first bucket for the chain whose keys map to this slot.
358	/// When things are going well the head usually links to this slot,
359	/// but that doesn't always have to happen.
360	Atom<IndexType> headAndState_;
361
362	/// The next bucket in the chain
363	IndexType next_;
364
365	/// Key and Value
366	aligned_storage_for_t<value_type> raw_;
367
368	~Slot() {
369	auto s = state();
370	assert(s == EMPTY \|\| s == LINKED);
371	if (s == LINKED) {
372	keyValue().first.~Key();
373	keyValue().second.~Value();
374	}
375	}
376
377	BucketState state() const {
378	return BucketState(headAndState_.load(std::memory_order_acquire) & `3`);
379	}
380
381	void stateUpdate(BucketState before, BucketState after) {
382	assert(state() == before);
383	headAndState_ += (after - before);
384	}
385
386	value_type& keyValue() {
387	assert(state() != EMPTY);
388	return *static_cast<value_type>(static_cast<void**>(&raw_));
389	}
390
391	const value_type& keyValue() const {
392	assert(state() != EMPTY);
393	return *static_cast<const value_type>(static_cast<const* void*>(&raw_));
394	}
395	};
396
397	// We manually manage the slot memory so we can bypass initialization
398	// (by getting a zero-filled mmap chunk) and optionally destruction of
399	// the slots
400
401	size_t mmapRequested_;
402	size_t numSlots_;
403
404	/// tricky, see keyToSlodIdx
405	size_t slotMask_;
406
407	Allocator allocator_;
408	Slot* slots_;
409
410	IndexType keyToSlotIdx(const Key& key) const {
411	size_t h = hasher()(key);
412	h &= slotMask_;
413	while (h >= numSlots_) {
414	h -= numSlots_;
415	}
416	return h;
417	}
418
419	IndexType find(const Key& key, IndexType slot) const {
420	KeyEqual ke = {};
421	auto hs = slots_[slot].headAndState_.load(std::memory_order_acquire);
422	for (slot = hs >> `2`; slot != `0`; slot = slots_[slot].next_) {
423	if (ke(key, slots_[slot].keyValue().first)) {
424	return slot;
425	}
426	}
427	return `0`;
428	}
429
430	/// Allocates a slot and returns its index. Tries to put it near
431	/// slots_[start].
432	IndexType allocateNear(IndexType start) {
433	for (IndexType tries = `0`; tries < kMaxAllocationTries; ++tries) {
434	auto slot = allocationAttempt(start, tries);
435	auto prev = slots_[slot].headAndState_.load(std::memory_order_acquire);
436	if ((prev & `3`) == EMPTY &&
437	slots_[slot].headAndState_.compare_exchange_strong(
438	prev, prev + CONSTRUCTING - EMPTY)) {
439	return slot;
440	}
441	}
442	throw std::bad_alloc ();
443	}
444
445	/// Returns the slot we should attempt to allocate after tries failed
446	/// tries, starting from the specified slot. This is pulled out so we
447	/// can specialize it differently during deterministic testing
448	IndexType allocationAttempt(IndexType start, IndexType tries) const {
449	if (LIKELY(tries < `8` && start + tries < numSlots_)) {
450	return IndexType(start + tries);
451	} else {
452	IndexType rv;
453	if (sizeof(IndexType) <= `4`) {
454	rv = IndexType(folly::Random::rand32(numSlots_));
455	} else {
456	rv = IndexType(folly::Random::rand64(numSlots_));
457	}
458	assert(rv < numSlots_);
459	return rv;
460	}
461	}
462
463	void zeroFillSlots() {
464	using folly::detail::GivesZeroFilledMemory;
465	if (!GivesZeroFilledMemory<Allocator>::value) {
466	memset(static_cast<void*>(slots_), `0`, mmapRequested_);
467	}
468	}
469	};
470
471	/// AtomicUnorderedInsertMap64 is just a type alias that makes it easier
472	/// to select a 64 bit slot index type. Use this if you need a capacity
473	/// bigger than 2^30 (about a billion). This increases memory overheads,
474	/// obviously.
475	template <
476	typename Key,
477	typename Value,
478	typename Hash = std::hash<Key>,
479	typename KeyEqual = std::equal_to<Key>,
480	bool SkipKeyValueDeletion =
481	(boost::has_trivial_destructor<Key>::value &&
482	boost::has_trivial_destructor<Value>::value),
483	template <typename> class Atom = std::atomic,
484	typename Allocator = folly::detail::MMapAlloc>
485	using AtomicUnorderedInsertMap64 = AtomicUnorderedInsertMap<
486	Key,
487	Value,
488	Hash,
489	KeyEqual,
490	SkipKeyValueDeletion,
491	Atom,
492	uint64_t,
493	Allocator>;
494
495	/// MutableAtom is a tiny wrapper than gives you the option of atomically
496	/// updating values inserted into an AtomicUnorderedInsertMap<K,
497	/// MutableAtom<V>>. This relies on AtomicUnorderedInsertMap's guarantee
498	/// that it doesn't move values.
499	template <typename T, template <typename> class Atom = std::atomic>
500	struct MutableAtom {
501	mutable Atom<T> data;
502
503	explicit MutableAtom(const T& init) : data(init) {}
504	};
505
506	/// MutableData is a tiny wrapper than gives you the option of using an
507	/// external concurrency control mechanism to updating values inserted
508	/// into an AtomicUnorderedInsertMap.
509	template <typename T>
510	struct MutableData {
511	mutable T data;
512	explicit MutableData(const T& init) : data(init) {}
513	};
514
515	} // namespace folly
516

Browse the source code of folly/AtomicUnorderedMap.h