1/*
2 * Copyright 2015-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <glog/logging.h>
20
21#ifdef _MSC_VER
22#include <immintrin.h>
23#endif
24
25#include <folly/CpuId.h>
26#include <folly/Portability.h>
27#include <folly/lang/Assume.h>
28#include <folly/portability/Builtins.h>
29
30namespace folly {
31namespace compression {
32namespace instructions {
33
34// NOTE: It's recommended to compile EF coding with -msse4.2, starting
35// with Nehalem, Intel CPUs support POPCNT instruction and gcc will emit
36// it for __builtin_popcountll intrinsic.
37// But we provide an alternative way for the client code: it can switch to
38// the appropriate version of EliasFanoReader<> at runtime (client should
39// implement this switching logic itself) by specifying instruction set to
40// use explicitly.
41
42struct Default {
43 static bool supported(const folly::CpuId& /* cpuId */ = {}) {
44 return true;
45 }
46 static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) {
47 return uint64_t(__builtin_popcountll(value));
48 }
49 static FOLLY_ALWAYS_INLINE int ctz(uint64_t value) {
50 DCHECK_GT(value, 0u);
51 return __builtin_ctzll(value);
52 }
53 static FOLLY_ALWAYS_INLINE int clz(uint64_t value) {
54 DCHECK_GT(value, 0u);
55 return __builtin_clzll(value);
56 }
57 static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) {
58 return value & (value - 1);
59 }
60
61 // Extract `length` bits starting from `start` from value. Only bits [0:63]
62 // will be extracted. All higher order bits in the
63 // result will be zeroed. If no bits are extracted, return 0.
64 static FOLLY_ALWAYS_INLINE uint64_t
65 bextr(uint64_t value, uint32_t start, uint32_t length) {
66 if (start > 63) {
67 return 0ULL;
68 }
69 if (start + length > 64) {
70 length = 64 - start;
71 }
72
73 return (value >> start) &
74 ((length == 64) ? (~0ULL) : ((1ULL << length) - 1ULL));
75 }
76
77 // Clear high bits starting at position index.
78 static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) {
79 if (index > 63) {
80 return 0;
81 }
82 return value & ((uint64_t(1) << index) - 1);
83 }
84};
85
86struct Nehalem : public Default {
87 static bool supported(const folly::CpuId& cpuId = {}) {
88 return cpuId.popcnt();
89 }
90
91 static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) {
92// POPCNT is supported starting with Intel Nehalem, AMD K10.
93#if defined(__GNUC__) || defined(__clang__)
94 // GCC and Clang won't inline the intrinsics.
95 uint64_t result;
96 asm("popcntq %1, %0" : "=r"(result) : "r"(value));
97 return result;
98#else
99 return uint64_t(_mm_popcnt_u64(value));
100#endif
101 }
102};
103
104struct Haswell : public Nehalem {
105 static bool supported(const folly::CpuId& cpuId = {}) {
106 return Nehalem::supported(cpuId) && cpuId.bmi1() && cpuId.bmi2();
107 }
108
109 static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) {
110// BMI1 is supported starting with Intel Haswell, AMD Piledriver.
111// BLSR combines two instructions into one and reduces register pressure.
112#if defined(__GNUC__) || defined(__clang__)
113 // GCC and Clang won't inline the intrinsics.
114 uint64_t result;
115 asm("blsrq %1, %0" : "=r"(result) : "r"(value));
116 return result;
117#else
118 return _blsr_u64(value);
119#endif
120 }
121
122 static FOLLY_ALWAYS_INLINE uint64_t
123 bextr(uint64_t value, uint32_t start, uint32_t length) {
124#if defined(__GNUC__) || defined(__clang__)
125 // GCC and Clang won't inline the intrinsics.
126 // Encode parameters in `pattern` where `pattern[0:7]` is `start` and
127 // `pattern[8:15]` is `length`.
128 // Ref: Intel Advanced Vector Extensions Programming Reference
129 uint64_t pattern = start & 0xFF;
130 pattern = pattern | ((length & 0xFF) << 8);
131 uint64_t result;
132 asm("bextrq %2, %1, %0" : "=r"(result) : "r"(value), "r"(pattern));
133 return result;
134#else
135 return _bextr_u64(value, start, length);
136#endif
137 }
138
139 static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) {
140#if defined(__GNUC__) || defined(__clang__)
141 // GCC and Clang won't inline the intrinsics.
142 const uint64_t index64 = index;
143 uint64_t result;
144 asm("bzhiq %2, %1, %0" : "=r"(result) : "r"(value), "r"(index64));
145 return result;
146#else
147 return _bzhi_u64(value, index);
148#endif
149 }
150};
151
152enum class Type {
153 DEFAULT,
154 NEHALEM,
155 HASWELL,
156};
157
158inline Type detect() {
159 const static Type type = [] {
160 if (instructions::Haswell::supported()) {
161 VLOG(2) << "Will use folly::compression::instructions::Haswell";
162 return Type::HASWELL;
163 } else if (instructions::Nehalem::supported()) {
164 VLOG(2) << "Will use folly::compression::instructions::Nehalem";
165 return Type::NEHALEM;
166 } else {
167 VLOG(2) << "Will use folly::compression::instructions::Default";
168 return Type::DEFAULT;
169 }
170 }();
171 return type;
172}
173
174template <class F>
175auto dispatch(Type type, F&& f) -> decltype(f(std::declval<Default>())) {
176 switch (type) {
177 case Type::HASWELL:
178 return f(Haswell());
179 case Type::NEHALEM:
180 return f(Nehalem());
181 case Type::DEFAULT:
182 return f(Default());
183 }
184
185 assume_unreachable();
186}
187
188template <class F>
189auto dispatch(F&& f) -> decltype(f(std::declval<Default>())) {
190 return dispatch(detect(), std::forward<F>(f));
191}
192
193} // namespace instructions
194} // namespace compression
195} // namespace folly
196