1 | /* |
2 | * Copyright 2015-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <glog/logging.h> |
20 | |
21 | #ifdef _MSC_VER |
22 | #include <immintrin.h> |
23 | #endif |
24 | |
25 | #include <folly/CpuId.h> |
26 | #include <folly/Portability.h> |
27 | #include <folly/lang/Assume.h> |
28 | #include <folly/portability/Builtins.h> |
29 | |
30 | namespace folly { |
31 | namespace compression { |
32 | namespace instructions { |
33 | |
34 | // NOTE: It's recommended to compile EF coding with -msse4.2, starting |
35 | // with Nehalem, Intel CPUs support POPCNT instruction and gcc will emit |
36 | // it for __builtin_popcountll intrinsic. |
37 | // But we provide an alternative way for the client code: it can switch to |
38 | // the appropriate version of EliasFanoReader<> at runtime (client should |
39 | // implement this switching logic itself) by specifying instruction set to |
40 | // use explicitly. |
41 | |
42 | struct Default { |
43 | static bool supported(const folly::CpuId& /* cpuId */ = {}) { |
44 | return true; |
45 | } |
46 | static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) { |
47 | return uint64_t(__builtin_popcountll(value)); |
48 | } |
49 | static FOLLY_ALWAYS_INLINE int ctz(uint64_t value) { |
50 | DCHECK_GT(value, 0u); |
51 | return __builtin_ctzll(value); |
52 | } |
53 | static FOLLY_ALWAYS_INLINE int clz(uint64_t value) { |
54 | DCHECK_GT(value, 0u); |
55 | return __builtin_clzll(value); |
56 | } |
57 | static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) { |
58 | return value & (value - 1); |
59 | } |
60 | |
61 | // Extract `length` bits starting from `start` from value. Only bits [0:63] |
62 | // will be extracted. All higher order bits in the |
63 | // result will be zeroed. If no bits are extracted, return 0. |
64 | static FOLLY_ALWAYS_INLINE uint64_t |
65 | bextr(uint64_t value, uint32_t start, uint32_t length) { |
66 | if (start > 63) { |
67 | return 0ULL; |
68 | } |
69 | if (start + length > 64) { |
70 | length = 64 - start; |
71 | } |
72 | |
73 | return (value >> start) & |
74 | ((length == 64) ? (~0ULL) : ((1ULL << length) - 1ULL)); |
75 | } |
76 | |
77 | // Clear high bits starting at position index. |
78 | static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) { |
79 | if (index > 63) { |
80 | return 0; |
81 | } |
82 | return value & ((uint64_t(1) << index) - 1); |
83 | } |
84 | }; |
85 | |
86 | struct Nehalem : public Default { |
87 | static bool supported(const folly::CpuId& cpuId = {}) { |
88 | return cpuId.popcnt(); |
89 | } |
90 | |
91 | static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) { |
92 | // POPCNT is supported starting with Intel Nehalem, AMD K10. |
93 | #if defined(__GNUC__) || defined(__clang__) |
94 | // GCC and Clang won't inline the intrinsics. |
95 | uint64_t result; |
96 | asm("popcntq %1, %0" : "=r" (result) : "r" (value)); |
97 | return result; |
98 | #else |
99 | return uint64_t(_mm_popcnt_u64(value)); |
100 | #endif |
101 | } |
102 | }; |
103 | |
104 | struct Haswell : public Nehalem { |
105 | static bool supported(const folly::CpuId& cpuId = {}) { |
106 | return Nehalem::supported(cpuId) && cpuId.bmi1() && cpuId.bmi2(); |
107 | } |
108 | |
109 | static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) { |
110 | // BMI1 is supported starting with Intel Haswell, AMD Piledriver. |
111 | // BLSR combines two instructions into one and reduces register pressure. |
112 | #if defined(__GNUC__) || defined(__clang__) |
113 | // GCC and Clang won't inline the intrinsics. |
114 | uint64_t result; |
115 | asm("blsrq %1, %0" : "=r" (result) : "r" (value)); |
116 | return result; |
117 | #else |
118 | return _blsr_u64(value); |
119 | #endif |
120 | } |
121 | |
122 | static FOLLY_ALWAYS_INLINE uint64_t |
123 | bextr(uint64_t value, uint32_t start, uint32_t length) { |
124 | #if defined(__GNUC__) || defined(__clang__) |
125 | // GCC and Clang won't inline the intrinsics. |
126 | // Encode parameters in `pattern` where `pattern[0:7]` is `start` and |
127 | // `pattern[8:15]` is `length`. |
128 | // Ref: Intel Advanced Vector Extensions Programming Reference |
129 | uint64_t pattern = start & 0xFF; |
130 | pattern = pattern | ((length & 0xFF) << 8); |
131 | uint64_t result; |
132 | asm("bextrq %2, %1, %0" : "=r" (result) : "r" (value), "r" (pattern)); |
133 | return result; |
134 | #else |
135 | return _bextr_u64(value, start, length); |
136 | #endif |
137 | } |
138 | |
139 | static FOLLY_ALWAYS_INLINE uint64_t bzhi(uint64_t value, uint32_t index) { |
140 | #if defined(__GNUC__) || defined(__clang__) |
141 | // GCC and Clang won't inline the intrinsics. |
142 | const uint64_t index64 = index; |
143 | uint64_t result; |
144 | asm("bzhiq %2, %1, %0" : "=r" (result) : "r" (value), "r" (index64)); |
145 | return result; |
146 | #else |
147 | return _bzhi_u64(value, index); |
148 | #endif |
149 | } |
150 | }; |
151 | |
152 | enum class Type { |
153 | DEFAULT, |
154 | NEHALEM, |
155 | HASWELL, |
156 | }; |
157 | |
158 | inline Type detect() { |
159 | const static Type type = [] { |
160 | if (instructions::Haswell::supported()) { |
161 | VLOG(2) << "Will use folly::compression::instructions::Haswell" ; |
162 | return Type::HASWELL; |
163 | } else if (instructions::Nehalem::supported()) { |
164 | VLOG(2) << "Will use folly::compression::instructions::Nehalem" ; |
165 | return Type::NEHALEM; |
166 | } else { |
167 | VLOG(2) << "Will use folly::compression::instructions::Default" ; |
168 | return Type::DEFAULT; |
169 | } |
170 | }(); |
171 | return type; |
172 | } |
173 | |
174 | template <class F> |
175 | auto dispatch(Type type, F&& f) -> decltype(f(std::declval<Default>())) { |
176 | switch (type) { |
177 | case Type::HASWELL: |
178 | return f(Haswell()); |
179 | case Type::NEHALEM: |
180 | return f(Nehalem()); |
181 | case Type::DEFAULT: |
182 | return f(Default()); |
183 | } |
184 | |
185 | assume_unreachable(); |
186 | } |
187 | |
188 | template <class F> |
189 | auto dispatch(F&& f) -> decltype(f(std::declval<Default>())) { |
190 | return dispatch(detect(), std::forward<F>(f)); |
191 | } |
192 | |
193 | } // namespace instructions |
194 | } // namespace compression |
195 | } // namespace folly |
196 | |