1 | #ifndef SIMDJSON_ARM64_BITMASK_H |
2 | #define SIMDJSON_ARM64_BITMASK_H |
3 | |
4 | namespace simdjson { |
5 | namespace SIMDJSON_IMPLEMENTATION { |
6 | namespace { |
7 | |
8 | // |
9 | // Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. |
10 | // |
11 | // For example, prefix_xor(00100100) == 00011100 |
12 | // |
13 | simdjson_inline uint64_t prefix_xor(uint64_t bitmask) { |
14 | ///////////// |
15 | // We could do this with PMULL, but it is apparently slow. |
16 | // |
17 | //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension |
18 | //return vmull_p64(-1ULL, bitmask); |
19 | //#else |
20 | // Analysis by @sebpop: |
21 | // When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out |
22 | // in between other vector code, so effectively the extra cycles of the sequence do not matter |
23 | // because the GPR units are idle otherwise and the critical path is on the FP side. |
24 | // Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 ) |
25 | // and FP->GPR (2 cycles on N1 and 5 cycles on A72.) |
26 | /////////// |
27 | bitmask ^= bitmask << 1; |
28 | bitmask ^= bitmask << 2; |
29 | bitmask ^= bitmask << 4; |
30 | bitmask ^= bitmask << 8; |
31 | bitmask ^= bitmask << 16; |
32 | bitmask ^= bitmask << 32; |
33 | return bitmask; |
34 | } |
35 | |
36 | } // unnamed namespace |
37 | } // namespace arm64 |
38 | } // namespace simdjson |
39 | |
40 | #endif |
41 | |