| 1 | #ifndef SIMDJSON_ARM64_BITMASK_H |
| 2 | #define SIMDJSON_ARM64_BITMASK_H |
| 3 | |
| 4 | namespace simdjson { |
| 5 | namespace SIMDJSON_IMPLEMENTATION { |
| 6 | namespace { |
| 7 | |
| 8 | // |
| 9 | // Perform a "cumulative bitwise xor," flipping bits each time a 1 is encountered. |
| 10 | // |
| 11 | // For example, prefix_xor(00100100) == 00011100 |
| 12 | // |
| 13 | simdjson_inline uint64_t prefix_xor(uint64_t bitmask) { |
| 14 | ///////////// |
| 15 | // We could do this with PMULL, but it is apparently slow. |
| 16 | // |
| 17 | //#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension |
| 18 | //return vmull_p64(-1ULL, bitmask); |
| 19 | //#else |
| 20 | // Analysis by @sebpop: |
| 21 | // When diffing the assembly for src/stage1_find_marks.cpp I see that the eors are all spread out |
| 22 | // in between other vector code, so effectively the extra cycles of the sequence do not matter |
| 23 | // because the GPR units are idle otherwise and the critical path is on the FP side. |
| 24 | // Also the PMULL requires two extra fmovs: GPR->FP (3 cycles in N1, 5 cycles in A72 ) |
| 25 | // and FP->GPR (2 cycles on N1 and 5 cycles on A72.) |
| 26 | /////////// |
| 27 | bitmask ^= bitmask << 1; |
| 28 | bitmask ^= bitmask << 2; |
| 29 | bitmask ^= bitmask << 4; |
| 30 | bitmask ^= bitmask << 8; |
| 31 | bitmask ^= bitmask << 16; |
| 32 | bitmask ^= bitmask << 32; |
| 33 | return bitmask; |
| 34 | } |
| 35 | |
| 36 | } // unnamed namespace |
| 37 | } // namespace arm64 |
| 38 | } // namespace simdjson |
| 39 | |
| 40 | #endif |
| 41 | |