位运算计算奇偶校验的最快方法是什么？

Question

我的解决方案（对于输入块的每一位，都有这样一行）：

*parity ^= (((x[0] >> 30) & 0x00000001) * 0xc3e0d69f);

所有类型均为 uint32。该行获取输入 x 的第二位，将其移位到 LSB 并将所有其他位设置为零。然后，将 32 位奇偶校验与该位的相应奇偶校验集进行异或。

我发现这个乘法解决方案是执行条件异或的最快方法。有没有更快的方法？

Answer 1

请参阅 并行计算奇偶校验，了解一些用于计算字、字节等奇偶校验的巧妙技巧。

Answer 2

我不完全明白你的意思是什么样的奇偶校验，但是如果这行代码正在做你想要的事情，它可能会得到改进。

一般规则：对于 {0, 1} 中的 x x * N == -x & N

这是因为 -x 表示 0 表示所有位均重置，而表示 1 表示 -1 表示所有位均置位。

所以原来的代码行可以重写为：

*parity ^= (-((x[0] >> 30) & 0x00000001) & 0xc3e0d69f);

在许多微处理器上，哪两个运算的计算时间比乘法要短，但您应该检查一下。

代码也可以利用有符号右移

*parity ^= (((int32_t)x[0] << 1 >> 31) & 0xc3e0d69f);

首先将第 30 位移位到第 31 位，即符号位，然后第二次扩展所有其他位上的符号位，因为在大多数机器上右移充当下限（x / 2^N），从而填充移位位带符号位 (

abc...yz>>3 == aaaabc...yz

)。

但是这些技巧在 C 标准中被描述为“未定义行为”，因此“不可移植”。小心使用它们。有些处理器会为您执行此操作。请参阅 x86 的

奇偶校验标志

Answer 3

奇偶校验计算任务相当于计数个数。它也称为“计数设置位”、“总体计数”或简称为“popcount”。一些处理器有一个有效的指令来计算它（POPCNT，VCNT）。

Answer 4

可以通过内联汇编器或使用内置程序来访问它：

__builtin_popcount()/ __popcnt()/ std::bitset::count()

适用于

GCC

、

Visual Studio

和 C++。我个人更喜欢使用 __builtin_parity() 将这项工作交给编译器

。

如果我正确理解了问题，那么你正在做

Answer 5

for (i = 0; i < 32; i++)
    *parity ^= (((x[0] >> i) & 1) * SOME_CONST[i]);

如果是这样，最好使用查找表：

for (i = 0; i < 4; i++)
    *parity ^= PARITY_LUT[i][ (x[0] >> (i*8)) & 0xFF];

这将花费 256 KB，但速度会快得多。

stanford bithacks 代码非常次优。为了完整起见，这里是跨所有平台和所有环境的最佳代码：

Answer 6

#include <stdint.h>

#if !defined(__wasm__) && defined(__cplusplus) && defined(__cpp_lib_bitops)
#   include <bit>
#   define paritytree_parity32(x) (std::popcount(x) & 1)
#elif !defined(__wasm__) && defined(__has_builtin)
#   if __has_builtin(__builtin_parity)
#       if INT_MAX >= INT32_MAX
#           define paritytree_parity32(x) __builtin_parity(x)
#       else
#           define paritytree_parity32(x) __builtin_parityl(x)
#       endif
#   endif
#endif
#if !defined(__wasm__) && defined(_MSC_VER) && defined(_M_X64) && !defined(paritytree_parity32)
#   include <intrin.h>
#   define paritytree_parity32(x) __popcnt(x)
#elif !defined(paritytree_parity32)
    static inline uint32_t paritytree_parity32(uint32_t v) {
        // On x86: 2 lea, 2 and, 1 xor, 1 mul, 1 shr, and 0 mov(!!!)
        // On arm64: 2 and, 1 add, 1 eor, 1 mul, 1 lsr, and 1 mov(!!!)
        v = (v ^ (v << 1)) & 0xAAAAAAAA;
        return (v*5 & UINT32_C(0x88888888)) * UINT32_C(0x11111111) >> 31;
    }
#endif

这是用于显示程序集的测试 C 代码：

uint8_t fastParity(uint32_t x) {
    return paritytree_parity32(x);
}

在 x86_64 上，Clang 17 和 GCC 13 都产生了这个精彩的作品。请注意它如何使用 x86 的 16 位半寄存器来删除 2 条移动和移位指令。另请注意

setnp

，这是一个鲜为人知的 x86 gem，它获取受异或影响的奇偶校验标志寄存器的值。是的！，每个 16 位异或都根据其目标结果的奇偶校验设置奇偶校验标志。请参阅

https://c9x.me/x86/html/file_module_x86_id_288.html

。

; Clang 17 and GCC 13 on x86_64
fastParity:
        mov     eax, edi
        shr     eax, 16
        xor     eax, edi
        xor     al, ah
        setnp   al
        ret

在使用 MSVC 2019 的 x86_64 上，它会生成此内容。请注意，popcnt 至少需要 2008 Intel Nehalem（第 1 代），Windows 10 至少需要 2017 Intel Coffeelake（第 8 代），因此在不受支持的硬件上运行无需担心。

; MSVC 2019 on x86_64
fastParity PROC
        popcnt  eax, ecx
        ret     0
fastParity ENDP

arm64 上的 GCC 会产生以下结果：

; gcc 13 on arm64
fastParity:
        fmov    s0, w0
        cnt     v0.8b, v0.8b
        addv    b0, v0.8b
        fmov    w0, s0
        and     w0, w0, 1
        ret

arm64 上的 Clang 会产生以下结果：

; clang 17 on arm64
fastParity:                          // @fastParity
        eor     w8, w0, w0, lsr #16
        eor     w8, w8, w8, lsr #8
        eor     w8, w8, w8, lsr #4
        eor     w8, w8, w8, lsr #2
        eor     w8, w8, w8, lsr #1
        and     w0, w8, #0x1
        ret

arm64 上的 MSVC 会产生以下结果：

; MSVC 2019 on arm64
|fastParity| PROC
        eor         w8,w0,w0,lsl #1
        and         w8,w8,#0xAAAAAAAA
        add         w8,w8,w8,lsl #2
        and         w9,w8,#0x88888888
        mov         w8,#0x11111111
        mul         w8,w9,w8
        lsr         w0,w8,#0x1F
        ret

        ENDP  ; |fastParity|

生成的WebAssembly为：

; WebAssembly
 (func $fastParity (; 1 ;) (param $0 i32) (result i32)
  (i32.shr_u
   (i32.mul
    (i32.and
     (i32.mul
      (i32.and
       (i32.xor
        (i32.shl
         (get_local $0)
         (i32.const 1)
        )
        (get_local $0)
       )
       (i32.const -1431655766)
      )
      (i32.const 5)
     )
     (i32.const -2004318072)
    )
    (i32.const 286331153)
   )
   (i32.const 31)
  )
 )

这个 WebAssembly 没有内存或浮点访问，需要额外的检查器代码，所以它应该被编译到这个漂亮的小 x86_64 中：

; Webassembly ran on a x86_64
fastParity:
        lea     eax, [rdi+rdi]
        xor     eax, edi
        and     eax, -1431655766
        lea     eax, [rax+rax*4]
        and     eax, -2004318072
        imul    eax, eax, 286331153
        shr     eax, 31
        ret

这个在arm64上：

; Webassembly ran on an arm64
fastParity:
        eor     w9, w0, w0, lsl #1
        mov     w8, #286331153
        and     w9, w9, #0xaaaaaaaa
        add     w9, w9, w9, lsl #2
        and     w9, w9, #0x88888888
        mul     w8, w9, w8
        lsr     w0, w8, #31
        ret

为了进行比较，这里是斯坦福大学 bithacks 页面非常次优的 C 代码和生成的汇编。有 3 条额外的汇编指令，并且没有测试更快的特定于硬件的奇偶校验指令。

uint32_t stanfordParityDoNotUse(uint32_t v) {
    v ^= v >> 1;
    v ^= v >> 2;
    v = (v & 0x11111111U) * 0x11111111U;
    return (v >> 28) & 1;
}
stanfordParityDoNotUse:
        mov     edx, edi
        shr     edx
        xor     edx, edi
        mov     eax, edx
        shr     eax, 2
        xor     eax, edx
        and     eax, 286331153
        imul    eax, eax, 286331153
        shr     eax, 28
        and     eax, 1
        ret

位运算计算奇偶校验的最快方法是什么？

问题描述投票：0回答：6

6个回答

最新问题

位运算计算奇偶校验的最快方法是什么？

问题描述 投票：0回答：6

6个回答

最新问题

问题描述投票：0回答：6