具有内在函数的前导零计算

问题描述 投票:4回答:1

我正在尝试优化嵌入式系统(FLAC解码,Windows CE,ARM 926 MCU)中的某些代码。

default implementation使用宏和查找表:

/* counts the # of zero MSBs in a word */
#define COUNT_ZERO_MSBS(word) ( \
 (word) <= 0xffff ? \
  ( (word) <= 0xff? byte_to_unary_table[word] + 24 : \
              byte_to_unary_table[(word) >> 8] + 16 ) : \
  ( (word) <= 0xffffff? byte_to_unary_table[word >> 16] + 8 : \
              byte_to_unary_table[(word) >> 24] ) \
)

static const unsigned char byte_to_unary_table[] = {
    8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};

然而大多数CPU已经有专用指令,x86上的bsr和ARM(clz)上的http://www.devmaster.net/articles/fixed-point-optimizations/,应该更有效。

在Windows CE上,我们具有固有函数_CountLeadingZeros,该函数应仅调用该值。但是,它比宏慢4倍(以1000万次迭代计算)。

(应该)依赖于专用ASM指令的内在函数怎么可能慢4倍?

arm bit-manipulation windows-ce intrinsics leading-zero
1个回答
5
投票

检查拆卸。您确定编译器已插入指令吗?在“备注”部分中有以下文本:

© www.soinside.com 2019 - 2024. All rights reserved.