这个问题的灵感来自于我最近在 Stackoverflow 上遇到的一个问题。它提醒我,在 x86-64 ISA 开发的早期,我曾编写过用于 BCD 加法的 32 位 x86 代码,而没有使用 x86 ISA 中提供的 BCD 支持指令。根据注释,这指的是在某些平台上可能称为“打包”BCD 的内容,即每个四位位存储一个十进制数字,通常称为半字节。
虽然从性能角度来看,删除 DAA
和
DAS
指令(加法/减法后的小数调整)是安全的,但最好手头有具体的证据,比如如何实施在当时的 Athlon 处理器上,没有这些指令的 BCD 加法。我当时的代码是从互联网深处挖掘出来的,稍作调整并应用了错误修复,包含在下面。请注意,当时的
RCR
并不是现在那样的性能杀手。我从今天的角度重新审视了这个问题,选择继续以 32 位块进行处理,以便于跨架构进行阐述。如何扩展到 64 位 ISA 上的 64 位块处理是显而易见的。从概念上讲,BCD 加法很简单。为了促进半字节间进位传播,将两个源操作数的二进制和的每个半字节临时加6。对于那些不产生进位的半字节,要么从半字节中减去 6(撤消方法),要么重复原始二进制加法,但不向该半字节添加 6(重做方法)。在 1999 年处理我的原始代码时,我发现使用重做方法会导致性能稍高,这可能是由于依赖链较短。 BCD 加法的主要开销是确定半字节间进位发生的位置。我的方法是检查下一个高半字节的 LSB:如果它的值与和位的初始值(两个源 LSB 的异或)不同,则必须发生进位,因此下一个低半字节产生一个执行。一个有点丑陋的特殊情况是处理最重要的半字节的进出。当 Knuth 在 2011 年出版 TAOCP 的第 4a 卷时,我发现它包含了一种使用 undo 方法进行 BCD 加法的算法,并通过检查源操作数中相应半字节的 MSB以及调整后的总和来检测半字节间进位该半字节是否有结转。当后一个功能通过大多数 3 计算来表达时,这使得代码变得非常优雅,Knuth 更喜欢用更通用的名称中位数来引用它。
我通过实验发现,在多种架构中,重做方法通常优于撤消方法:x86、ARM、RISC-V、Sparc、Power。对于半字节进位检测,LSB 方法和 MSB 方法在性能方面通常看起来不相上下,但 Knuth 的 MSB 方法在能够利用附加逻辑运算更有效地实现中值运算的架构上更胜一筹(例如 ORN
、
ANDN
、
XNOR
、NOR
、NAND
)超出最小逻辑运算集(OR
、AND
、XOR
、NOT
)。这在现代 NVIDIA GPU 上尤其如此,它提供了 LOP3
指令,可以计算三个输入的 any逻辑运算。这会将中值运算(任何输入都可能取反)映射到
LOP3
的单个实例。
虽然我认为我的摆弄技能相当扎实,但过去的经验表明,与此处答案中提出的最佳解决方案相比,我通常只达到 85%。具体来说,我想知道:是否有更有效的方法来处理半字节进位问题?使用简单的 ISA(如 32 位 RISC-V)可能会提供对此问题的最佳见解。任何适用于该 ISA 的卓越解决方案都不太可能损害具有更广泛可用指令的 ISA。除此之外,任何可以减少半字节进位检测开销的聪明想法(如减少指令数或缩短依赖链所证明的那样)都是受欢迎的。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#define NUM_TESTS (100000000)
#define TEST_PORTABLE_BCD64 (1) // 0: x86-specific test of 18-digit BCD arithm.
// 1: portable test of 16-digit BCD arithmetic
#define PORTABLE (1) // 0: x86-specific inline assembly
#define NATIVE_64BIT (0) // applies only if PORTABLE==1
#define USE_KNUTH (1) // applies only if PORTABLE==1 && NATIVE_64BIT==0
#if PORTABLE && !NATIVE_64BIT
/* majority-of-3 operation that Knuth refers to more generically as median */
uint32_t median32 (uint32_t x, uint32_t y, uint32_t z)
{
return (x | (y & z)) & (y | z);
}
/* 8-digit BCD addition with carry-out */
uint32_t BCD32_ADDcc (uint32_t x, uint32_t y, uint32_t *cy)
{
#if USE_KNUTH
// derived from Knuth, TAOCP 4a (2011), answer to exercise 100 section 7.1.3
uint32_t z, u, t;
z = y + 0x66666666;
u = x + z;
t = median32 (x, z, ~u) & 0x88888888;
*cy = t >> 31;
return (x + y) + (t - (t >> 2));
#else // ! USE_KNUTH
const uint32_t sixes = 0x66666666;
const uint32_t eights = 0x88888888;
uint32_t xe, s, u, carry_out;
xe = x + sixes; // add "nibble offset"; cannot overflow
s = xe ^ y; // preliminary sum bits
u = xe + y;
carry_out = u < xe; // carry-out from most significant nibble
s = (carry_out << 31) | ((s ^ u) >> 1); // sum bits with carry-in
s = s & eights; // isolate nibble lsbs with carry-in
*cy = carry_out; // record carry-out from most significant nibble
u = (x + y) + (s - (s >> 2)); // add "nibble offset" to nibbles that need it
return u;
#endif // USE_KNUTH
}
/* 8-digit BCD addition with carry-in */
uint32_t BCD32_ADDC (uint32_t x, uint32_t y, uint32_t cy)
{
#if USE_KNUTH
// derived from Knuth, TAOCP 4a (2011), answer to exercise 100 section 7.1.3
uint32_t z, u, t;
x = x + cy;
z = y + 0x66666666;
u = x + z;
t = median32 (x, z, ~u) & 0x88888888;
return (x + y) + (t - (t >> 2));
#else // !USE_KNUTH
const uint32_t sixes = 0x66666666;
const uint32_t eights = 0x88888888;
uint32_t xe, s, u, carry_out;
y = y + cy; // incorporate carry-in; cannot overflow
xe = x + sixes; // add "nibble offset"; cannot overflow
s = xe ^ y; // preliminary sum bits
u = xe + y;
carry_out = u < xe; // carry-out from most significant nibble
s = (carry_out << 31) | ((s ^ u) >> 1); // sum bits with carry-in
s = s & eights; // isolate nibble lsbs with carry-in
u = (x + y) + (s - (s >> 2)); // add "nibble offset" to nibbles that need it
return u;
#endif // USE_KNUTH
}
/* 8-digit BCD addition with carry-in and carry-out */
uint32_t BCD32_ADDCcc (uint32_t x, uint32_t y, uint32_t *cy)
{
#if USE_KNUTH
// derived from Knuth, TAOCP 4a (2011), answer to exercise 100 section 7.1.3
uint32_t z, u, t;
x = x + (*cy);
z = y + 0x66666666;
u = x + z;
t = median32 (x, z, ~u) & 0x88888888;
*cy = t >> 31;
return (x + y) + (t - (t >> 2));
#else // !USE_KNUTH
const uint32_t sixes = 0x66666666;
const uint32_t eights = 0x88888888;
uint32_t xe, s, u, carry_out;
y = y + (*cy); // incorporate carry-in; cannot overflow
xe = x + sixes; // add "nibble offset"; cannot overflow
s = xe ^ y; // preliminary sum bits
u = xe + y;
carry_out = u < xe; // carry-out from most significant nibble
s = (carry_out << 31) | ((s ^ u) >> 1); // sum bits with carry-in
s = s & eights; // isolate nibble lsbs with carry-in
*cy = carry_out; // record carry-out from most significant nibble
u = (x + y) + (s - (s >> 2)); // add "nibble offset" to nibbles that need it
return u;
#endif // USE_KNUTH
}
#endif // PORTABLE && !NATIVE_64BIT
#if PORTABLE
#if NATIVE_64BIT
/* majority-of-3 operation that Knuth refers to more generically as median */
uint64_t median64 (uint64_t x, uint64_t y, uint64_t z)
{
return (x | (y & z)) & (y | z);
}
/* 16-digit BCD addition */
uint64_t BCD64_ADD (uint64_t x, uint64_t y)
{
// derived from Knuth, TAOCP 4a (2011), answer to exercise 100 section 7.1.3
uint64_t z, u, t;
z = y + 0x6666666666666666ULL;
u = x + z;
t = median64 (x, z, ~u) & 0x8888888888888888ULL;
return (x + y) + (t - (t >> 2));
}
#else // !NATIVE_64BIT
/* 16-digit BCD addition */
uint64_t BCD64_ADD (uint64_t x, uint64_t y)
{
uint32_t x_lo = (uint32_t)x;
uint32_t x_hi = (uint32_t)(x >> 32);
uint32_t y_lo = (uint32_t)y;
uint32_t y_hi = (uint32_t)(y >> 32);
uint32_t cy;
uint32_t r_lo = BCD32_ADDcc (x_lo, y_lo, &cy);
uint32_t r_hi = BCD32_ADDC (x_hi, y_hi, cy);
return ((uint64_t)r_hi << 32) | r_lo;
}
#endif // NATIVE_64BIT
#else // !PORTABLE
/*
16-digit BCD addition; N. Juffa, about 1999. Code retrieved on 3/24/2024 from
https://web.archive.org/web/20001211131100/http://www.azillionmonkeys.com/qed/asmexample.html
Bug fixed and adjusted for Intel compiler's inline asm (may need -fasm-blocks)
*/
uint64_t BCD64_ADD (uint64_t x, uint64_t y)
{
uint64_t r;
__asm mov eax, dword ptr [x] ; // x (lo)
__asm mov ebx, dword ptr [y] ; // y (lo)
__asm mov edx, dword ptr [x+4] ; // x (hi)
__asm mov ecx, dword ptr [y+4] ; // y (hi)
// x in EDX:EAX, y in EDX:EBX. Add least significant 8 BCD digits
__asm mov esi, eax ; // x
__asm lea edi, [eax+66666666h] ; // x + 0x66666666
__asm xor esi, ebx ; // x ^ y
__asm add eax, ebx ; // x + y
__asm shr esi, 1 ; // t1 = (x ^ y) >> 1
__asm add edi, ebx ; // x + y + 0x66666666
__asm sbb ebx, ebx ; // capture carry: carry ? (-1) : 0
__asm rcr edi, 1 ; // t2 = (x + y + 0x66666666) >> 1
__asm xor esi, edi ; // t1 ^ t2
__asm and esi, 88888888h ; // t3 = (t1 ^ t2) & 0x88888888
__asm add eax, esi ; // x + y + t3
__asm shr esi, 2 ; // t3 >> 2
__asm sub eax, esi ; // x + y + t3 - (t3 >> 2)
// Add most significant 8 BCD digits
__asm sub ecx, ebx ; // y - carry // propagate carry
__asm mov esi, edx ; // x
__asm lea edi, [edx+66666666h] ; // x + 0x66666666
__asm xor esi, ecx ; // x ^ y
__asm add edx, ecx ; // x + y
__asm shr esi, 1 ; // t1 = (x ^ y) >> 1
__asm add edi, ecx ; // x + y + 0x66666666
// __asm sbb ebx, ebx ; capture carry: carry ? (-1) : 0
__asm rcr edi, 1 ; // t2 = (x + y + 0x66666666) >> 1
__asm xor esi, edi ; // t1 ^ t2
__asm and esi, 88888888h ; // t3 = (t1 ^ t2) & 0x88888888
__asm add edx, esi ; // x + y + t3
__asm shr esi, 2 ; // t3 >> 2
__asm sub edx, esi ; // x + y + t3 - (t3 >> 2)
// Now EDX:EAX = x + y
__asm mov dword ptr [r], eax ; // save result (lo)
__asm mov dword ptr [r+4], edx ; // save result (hi)
return r;
}
#endif // PORTABLE
/* convert binary operand x in [0, 9999] to BCD */
uint32_t cvt_bin2bcd_4dig (uint32_t x)
{
uint32_t num, res = 0;
/* convert to 21:11 fixed point */
num = ((0x8312 * x) + 0x6000) >> 14; // (2**11 / 10**3) * 2**14
/* pull out decimal digits one at a time */
res = (num >> 11);
num = (num & ((1 << 11) - 1)) * 5; // 22:10 fixed point
res = (res << 4) | (num >> 10);
num = (num & ((1 << 10) - 1)) * 5; // 23:9 fixed point
res = (res << 4) | (num >> 9);
num = (num & ((1 << 9) - 1)) * 5; // 24:8 fixed point
res = (res << 4) | (num >> 8);
return res;
}
/* convert binary operand x in [0, 99999999] to BCD */
uint32_t cvt_bin2bcd_8dig (uint32_t x)
{
uint32_t hi = x / 10000;
uint32_t lo = x - hi * 10000;
return (cvt_bin2bcd_4dig (hi) << 16) | cvt_bin2bcd_4dig (lo);
}
/* convert binary operand a in [0, 9999999999999999] to BCD */
uint64_t uint64_to_bcd64 (uint64_t x)
{
uint32_t hi = (uint32_t)(x / 100000000ULL);
uint32_t lo = (uint32_t)(x - hi * 100000000ULL);
return (((uint64_t)cvt_bin2bcd_8dig (hi)) << 32) | cvt_bin2bcd_8dig (lo);
}
/* return most significant 32 bits of the full product of two 32-bit operands */
uint32_t umul32hi (uint32_t a, uint32_t b)
{
uint64_t prod = (uint64_t)a * b;
return (uint32_t)(prod >> 32);
}
/* convert BCD operand x in [0, 99999999] to binary
based on Knuth TAOCP 2, answer on p. 638 to exercise 19 in section 4.4
*/
uint32_t cvt_bcd2bin_8dig (uint32_t x)
{
const uint32_t m1 = 0xf0f0f0f0;
const uint32_t m2 = 0xff00ff00;
const uint32_t m3 = 0xffff0000;
const uint32_t c1 = 0x60000000; // 0.375000000000
const uint32_t c2 = 0x9c000000; // 0.609375000000
const uint32_t c3 = 0xd8f00000; // 0.847412109375
x = x - umul32hi (c1, x & m1);
x = x - umul32hi (c2, x & m2);
x = x - umul32hi (c3, x & m3);
return x;
}
/* convert BCD operand x in [0, 9999999999999999] to binary */
uint64_t bcd64_to_uint64 (uint64_t a)
{
uint32_t hi = (uint32_t)(a >> 32);
uint32_t lo = (uint32_t)a;
return 100000000ULL * cvt_bcd2bin_8dig (hi) + cvt_bcd2bin_8dig (lo);
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <[email protected]>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
#if TEST_PORTABLE_BCD64
int main (void)
{
uint64_t bx, by, x, y, ref, res, i;
printf ("Testing 16-digit BCD arithmetic\n");
for (i = 0; i < NUM_TESTS; i++) {
x = KISS64 % 10000000000000000ULL;
y = KISS64 % 10000000000000000ULL;
bx = uint64_to_bcd64 (x);
by = uint64_to_bcd64 (y);
res = BCD64_ADD (bx, by);
ref = uint64_to_bcd64 ((x + y) % 10000000000000000ULL);
if (res != ref) {
printf ("!!!! x=%016llx y=%016llx bx=%016llx by=%016llx res=%016llx ref=%016llx\n",
x, y, bx, by, res, ref);
return EXIT_FAILURE;
}
}
printf ("test PASSED\n");
return EXIT_SUCCESS;
}
#else // TEST_PORTABLE_BCD64
typedef struct ten_byte {
uint64_t low64;
uint16_t high16;
} ten_byte;
/* 18-digit BCD addition modulo 10**18 using the x87 FPU */
ten_byte bcd18dig_add_ref (ten_byte a, ten_byte b)
{
const double exp10_18 = 1e18;
const uint16_t cw_extended_chop = 0xf7f;
ten_byte aa, bb, r;
uint16_t cw_original;
aa = a; // workaround: bad code generated when data not local to function
bb = b;
__asm fstcw [cw_original] ;// save current CW
__asm fldcw [cw_extended_chop] ;// RC=truncate PC=extended
__asm fld qword ptr [exp10_18] ;// 10**18
__asm fbld tbyte ptr [aa] ;// a, 10**8
__asm fbld tbyte ptr [bb] ;// b, a, 10**18
__asm fadd st, st(1) ;// a+b, a, 10**18
__asm fst st(1) ;// a+b, a+b, 10**18
__asm fdiv st, st(2) ;// (a+b)/10**18, a+b, 10**18
__asm frndint ;// trunc((a+b)/10**18), a+b, 10**18
__asm fmulp st(2), st ;// a+b, trunc((a+b)/10**18)*10**18
__asm fsubrp st(1),st ;// a+b - trunc((a+b)/10**18)*10**18;
__asm fbstp tbyte ptr [r] ;// <empty>
__asm fldcw [cw_original] ;// restore original CW
return r;
}
/* 18-digit BCD addition modulo 10**18, processed in 8-digit chunks */
ten_byte bcd18dig_add (ten_byte x, ten_byte y)
{
ten_byte r;
uint32_t x_lo = (uint32_t)(x.low64);
uint32_t x_md = (uint32_t)(x.low64 >> 32);
uint32_t x_hi = (uint32_t)(x.high16);
uint32_t y_lo = (uint32_t)(y.low64);
uint32_t y_md = (uint32_t)(y.low64 >> 32);
uint32_t y_hi = (uint32_t)(y.high16);
uint32_t cy;
uint32_t r_lo = BCD32_ADDcc (x_lo, y_lo, &cy);
uint32_t r_md = BCD32_ADDCcc(x_md, y_md, &cy);
uint32_t r_hi = BCD32_ADDC (x_hi, y_hi, cy);
r_hi &= 0xff;
r.low64 = ((uint64_t)r_md << 32) | r_lo;
r.high16 = (uint16_t)r_hi;
return r;
}
int main (void)
{
uint64_t x, y;
ten_byte bcd_augend, bcd_addend, bcd_sum_ref, bcd_sum_res;
printf ("Testing 18-digit BCD arithmetic against x87 FPU\n");
for (int i = 0; i < NUM_TESTS; i++) {
x = KISS64 % 10000000000000000ULL;
y = KISS64 % 10000000000000000ULL;
bcd_augend.low64 = uint64_to_bcd64 (x);
bcd_addend.low64 = uint64_to_bcd64 (x);
x = KISS64 % 100ULL;
y = KISS64 % 100ULL;
bcd_augend.high16 = uint64_to_bcd64 (x);
bcd_addend.high16 = uint64_to_bcd64 (y);
bcd_sum_ref = bcd18dig_add_ref (bcd_augend, bcd_addend);
bcd_sum_res = bcd18dig_add (bcd_augend, bcd_addend);
if (memcmp (&bcd_sum_res, &bcd_sum_ref, 9) != 0) {
printf ("a = %02x%016llx\n", bcd_augend.high16, bcd_augend.low64);
printf ("b = %02x%016llx\n", bcd_addend.high16, bcd_addend.low64);
printf ("res = %02x%016llx\n",bcd_sum_res.high16,bcd_sum_res.low64);
printf ("ref = %02x%016llx\n",bcd_sum_ref.high16,bcd_sum_ref.low64);
return EXIT_FAILURE;
}
}
printf ("test PASSED\n");
return EXIT_SUCCESS;
}
#endif // TEST_PORTABLE_BCD64
我想要的只是与性能相关的答案。
计算任意位位置溢出的简单公式是已知的(参见 Hacker's Delight 示例章节)。 打包 BCD 的任何属性都不允许更快地发现溢出,AFAICT。