我最近遇到了一个给定的问题:
向量中有8个元素,每个元素用int8_t表示。
在 x86_64 中实现一个算法,该算法将添加两个向量(uint64_t 类型)。
添加元素时应考虑饱和度算法。
例如:
80 + 60 = 127
(−40) + (−100) = −128
事实证明,最大的挑战是施加的限制:
我想不出任何符合这些限制的解决方案。 谁能给我一些提示?欢迎使用 C 语言示例。
我只能使用“标准”、传输、算术、逻辑指令和标准寄存器:
使用
paddsb
指令添加带符号饱和度的字节向量。实现可能是这样的(假设是 amd64 sysv abi):
movq %mm0, %rdi # move the first operand to an MMX register
movq %mm1, %rsi # move the second operand to an MMX register
paddsb %mm0, %mm1 # packed add bytes with signed saturation
movq %rax, %mm0 # move the result back to a scalar register
emms # end MMX mode
ret # return to caller
没有MMX,可以使用下面的方法。这个想法是使用 SWAR 技术对所有字节并行执行以下算法:
int8_t addsb(int8_t a, int8_t b) {
int8_t q = a + b;
/* can the addition overflow (are a and b of different sign?) */
if (((a ^ b) & 0x80) == 0) {
/* is the result of different sign? */
if (((a ^ q) & 0x80) != 0) {
/* if yes, overflow occurred */
return (a & 0x80 ? 0x80 : 0x7f);
}
}
return (q);
}
以下代码未经测试但应该有效:
paddsb: mov $0x0101010101010101, %rdx # LSB bit masks
lea (%rsi, %rdi, 1), %rax # q = a + b
mov %rdi, %rcx
xor %rsi, %rcx # a ^ b
mov %rax, %rbx
sub %rcx, %rbx # a + b - (a ^ b) (carry out)
and %rdx, %rbx # carry outs from one byte to the next
not %rcx # ~a ^ b
xor %rax, %rdi # a ^ q
sub %rbx, %rax # compensate for the carry out
and %rcx, %rdi # bit 7 set where overflow
shr $7, %rdi # bit 0 set where overflow
and %rdx, %rdi # 0x01 where overflow, 0x00 where not
imul $0xff, %rdi, %rdi # 0xff where overflow, 0x00 where not
shr $7, %rsi
and %rdx, %rsi # 0x01 where b negative, 0x00 where not
mov $0x7f7f7f7f7f7f7f7f, %rdx
add %rsi, %rdx # 0x80 where b negative, 0x7f where not
and %rdi, %rdx # masked to only where overflown
not %rdi # 0x00 where overflow, 0xff where not
and %rdi, %rax # q masked to only where not overflown
or %rdx, %rax # signed sum of a and b
ret
请注意,需要进行一些额外的处理,以避免从一个字节执行到下一个字节。
我是这样用C++写的:
#include <cstdint>
uint64_t add(uint64_t a, uint64_t b) {
uint64_t asigns = a & 0x8080808080808080L;
uint64_t bsigns = b & 0x8080808080808080L;
uint64_t sum = (a^asigns) + (b^bsigns);
// fix up 8 bit wrapped sums
sum ^= asigns ^ bsigns;
uint64_t sumsigns = sum & 0x8080808080808080L;
// we saturate high when a and b were positive, but the result is negative
uint64_t sat = sumsigns & ~(asigns|bsigns);
sum |= (sat>>7)*127;
sum &= ~sat;
// we saturate negative when a and b were negative, but the result is positive
sat = (asigns&bsigns) & ~sumsigns;
sum &= ~((sat>>7)*127);
sum |= sat;
return sum;
}
然后我去了 https://godbolt.org/ 看看各种编译器生成了什么。 clang-16 给出了 33 条指令:
add(unsigned long, unsigned long):
movabs rdx, -9187201950435737472
mov rax, rdi
and rax, rdx
mov rcx, rsi
and rcx, rdx
movabs r8, 9187201950435737471
mov r9, rdi
and r9, r8
and r8, rsi
add r8, r9
xor rax, rcx
xor rax, r8
or rsi, rdi
not rsi
and rdx, rsi
and rdx, r8
mov rsi, rdx
shr rsi, 7
mov r8, rdx
sub r8, rsi
or r8, rax
xor r8, rdx
not rax
and rcx, rdi
and rcx, rax
mov rdx, rcx
shr rdx, 7
mov rax, rcx
sub rax, rdx
not rax
and rax, r8
or rax, rcx
ret
您可以尝试各种其他选项。