为每个 int8_t 元素添加两个具有饱和度的向量（uint64_t 类型）

Question

我最近遇到了一个给定的问题：

向量中有8个元素，每个元素用int8_t表示。

在 x86_64 中实现一个算法，该算法将添加两个向量（uint64_t 类型）。

添加元素时应考虑饱和度算法。

例如：

80 + 60 = 127

(−40) + (−100) = −128

事实证明，最大的挑战是施加的限制：

除ret外无条件指令；没有跳跃，移动，设置等
解决方案不能超过48条指令（存在小于37条指令的解决方案）

我想不出任何符合这些限制的解决方案。谁能给我一些提示？欢迎使用 C 语言示例。

我只能使用“标准”、传输、算术、逻辑指令和标准寄存器：

mov cbw/cwde/cdqe cwd/cdq/cqo movzx movsx
add sub imul mul idiv div inc dec neg
and or xor not sar sarx shr shrx shl shlx ror rol
lea ret

Answer 1

使用

paddsb

指令添加带符号饱和度的字节向量。实现可能是这样的（假设是 amd64 sysv abi）：

    movq    %mm0, %rdi  # move the first operand to an MMX register
    movq    %mm1, %rsi  # move the second operand to an MMX register
    paddsb  %mm0, %mm1  # packed add bytes with signed saturation
    movq    %rax, %mm0  # move the result back to a scalar register
    emms                # end MMX mode
    ret                 # return to caller

没有MMX，可以使用下面的方法。这个想法是使用 SWAR 技术对所有字节并行执行以下算法：

int8_t addsb(int8_t a, int8_t b) {
    int8_t q = a + b;

    /* can the addition overflow (are a and b of different sign?) */
    if (((a ^ b) & 0x80) == 0) {
        /* is the result of different sign? */
        if (((a ^ q) & 0x80) != 0) {
            /* if yes, overflow occurred */
            return (a & 0x80 ? 0x80 : 0x7f);
        }
    }

    return (q);
}

以下代码未经测试但应该有效：

paddsb: mov     $0x0101010101010101, %rdx       # LSB bit masks
        lea     (%rsi, %rdi, 1), %rax           # q = a + b
        mov     %rdi, %rcx
        xor     %rsi, %rcx                      # a ^ b
        mov     %rax, %rbx
        sub     %rcx, %rbx                      # a + b - (a ^ b) (carry out)
        and     %rdx, %rbx                      # carry outs from one byte to the next
        not     %rcx                            # ~a ^ b
        xor     %rax, %rdi                      # a ^ q
        sub     %rbx, %rax                      # compensate for the carry out
        and     %rcx, %rdi                      # bit 7 set where overflow
        shr     $7, %rdi                        # bit 0 set where overflow
        and     %rdx, %rdi                      # 0x01 where overflow, 0x00 where not
        imul    $0xff, %rdi, %rdi               # 0xff where overflow, 0x00 where not
        shr     $7, %rsi
        and     %rdx, %rsi                      # 0x01 where b negative, 0x00 where not
        mov     $0x7f7f7f7f7f7f7f7f, %rdx
        add     %rsi, %rdx                      # 0x80 where b negative, 0x7f where not
        and     %rdi, %rdx                      # masked to only where overflown
        not     %rdi                            # 0x00 where overflow, 0xff where not
        and     %rdi, %rax                      # q masked to only where not overflown
        or      %rdx, %rax                      # signed sum of a and b
        ret

请注意，需要进行一些额外的处理，以避免从一个字节执行到下一个字节。

Answer 2

我是这样用C++写的：

#include <cstdint>

uint64_t add(uint64_t a, uint64_t b) {
    uint64_t asigns = a & 0x8080808080808080L;
    uint64_t bsigns = b & 0x8080808080808080L;
    uint64_t sum = (a^asigns) + (b^bsigns);
    // fix up 8 bit wrapped sums
    sum ^= asigns ^ bsigns;
    uint64_t sumsigns = sum & 0x8080808080808080L;
    // we saturate high when a and b were positive, but the result is negative
    uint64_t sat = sumsigns & ~(asigns|bsigns);
    sum |= (sat>>7)*127;
    sum &= ~sat;
    // we saturate negative when a and b were negative, but the result is positive
    sat = (asigns&bsigns) & ~sumsigns;
    sum &= ~((sat>>7)*127);
    sum |= sat;
    return sum;
}

然后我去了 https://godbolt.org/ 看看各种编译器生成了什么。 clang-16 给出了 33 条指令：

add(unsigned long, unsigned long):
        movabs  rdx, -9187201950435737472
        mov     rax, rdi
        and     rax, rdx
        mov     rcx, rsi
        and     rcx, rdx
        movabs  r8, 9187201950435737471
        mov     r9, rdi
        and     r9, r8
        and     r8, rsi
        add     r8, r9
        xor     rax, rcx
        xor     rax, r8
        or      rsi, rdi
        not     rsi
        and     rdx, rsi
        and     rdx, r8
        mov     rsi, rdx
        shr     rsi, 7
        mov     r8, rdx
        sub     r8, rsi
        or      r8, rax
        xor     r8, rdx
        not     rax
        and     rcx, rdi
        and     rcx, rax
        mov     rdx, rcx
        shr     rdx, 7
        mov     rax, rcx
        sub     rax, rdx
        not     rax
        and     rax, r8
        or      rax, rcx
        ret

您可以尝试各种其他选项。

为每个 int8_t 元素添加两个具有饱和度的向量（uint64_t 类型）

问题描述投票：0回答：2

2个回答

最新问题

为每个 int8_t 元素添加两个具有饱和度的向量（uint64_t 类型）

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2