Bug:等效向量内部代码 c

问题描述 投票:0回答:1

目前我有一段 C 代码,它是用 Intel 内在函数编写的,它使用 AVX512BW + AVX512VL 和 SSE 指令。

尝试加载 16 字节数据的地方

__m128i input16 = _mm_loadu_epi8(len + diff);
__m128i match16 = _mm_loadu_epi8(len + 0);
_BitScanForward(&matchLength, (~(_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ))) | 0x10000));
len = len + (unsigned)matchLength;
if (matchLength != 16)
   break;

目前我想使用向量指令为同一组代码实现汇编版本。

vmovdqu8   xmm1, XMMWORD PTR [diff + 1 * len]
vmovdqu8   xmm0, XMMWORD PTR [cur + 1 * len]
pcmpeqb xmm1, xmm0
pmovmskb ecx, xmm1
not ecx
bts ecx, 16
bsf edx, ecx

上述代码的编译没有错误,但是在上述代码运行时执行时,执行突然停止,没有任何错误。无法调试,因为没有正确的方法来检查/打印错误。尝试以传统方式调试,仅添加部分代码,目前观察到仅在引入

pmovmskb ecx, xmm1
行时才会出现执行错误,前三行代码执行正常,没有错误。

对上述汇编代码片段的问题有什么想法吗?

使用微软汇编器:来自 VS 2022,ml64.exe。 {Microsoft (R) 宏汇编器 (x64) 版本 14.37.32824.0}

以上代码是以下代码部分中原始工作汇编代码中的 match_loop 的一部分。

; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
; 2021-07-21: Igor Pavlov : Public domain
;

ifndef x64
; x64=1
; .err <x64_IS_REQUIRED>
endif

include 7zAsm.asm

MY_ASM_START

_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'

MY_ALIGN macro num:req
        align  num
endm

MY_ALIGN_32 macro
        MY_ALIGN 32
endm

MY_ALIGN_64 macro
        MY_ALIGN 64
endm


t0_L    equ x0_L
t0_x    equ x0
t0      equ r0
t1_x    equ x3
t1      equ r3

cp_x    equ t1_x
cp_r    equ t1
m       equ x5
m_r     equ r5
len_x   equ x6
len     equ r6
diff_x  equ x7
diff    equ r7
len0    equ r10
len1_x  equ x11
len1    equ r11
maxLen_x equ x12
maxLen  equ r12
d       equ r13
ptr0    equ r14
ptr1    equ r15

d_lim       equ m_r
cycSize     equ len_x
hash_lim    equ len0
delta1_x    equ len1_x
delta1_r    equ len1
delta_x     equ maxLen_x
delta_r     equ maxLen
hash        equ ptr0
src         equ ptr1



if (IS_LINUX gt 0)

; r1 r2  r8 r9        : win32
; r7 r6  r2 r1  r8 r9 : linux

lenLimit        equ r8
lenLimit_x      equ x8
; pos_r           equ r2
pos             equ x2
cur             equ r1
son             equ r9

else

lenLimit        equ REG_ABI_PARAM_2
lenLimit_x      equ REG_ABI_PARAM_2_x
pos             equ REG_ABI_PARAM_1_x
cur             equ REG_ABI_PARAM_0
son             equ REG_ABI_PARAM_3

endif


if (IS_LINUX gt 0)
    maxLen_OFFS         equ  (REG_SIZE * (6 + 1))
else
    cutValue_OFFS       equ  (REG_SIZE * (8 + 1 + 4))
    d_OFFS              equ  (REG_SIZE + cutValue_OFFS)
    maxLen_OFFS         equ  (REG_SIZE + d_OFFS)
endif
    hash_OFFS           equ  (REG_SIZE + maxLen_OFFS)
    limit_OFFS          equ  (REG_SIZE + hash_OFFS)
    size_OFFS           equ  (REG_SIZE + limit_OFFS)
    cycPos_OFFS         equ  (REG_SIZE + size_OFFS)
    cycSize_OFFS        equ  (REG_SIZE + cycPos_OFFS)
    posRes_OFFS         equ  (REG_SIZE + cycSize_OFFS)
    
if (IS_LINUX gt 0)
else
    cutValue_PAR        equ  [r0 + cutValue_OFFS]
    d_PAR               equ  [r0 + d_OFFS]
endif
    maxLen_PAR          equ  [r0 + maxLen_OFFS]
    hash_PAR            equ  [r0 + hash_OFFS]
    limit_PAR           equ  [r0 + limit_OFFS]
    size_PAR            equ  [r0 + size_OFFS]
    cycPos_PAR          equ  [r0 + cycPos_OFFS]
    cycSize_PAR         equ  [r0 + cycSize_OFFS]
    posRes_PAR          equ  [r0 + posRes_OFFS]


    cutValue_VAR        equ  DWORD PTR [r4 + 8 * 0]
    cutValueCur_VAR     equ  DWORD PTR [r4 + 8 * 0 + 4]
    cycPos_VAR          equ  DWORD PTR [r4 + 8 * 1 + 0]
    cycSize_VAR         equ  DWORD PTR [r4 + 8 * 1 + 4]
    hash_VAR            equ  QWORD PTR [r4 + 8 * 2]
    limit_VAR           equ  QWORD PTR [r4 + 8 * 3]
    size_VAR            equ  QWORD PTR [r4 + 8 * 4]
    distances           equ  QWORD PTR [r4 + 8 * 5]
    maxLen_VAR          equ  QWORD PTR [r4 + 8 * 6]

    Old_RSP             equ  QWORD PTR [r4 + 8 * 7]
    LOCAL_SIZE          equ  8 * 8

COPY_VAR_32 macro dest_var, src_var
        mov     x3, src_var
        mov     dest_var, x3
endm

COPY_VAR_64 macro dest_var, src_var
        mov     r3, src_var
        mov     dest_var, r3
endm


; MY_ALIGN_64
MY_PROC GetMatchesSpecN_2, 13
MY_PUSH_PRESERVED_ABI_REGS
        mov     r0, RSP
        lea     r3, [r0 - LOCAL_SIZE]
        and     r3, -64
        mov     RSP, r3
        mov     Old_RSP, r0

if (IS_LINUX gt 0)
        mov     d,            REG_ABI_PARAM_5       ; r13 = r9
        mov     cutValue_VAR, REG_ABI_PARAM_4_x     ;     = r8
        mov     son,          REG_ABI_PARAM_3       ;  r9 = r1
        mov     r8,           REG_ABI_PARAM_2       ;  r8 = r2
        mov     pos,          REG_ABI_PARAM_1_x     ;  r2 = x6
        mov     r1,           REG_ABI_PARAM_0       ;  r1 = r7
else
        COPY_VAR_32 cutValue_VAR, cutValue_PAR
        mov     d, d_PAR
endif

        COPY_VAR_64 limit_VAR, limit_PAR
        
        mov     hash_lim, size_PAR
        mov     size_VAR, hash_lim
        
        mov     cp_x, cycPos_PAR
        mov     hash, hash_PAR

        mov     cycSize, cycSize_PAR
        mov     cycSize_VAR, cycSize
        
        ; we want cur in (rcx). So we change the cur and lenLimit variables
        sub     lenLimit, cur
        neg     lenLimit_x
        inc     lenLimit_x
        
        mov     t0_x, maxLen_PAR
        sub     t0, lenLimit
        mov     maxLen_VAR, t0

        jmp     main_loop

MY_ALIGN_64
fill_empty:
        ; ptr0 = *ptr1 = kEmptyHashValue;
        mov     QWORD PTR [ptr1], 0
        inc     pos
        inc     cp_x
        mov     DWORD PTR [d - 4], 0
        cmp     d, limit_VAR
        jae     fin
        cmp     hash, hash_lim
        je      fin

; MY_ALIGN_64
main_loop:
        ; UInt32 delta = *hash++;
        mov     diff_x, [hash]  ; delta
        add     hash, 4
        ; mov     cycPos_VAR, cp_x
       
        inc     cur
        add     d, 4
        mov     m, pos
        sub     m, diff_x;      ; matchPos
        
        ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
        lea     ptr1, [son + 8 * cp_r]
        ; mov     cycSize, cycSize_VAR
        cmp     pos, cycSize
        jb      directMode      ; if (pos < cycSize_VAR)
        
        ; CYC MODE

        cmp     diff_x, cycSize
        jae     fill_empty      ; if (delta >= cycSize_VAR)
        
        xor     t0_x, t0_x
        mov     cycPos_VAR, cp_x
        sub     cp_x, diff_x
        ; jae     prepare_for_tree_loop
        ; add     cp_x, cycSize
        cmovb   t0_x, cycSize
        add     cp_x, t0_x      ; cp_x +=  (cycPos < delta ? cycSize : 0)
        jmp     prepare_for_tree_loop
        
        
directMode:
        cmp     diff_x,  pos
        je      fill_empty      ; if (delta == pos)
        jae     fin_error       ; if (delta >= pos)
        
        mov     cycPos_VAR, cp_x
        mov     cp_x, m
        
prepare_for_tree_loop:
        mov     len0, lenLimit
        mov     hash_VAR, hash
        ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
        lea     ptr0, [ptr1 + 4]
        ; UInt32 *_distances = ++d;
        mov     distances, d

        neg     len0
        mov     len1, len0

        mov     t0_x, cutValue_VAR
        mov     maxLen, maxLen_VAR
        mov     cutValueCur_VAR, t0_x

MY_ALIGN_32
tree_loop:
        neg     diff
        mov     len, len0
        cmp     len1, len0
        cmovb   len, len1       ; len = (len1 < len0 ? len1 : len0);
        add     diff, cur

        mov     t0_x, [son + cp_r * 8]  ; prefetch
        movzx   t0_x, BYTE PTR [diff + 1 * len]
        lea     cp_r, [son + cp_r * 8]
        cmp     [cur + 1 * len], t0_L
        je      matched_1
        
        jb      left_0

        mov     [ptr1], m
        mov        m, [cp_r + 4]
        lea     ptr1, [cp_r + 4]
        sub     diff, cur ; FIX32
        jmp     next_node

MY_ALIGN_32
left_0:
        mov     [ptr0], m
        mov        m, [cp_r]
        mov     ptr0, cp_r
        sub     diff, cur ; FIX32
        ; jmp     next_node

; ------------ NEXT NODE ------------
; MY_ALIGN_32
next_node:
        mov     cycSize, cycSize_VAR
        dec     cutValueCur_VAR
        je      finish_tree
        
        add     diff_x, pos     ; prev_match = pos + diff
        cmp     m, diff_x
        jae     fin_error       ; if (new_match >= prev_match)
        
        mov     diff_x, pos
        sub     diff_x, m       ; delta = pos - new_match
        cmp     pos, cycSize
        jae     cyc_mode_2      ; if (pos >= cycSize)

        mov     cp_x, m
        test    m, m
        jne     tree_loop       ; if (m != 0)
        
finish_tree:
        ; ptr0 = *ptr1 = kEmptyHashValue;
        mov     DWORD PTR [ptr0], 0
        mov     DWORD PTR [ptr1], 0

        inc     pos
        
        ; _distances[-1] = (UInt32)(d - _distances);
        mov     t0, distances
        mov     t1, d
        sub     t1, t0
        shr     t1_x, 2
        mov     [t0 - 4], t1_x

        cmp     d, limit_VAR
        jae     fin             ; if (d >= limit)
   
        mov     cp_x, cycPos_VAR
        mov     hash, hash_VAR
        mov     hash_lim, size_VAR
        inc     cp_x
        cmp     hash, hash_lim
        jne     main_loop       ; if (hash != size)
        jmp     fin
        

MY_ALIGN_32
cyc_mode_2:
        cmp     diff_x, cycSize
        jae     finish_tree     ; if (delta >= cycSize)

        mov     cp_x, cycPos_VAR
        xor     t0_x, t0_x
        sub     cp_x, diff_x    ; cp_x = cycPos - delta
        cmovb   t0_x, cycSize
        add     cp_x, t0_x      ; cp_x += (cycPos < delta ? cycSize : 0)
        jmp     tree_loop

        
MY_ALIGN_32
matched_1:

        inc     len
        ; cmp     len_x, lenLimit_x
        je      short lenLimit_reach
        movzx   t0_x, BYTE PTR [diff + 1 * len]
        cmp     [cur + 1 * len], t0_L
        jne     mismatch

        
MY_ALIGN_32
match_loop:
        ;  while (++len != lenLimit)  (len[diff] != len[0]) ;

        inc     len
        ; cmp     len_x, lenLimit_x
        je      short lenLimit_reach
        movzx   t0_x, BYTE PTR [diff + 1 * len]
        cmp     BYTE PTR [cur + 1 * len], t0_L
        je      match_loop

mismatch:
        jb      left_2

        mov     [ptr1], m
        mov        m, [cp_r + 4]
        lea     ptr1, [cp_r + 4]
        mov     len1, len

        jmp     max_update
        
MY_ALIGN_32
left_2:
        mov     [ptr0], m
        mov        m, [cp_r]
        mov     ptr0, cp_r
        mov     len0, len

max_update:
        sub     diff, cur       ; restore diff

        cmp     maxLen, len
        jae     next_node
        
        mov     maxLen, len
        add     len, lenLimit
        mov     [d], len_x
        mov     t0_x, diff_x
        not     t0_x
        mov     [d + 4], t0_x
        add     d, 8
       
        jmp     next_node


        
MY_ALIGN_32
lenLimit_reach:

        mov     delta_r, cur
        sub     delta_r, diff
        lea     delta1_r, [delta_r - 1]

        mov     t0_x, [cp_r]
        mov     [ptr1], t0_x
        mov     t0_x, [cp_r + 4]
        mov     [ptr0], t0_x

        mov     [d], lenLimit_x
        mov     [d + 4], delta1_x
        add     d, 8

        ; _distances[-1] = (UInt32)(d - _distances);
        mov     t0, distances
        mov     t1, d
        sub     t1, t0
        shr     t1_x, 2
        mov     [t0 - 4], t1_x

        mov     hash, hash_VAR
        mov     hash_lim, size_VAR

        inc     pos
        mov     cp_x, cycPos_VAR
        inc     cp_x

        mov     d_lim, limit_VAR
        mov     cycSize, cycSize_VAR
        ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
        ;    break;
        cmp     hash, hash_lim
        je      fin
        cmp     d, d_lim
        jae     fin
        cmp     delta_x, [hash]
        jne     main_loop
        movzx   t0_x, BYTE PTR [diff]
        cmp     [cur], t0_L
        jne     main_loop

        ; jmp     main_loop     ; bypass for debug
        
        mov     cycPos_VAR, cp_x
        shl     len, 3          ; cycSize * 8
        sub     diff, cur       ; restore diff
        xor     t0_x, t0_x
        cmp     cp_x, delta_x   ; cmp (cycPos_VAR, delta)
        lea     cp_r, [son + 8 * cp_r]  ; dest
        lea     src, [cp_r + 8 * diff]
        cmovb   t0, len         ; t0 =  (cycPos_VAR < delta ? cycSize * 8 : 0)
        add     src, t0
        add     len, son        ; len = son + cycSize * 8

       
MY_ALIGN_32
long_loop:
        add     hash, 4
        
        ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
        
        mov     t0, [src]
        add     src, 8
        mov     [cp_r], t0
        add     cp_r, 8
        cmp     src, len
        cmove   src, son       ; if end of (son) buffer is reached, we wrap to begin

        mov     DWORD PTR [d], 2
        mov     [d + 4], lenLimit_x
        mov     [d + 8], delta1_x
        add     d, 12

        inc     cur

        cmp     hash, hash_lim
        je      long_footer
        cmp     delta_x, [hash]
        jne     long_footer
        movzx   t0_x, BYTE PTR [diff + 1 * cur]
        cmp     [cur], t0_L
        jne     long_footer
        cmp     d, d_lim
        jb      long_loop

long_footer:
        sub     cp_r, son
        shr     cp_r, 3
        add     pos, cp_x
        sub     pos, cycPos_VAR
        mov     cycSize, cycSize_VAR
        
        cmp     d, d_lim
        jae     fin
        cmp     hash, hash_lim
        jne     main_loop
        jmp     fin



fin_error:
        xor     d, d
        
fin:
        mov     RSP, Old_RSP
        mov     t0, [r4 + posRes_OFFS]
        mov     [t0], pos
        mov     r0, d

MY_POP_PRESERVED_ABI_REGS
MY_ENDP

_TEXT$LZFINDOPT ENDS

end

`

期望有一个正确的工作装配版本

x86-64 masm intrinsics avx2 avx512
1个回答
0
投票

使用 Visual Studio 的以下独立代码,我已经测试了所需汇编代码片段的工作。

; 64bit - MASMTest.asm - add two integers

;.386
;.model flat,stdcall
;.stack 4096
;ExitProcess proto,dwExitCode:dword

.data
numbers db 17 dup(15h)
len dword ?
temp dword ?
sum qword ?

.code
main PROC   
    MOV RAX, 7
    ADD RAX, 4
    MOV SUM, RAX
    mov numbers[16], 45h
    vmovdqu    xmm1, XMMWORD PTR numbers
    vmovdqu    xmm0, XMMWORD PTR [numbers + 1]
    pcmpeqb    xmm1, xmm0
    pmovmskb   EAX, xmm1
    not        EAX
    bts        EAX, 16
    bsf        EAX, EAX

    ;mov           EAX, EBX
    add        len, EAX

    cmp        EAX, 16
    jne        exit_process
    ret

    ;invoke ExitProcess, 0 only for 32-bit

    main ENDP
    ;end main

    exit_process:
    ; Perform an exit system call (syscall number 60 on x86-64)
    mov       rax, 60         ; syscall number for sys_exit
    xor       rdi, rdi        ; status 0
    syscall             ; make syscall

    END
© www.soinside.com 2019 - 2024. All rights reserved.