目前我有一段 C 代码,它是用 Intel 内在函数编写的,它使用 AVX512BW + AVX512VL 和 SSE 指令。
尝试加载 16 字节数据的地方
__m128i input16 = _mm_loadu_epi8(len + diff);
__m128i match16 = _mm_loadu_epi8(len + 0);
_BitScanForward(&matchLength, (~(_mm_movemask_epi8( _mm_cmpeq_epi8( input16, match16 ))) | 0x10000));
len = len + (unsigned)matchLength;
if (matchLength != 16)
break;
目前我想使用向量指令为同一组代码实现汇编版本。
vmovdqu8 xmm1, XMMWORD PTR [diff + 1 * len]
vmovdqu8 xmm0, XMMWORD PTR [cur + 1 * len]
pcmpeqb xmm1, xmm0
pmovmskb ecx, xmm1
not ecx
bts ecx, 16
bsf edx, ecx
上述代码的编译没有错误,但是在上述代码运行时执行时,执行突然停止,没有任何错误。无法调试,因为没有正确的方法来检查/打印错误。尝试以传统方式调试,仅添加部分代码,目前观察到仅在引入
pmovmskb ecx, xmm1
行时才会出现执行错误,前三行代码执行正常,没有错误。
对上述汇编代码片段的问题有什么想法吗?
使用微软汇编器:来自 VS 2022,ml64.exe。 {Microsoft (R) 宏汇编器 (x64) 版本 14.37.32824.0}
以上代码是以下代码部分中原始工作汇编代码中的 match_loop 的一部分。
; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
; 2021-07-21: Igor Pavlov : Public domain
;
ifndef x64
; x64=1
; .err <x64_IS_REQUIRED>
endif
include 7zAsm.asm
MY_ASM_START
_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req
align num
endm
MY_ALIGN_32 macro
MY_ALIGN 32
endm
MY_ALIGN_64 macro
MY_ALIGN 64
endm
t0_L equ x0_L
t0_x equ x0
t0 equ r0
t1_x equ x3
t1 equ r3
cp_x equ t1_x
cp_r equ t1
m equ x5
m_r equ r5
len_x equ x6
len equ r6
diff_x equ x7
diff equ r7
len0 equ r10
len1_x equ x11
len1 equ r11
maxLen_x equ x12
maxLen equ r12
d equ r13
ptr0 equ r14
ptr1 equ r15
d_lim equ m_r
cycSize equ len_x
hash_lim equ len0
delta1_x equ len1_x
delta1_r equ len1
delta_x equ maxLen_x
delta_r equ maxLen
hash equ ptr0
src equ ptr1
if (IS_LINUX gt 0)
; r1 r2 r8 r9 : win32
; r7 r6 r2 r1 r8 r9 : linux
lenLimit equ r8
lenLimit_x equ x8
; pos_r equ r2
pos equ x2
cur equ r1
son equ r9
else
lenLimit equ REG_ABI_PARAM_2
lenLimit_x equ REG_ABI_PARAM_2_x
pos equ REG_ABI_PARAM_1_x
cur equ REG_ABI_PARAM_0
son equ REG_ABI_PARAM_3
endif
if (IS_LINUX gt 0)
maxLen_OFFS equ (REG_SIZE * (6 + 1))
else
cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4))
d_OFFS equ (REG_SIZE + cutValue_OFFS)
maxLen_OFFS equ (REG_SIZE + d_OFFS)
endif
hash_OFFS equ (REG_SIZE + maxLen_OFFS)
limit_OFFS equ (REG_SIZE + hash_OFFS)
size_OFFS equ (REG_SIZE + limit_OFFS)
cycPos_OFFS equ (REG_SIZE + size_OFFS)
cycSize_OFFS equ (REG_SIZE + cycPos_OFFS)
posRes_OFFS equ (REG_SIZE + cycSize_OFFS)
if (IS_LINUX gt 0)
else
cutValue_PAR equ [r0 + cutValue_OFFS]
d_PAR equ [r0 + d_OFFS]
endif
maxLen_PAR equ [r0 + maxLen_OFFS]
hash_PAR equ [r0 + hash_OFFS]
limit_PAR equ [r0 + limit_OFFS]
size_PAR equ [r0 + size_OFFS]
cycPos_PAR equ [r0 + cycPos_OFFS]
cycSize_PAR equ [r0 + cycSize_OFFS]
posRes_PAR equ [r0 + posRes_OFFS]
cutValue_VAR equ DWORD PTR [r4 + 8 * 0]
cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4]
cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0]
cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4]
hash_VAR equ QWORD PTR [r4 + 8 * 2]
limit_VAR equ QWORD PTR [r4 + 8 * 3]
size_VAR equ QWORD PTR [r4 + 8 * 4]
distances equ QWORD PTR [r4 + 8 * 5]
maxLen_VAR equ QWORD PTR [r4 + 8 * 6]
Old_RSP equ QWORD PTR [r4 + 8 * 7]
LOCAL_SIZE equ 8 * 8
COPY_VAR_32 macro dest_var, src_var
mov x3, src_var
mov dest_var, x3
endm
COPY_VAR_64 macro dest_var, src_var
mov r3, src_var
mov dest_var, r3
endm
; MY_ALIGN_64
MY_PROC GetMatchesSpecN_2, 13
MY_PUSH_PRESERVED_ABI_REGS
mov r0, RSP
lea r3, [r0 - LOCAL_SIZE]
and r3, -64
mov RSP, r3
mov Old_RSP, r0
if (IS_LINUX gt 0)
mov d, REG_ABI_PARAM_5 ; r13 = r9
mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8
mov son, REG_ABI_PARAM_3 ; r9 = r1
mov r8, REG_ABI_PARAM_2 ; r8 = r2
mov pos, REG_ABI_PARAM_1_x ; r2 = x6
mov r1, REG_ABI_PARAM_0 ; r1 = r7
else
COPY_VAR_32 cutValue_VAR, cutValue_PAR
mov d, d_PAR
endif
COPY_VAR_64 limit_VAR, limit_PAR
mov hash_lim, size_PAR
mov size_VAR, hash_lim
mov cp_x, cycPos_PAR
mov hash, hash_PAR
mov cycSize, cycSize_PAR
mov cycSize_VAR, cycSize
; we want cur in (rcx). So we change the cur and lenLimit variables
sub lenLimit, cur
neg lenLimit_x
inc lenLimit_x
mov t0_x, maxLen_PAR
sub t0, lenLimit
mov maxLen_VAR, t0
jmp main_loop
MY_ALIGN_64
fill_empty:
; ptr0 = *ptr1 = kEmptyHashValue;
mov QWORD PTR [ptr1], 0
inc pos
inc cp_x
mov DWORD PTR [d - 4], 0
cmp d, limit_VAR
jae fin
cmp hash, hash_lim
je fin
; MY_ALIGN_64
main_loop:
; UInt32 delta = *hash++;
mov diff_x, [hash] ; delta
add hash, 4
; mov cycPos_VAR, cp_x
inc cur
add d, 4
mov m, pos
sub m, diff_x; ; matchPos
; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2;
lea ptr1, [son + 8 * cp_r]
; mov cycSize, cycSize_VAR
cmp pos, cycSize
jb directMode ; if (pos < cycSize_VAR)
; CYC MODE
cmp diff_x, cycSize
jae fill_empty ; if (delta >= cycSize_VAR)
xor t0_x, t0_x
mov cycPos_VAR, cp_x
sub cp_x, diff_x
; jae prepare_for_tree_loop
; add cp_x, cycSize
cmovb t0_x, cycSize
add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
jmp prepare_for_tree_loop
directMode:
cmp diff_x, pos
je fill_empty ; if (delta == pos)
jae fin_error ; if (delta >= pos)
mov cycPos_VAR, cp_x
mov cp_x, m
prepare_for_tree_loop:
mov len0, lenLimit
mov hash_VAR, hash
; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1;
lea ptr0, [ptr1 + 4]
; UInt32 *_distances = ++d;
mov distances, d
neg len0
mov len1, len0
mov t0_x, cutValue_VAR
mov maxLen, maxLen_VAR
mov cutValueCur_VAR, t0_x
MY_ALIGN_32
tree_loop:
neg diff
mov len, len0
cmp len1, len0
cmovb len, len1 ; len = (len1 < len0 ? len1 : len0);
add diff, cur
mov t0_x, [son + cp_r * 8] ; prefetch
movzx t0_x, BYTE PTR [diff + 1 * len]
lea cp_r, [son + cp_r * 8]
cmp [cur + 1 * len], t0_L
je matched_1
jb left_0
mov [ptr1], m
mov m, [cp_r + 4]
lea ptr1, [cp_r + 4]
sub diff, cur ; FIX32
jmp next_node
MY_ALIGN_32
left_0:
mov [ptr0], m
mov m, [cp_r]
mov ptr0, cp_r
sub diff, cur ; FIX32
; jmp next_node
; ------------ NEXT NODE ------------
; MY_ALIGN_32
next_node:
mov cycSize, cycSize_VAR
dec cutValueCur_VAR
je finish_tree
add diff_x, pos ; prev_match = pos + diff
cmp m, diff_x
jae fin_error ; if (new_match >= prev_match)
mov diff_x, pos
sub diff_x, m ; delta = pos - new_match
cmp pos, cycSize
jae cyc_mode_2 ; if (pos >= cycSize)
mov cp_x, m
test m, m
jne tree_loop ; if (m != 0)
finish_tree:
; ptr0 = *ptr1 = kEmptyHashValue;
mov DWORD PTR [ptr0], 0
mov DWORD PTR [ptr1], 0
inc pos
; _distances[-1] = (UInt32)(d - _distances);
mov t0, distances
mov t1, d
sub t1, t0
shr t1_x, 2
mov [t0 - 4], t1_x
cmp d, limit_VAR
jae fin ; if (d >= limit)
mov cp_x, cycPos_VAR
mov hash, hash_VAR
mov hash_lim, size_VAR
inc cp_x
cmp hash, hash_lim
jne main_loop ; if (hash != size)
jmp fin
MY_ALIGN_32
cyc_mode_2:
cmp diff_x, cycSize
jae finish_tree ; if (delta >= cycSize)
mov cp_x, cycPos_VAR
xor t0_x, t0_x
sub cp_x, diff_x ; cp_x = cycPos - delta
cmovb t0_x, cycSize
add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0)
jmp tree_loop
MY_ALIGN_32
matched_1:
inc len
; cmp len_x, lenLimit_x
je short lenLimit_reach
movzx t0_x, BYTE PTR [diff + 1 * len]
cmp [cur + 1 * len], t0_L
jne mismatch
MY_ALIGN_32
match_loop:
; while (++len != lenLimit) (len[diff] != len[0]) ;
inc len
; cmp len_x, lenLimit_x
je short lenLimit_reach
movzx t0_x, BYTE PTR [diff + 1 * len]
cmp BYTE PTR [cur + 1 * len], t0_L
je match_loop
mismatch:
jb left_2
mov [ptr1], m
mov m, [cp_r + 4]
lea ptr1, [cp_r + 4]
mov len1, len
jmp max_update
MY_ALIGN_32
left_2:
mov [ptr0], m
mov m, [cp_r]
mov ptr0, cp_r
mov len0, len
max_update:
sub diff, cur ; restore diff
cmp maxLen, len
jae next_node
mov maxLen, len
add len, lenLimit
mov [d], len_x
mov t0_x, diff_x
not t0_x
mov [d + 4], t0_x
add d, 8
jmp next_node
MY_ALIGN_32
lenLimit_reach:
mov delta_r, cur
sub delta_r, diff
lea delta1_r, [delta_r - 1]
mov t0_x, [cp_r]
mov [ptr1], t0_x
mov t0_x, [cp_r + 4]
mov [ptr0], t0_x
mov [d], lenLimit_x
mov [d + 4], delta1_x
add d, 8
; _distances[-1] = (UInt32)(d - _distances);
mov t0, distances
mov t1, d
sub t1, t0
shr t1_x, 2
mov [t0 - 4], t1_x
mov hash, hash_VAR
mov hash_lim, size_VAR
inc pos
mov cp_x, cycPos_VAR
inc cp_x
mov d_lim, limit_VAR
mov cycSize, cycSize_VAR
; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit)
; break;
cmp hash, hash_lim
je fin
cmp d, d_lim
jae fin
cmp delta_x, [hash]
jne main_loop
movzx t0_x, BYTE PTR [diff]
cmp [cur], t0_L
jne main_loop
; jmp main_loop ; bypass for debug
mov cycPos_VAR, cp_x
shl len, 3 ; cycSize * 8
sub diff, cur ; restore diff
xor t0_x, t0_x
cmp cp_x, delta_x ; cmp (cycPos_VAR, delta)
lea cp_r, [son + 8 * cp_r] ; dest
lea src, [cp_r + 8 * diff]
cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0)
add src, t0
add len, son ; len = son + cycSize * 8
MY_ALIGN_32
long_loop:
add hash, 4
; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff];
mov t0, [src]
add src, 8
mov [cp_r], t0
add cp_r, 8
cmp src, len
cmove src, son ; if end of (son) buffer is reached, we wrap to begin
mov DWORD PTR [d], 2
mov [d + 4], lenLimit_x
mov [d + 8], delta1_x
add d, 12
inc cur
cmp hash, hash_lim
je long_footer
cmp delta_x, [hash]
jne long_footer
movzx t0_x, BYTE PTR [diff + 1 * cur]
cmp [cur], t0_L
jne long_footer
cmp d, d_lim
jb long_loop
long_footer:
sub cp_r, son
shr cp_r, 3
add pos, cp_x
sub pos, cycPos_VAR
mov cycSize, cycSize_VAR
cmp d, d_lim
jae fin
cmp hash, hash_lim
jne main_loop
jmp fin
fin_error:
xor d, d
fin:
mov RSP, Old_RSP
mov t0, [r4 + posRes_OFFS]
mov [t0], pos
mov r0, d
MY_POP_PRESERVED_ABI_REGS
MY_ENDP
_TEXT$LZFINDOPT ENDS
end
`
期望有一个正确的工作装配版本
使用 Visual Studio 的以下独立代码,我已经测试了所需汇编代码片段的工作。
; 64bit - MASMTest.asm - add two integers
;.386
;.model flat,stdcall
;.stack 4096
;ExitProcess proto,dwExitCode:dword
.data
numbers db 17 dup(15h)
len dword ?
temp dword ?
sum qword ?
.code
main PROC
MOV RAX, 7
ADD RAX, 4
MOV SUM, RAX
mov numbers[16], 45h
vmovdqu xmm1, XMMWORD PTR numbers
vmovdqu xmm0, XMMWORD PTR [numbers + 1]
pcmpeqb xmm1, xmm0
pmovmskb EAX, xmm1
not EAX
bts EAX, 16
bsf EAX, EAX
;mov EAX, EBX
add len, EAX
cmp EAX, 16
jne exit_process
ret
;invoke ExitProcess, 0 only for 32-bit
main ENDP
;end main
exit_process:
; Perform an exit system call (syscall number 60 on x86-64)
mov rax, 60 ; syscall number for sys_exit
xor rdi, rdi ; status 0
syscall ; make syscall
END