重新审视 AVR 8 位处理器上的 32 位整数除法,我尝试编写无符号非恢复除法。不觉得看起来太糟糕,有一个问题:
对于设置了最高位的除数,在必要时不容易得到商 0。
归结为 8 位除法,然后是一些 libgcc 代码:
#define bits __zero_reg__
#define R r25
#define D r22
#define NQ r24
ldi NQ, 25
ldi r22, 129
__udivmodqi4NR: ; 19 instructions
clr R
ldi bits, 8
lsl NQ
_udivmodqi4_pos:
rol R ; 1
sub R, D ; 2 diminish remainder
brcs _udivmodqi4_nep ; 4/3 remainder got < 0, was < divisor
sec ; 4 How to obviate this?
_udivmodqi4_ep:
rol NQ ; 5 rotate 1 quotient bit into dendQuot
dec bits ; 6 decrement loop counter
brne _udivmodqi4_pos ; 8/7
ret
_udivmodqi4_neg:
rol R ; 1
add R, D ; 2 increase remainder
brcs _udivmodqi4_ep ; 4/3 remainder restored to >= 0
_udivmodqi4_nep:
lsl NQ ; 4/5 shift 0 quotient bit into dendQuot
dec bits ; 5/6 decrement loop counter
brne _udivmodqi4_neg ; 7/6/8
add R, D
ret
sleep
; libgcc (or close enough), 12 instructions, worst case not significantly slower:
#define r_rem r25 /* remainder */
#define r_arg1 r24 /* dividend, quotient */
#define r_arg2 r22 /* divisor */
#define q_cnt r23 /* loop count */
__udivmodqi4:
sub r_rem, r_rem ; clear remainder and carry
ldi q_cnt, 8 ; init loop counter
lsl r_arg1
__udivmodqi4_loop:
rol r_rem ; 1 shift dividend into remainder
cp r_rem, r_arg2 ; 2 compare remainder & divisor
brcs __udivmodqi4_ep ; 4/3 remainder <= divisor
sub r_rem, r_arg2 ; 4 restore remainder
__udivmodqi4_ep:
rol r_arg1 ; 5 shift dividend (with CARRY)
dec q_cnt ; 6 decrement loop counter
brne __udivmodqi4_loop;8/7
com r_arg1 ; 67 complement result
; because C flag was complemented in loop
ret
sleep
zeroOne:
cp r_arg1L , r_arg2L ; compare remainder & divisor
cpc r_arg1H , r_arg2H
cpc r_arg1HL, r_arg2HL
cpc r_arg1HH, r_arg2HH
brcs zero
sub r_arg1L , r_arg2L ; subtract divisor
sbc r_arg1H , r_arg2H
sbc r_arg1HL, r_arg2HL
sbc r_arg1HH, r_arg2HH
zero:
mov_l r_arg2L , r_remHL
mov_h r_arg2H , r_remHH
mov_l r_arg2HL, r_remHL
mov_h r_arg2HH, r_remHH
sbci r_arg2L, -1
ret
__udivmodsi4NonRestoring:
sub r_remHH, r_remHH
sub r_remHL, r_remHL; clear remainder and carry
; cpi r_arg2HH, 128 ; any advantage to 64?
; brsh zeroOne ; how to avoid special casing divisor MSB set?
sub r_remH , r_remH
sub r_remL , r_remL ; clear remainder and carry
ldi byteCnt, 4
rjmp setDendQuot
byteLoop:
mov r_cnt, dendQuot
mov r_arg1HH, r_arg1HL
mov r_arg1HL, r_arg1H
mov r_arg1H , r_arg1L
mov r_arg1L , r_cnt
setDendQuot:
ldi r_cnt, 8
brcs callNeg
lsl dendQuot
_udivmodsi4_pos:
rol r_remL ; 1 shift dividend into remainder
rol r_remH ; 2
rol r_remHL ; 3
rol r_remHH ; 4
sub r_remL, r_arg2L ; 5 diminish remainder
sbc r_remH, r_arg2H ; 6
sbc r_remHL, r_arg2HL; 7
sbc r_remHH, r_arg2HH; 8
brcs _udivmodsi4_nep; 9/10 remainder got < 0, was < divisor
sec ; 10
_udivmodsi4_ep:
rol dendQuot ; 11 rotate 1 into dendQuot
dec r_cnt ; 12 decrement loop counter
brne _udivmodsi4_pos; 14/13
clc ; __zero_reg__ now restored (r_cnt == 0)
dec byteCnt
brne byteLoop
rjmp bitsDone ; pop dec brne byteLoop
callNeg:
lsl dendQuot
_udivmodsi4_neg:
rol r_remL ; shift dividend into remainder
rol r_remH ; 2
rol r_remHL ; 3
rol r_remHH ; 4
add r_remL, r_arg2L ; 5 raise remainder
adc r_remH, r_arg2H ; 6
adc r_remHL, r_arg2HL; 7
adc r_remHH, r_arg2HH; 8
brcs _udivmodsi4_ep ; 9/10 remainder restored to >= 0
_udivmodsi4_nep:
lsl dendQuot ; 10/11 shift 0 as quotient bit into dendQuot
dec r_cnt ; 11/12 decrement loop counter
brne _udivmodsi4_neg; 13/14
sec ; __zero_reg__ now restored (r_cnt == 0)
wrapUp:
dec byteCnt
brne byteLoop
add r_remL , r_arg2L ; raise remainder
adc r_remH , r_arg2H
adc r_remHL, r_arg2HL
adc r_remHH, r_arg2HH
bitsDone:
mov_l r_arg2L , dendQuot ; quotient
mov_h r_arg2H , r_arg1L
mov_l r_arg2HL, r_arg1H
mov_h r_arg2HH, r_arg1HL
moveRem:
mov_l r_arg1L , r_remL ; remainder
mov_h r_arg1H , r_remH
mov_l r_arg1HL, r_remHL
mov_h r_arg1HH, r_remHH
ret
sleep
如果没有 MSB 1 除数的附加指令,如何保持非恢复除法的速度?
(明显的?)无答案:
可以使用条件减除法来避免“非恢复除法”中的一种特殊情况显然必需的代码膨胀。 更快的直接替代
libgcc 的 AVR 部门
在嵌套循环中处理被除数字节:
#define quotient __tmp_reg__
#define dividend r_arg1HH
__udivmodsi4B:
ldi r_cnt, 4 ; init loop counter
sub r_remL, r_remL ; could use clr just as well
sub r_remH, r_remH ; clear remainder and carry
mov_l r_remHL, r_remL
mov_h r_remHH, r_remH
byteLoop:
ldi quotient, 1 ; takes NBBY(8) rol instructions to appear as carry
__udivmodsi4_loopB:
lsl dividend
rol r_remL ; shift dividend into remainder
rol r_remH
rol r_remHL
rol r_remHH
cp r_remL,r_arg2L ; compare remainder & divisor
cpc r_remH,r_arg2H
cpc r_remHL,r_arg2HL
cpc r_remHH,r_arg2HH
brcs __udivmodsi4_epB; remainder <= divisor
sub r_remL,r_arg2L ; restore remainder
sbc r_remH,r_arg2H
sbc r_remHL,r_arg2HL
sbc r_remHH,r_arg2HH
__udivmodsi4_epB:
rol quotient ; (with CARRY)
brcc __udivmodsi4_loopB
com quotient
mov r_arg1HH, r_arg1HL ; shift by bytes. On reduced cores (no movw) one
mov r_arg1HL, r_arg1H ; additional (rjmp) instruction saves 2 cycles:
mov r_arg1H , r_arg1L ; shift on byteLoop top (skip on 1st iteration)
mov r_arg1L , quotient ; and adjust the quotient mov instructions
dec r_cnt
brne byteLoop
; __zero_reg__ now restored (r_cnt == 0)
; div/mod results to return registers, as for the ldiv() function
mov_l r_arg2L, r_arg1L ; quotient
mov_h r_arg2H, r_arg1H
mov_l r_arg2HL, r_arg1HL
mov_h r_arg2HH, r_arg1HH
mov_l r_arg1L, r_remL ; remainder
mov_h r_arg1H, r_remH
mov_l r_arg1HL, r_remHL
mov_h r_arg1HH, r_remHH
ret
sleep
可以想象,使用“部分单个”整数的变体,加速会更小,并比 __udivmodpsi4 多执行一条指令。
和非恢复除法是独立的,这个是快速但大型实现的第一站,其次是非恢复。
将两者结合起来可能是第三个。