我当然知道cmath(math.h)中有一个很好的pow()函数,但不接触pow()的背景什么是最快的方法来计算数字用我自己的双手?
您在问两个不同的问题:
这些有不同的答案。
pow
很快。标准库的实现通常由非常聪明的人编写,由其他聪明的人审查,然后由更聪明的人重构。因此,使用提供的标准库实现几乎总是比尝试自己重新实现标准库更好。
但是,如果您坚持创建自己的
pow
实现,则应首先使用泰勒级数展开来实现 exp
和 log
。然后使用以下属性:
pow(base,power) = exp( power * log(base) )
请注意,如果
base
为负数,则应首先计算 pow(-base,power)
,然后对 base
进行奇偶校验以确定结果的符号。
在我的系统中,Visual Studio 中的标准 pow() 函数的运行速度几乎比我使用 x87 指令编写的简单函数慢 1.4 倍。但我努力工作,使用 AVX 指令编写了几乎同样精确的函数,尽管我忽略了一些优化,但它的运行速度比标准 pow() 快约 2.5 倍。下面我提供了两个版本的代码(x86架构)。 这些函数在整个输入数据范围内都能正常工作。 AVX 功能错误是几个 ULP。 x87 代码几乎完全准确(如果 FPU 处于默认模式)。
FPU代码
_declspec(naked) double _fastcall pwr(const double &x, const double &y)
{
_asm
{
fld qword ptr [edx] // Load exponent y
fld qword ptr [ecx] // Load base x
fyl2x // Calculate y*lb(x). This will throw an error if x<0
fxam // Check result type to reveal values +-Inf
fnstsw ax // Move the swr register into ax (there is the check result)
sahf // Set cf if y*lb(x) is equal +-Inf or NaN
fld1 // On stack: y*lb(x); 1
fld st(1) // On stack: y*lb(x); 1; y*lb(x)
frndint // At st(0) - exponent rounded to integer number (yi)
fsub st(2),st(0) // At st(2) - fraction (yf) in the range from -0.5 to 0.5
fxch st(2) // On stack: yi; 1; yf
fcmovb st(0),st(1) // In case |y*lb(x)|=Inf replace yf (equal NaN) by 1
f2xm1 // Calculate 2^yf-1
faddp st(1),st(0) // Add 1 to result, we get 2^yf
fscale // Take into account the integer part by scaling
fstp st(1) // Remove yi from stack, result at st(0)
ret // Return
}
}
AVX 代码
_declspec(naked) double _vectorcall pwr(double x, double y)
{
_declspec(align(16)) static const double ct[20] = // Constants table
{
// Additional constants
1.0, // 1
1.41421356237309505, // 2^0.5
// Chebyshev approximation coefficients for calculating the function lb(x)
0.262334391706879229, // b5
0.320598534127666242, // b4
0.412198585848994984, // b3
0.577078016345480558, // b2
0.961796693925989802, // b1
2.88539008177792681, // b0
0.213668367342346557, // b7
0.220912115142181838, // b6
// Exponent-correcting coefficients
-2.0, // -2
-5.55111512312578270E-17, // -2^-54
// Pade-Chebyshev approximation coefficients for calculating the function 2^x
6.79350226696114100E-7, // a3
8.40034003010100485E-9, // b3
6.36842571235065203E-4, // a2
2.64772893849914684E-5, // b2
0.105006705553276333, // a1
0.0101081831663138722, // b1
1.89422861932497200, // a0
0.656489613410531136 // b0
};
_asm
{
vmovapd xmm2,xmm0 // xmm2 = x
mov edx,offset ct // edx contains the constants table address
vpextrw eax,xmm0,3 // Read into ax the high word of a number containing the sign bit and the exponent
vunpcklpd xmm5,xmm1,xmm1 // xmm5 = y : y
cmp ax,0x7FF0 // Check the condition that the input is +Inf or +NaN
jge lb_nan_inf // Handle abnormal input value
lb_denorm: // Entry point for handling denormal numbers
mov cx,0x3FF0 // Set the low byte to ch and prepare a mask in cl
or cl,al // Set low exponent byte of x
sar ax,4 // Get in eax the exponent of the number plus the offset 1023
jg lb_x_ok // Goto if x>0 and a number is normalized
vsqrtsd xmm2,xmm2,xmm2 // Take the root of the supposedly denormalized x (NaN if x<0)
vaddpd xmm5,xmm5,xmm5 // Accordingly, the exponent y is multiplied by 2
vpextrw eax,xmm2,3 // Read into ax the high word of a number containing the sign bit and the exponent
vucomisd xmm2,xmm0 // Check if xmm2 has increased after rooting
ja lb_denorm // Repeat order extraction if x>0 and has been normalized
test al,al // al=0 for x=+-0 only
jnz lb_nan_inf // Goto if x=Inf or x=NaN
or eax,-16 // If x=0 form ax=FFF0h - high word of value -Inf
vpinsrw xmm2,xmm2,eax,3 // xmm2 = -Inf if x=0
lb_nan_inf: // A non-standard logarithm value is ready in xmm2 (NaN, -Inf or Inf)
push 48 // Store a number 48 on the stack
vcvtsd2si ecx,xmm2 // ecx = -2^31
pop eax // eax = 48 - loop counter for calculating exponential function
jmp pwr_bad_calc // Goto the end of the logarithm calculation block
lb_x_ok: // Continue with a valid value of x
vpinsrw xmm0,xmm2,ecx,3 // Replace the exponent of x. Now 1<=x<2 (reduced value)
sub eax,1023 // Calculate in eax the integer part of the logarithm
vcomisd xmm0,[edx+8] // Compare x and root of 2
jb lb_x_done // Goto if 1 <= x < 2^0.5, i.e. no exponent correction required
and ecx,-17 // Subtract 1 from exponent to become 2^0.5/2 <= x < 1
inc eax // The integer part of the logarithm increases by 1 accordingly
vpinsrw xmm0,xmm0,ecx,3 // Update x value in xmm0
lb_x_done: // In xmm0 - the reduced value of x such that 2^0.5/2 <= x < 2^0.5
vaddsd xmm2,xmm0,[edx] // xmm2 = x+1
vsubsd xmm4,xmm0,[edx] // xmm4 = x-1
vcvtsi2sd xmm3,xmm3,eax // In xmm3 - the integer part of the binary logarithm
vdivsd xmm0,xmm4,xmm2 // xmm0 = (x-1)/(x+1) = u
vmovapd xmm1,[edx+64] // Initialize sums with low coefficients: xmm1 = b6 : b7
vmulsd xmm2,xmm0,xmm0 // xmm2 = u^2
xor eax,eax // eax = 0 - initialize the loop counter
vmulsd xmm4,xmm2,xmm2 // xmm4 = u^4
vunpcklpd xmm4,xmm4,xmm4 // xmm4 = u^4 : u^4
lb_loop: // Cycle for calculating polynomials using Horner's scheme
add eax,16 // Get in eax the offset of the next pair of coefficients relative to ct
vfmadd213pd xmm1,xmm4,[edx+eax] // Multiply current sums by u^4 and add coefficients
jnp lb_loop // Repeat the cycle 3 times (take into account all 4 pairs of coefficients)
vunpckhpd xmm4,xmm1,xmm1 // In xmm1 - the sum over odd terms, in xmm4 - over even terms
vfmadd213sd xmm1,xmm2,xmm4 // In xmm1 - total sum
vmulsd xmm0,xmm0,xmm1 // xmm0 = lb_f - fractional part of the binary logarithm
vunpcklpd xmm1,xmm3,xmm0 // xmm1 = lb_f : lb_i
vmulpd xmm2,xmm5,xmm1 // xmm2 = y*lb_f : y*lb_i
vroundpd xmm2,xmm2,0 // xmm2 = (y*lb_f)_i : (y*lb_i)_i
vfmsub213pd xmm1,xmm5,xmm2 // xmm1 = (y*lb_f)_f : (y*lb_i)_f
vhaddpd xmm2,xmm2,xmm1 // xmm2 = (y*lb_f)_f + (y*lb_i)_f = (y*lb)_f : (y*lb_f)_i + (y*lb_i)_i = (y*lb)_i
// Found lb(x^y)_i=k и lb(x^y)_f=t. Let's start calculating the function 2^k*2^t
vcvtsd2si ecx,xmm2 // In ecx - additional exponent k of the result
vucomisd xmm2,xmm0 // cf=1 if k<0; pf=1 if y=+-Inf or the result is NaN
jnp res_no_nan // Goto if |y|<Inf and the result is not NaN
vaddsd xmm2,xmm0,xmm3 // xmm2 = lb(x)
pwr_bad_calc: // Entry point for handling situations x=NaN, x<=0 or x=Inf
vmulsd xmm0,xmm2,xmm5 // xmm0 = y*lb(x)
vucomisd xmm0,[edx] // Identify the infinity sign or xmm0 is NaN
jp res_done // Exit if result is NaN
res_no_nan: // Checking for k overflow provided the result is not NaN
vmovapd xmm4,[edx+80] // Prepare coefficients for exponent correction
dec ecx // of=1 if |k| is too big
jo res_0_inf // Go to set zero or infinity on overflow
add ecx,1023 // Get binary representation of exponent of value 2^(k-1)
jg test_big_exp // If exponent is not too small, check for overflow
vunpckhpd xmm4,xmm4,xmm4 // Take the coefficient -2^-54 for the denormalized result
sub ecx,-55 // For a denormalized result increase the exponent by 54
jle res_0_inf // Go to set zero if k<<0
test_big_exp: // Checking the upper bound of the exponent
cmp ecx,2046 // Compare exponent with limit value
jbe calc_exp // Goto if there is no upward exponent overflow
res_0_inf: // Set result is 0 if cf=1, and is Inf if cf=0
vxorpd xmm0,xmm0,xmm0 // xmm0 = 0
jbe res_done // Exit with result 0 if k<<0
mov cx,0x07FF // Place the high 12 bits of the Inf value into the low bits of ecx
calc_exp: // Form the exponent of the factor - additional exponent
vunpckhpd xmm1,xmm2,xmm2 // xmm1 = t : t
vmovapd xmm3,[edx+96] // xmm3 = b3 : a3 - initialize sums
vmulpd xmm2,xmm1,xmm1 // xmm2 = t^2 : t^2
vmovd xmm0,ecx // Move additional exponent to xmm0
exp_loop: // Cycle for calculating polynomials using Horner's scheme
add eax,8 // In eax - half-offset of the next pair of coefficients relative to ct
vfmadd213pd xmm3,xmm2,[edx+2*eax] // Multiply current sums by t^2 and add coefficients
jnp exp_loop // Repeat the loop body 3 times
vunpcklpd xmm2,xmm3,xmm3 // xmm2 = P : P
vunpckhpd xmm3,xmm3,xmm3 // xmm3 = Q : Q
vpsllq xmm0,xmm0,52 // xmm0 = 2^k or Inf
vfmaddsub213pd xmm3,xmm1,xmm2 // xmm3 = t*Q+P : t*Q-P
vunpckhpd xmm2,xmm3,xmm3 // xmm3 = t*Q-P; xmm2 = t*Q+P
vdivsd xmm2,xmm2,xmm3 // xmm2 = -(P+t*Q)/(P-t*Q)
vmulsd xmm2,xmm2,xmm4 // Multiply the denormalized result by -2^-54, the normalized result by -2
vmulsd xmm0,xmm0,xmm2 // Consider additional exponent: xmm0 = 2^k*(P+t*Q)/(P-t*Q)
res_done: // Result is ready in xmm0
ret // Return
}
}