对实数求幂最快的方法是什么？

Question

我当然知道cmath(math.h)中有一个很好的pow()函数，但不接触pow()的背景什么是最快的方法来计算数字用我自己的双手？

Answer 1

您在问两个不同的问题：

为实数提供动力的最快方法是什么？
用自己的双手为数字提供动力的最快方法是什么？

这些有不同的答案。

pow

很快。标准库的实现通常由非常聪明的人编写，由其他聪明的人审查，然后由更聪明的人重构。因此，使用提供的标准库实现几乎总是比尝试自己重新实现标准库更好。

但是，如果您坚持创建自己的

pow

实现，则应首先使用泰勒级数展开来实现

exp

和

log

。然后使用以下属性：

pow(base,power) = exp( power * log(base) )

请注意，如果

base

为负数，则应首先计算

pow(-base,power)

，然后对

base

进行奇偶校验以确定结果的符号。

Answer 2

在我的系统中，Visual Studio 中的标准 pow() 函数的运行速度几乎比我使用 x87 指令编写的简单函数慢 1.4 倍。但我努力工作，使用 AVX 指令编写了几乎同样精确的函数，尽管我忽略了一些优化，但它的运行速度比标准 pow() 快约 2.5 倍。下面我提供了两个版本的代码（x86架构）。这些函数在整个输入数据范围内都能正常工作。 AVX 功能错误是几个 ULP。 x87 代码几乎完全准确（如果 FPU 处于默认模式）。

FPU代码

_declspec(naked) double _fastcall pwr(const double &x, const double &y)
{
  _asm
  {
    fld qword ptr [edx] // Load exponent y
    fld qword ptr [ecx] // Load base x
    fyl2x               // Calculate y*lb(x). This will throw an error if x<0
    fxam                // Check result type to reveal values +-Inf
    fnstsw ax           // Move the swr register into ax (there is the check result)
    sahf                // Set cf if y*lb(x) is equal +-Inf or NaN
    fld1                // On stack: y*lb(x); 1
    fld st(1)           // On stack: y*lb(x); 1; y*lb(x)
    frndint             // At st(0) - exponent rounded to integer number (yi)
    fsub st(2),st(0)    // At st(2) - fraction (yf) in the range from -0.5 to 0.5
    fxch st(2)          // On stack: yi; 1; yf
    fcmovb st(0),st(1)  // In case |y*lb(x)|=Inf replace yf (equal NaN) by 1
    f2xm1               // Calculate 2^yf-1
    faddp st(1),st(0)   // Add 1 to result, we get 2^yf
    fscale              // Take into account the integer part by scaling
    fstp st(1)          // Remove yi from stack, result at st(0)
    ret                 // Return
  }
}

AVX 代码

_declspec(naked) double _vectorcall pwr(double x, double y)
  {
  _declspec(align(16)) static const double ct[20] = // Constants table
    {
    // Additional constants
    1.0,                        // 1
    1.41421356237309505,        // 2^0.5
    // Chebyshev approximation coefficients for calculating the function lb(x)
    0.262334391706879229,       // b5
    0.320598534127666242,       // b4
    0.412198585848994984,       // b3
    0.577078016345480558,       // b2
    0.961796693925989802,       // b1
    2.88539008177792681,        // b0
    0.213668367342346557,       // b7
    0.220912115142181838,       // b6
    // Exponent-correcting coefficients
    -2.0,                       // -2
    -5.55111512312578270E-17,   // -2^-54
    // Pade-Chebyshev approximation coefficients for calculating the function 2^x
    6.79350226696114100E-7,     // a3
    8.40034003010100485E-9,     // b3
    6.36842571235065203E-4,     // a2
    2.64772893849914684E-5,     // b2
    0.105006705553276333,       // a1
    0.0101081831663138722,      // b1
    1.89422861932497200,        // a0
    0.656489613410531136        // b0
    };
  _asm
    {
    vmovapd xmm2,xmm0           // xmm2 = x
    mov edx,offset ct           // edx contains the constants table address
    vpextrw eax,xmm0,3          // Read into ax the high word of a number containing the sign bit and the exponent
    vunpcklpd xmm5,xmm1,xmm1    // xmm5 = y : y
    cmp ax,0x7FF0               // Check the condition that the input is +Inf or +NaN
    jge lb_nan_inf              // Handle abnormal input value
      lb_denorm:                // Entry point for handling denormal numbers
    mov cx,0x3FF0               // Set the low byte to ch and prepare a mask in cl
    or cl,al                    // Set low exponent byte of x
    sar ax,4                    // Get in eax the exponent of the number plus the offset 1023
    jg lb_x_ok                  // Goto if x>0 and a number is normalized
    vsqrtsd xmm2,xmm2,xmm2      // Take the root of the supposedly denormalized x (NaN if x<0)
    vaddpd xmm5,xmm5,xmm5       // Accordingly, the exponent y is multiplied by 2
    vpextrw eax,xmm2,3          // Read into ax the high word of a number containing the sign bit and the exponent
    vucomisd xmm2,xmm0          // Check if xmm2 has increased after rooting
    ja lb_denorm                // Repeat order extraction if x>0 and has been normalized
    test al,al                  // al=0 for x=+-0 only
    jnz lb_nan_inf              // Goto if x=Inf or x=NaN
    or eax,-16                  // If x=0 form ax=FFF0h - high word of value -Inf
    vpinsrw xmm2,xmm2,eax,3     // xmm2 = -Inf if x=0
      lb_nan_inf:               // A non-standard logarithm value is ready in xmm2 (NaN, -Inf or Inf)
    push 48                     // Store a number 48 on the stack
    vcvtsd2si ecx,xmm2          // ecx = -2^31
    pop eax                     // eax = 48 - loop counter for calculating exponential function
    jmp pwr_bad_calc            // Goto the end of the logarithm calculation block
      lb_x_ok:                  // Continue with a valid value of x
    vpinsrw xmm0,xmm2,ecx,3     // Replace the exponent of x. Now 1<=x<2 (reduced value)
    sub eax,1023                // Calculate in eax the integer part of the logarithm
    vcomisd xmm0,[edx+8]        // Compare x and root of 2
    jb lb_x_done                // Goto if 1 <= x < 2^0.5, i.e. no exponent correction required
    and ecx,-17                 // Subtract 1 from exponent to become 2^0.5/2 <= x < 1
    inc eax                     // The integer part of the logarithm increases by 1 accordingly
    vpinsrw xmm0,xmm0,ecx,3     // Update x value in xmm0
      lb_x_done:                // In xmm0 - the reduced value of x such that 2^0.5/2 <= x < 2^0.5
    vaddsd xmm2,xmm0,[edx]      // xmm2 = x+1
    vsubsd xmm4,xmm0,[edx]      // xmm4 = x-1
    vcvtsi2sd xmm3,xmm3,eax     // In xmm3 - the integer part of the binary logarithm
    vdivsd xmm0,xmm4,xmm2       // xmm0 = (x-1)/(x+1) = u
    vmovapd xmm1,[edx+64]       // Initialize sums with low coefficients: xmm1 = b6 : b7
    vmulsd xmm2,xmm0,xmm0       // xmm2 = u^2
    xor eax,eax                 // eax = 0 - initialize the loop counter
    vmulsd xmm4,xmm2,xmm2       // xmm4 = u^4
    vunpcklpd xmm4,xmm4,xmm4    // xmm4 = u^4 : u^4
      lb_loop:                  // Cycle for calculating polynomials using Horner's scheme
    add eax,16                  // Get in eax the offset of the next pair of coefficients relative to ct
    vfmadd213pd xmm1,xmm4,[edx+eax] // Multiply current sums by u^4 and add coefficients
    jnp lb_loop                 // Repeat the cycle 3 times (take into account all 4 pairs of coefficients)
    vunpckhpd xmm4,xmm1,xmm1    // In xmm1 - the sum over odd terms, in xmm4 - over even terms
    vfmadd213sd xmm1,xmm2,xmm4  // In xmm1 - total sum
    vmulsd xmm0,xmm0,xmm1       // xmm0 = lb_f - fractional part of the binary logarithm
    vunpcklpd xmm1,xmm3,xmm0    // xmm1 = lb_f : lb_i
    vmulpd xmm2,xmm5,xmm1       // xmm2 = y*lb_f : y*lb_i
    vroundpd xmm2,xmm2,0        // xmm2 = (y*lb_f)_i : (y*lb_i)_i
    vfmsub213pd xmm1,xmm5,xmm2  // xmm1 = (y*lb_f)_f : (y*lb_i)_f
    vhaddpd xmm2,xmm2,xmm1      // xmm2 = (y*lb_f)_f + (y*lb_i)_f = (y*lb)_f : (y*lb_f)_i + (y*lb_i)_i = (y*lb)_i
    // Found lb(x^y)_i=k и lb(x^y)_f=t. Let's start calculating the function 2^k*2^t
    vcvtsd2si ecx,xmm2          // In ecx - additional exponent k of the result
    vucomisd xmm2,xmm0          // cf=1 if k<0; pf=1 if y=+-Inf or the result is NaN
    jnp res_no_nan              // Goto if |y|<Inf and the result is not NaN
    vaddsd xmm2,xmm0,xmm3       // xmm2 = lb(x)
      pwr_bad_calc:             // Entry point for handling situations x=NaN, x<=0 or x=Inf
    vmulsd xmm0,xmm2,xmm5       // xmm0 = y*lb(x)
    vucomisd xmm0,[edx]         // Identify the infinity sign or xmm0 is NaN
    jp res_done                 // Exit if result is NaN
      res_no_nan:               // Checking for k overflow provided the result is not NaN
    vmovapd xmm4,[edx+80]       // Prepare coefficients for exponent correction
    dec ecx                     // of=1 if |k| is too big
    jo res_0_inf                // Go to set zero or infinity on overflow
    add ecx,1023                // Get binary representation of exponent of value 2^(k-1)
    jg test_big_exp             // If exponent is not too small, check for overflow
    vunpckhpd xmm4,xmm4,xmm4    // Take the coefficient -2^-54 for the denormalized result
    sub ecx,-55                 // For a denormalized result increase the exponent by 54
    jle res_0_inf               // Go to set zero if k<<0
      test_big_exp:             // Checking the upper bound of the exponent
    cmp ecx,2046                // Compare exponent with limit value
    jbe calc_exp                // Goto if there is no upward exponent overflow
      res_0_inf:                // Set result is 0 if cf=1, and is Inf if cf=0
    vxorpd xmm0,xmm0,xmm0       // xmm0 = 0
    jbe res_done                // Exit with result 0 if k<<0
    mov cx,0x07FF               // Place the high 12 bits of the Inf value into the low bits of ecx
      calc_exp:                 // Form the exponent of the factor - additional exponent
    vunpckhpd xmm1,xmm2,xmm2    // xmm1 = t : t
    vmovapd xmm3,[edx+96]       // xmm3 = b3 : a3 - initialize sums
    vmulpd xmm2,xmm1,xmm1       // xmm2 = t^2 : t^2
    vmovd xmm0,ecx              // Move additional exponent to xmm0
      exp_loop:                 // Cycle for calculating polynomials using Horner's scheme
    add eax,8                   // In eax - half-offset of the next pair of coefficients relative to ct
    vfmadd213pd xmm3,xmm2,[edx+2*eax] // Multiply current sums by t^2 and add coefficients
    jnp exp_loop                // Repeat the loop body 3 times
    vunpcklpd xmm2,xmm3,xmm3    // xmm2 = P : P
    vunpckhpd xmm3,xmm3,xmm3    // xmm3 = Q : Q
    vpsllq xmm0,xmm0,52         // xmm0 = 2^k or Inf
    vfmaddsub213pd xmm3,xmm1,xmm2 // xmm3 = t*Q+P : t*Q-P
    vunpckhpd xmm2,xmm3,xmm3    // xmm3 = t*Q-P; xmm2 = t*Q+P
    vdivsd xmm2,xmm2,xmm3       // xmm2 = -(P+t*Q)/(P-t*Q)
    vmulsd xmm2,xmm2,xmm4       // Multiply the denormalized result by -2^-54, the normalized result by -2
    vmulsd xmm0,xmm0,xmm2       // Consider additional exponent: xmm0 = 2^k*(P+t*Q)/(P-t*Q)
      res_done:                 // Result is ready in xmm0
    ret                         // Return
  }
}

对实数求幂最快的方法是什么？

问题描述投票：0回答：2

2个回答

最新问题

对实数求幂最快的方法是什么？

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2