快速实现5位精度的互补误差函数

问题描述 投票:0回答:1

[虽然这是一个自我回答的问题,但我会很乐意投票并接受任何替代答案,该答案要么为相同数量的计算工作提供卓越的准确性,要么在保持相同准确性水平的同时减少计算工作量。]

我之前已经演示了如何计算最大误差小于 3 ulp 的互补误差函数

erfcf()
。这可以作为其他函数的构建块,例如标准正态分布 Φ(x) = ½ erfc (-√½ x) 的 CDF 或高斯 Q 函数 Q(x) = 1-Φ(x ) = 1/2 erfc (√1/2 x)。然而,对于某些用例,不需要完全精确到单精度的计算,而
erfc()
评估对总运行时间的贡献不可忽略。

文献提供了互补误差函数的各种低精度近似,但它们要么仅限于完整输入域的子集,要么针对绝对误差进行优化,要么计算过于复杂,例如需要多次调用超越函数。如何在整个输入域中以高性能和约 5 个小数位的相对精度来实现

erfcf()

c algorithm floating-point floating-accuracy
1个回答
0
投票
float

映射到 IEEE-754

binary32
格式,并且在 32 位整数和 32 位整数之间使用相同的字节序。
float
。进一步假设 C 工具链可用(如果需要,通过设置适当的命令行开关)保留 IEEE-754 语义。我使用带有开关的 Intel C/C++ 编译器
-march=skylake-avx152 -O3 -fp-model=precise
由于互补误差函数关于 (0,1) 对称,因此可以关注正半平面中的函数输入。这里函数的衰减大致类似于 exp(-x

2

),并且通过 float 计算,当参数 x > 10.5 时,它会下溢到零。如果在 [0, 10.5] 上绘制 erfc(x) / exp(-x

2
) ,则该形状表明用多项式近似有点困难,但应该很容易用有理函数(即比率)近似两个多项式。一些初步实验表明,两个 3 次多项式应该足以达到五位数的精度。 虽然有许多工具可以生成多项式近似值,但不幸的是,有理近似值并非如此。我使用了 Remez 算法的修改来生成初始极小极大近似 R(x) = P(x)/Q(x) 到 erfc(x) / exp(-x

2

),但必须跟进进行相当广泛的启发式搜索,以获得一个近似值,该近似值提供“接近”到误差峰值的等振荡,并且“几乎”实现了 10-5 的相对误差,而对于我的需求而言,其余差异可以忽略不计。 通过计算 erfc(x) = exp(-x2) R(x),所达到的精度显然取决于平台的 expf() 实现的精度。根据我的经验,该函数的忠实实现(最大误差

my_expf()

,误差较大,并且观察到对fast_erfcf()的准确性只有非常小的影响。

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define USE_FMA          (1)
#define USE_BUILTIN_EXP  (0)

#if !USE_BUILTIN_EXP
float my_expf (float a);
#endif // USE_BUILTIN_EXP

/* Fast computation of the complementary error function. For argument x > 0
   erfc(x) = exp(-x*x) * P(x) / Q(x), where P(x) and Q(x) are polynomials. 
   If expf() is faithfully rounded, the following error bounds should hold:
   Maximum relative error < 1.065e-5, maximum absolute error < 9.50e-6, and 
   maximum ulp error < 176.5
*/
float fast_erfcf (float x)
{
    float a, c, e, p, q, r, s;
    a = fabsf (x);
    c = fminf (a, 10.5f);
    s = -c * c;
#if USE_BUILTIN_EXP
    e = expf (s);
#else // USE_BUILTIN_EXP
    e = my_expf (s);
#endif // USE_BUILTIN_EXP
#if USE_FMA
    q =             3.82346243e-1f;  //  0x1.8785c6p-2
    p =            -4.38094139e-5f;  // -0x1.6f8000p-15
    q = fmaf (q, c, 1.30382288e+0f); //  0x1.4dc756p+0
    p = fmaf (p, c, 2.16852024e-1f); //  0x1.bc1ceap-3
    q = fmaf (q, c, 1.85278833e+0f); //  0x1.da5056p+0
    p = fmaf (p, c, 7.23953605e-1f); //  0x1.72aa0cp-1
    q = fmaf (q, c, 9.99991655e-1f); //  0x1.fffee8p-1
    p = fmaf (p, c, 1.00000000e+0f); //  0x1.000000p+0
#else // USE_FMA
    q =         3.82346272e-1f; //  0x1.8785c8p-2f
    p =        -4.38243151e-5f; // -0x1.6fa000p-15
    q = q * c + 1.30382371e+0f; //  0x1.4dc764p+0
    p = p * c + 2.16852218e-1f; //  0x1.bc1d04p-3
    q = q * c + 1.85278797e+0f; //  0x1.da5050p+0
    p = p * c + 7.23953605e-1f; //  0x1.72aa0cp-1
    q = q * c + 9.99991596e-1f; //  0x1.fffee6p-1
    p = p * c + 1.00000000e+0f; //  0x1.000000p+0
#endif // USE_FMA
    r = e / q;
    r = r * p;
    if (x < 0.0f) r = 2.0f - r;
    if (isnan(x)) r = x + x;
    return r;
}

float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

/* Exponential function. Maximum error 0.86565 ulps */
float my_expf (float a)
{
    float f, r, j, s, t;
    int i;
    unsigned int ia;

    // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
    j = fmaf (1.442695f, a, 12582912.f); // 0x1.715476p0 // log2(e)
    j = j - 12582912.f; // 0x1.8p23 // 2**23+2**22
    f = fmaf (j, -6.93145752e-1f, a); // -0x1.62e400p-1  // log_2_hi 
    f = fmaf (j, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo 
    i = (int)j;
    // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
    r =             1.37805939e-3f;  // 0x1.694000p-10
    r = fmaf (r, f, 8.37312452e-3f); // 0x1.125edcp-7
    r = fmaf (r, f, 4.16695364e-2f); // 0x1.555b5ap-5
    r = fmaf (r, f, 1.66664720e-1f); // 0x1.555450p-3
    r = fmaf (r, f, 4.99999851e-1f); // 0x1.fffff6p-2
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    r = fmaf (r, f, 1.00000000e+0f); // 0x1.000000p+0
    // exp(a) = 2**i * r
    ia = (i > 0) ? 0 : 0x83000000;
    s = uint32_as_float (0x7f000000 + ia);
    t = uint32_as_float ((i << 23) - ia);
    r = r * s;
    r = r * t;
    // handle special cases: severe overflow / underflow
    if (fabsf (a) >= 104.0f) r = (a < 0) ? 0.0f : INFINITY;
    return r;
}

uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

uint64_t double_as_uint64 (double a)
{
    uint64_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

double floatUlpErr (float res, double ref)
{
    uint64_t refi, i, j, err;
    int expoRef;
    
    /* ulp error cannot be computed if either operand is NaN, infinity, zero */
    if (isnan (res) || isnan (ref) || isinf(res) || isinf (ref) ||
        (res == 0.0f) || (ref == 0.0)) {
        return 0.0;
    }
    i = ((int64_t)float_as_uint32 (res)) << 32;
    expoRef = (int)(((double_as_uint64 (ref) >> 52) & 0x7ff) - 1023);
    refi = double_as_uint64 (ref);
    if (expoRef >= 129) {
        j = (refi & 0x8000000000000000ULL) | 0x7fffffffffffffffULL;
    } else if (expoRef < -126) {
        j = ((refi << 11) | 0x8000000000000000ULL) >> 8;
        j = j >> (-(expoRef + 126));
        j = j | (refi & 0x8000000000000000ULL);
    } else {
        j = ((refi << 11) & 0x7fffffffffffffffULL) >> 8;
        j = j | ((uint64_t)(expoRef + 127) << 55);
        j = j | (refi & 0x8000000000000000ULL);
    }
    err = (i < j) ? (j - i) : (i - j);
    return err / 4294967296.0;
}

int main (void)
{
    uint32_t argi, resi, refi, diff;
    float arg, res, reff, abserrloc = NAN, relerrloc = NAN, ulperrloc = NAN;
    double ref, relerr, abserr, ulperr;
    double maxabserr = 0, maxrelerr = 0, maxulperr = 0;

    argi = 0;
    do {
        arg = uint32_as_float (argi);
        ref = erfc ((double)arg);
        res = fast_erfcf (arg);
        reff = (float)ref;
        resi = float_as_uint32 (res);
        refi = float_as_uint32 (reff);
        ulperr = floatUlpErr (res, ref);
        if (ulperr > maxulperr) {
            ulperrloc = arg;
            maxulperr = ulperr;
        }
        abserr = fabs (res - ref);
        if (abserr > maxabserr) {
            abserrloc = arg;
            maxabserr = abserr;
        }
        if (fabs (ref) >= 0x1.0p-126) {
            relerr = fabs ((res - ref) / ref);
            if (relerr > maxrelerr) {
                relerrloc = arg;
                maxrelerr = relerr;
            }
        }
        diff = (resi > refi) ? (resi - refi) : (refi - resi);
        if (diff > 200) {
            printf ("diff=%u @ %15.8e : res=% 15.8e  ref=% 15.8e\n", 
                    diff, arg, res, ref);
            return EXIT_FAILURE;
        }
        argi++;
    } while (argi);

    printf ("max rel err = %.6e @ % 15.8e\n"
            "max abs err = %.6e @ % 15.8e\n"
            "max ulp err = %.6e @ % 15.8e\n",
            maxrelerr, relerrloc, 
            maxabserr, abserrloc,
            maxulperr, ulperrloc);
    return EXIT_SUCCESS;
}
<= 1 ulp) are common. While the Intel compiler comes with a highly accurate math library that provides a nearly correctly-rounded implementation (maximum error very close to 0.5 ulps), I also tried my own faithfully-rounded alternative 
    

© www.soinside.com 2019 - 2024. All rights reserved.