如何使用 SIMD 优化这个“点积”函数？它是 Mat4x4 * Vec4，但具有巨大的跨步访问

Question

我在尝试为该函数获得最佳加速时遇到了一个大问题，但我无法编写击败自动矢量化器的有效 SIMD 代码。我需要写一些 SIMD 来击败它，但我现在完全陷入困境：

static int  c_coef_4tap[4][4] = {
     0, 64,  0,  0,
    -4, 54, 16, -2,
    -4, 36, 36, -4,
    -2, 16, 54, -4,
};

static void s_read_qpel_ref(int list, int ref_pos_x, int ref_pos_y, int blk_size, int ref_buf[])
{
    int  int_buf[35 * 35];                              // 1+32+2 = 35
    int  int_x, int_y, frac_x, frac_y;
    int  pixel[4][4], hor[4], ver;
    int  x, y, h, v;

    int_x = ref_pos_x >> 2;
    int_y = ref_pos_y >> 2;
    frac_x = ref_pos_x & 3;
    frac_y = ref_pos_y & 3;

   //int_buf fill-up function is here. it's too long but is not the bottleneck


    for (y=0; y<blk_size; y++) {
        for (x=0; x<blk_size; x++) {
            for (v=0; v<4; v++) {
                for (h=0; h<4; h++) {
                    pixel[v][h] = int_buf[(y + v) * (blk_size+3) + (x + h)];
                }
            }
            // 4x4 filtering
            for (v=0; v<4; v++) {
                hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
                          pixel[v][1] * c_coef_4tap[frac_x][1] +
                          pixel[v][2] * c_coef_4tap[frac_x][2] +
                          pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
                hor[v] = I_CLIP3(0, 255, hor[v]);
            }
            ver = (hor[0] * c_coef_4tap[frac_y][0] +
                   hor[1] * c_coef_4tap[frac_y][1] +
                   hor[2] * c_coef_4tap[frac_y][2] +
                   hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;
            ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
        }
    }
}

我当前使用的优化标志是：

-O3 -march=native -funroll-loops -flto

这些给了我最好的加速，没问题。但我必须让它更快。

目前，我已经分析了代码并发现“hor[v]”计算花费的时间最多。我尝试过水平添加，但它让它变得更慢。我还查看了自动矢量化器转储，它显示 x 循环正在被矢量化。我的 SIMD 内部代码根本没有对 x 循环进行矢量化。事实上，当我引入内在函数时，它会停止对 x 循环进行矢量化。有人可以告诉我在这里做什么吗？

我会尝试转置和使用点积，但是 x 循环再次没有被矢量化。我认为，对 int_buf[] 的跨步访问来填充像素数组也会导致巨大的问题。

目前，我已经分析了代码并发现“hor[v]”计算花费的时间最多。我尝试过水平添加，但它让它变得更慢。我还查看了自动矢量化器转储，它显示 x 循环正在被矢量化。我的 SIMD 内部代码根本没有对 x 循环进行矢量化。事实上，当我引入内在函数时，它会停止对 x 循环进行矢量化。有人可以告诉我在这里做什么吗？

我会尝试转置和使用点积，但是 x 循环再次没有被矢量化。我认为，对 int_buf[] 的跨步访问来填充像素数组也导致了巨大的问题。

即使这些是整数。像素数组不超过 10 位整数。思考使用 16 位乘法是否会有帮助。

Answer 1

以热门评论为序...

好的，我综合了一个基准程序。

将

pixel

和

hor

更改为标量的一个原因是它使转换为

openmp

更容易。

第一次天真的尝试使用

-fopenmp

导致原始函数结果不正确。我的“清理”功能更容易转换为 OMP。

详细基准如下，但是...

OMP 带来 2 倍的加速。
使用
```
-O2
```
，我的版本速度更快（17％）。对于
```
-O3
```
，版本几乎相同。

这是重构后的代码：

// bench.c -- benchmark program

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define inline_always static inline __attribute__((always_inline))

double
tscgetf(void)
{
    struct timespec ts;
    double sec;

    clock_gettime(CLOCK_MONOTONIC,&ts);
    sec = ts.tv_nsec;
    sec /= 1e9;
    sec += ts.tv_sec;

    return sec;
}

double t_beg;
double t_end;

#define BEG \
    t_beg = tscgetf()

#define END \
    t_end = tscgetf()

#if 0
static int c_coef_4tap[4][4] = {
    0, 64, 0, 0,
    -4, 54, 16, -2,
    -4, 36, 36, -4,
    -2, 16, 54, -4,
};
#else
const int c_coef_4tap[4][4] = {
    { 0, 64, 0, 0 },
    { -4, 54, 16, -2 },
    { -4, 36, 36, -4 },
    { -2, 16, 54, -4 },
};
#endif

typedef void
(*fncptr)(int list, int ref_pos_x, int ref_pos_y, int blk_size, int ref_buf[]);

struct fncdef {
    fncptr fnc_ptr;
    const char *fnc_sym;
    const char *fnc_reason;
    int *fnc_ref;
    double fnc_elap;
};

inline_always int
I_CLIP3(int lo,int hi,int val)
{

    do {
        if (val < lo) {
            val = lo;
            break;
        }

        if (val > hi) {
            val = hi;
            break;
        }
    } while (0);

    return val;
}

void
int_fill(int *int_buf)
{

    for (int idx = 0;  idx < (35 * 35);  ++idx)
        int_buf[idx] = idx;
}

void
s_orig(int list, int ref_pos_x, int ref_pos_y, int blk_size,
    int ref_buf[])
{
    int int_buf[35 * 35];               // 1+32+2 = 35
#if 0
    int int_x, int_y;
#endif
    int frac_x, frac_y;
    int pixel[4][4];
    int hor[4];
    int ver;
    int x, y, h, v;

#if 0
    int_x = ref_pos_x >> 2;
    int_y = ref_pos_y >> 2;
#endif
    frac_x = ref_pos_x & 3;
    frac_y = ref_pos_y & 3;

    // int_buf fill-up function is here. it's too long but is not the bottleneck
    int_fill(int_buf);

    BEG;

    for (y = 0; y < blk_size; y++) {
        for (x = 0; x < blk_size; x++) {
            for (v = 0; v < 4; v++) {
                for (h = 0; h < 4; h++) {
                    pixel[v][h] = int_buf[(y + v) * (blk_size + 3) + (x + h)];
                }
            }

            // 4x4 filtering
            for (v = 0; v < 4; v++) {
                hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
                    pixel[v][1] * c_coef_4tap[frac_x][1] +
                    pixel[v][2] * c_coef_4tap[frac_x][2] +
                    pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
                hor[v] = I_CLIP3(0, 255, hor[v]);
            }

            ver = (hor[0] * c_coef_4tap[frac_y][0] +
                hor[1] * c_coef_4tap[frac_y][1] +
                hor[2] * c_coef_4tap[frac_y][2] +
                hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;

            ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
        }
    }

    END;
}

void
s_orig_omp1(int list, int ref_pos_x, int ref_pos_y, int blk_size,
    int ref_buf[])
{
    int int_buf[35 * 35];               // 1+32+2 = 35
#if 0
    int int_x, int_y;
#endif
    int frac_x, frac_y;
    int pixel[4][4];
    int hor[4];
    int ver;
    int x, y, h, v;

#if 0
    int_x = ref_pos_x >> 2;
    int_y = ref_pos_y >> 2;
#endif
    frac_x = ref_pos_x & 3;
    frac_y = ref_pos_y & 3;

    // int_buf fill-up function is here. it's too long but is not the bottleneck
    int_fill(int_buf);

    BEG;

#pragma omp parallel for
    for (y = 0; y < blk_size; y++) {
        for (x = 0; x < blk_size; x++) {
            for (v = 0; v < 4; v++) {
                for (h = 0; h < 4; h++) {
                    pixel[v][h] = int_buf[(y + v) * (blk_size + 3) + (x + h)];
                }
            }

            // 4x4 filtering
            for (v = 0; v < 4; v++) {
                hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
                    pixel[v][1] * c_coef_4tap[frac_x][1] +
                    pixel[v][2] * c_coef_4tap[frac_x][2] +
                    pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
                hor[v] = I_CLIP3(0, 255, hor[v]);
            }

            ver = (hor[0] * c_coef_4tap[frac_y][0] +
                hor[1] * c_coef_4tap[frac_y][1] +
                hor[2] * c_coef_4tap[frac_y][2] +
                hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;

            ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
        }
    }

    END;
}

void
s_orig_omp2(int list, int ref_pos_x, int ref_pos_y, int blk_size,
    int ref_buf[])
{
    int int_buf[35 * 35];               // 1+32+2 = 35
#if 0
    int int_x, int_y;
#endif
    int frac_x, frac_y;
    int pixel[4][4];
    int hor[4];

#if 0
    int_x = ref_pos_x >> 2;
    int_y = ref_pos_y >> 2;
#endif
    frac_x = ref_pos_x & 3;
    frac_y = ref_pos_y & 3;

    // int_buf fill-up function is here. it's too long but is not the bottleneck
    int_fill(int_buf);

    BEG;

#pragma omp parallel for private(pixel,hor) shared(int_buf,frac_x,frac_y)
    for (int y = 0; y < blk_size; y++) {
        for (int x = 0; x < blk_size; x++) {
            for (int v = 0; v < 4; v++) {
                for (int h = 0; h < 4; h++) {
                    pixel[v][h] = int_buf[(y + v) * (blk_size + 3) + (x + h)];
                }
            }

            // 4x4 filtering
            for (int v = 0; v < 4; v++) {
                hor[v] = (pixel[v][0] * c_coef_4tap[frac_x][0] +
                    pixel[v][1] * c_coef_4tap[frac_x][1] +
                    pixel[v][2] * c_coef_4tap[frac_x][2] +
                    pixel[v][3] * c_coef_4tap[frac_x][3] + 32) >> 6;
                hor[v] = I_CLIP3(0, 255, hor[v]);
            }

            int ver = (hor[0] * c_coef_4tap[frac_y][0] +
                hor[1] * c_coef_4tap[frac_y][1] +
                hor[2] * c_coef_4tap[frac_y][2] +
                hor[3] * c_coef_4tap[frac_y][3] + 32) >> 6;

            ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
        }
    }

    END;
}

void
s_fix1(int list, int ref_pos_x, int ref_pos_y, int blk_size,
    int ref_buf[])
{
    int int_buf[35 * 35];               // 1+32+2 = 35
#if 0
    int int_x, int_y;
#endif
    int frac_x, frac_y;
    int x, y, h, v;

#if 0
    int_x = ref_pos_x >> 2;
    int_y = ref_pos_y >> 2;
#endif
    frac_x = ref_pos_x & 3;
    frac_y = ref_pos_y & 3;

    // int_buf fill-up function is here. it's too long but is not the bottleneck
    int_fill(int_buf);

    BEG;

    for (y = 0; y < blk_size; y++) {
        for (x = 0; x < blk_size; x++) {
            int ver = 0;

            for (v = 0; v < 4; v++) {
                int hor_h = 0;

                for (h = 0; h < 4; h++) {
                    int pixel = int_buf[(y + v) * (blk_size + 3) + (x + h)];
                    hor_h += pixel * c_coef_4tap[frac_x][h];
                }

                hor_h += 32;
                hor_h >>= 6;
                hor_h = I_CLIP3(0, 255, hor_h);

                ver += hor_h * c_coef_4tap[frac_y][v];
            }

            ver += 32;
            ver >>= 6;

            ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
        }
    }

    END;
}

void
s_fix1_omp(int list, int ref_pos_x, int ref_pos_y, int blk_size,
    int ref_buf[])
{
    int int_buf[35 * 35];               // 1+32+2 = 35
#if 0
    int int_x, int_y;
#endif
    int frac_x, frac_y;
#if 0
    int x, y, h, v;
#endif

#if 0
    int_x = ref_pos_x >> 2;
    int_y = ref_pos_y >> 2;
#endif
    frac_x = ref_pos_x & 3;
    frac_y = ref_pos_y & 3;

    // int_buf fill-up function is here. it's too long but is not the bottleneck
    int_fill(int_buf);

    BEG;

#pragma omp parallel for shared(int_buf,frac_x,frac_y)
    for (int y = 0; y < blk_size; y++) {
        for (int x = 0; x < blk_size; x++) {
            int ver = 0;

            for (int v = 0; v < 4; v++) {
                int hor_h = 0;

                for (int h = 0; h < 4; h++) {
                    int pixel = int_buf[(y + v) * (blk_size + 3) + (x + h)];
                    hor_h += pixel * c_coef_4tap[frac_x][h];
                }

                hor_h += 32;
                hor_h >>= 6;
                hor_h = I_CLIP3(0, 255, hor_h);

                ver += hor_h * c_coef_4tap[frac_y][v];
            }

            ver += 32;
            ver >>= 6;

            ref_buf[y * blk_size + x] = I_CLIP3(0, 255, ver);
        }
    }

    END;
}

#define FNCDEF(_sym,_reason) \
    .fnc_ptr = _sym, .fnc_sym = #_sym, .fnc_reason = _reason

struct fncdef fnclist[] = {
    { FNCDEF(s_orig,"OP's original code") },
#ifdef _OPENMP
    { FNCDEF(s_orig_omp1,"OP's -fopenmp code [BROKEN]") },
    { FNCDEF(s_orig_omp2,"OP's -fopenmp code (with fixes)") },
#endif
    { FNCDEF(s_fix1,"Craig's cleanup") },
#ifdef _OPENMP
    { FNCDEF(s_fix1_omp,"Craig's -fopenmp") },
#endif
    { .fnc_ptr = NULL }
};

#define FNCALL \
    struct fncdef *fnc = fnclist;  fnc->fnc_ptr != NULL;  ++fnc

void
dofnc(struct fncdef *fnc,
    int list, int ref_pos_x, int ref_pos_y, int blk_size)
{
    double t_dif;

    fnc->fnc_elap = 1e9;

    for (int iter = 1;  iter <= 5;  ++iter) {
        fnc->fnc_ptr(list,ref_pos_x,ref_pos_y,blk_size,fnc->fnc_ref);
        t_dif = t_end - t_beg;
        if (t_dif < fnc->fnc_elap)
            fnc->fnc_elap = t_dif;
    }

    printf("ELAPSED: %.9f %s (%s)\n",
        fnc->fnc_elap,fnc->fnc_sym,fnc->fnc_reason);
}

void
dotest(int list,int ref_pos_x,int ref_pos_y,int blk_size)
{

    for (FNCALL) {
        fnc->fnc_ref = malloc(sizeof(int) * blk_size * blk_size);
        dofnc(fnc,list,ref_pos_x,ref_pos_y,blk_size);
    }

    struct fncdef *ref = NULL;
    for (FNCALL) {
        if (ref == NULL) {
            ref = fnc;
            continue;
        }

        const int *ref_orig = ref->fnc_ref;
        const int *ref_fast = fnc->fnc_ref;

        printf("dotest: COMPARE %s\n",fnc->fnc_sym);

        for (int y = 0;  y < blk_size;  ++y) {
            for (int x = 0;  x < blk_size;  ++x) {
                int orig = ref_orig[(y * blk_size) + x];
                int fast = ref_fast[(y * blk_size) + x];
                if (fast != orig) {
                    printf("dotest: [%3d][%3d] %4d %4d\n",y,x,orig,fast);
                    //exit(1);
                }
            }
        }
    }

    for (FNCALL)
        free(fnc->fnc_ref);
}

int
main(void)
{

    //dotest(0,35,0,16);
    dotest(0,35,0,32);

    return 0;
}

在上面的代码中，我使用

cpp

条件来表示旧代码与新代码：

#if 0
// old code
#else
// new code
#endif

#if 1
// new code
#endif

注意：这可以通过运行文件来清理

unifdef -k

以下是我使用的命令：

cc -o sO0 bench.c -O0 -Wall -Werror -Wno-unknown-pragmas
./sO0
ELAPSED: 0.000141978 s_orig (OP's original code)
ELAPSED: 0.000132095 s_fix1 (Craig's cleanup)
dotest: COMPARE s_fix1

cc -o sO2 bench.c -O2 -Wall -Werror -Wno-unknown-pragmas
./sO2
ELAPSED: 0.000027120 s_orig (OP's original code)
ELAPSED: 0.000023231 s_fix1 (Craig's cleanup)
dotest: COMPARE s_fix1

cc -o std bench.c -O3 -Wall -Werror -Wno-unknown-pragmas
./std
ELAPSED: 0.000010765 s_orig (OP's original code)
ELAPSED: 0.000010280 s_fix1 (Craig's cleanup)
dotest: COMPARE s_fix1

cc -o omp bench.c -O3 -Wall -Werror -fopenmp
./omp
ELAPSED: 0.000010361 s_orig (OP's original code)
ELAPSED: 0.000022512 s_orig_omp1 (OP's -fopenmp code [BROKEN])
ELAPSED: 0.000004398 s_orig_omp2 (OP's -fopenmp code (with fixes))
ELAPSED: 0.000015385 s_fix1 (Craig's cleanup)
ELAPSED: 0.000004432 s_fix1_omp (Craig's -fopenmp)
dotest: COMPARE s_orig_omp1
dotest: [  0][ 14]   51  255
dotest: [  3][ 17]  159  255
dotest: [  3][ 23]  165  255
dotest: [  5][  0]  212  255
dotest: [  8][ 18]  255  190
dotest: [ 12][  0]  255  181
dotest: [ 16][  0]  255   73
dotest: [ 16][ 13]  255  241
dotest: [ 20][  8]  255   38
dotest: [ 22][  0]  255  100
dotest: [ 24][ 12]  255   44
dotest: [ 25][ 21]  255  241
dotest: [ 25][ 22]  255  247
dotest: [ 28][ 21]  255   66
dotest: [ 28][ 22]  255  219
dotest: [ 28][ 23]  255  221
dotest: [ 28][ 25]  255  228
dotest: [ 29][  0]  255   89
dotest: [ 30][  0]  255  125
dotest: [ 31][  1]  255  169
dotest: COMPARE s_orig_omp2
dotest: COMPARE s_fix1
dotest: COMPARE s_fix1_omp

如何使用 SIMD 优化这个“点积”函数？它是 Mat4x4 * Vec4，但具有巨大的跨步访问

问题描述投票：0回答：1

1个回答

最新问题

如何使用 SIMD 优化这个“点积”函数？它是 Mat4x4 * Vec4，但具有巨大的跨步访问

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1