加速嵌套循环计算 3 个数组中每对元素的交集的 popcount 的乘积

Question

我有一个看似无辜的函数

，它在紧密循环中调用并导致速度瓶颈。关于如何改进它有什么见解吗？

#define N 48
// N = 47 is also relevant
int f(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
    int E = 0;
    for (int r = 0; r < N; ++r) {
        for (int s = r; s < N; ++s) {
            int pa = __builtin_popcount(A[r] & A[s]);
            int pb = __builtin_popcount(B[r] & B[s]);
            int pc = __builtin_popcount(C[r] & C[s]);
            E += pa*pb*pc;
        }
    }
    return E;
}

我尝试过的：

(a) 在线性时间内预处理

、

数组，并仅使用

pa*pb*pc

非零的那些三元组进入双循环。然而，由于

、

中的比特分布是均匀的，因此几乎没有过滤掉任何内容。

(b) 阻塞以最小化缓存未命中

(c) 将

、

和

重新打包到

uint64_t

中，并重新加工

popcount

，以便一次处理 4 个输入

这些都没有帮助。看来迭代次数（~1000）才是主要问题。我在这里缺少什么吗？

编辑。我可以假设 AVX2 在目标处理器上可用。目前相关编译器选项包括

-O3

、

-mpopcnt

、

-funroll-loops

、

-mtune=native

和

-march=native

Answer 1

使用下三角矩阵而不是上三角矩阵，并分割对角线，得到的优化对我来说速度提高了 2.7 倍（在带有

-O3

的 Clang 14.0.3、Apple M1 上）。它允许使用矢量（NEON）指令和一些循环展开：

int f2(const uint16_t * __restrict a,
       const uint16_t * __restrict b,
       const uint16_t * __restrict c) {
    int e = 0;
    for (int r = 1; r < N; ++r)
        for (int s = 0; s < r; ++s) {
            int pa = __builtin_popcount(a[r] & a[s]);
            int pb = __builtin_popcount(b[r] & b[s]);
            int pc = __builtin_popcount(c[r] & c[s]);
            e += pa * pb * pc;
        }
    for (int r = 0; r < N; ++r) {
        int pa = __builtin_popcount(a[r]);
        int pb = __builtin_popcount(b[r]);
        int pc = __builtin_popcount(c[r]);
        e += pa * pb * pc;
    }
    return e;
}

我还尝试使用表查找而不是 popcount 指令，但这在 M1 和 Intel i7（带有

-march=native

）上速度较慢。 i7 上的 Clang 11 在此版本中的表现并不比原始版本好得多（仅提高了 10%）。

Answer 2

OP 最初并未指定可以假设的硬件支持，在这种情况下，硬件会产生很大的差异。为了支持较旧的硬件，这里有一个函数可以完成仅需要 SSE2 的工作，而不需要 popcount 的硬件支持。 SSE2 版本使用我在快速计算 __m128i 寄存器中设置位的数量中找到的 popcnt8() 函数。

static inline __m128i popcnt8(__m128i x) {
    const __m128i popcount_mask1 = _mm_set1_epi8(0x77);
    const __m128i popcount_mask2 = _mm_set1_epi8(0x0F);
    __m128i n;
    // Count bits in each 4-bit field.
    n = _mm_srli_epi64(x, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    n = _mm_srli_epi64(n, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    n = _mm_srli_epi64(n, 1);
    n = _mm_and_si128(popcount_mask1, n);
    x = _mm_sub_epi8(x, n);
    x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
    x = _mm_and_si128(popcount_mask2, x);
    return x;
}

除非硬件 popcount 不可用，否则与 OP 的函数相比，以下函数非常慢。在我的 Nehalem i5 CPU 上，我测量了 OP 的功能，在启用硬件 popcount 的情况下，大约 8000-9000 个 rdtsc“周期”，在不启用硬件 popcount 的情况下，大约 40000 个 rdtsc“周期”。 SSE2 (gcc -msse2) 函数测量了大约 19000 个周期。它确实可以工作，无需更改不同的 N 值。

int f6(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
    int r, s, E = 0;    
    __m128i ABCr, ABCs, ABC[N];
    union {
      __m128i v;
      uint8_t u[16];
    } popcounts;
    for (r = 0; r < N; ++r) {
      ABC[r] = _mm_setr_epi16(A[r], B[r], C[r], 0, 0, 0, 0, 0);
    }
    for (r = 0; r < N; ++r) {
        ABCr = ABC[r];
        ABCs = popcnt8(ABCr);
        popcounts.v = _mm_bslli_si128(ABCs, 1);
        popcounts.v = _mm_add_epi8(popcounts.v, ABCs);
        E += (popcounts.u[1])*(popcounts.u[3])*(popcounts.u[5]);
        for (s = r+1; s < N; s++) {
            ABCs = ABC[s];
            ABCs = _mm_and_si128(ABCs, ABCr);
            ABCs = popcnt8(ABCs);
            popcounts.v = _mm_bslli_si128(ABCs, 1);
            popcounts.v = _mm_add_epi8(popcounts.v, ABCs);
            E += (popcounts.u[1])*(popcounts.u[3])*(popcounts.u[5]);
        }
    }
    return E;
}

虽然这个函数的性能不会给任何人留下深刻的印象，但我发现编写和基准测试它很有趣，并且认为其他人可能也会觉得它很有趣。由于假设 SSE2 支持作为最低限度是很常见的，并且多种架构的编码可能非常复杂，因此我认为分享我所做的事情有一定的价值。如果 OP 要求广泛兼容的代码并假设不超过 SSE2 支持，那么我认为这可能是一个值得的改进。

编辑：

我通过重新排序计算，制作了该函数的更快的 SSE2 版本。它的运行速度比硬件 popcount 启用的函数稍快，约为 5900 个周期。我知道 OP 想要 AVX2，但我认为如果有人想在 AVX2 或 AVX512 中创建手动矢量化版本，这种方法很有趣。

SSE2 + AVX512：

_mm_popcnt_epi16 是一个 AVX512 内在函数，如果替换 X_mm_popcnt_epi16 应该会带来不错的性能增益，我认为，将每次调用的向量指令数量从超过 10,000 减少到不到 3,000，但我没有任何支持 AVX512 的硬件来提供基准。

static inline __m128i X_mm_popcnt_epi16(__m128i v) {
// Adapted from scalar 32 bit https://stackoverflow.com/questions/109023/count-the-number-of-set-bits-in-a-32-bit-integer
  v = _mm_sub_epi16(v, _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi16(0x5555)));
  v = _mm_add_epi16(_mm_and_si128(v, _mm_set1_epi16(0x3333)), _mm_and_si128(_mm_srli_epi16(v, 2), _mm_set1_epi16(0x3333)));
  v = _mm_and_si128(_mm_add_epi16(v, _mm_srli_epi16(v, 4)), _mm_set1_epi16(0x0f0f));
  return _mm_srli_epi16(_mm_mullo_epi16(v, _mm_set1_epi16(0x101)), 8);
}

static inline __m128i getBrs(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C, uint32_t r, uint32_t s) {
  uint32_t i;  
  __m128i Evec, popA, popB, popC, temp, temp2, Ar, Br, Cr, As, Bs, Cs; 
  Evec = _mm_setzero_si128();
  /* getBrs does sth like...
  uint32_t EB = 0;  
  uint32_t j;  
  for (i = r; i<r+8; i++) {
    for (j = s; j<s+8; j++) {
      EB += __builtin_popcount(A[i] & A[j])*__builtin_popcount(B[i] & B[j])*__builtin_popcount(C[i] & C[j]);
    }
  }
  */  
  Ar = _mm_loadu_si128( (const __m128i*)&A[r]);
  Br = _mm_loadu_si128( (const __m128i*)&B[r]);
  Cr = _mm_loadu_si128( (const __m128i*)&C[r]);
  As = _mm_loadu_si128( (const __m128i*)&A[s]);
  Bs = _mm_loadu_si128( (const __m128i*)&B[s]);
  Cs = _mm_loadu_si128( (const __m128i*)&C[s]);
  for (i=0; i<8; i++) {
    As = _mm_bsrli_si128(As, 2);
    As = _mm_insert_epi16(As, A[s], 7);
    Bs = _mm_bsrli_si128(Bs, 2);
    Bs = _mm_insert_epi16(Bs, B[s], 7);
    Cs = _mm_bsrli_si128(Cs, 2);
    Cs = _mm_insert_epi16(Cs, C[s], 7);
    temp = _mm_and_si128(Ar, As);
    popA = X_mm_popcnt_epi16(temp); 
    temp = _mm_and_si128(Br, Bs);
    popB = X_mm_popcnt_epi16(temp); 
    temp = _mm_and_si128(Cr, Cs);
    popC = X_mm_popcnt_epi16(temp); 
    temp = _mm_mullo_epi16(popA, popB);
    temp2 = _mm_mullo_epi16(temp, popC);
    Evec = _mm_add_epi16(Evec, temp2);
    s++;
  }
  return _mm_madd_epi16(Evec, _mm_set1_epi16(1));
}

int f8(
const uint16_t * __restrict A,
const uint16_t * __restrict B,
const uint16_t * __restrict C) {
  uint32_t r, i,j, Ediag = 0, E = 0;
  __m128i Evec, popA, popB, popC, temp, temp2, Ar, Br, Cr; 
  Evec = _mm_setzero_si128();
  union {
    __m128i v;
    uint32_t u32[4];
  } popcounts;
  /*
  for (i = 0; i<N; i++) {
    Ediag += __builtin_popcount(A[i] & A[i])*__builtin_popcount(B[i] & B[i])*__builtin_popcount(C[i] & C[i]);
  }
  */
  for (r = 0; r < 48; r+=8) {
      Ar = _mm_loadu_si128( (const __m128i*)&A[r]);
      Br = _mm_loadu_si128( (const __m128i*)&B[r]);
      Cr = _mm_loadu_si128( (const __m128i*)&C[r]);
      popA = X_mm_popcnt_epi16(Ar); 
      popB = X_mm_popcnt_epi16(Br); 
      popC = X_mm_popcnt_epi16(Cr); 
      temp = _mm_mullo_epi16(popA, popB);
      temp2 = _mm_mullo_epi16(temp, popC);
      Evec = _mm_add_epi16(Evec, temp2);
  }
  popcounts.v = _mm_madd_epi16(Evec, _mm_set1_epi16(1));;
  Ediag = popcounts.u32[0] + popcounts.u32[1] + popcounts.u32[2] + popcounts.u32[3];
  Evec = _mm_setzero_si128();
  for (i = 0; i<N; i+=8) {
    Evec = _mm_add_epi32(Evec, getBrs(A,B,C,i,i));
    for (j = i+8; j<N; j+=8) {
      temp = getBrs(A,B,C,i,j);
      temp = _mm_add_epi32(temp, temp);
      Evec = _mm_add_epi32(Evec, temp);
    }
  }
  popcounts.v = Evec;
  E = popcounts.u32[0] + popcounts.u32[1] + popcounts.u32[2] + popcounts.u32[3];
  return (Ediag + E)/2;  
}

加速嵌套循环计算 3 个数组中每对元素的交集的 popcount 的乘积

问题描述投票：0回答：2

2个回答

最新问题

加速嵌套循环计算 3 个数组中每对元素的交集的 popcount 的乘积

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2