如何使用 SSE/SSE2/AVX/... 对 3、5、7、9 个输入进行有效的按位多数投票?

问题描述 投票:0回答:1

我有几个(例如 3、5、7 或 9 个)大小相同的巨大数据块(例如 100KB-100MB),并且想要进行按位多数投票,以获得每一位最常用值的一个块。为了加快速度,我想使用 SSE/SSE2/AVX/NEON/... CPU 扩展。

我只是手动按位尝试了一下,结果很慢。

assembly sse avx neon avx512
1个回答
0
投票

一种选择是使用便携式 std::experimental::simd 库并以直接的方式总结投票计数:

#include <cstdint>
#include <iostream>
#include <array>
#include <experimental/simd>

template<size_t R>
__attribute__((noinline))
void positional_popcount(uint8_t* majority_votes, size_t n_votes, std::array<uint8_t const*, R> vote_rows) {
    using namespace std::experimental;

    using V = simd<uint8_t>; // Use default platform vector size.
    V const ones{1};

    using counter_t = int8_t;
    static_assert(R == counter_t{R}, "Number of vote rows must fit into counter_t.");
    using C = simd<counter_t>;
    C max_minority_votes{counter_t{R / 2}};

    for(size_t c = 0; c < n_votes; c += V::size()) {
        // Count the number of votes in each bit position.
        C counts[8]{};
        for(auto r: vote_rows) {
            V vr{r + c, element_aligned};
            for(int b = 0; b < 8; ++b)
                counts[b] += static_simd_cast<counter_t>(min(vr & ones << b, ones));
        }
        // Convert vote counts into majority votes.
        V votes{};
        for(int b = 0; b < 8; ++b)
            votes |= static_simd_cast<uint8_t>(clamp(counts[b] - max_minority_votes, C{0}, C{1})) << b;
        votes.copy_to(majority_votes + c, element_aligned);
    }
}

void print_bits(uint8_t const* bits, size_t n_votes) {
    for(size_t c = 0; c < n_votes; ++c) {
        unsigned b = bits[c];
        for(unsigned mask = 0x80; mask; mask >>= 1) {
            char c = '0' + static_cast<bool>(b & mask);
            std::cout.put(c);
        }
        std::cout.put(' ');
    }
    std::cout.put('\n');
}

int main() {
    constexpr int R = 3, C = 256, C2 = 16;

    uint8_t votes[R][C];
    for(int r = 0; r < R; ++r) {
        for(int c = 0; c < C; ++c) {
            unsigned v = ((2u << r) - 1) << (c & 7);
            votes[r][c] = v;
        }
    }

    std::cout << "votes:\n";
    for(int r = 0; r < R; ++r)
        print_bits(votes[r], C2);

    uint8_t majority_votes[C] = {};
    std::array<uint8_t const*, R> vote_rows;
    for(int r = 0; r < R; ++r)
        vote_rows[r] = votes[r];
    positional_popcount(majority_votes, C, vote_rows);

    std::cout << "majority_votes:\n";
    print_bits(majority_votes, C2);
}

输出:

votes:
00000001 00000010 00000100 00001000 00010000 00100000 01000000 10000000 00000001 00000010 00000100 00001000 00010000 00100000 01000000 10000000 
00000011 00000110 00001100 00011000 00110000 01100000 11000000 10000000 00000011 00000110 00001100 00011000 00110000 01100000 11000000 10000000 
00000111 00001110 00011100 00111000 01110000 11100000 11000000 10000000 00000111 00001110 00011100 00111000 01110000 11100000 11000000 10000000 
majority_votes:
00000011 00000110 00001100 00011000 00110000 01100000 11000000 10000000 00000011 00000110 00001100 00011000 00110000 01100000 11000000 10000000 

生成的组件很不错,循环展开,没有额外的负载或存储超出所需的最小值。

© www.soinside.com 2019 - 2024. All rights reserved.