正如标题所说,我需要一种方法将 256-avx-register 寄存器中所有元素的位置移动/洗牌 N 个位置。我发现的所有关于此的信息都使用 32 或 64 位值(__builtin_ia32_permvarsf256)等。非常感谢帮助。
Example: {2,4,4,2,4,5,0,0,0,0,...} shift right by 4 -> {0,0,0,0,2,4,4,2,4,5,...}
如果在编译时已知移位距离,则相对容易且相当快。唯一需要注意的是,32 字节字节移位指令独立于 16 字节通道执行此操作,对于少于 16 字节的移位,需要跨通道传播这几个字节。这是左移:
// Move 16-byte vector to higher half of the output, and zero out the lower half
inline __m256i setHigh( __m128i v16 )
{
const __m256i v = _mm256_castsi128_si256( v16 );
return _mm256_permute2x128_si256( v, v, 8 );
}
template<int i>
inline __m256i shiftLeftBytes( __m256i src )
{
static_assert( i >= 0 && i < 32 );
if constexpr( i == 0 )
return src;
if constexpr( i == 16 )
return setHigh( _mm256_castsi256_si128( src ) );
if constexpr( 0 == ( i % 8 ) )
{
// Shifting by multiples of 8 bytes is faster with shuffle + blend
constexpr int lanes64 = i / 8;
constexpr int shuffleIndices = ( _MM_SHUFFLE( 3, 2, 1, 0 ) << ( lanes64 * 2 ) ) & 0xFF;
src = _mm256_permute4x64_epi64( src, shuffleIndices );
constexpr int blendMask = ( 0xFF << ( lanes64 * 2 ) ) & 0xFF;
return _mm256_blend_epi32( _mm256_setzero_si256(), src, blendMask );
}
if constexpr( i > 16 )
{
// Shifting by more than half of the register
// Shift low half by ( i - 16 ) bytes to the left, and place into the higher half of the result.
__m128i low = _mm256_castsi256_si128( src );
low = _mm_slli_si128( low, i - 16 );
return setHigh( low );
}
else
{
// Shifting by less than half of the register, using vpalignr to shift.
__m256i low = setHigh( _mm256_castsi256_si128( src ) );
return _mm256_alignr_epi8( src, low, 16 - i );
}
}
但是,如果在编译时不知道移位距离,则这是相当棘手的。这是一种方法。它使用了相当多的洗牌,但我希望它仍然比使用两个 32 字节存储(其中之一是写入零)然后再进行 32 字节加载的明显方式要快一些。
// 16 bytes of 0xFF (which makes `vpshufb` output zeros), followed by 16 bytes of identity shuffle [ 0 .. 15 ], followed by another 16 bytes of 0xFF
// That data allows to shift 16-byte vectors by runtime-variable count of bytes in [ -16 .. +16 ] range
inline std::array<uint8_t, 48> makeShuffleConstants()
{
std::array<uint8_t, 48> res;
std::fill_n( res.begin(), 16, 0xFF );
for( uint8_t i = 0; i < 16; i++ )
res[ (size_t)16 + i ] = i;
std::fill_n( res.begin() + 32, 16, 0xFF );
return res;
}
// Align by 64 bytes so the complete array stays within cache line
static const alignas( 64 ) std::array<uint8_t, 48> shuffleConstants = makeShuffleConstants();
// Load shuffle constant with offset in bytes. Counterintuitively, positive offset shifts output of to the right.
inline __m128i loadShuffleConstant( int offset )
{
assert( offset >= -16 && offset <= 16 );
return _mm_loadu_si128( ( const __m128i * )( shuffleConstants.data() + 16 + offset ) );
}
// Move 16-byte vector to higher half of the output, and zero out the lower half
inline __m256i setHigh( __m128i v16 )
{
const __m256i v = _mm256_castsi128_si256( v16 );
return _mm256_permute2x128_si256( v, v, 8 );
}
inline __m256i shiftLeftBytes( __m256i src, int i )
{
assert( i >= 0 && i < 32 );
if( i >= 16 )
{
// Shifting by more than half of the register
// Shift low half by ( i - 16 ) bytes to the left, and place into the higher half of the result.
__m128i low = _mm256_castsi256_si128( src );
low = _mm_shuffle_epi8( low, loadShuffleConstant( 16 - i ) );
return setHigh( low );
}
else
{
// Shifting by less than half of the register
// Just like _mm256_slli_si256, _mm_shuffle_epi8 can't move data across 16-byte lanes, need to propagate shifted bytes manually.
__m128i low = _mm256_castsi256_si128( src );
low = _mm_shuffle_epi8( low, loadShuffleConstant( 16 - i ) );
const __m256i cv = _mm256_broadcastsi128_si256( loadShuffleConstant( -i ) );
const __m256i high = setHigh( low );
src = _mm256_shuffle_epi8( src, cv );
return _mm256_or_si256( high, src );
}
}
也许我迟到了,但这段代码要简单得多。
int8_t ShiftBox[96] = { 0 };
int8_t* WorkArea = &ShiftBox[32];
int8_t TestOriginal[32] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32};
int8_t TestResult[32];
void ShiftL(int8_t* InA, int8_t* OutA, int8_t N) {
_mm256_storeu_epi8(WorkArea, _mm256_loadu_epi8(InA));
_mm256_storeu_epi8(OutA, _mm256_loadu_epi8(WorkArea - N*sizeof(int8_t)));
}