我有两个图像 A 和 B,它们存储为 ARGB 数据的字节数组:
Image A: [a0, r0, g0, b0, a1, r1, g1, b1, ...]
Image B: [a0, r0, g0, b0, a1, r1, g1, b1, ...]
我想使用 alpha 混合公式将图像 B 叠加在 A 之上。
如何使用一次对多个像素进行操作的 AVX512 指令来实现此目的?
我不介意在计算中使用 256 而不是 255,如果这能让事情变得更简单的话。
编辑:
我尝试根据另一个stackoverflow答案来实现这一点。然而,它似乎比一次运行一个像素的非 AVX512 代码慢。我做错了什么?
我尝试不使用lazy_static! (因为我认为它使用锁定数据结构)并将常量传递到函数中,但它仍然较慢。这难道不是 AVX512 解决的好问题吗?看来应该如此。
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn overlay_row_avx512(this_chunk: &mut [u8], image_chunk: &[u8]) {
use std::arch::x86_64::*;
let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
let image_ptr = image_chunk.as_ptr() as *const i8;
let this_argb = _mm512_loadu_epi8(this_ptr);
let image_argb = _mm512_loadu_epi8(image_ptr);
// Pick out the upper 256-bits and calculate inv_alpha.
let this_upper = _mm512_shuffle_epi8(this_argb, *UPPER_TO_U16);
let image_upper = _mm512_shuffle_epi8(image_argb, *UPPER_TO_U16);
let alpha_upper = _mm512_shuffle_epi8(image_argb, *UPPER_ALPHA_TO_U16);
let inv_alpha_upper = _mm512_subs_epu8(*U8_MAX_VALUE, alpha_upper);
// Apply the blend function and store the result in blended_upper_u8.
let this_blended_upper = _mm512_mullo_epi16(this_upper, inv_alpha_upper);
let image_blended_upper = _mm512_mullo_epi16(image_upper, alpha_upper); // TODO: premultiply alpha
let blended_upper = _mm512_add_epi16(this_blended_upper, image_blended_upper);
let blended_upper_u8 = _mm512_shuffle_epi8(blended_upper, *UPPER_U16_TO_U8);
// Repeat for the lower 256-bits.
let this_lower = _mm512_shuffle_epi8(this_argb, *LOWER_TO_U16);
let image_lower = _mm512_shuffle_epi8(image_argb, *LOWER_TO_U16);
let alpha_lower = _mm512_shuffle_epi8(image_argb, *LOWER_ALPHA_TO_U16);
let inv_alpha_lower = _mm512_subs_epu8(*U8_MAX_VALUE, alpha_lower);
let this_blended_lower = _mm512_mullo_epi16(this_lower, inv_alpha_lower);
let image_blended_lower = _mm512_mullo_epi16(image_lower, alpha_lower); // TODO: premultiply alpha
let blended_lower = _mm512_add_epi16(this_blended_lower, image_blended_lower);
let blended_lower_u8 = _mm512_add_epi16(blended_lower, *LOWER_U16_TO_U8);
// OR together the upper and lower 256-bits.
let blended = _mm512_or_si512(blended_upper_u8, blended_lower_u8);
_mm512_storeu_epi8(this_ptr, blended);
}
lazy_static! {
static ref U8_MAX_VALUE: __m512i = unsafe { _mm512_set1_epi8(-1) };
static ref UPPER_TO_U16: __m512i = unsafe {
_mm512_set_epi8(
X, 63, X, 62, X, 61, X, 60, X, 59, X, 58, X, 57, X, 56,
X, 55, X, 54, X, 53, X, 52, X, 51, X, 50, X, 49, X, 48,
X, 47, X, 46, X, 45, X, 44, X, 43, X, 42, X, 41, X, 40,
X, 39, X, 38, X, 37, X, 36, X, 35, X, 34, X, 33, X, 32,
)
};
static ref LOWER_TO_U16: __m512i = unsafe {
_mm512_set_epi8(
X, 31, X, 30, X, 29, X, 28, X, 27, X, 26, X, 25, X, 24,
X, 23, X, 22, X, 21, X, 20, X, 19, X, 18, X, 17, X, 16,
X, 15, X, 14, X, 13, X, 12, X, 11, X, 10, X, 9, X, 8,
X, 7, X, 6, X, 5, X, 4, X, 3, X, 2, X, 1, X, 0,
)
};
static ref UPPER_ALPHA_TO_U16: __m512i = unsafe {
_mm512_set_epi8(
X, 63, X, 63, X, 63, X, 63, X, 59, X, 59, X, 59, X, 59,
X, 55, X, 55, X, 55, X, 55, X, 51, X, 51, X, 51, X, 51,
X, 47, X, 47, X, 47, X, 47, X, 43, X, 43, X, 43, X, 43,
X, 39, X, 39, X, 39, X, 39, X, 35, X, 35, X, 35, X, 35,
)
};
static ref LOWER_ALPHA_TO_U16: __m512i = unsafe {
_mm512_set_epi8(
X, 31, X, 31, X, 31, X, 31, X, 27, X, 27, X, 27, X, 27,
X, 23, X, 23, X, 23, X, 23, X, 19, X, 19, X, 19, X, 19,
X, 15, X, 15, X, 15, X, 15, X, 11, X, 11, X, 11, X, 11,
X, 7, X, 7, X, 7, X, 7, X, 3, X, 3, X, 3, X, 3,
)
};
// Pick out the upper 8-bits of each 16-bit u16.
// This effectively divides by 256.
static ref UPPER_U16_TO_U8: __m512i = unsafe {
_mm512_set_epi8(
63, X, 62, X, 61, X, 60, X, 59, X, 58, X, 57, X, 56, X,
55, X, 54, X, 53, X, 52, X, 51, X, 50, X, 49, X, 48, X,
47, X, 46, X, 45, X, 44, X, 43, X, 42, X, 41, X, 40, X,
39, X, 38, X, 37, X, 36, X, 35, X, 34, X, 33, X, 32, X,
)
};
static ref LOWER_U16_TO_U8: __m512i = unsafe {
_mm512_set_epi8(
31, X, 30, X, 29, X, 28, X, 27, X, 26, X, 25, X, 24, X,
23, X, 22, X, 21, X, 20, X, 19, X, 18, X, 17, X, 16, X,
15, X, 14, X, 13, X, 12, X, 11, X, 10, X, 9, X, 8, X,
7, X, 6, X, 5, X, 4, X, 3, X, 2, X, 1, X, 0, X,
)
};
}
为了进行比较,这是我一次运行一个像素的代码:
// A chunk is just 4 bytes in this case rather than 64 bytes.
fn overlay_row_without_simd(this_chunk: &mut [u8], image_chunk: &[u8]) {
let alpha = image_chunk[0] as u32;
let inv_alpha = 255 - alpha;
this_chunk[1] = ((this_chunk[1] as u32 * inv_alpha + image_chunk[1] as u32 * alpha) / 255) as u8;
this_chunk[2] = ((this_chunk[2] as u32 * inv_alpha + image_chunk[2] as u32 * alpha) / 255) as u8;
this_chunk[3] = ((this_chunk[3] as u32 * inv_alpha + image_chunk[3] as u32 * alpha) / 255) as u8;
}
我设法弄清楚发生了什么事!
基本上,我的问题中的实现是错误的,最终生成了一个带有大量随机像素的视频。就我而言,我将每个帧的输出通过管道传输到 ffmpeg,由于所有随机颜色,压缩帧非常困难,这就是我的程序运行速度慢两倍的原因 - 不是因为有什么可做的与 AVX512 代码。
我花了很长时间弄清楚如何在 AVX512 中正确执行此操作并修复我的代码。然后我直接对该函数进行了基准测试,发现它的运行速度比单像素代码快 5.38 倍。我故意编写它,以便只依赖 avx512f 和 avx512bw 功能来获得更好的 CPU 兼容性。可以使用
_mm512_permutexvar_epi8
保存一些指令,但这需要 avx512vbmi。
我的工作实现在这里,一次处理 32 个像素:
unsafe fn overlay_chunk(this_chunk: &mut [u8], image_chunk: &[u8], c: &AVX512Constants) {
let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
let image_ptr = image_chunk.as_ptr() as *const i8;
let this_argb = _mm256_loadu_epi8(this_ptr);
let image_argb = _mm256_loadu_epi8(image_ptr);
// Extend each 8-bit integer into a 16-bit integer (zero filled).
let this_u16 = _mm512_cvtepu8_epi16(this_argb);
let image_u16 = _mm512_cvtepu8_epi16(image_argb);
// Copy the alpha channel over each rgb channel.
let image_alpha = _mm512_shuffle_epi8(image_u16, c.copy_alpha_to_rgb);
// Calculate (255 - alpha) and set each u16 alpha value to 256.
// We shift right by 8 bits later and 256 >> 8 equals 1.
let image_inv_alpha = _mm512_sub_epi8(c.inv_alpha_minuend, image_alpha);
// Apply the alpha blending formula (https://graphics.fandom.com/wiki/Alpha_blending).
let this_blended = _mm512_mullo_epi16(this_u16, image_inv_alpha);
let image_blended = _mm512_mullo_epi16(image_u16, image_alpha); // TODO: premultiply alpha
let blended = _mm512_add_epi16(this_blended, image_blended);
// Shift the u16 values right by 8 bits which divides by 256. We should
// divide by 255 but this is faster and is close enough. The alpha value
// of this_argb is preserved because of the 1 bits in the minuend.
let divided = _mm512_srli_epi16(blended, 8);
// Convert back to 8-bit integers, discarding the high bits that are zero.
let divided_u8 = _mm512_cvtepi16_epi8(divided);
_mm256_storeu_epi8(this_ptr, divided_u8);
}
struct AVX512Constants {
copy_alpha_to_rgb: __m512i,
inv_alpha_minuend: __m512i,
}
const X: i8 = -1;
impl AVX512Constants {
fn new() -> Self {
unsafe {
Self {
copy_alpha_to_rgb: _mm512_set_epi8(
X, 56, X, 56, X, 56, X, X, X, 48, X, 48, X, 48, X, X,
X, 40, X, 40, X, 40, X, X, X, 32, X, 32, X, 32, X, X,
X, 24, X, 24, X, 24, X, X, X, 16, X, 16, X, 16, X, X,
X, 8, X, 8, X, 8, X, X, X, 0, X, 0, X, 0, X, X, // right to left
// v v
// high low
),
inv_alpha_minuend: _mm512_set_epi8(
0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, // right to left
// v v
// high low
),
}
}
}
}