如何使用AVX512指令通过alpha混合叠加图像？

Question

我有两个图像 A 和 B，它们存储为 ARGB 数据的字节数组：

Image A: [a0, r0, g0, b0, a1, r1, g1, b1, ...]
Image B: [a0, r0, g0, b0, a1, r1, g1, b1, ...]

我想使用 alpha 混合公式将图像 B 叠加在 A 之上。

如何使用一次对多个像素进行操作的 AVX512 指令来实现此目的？

我不介意在计算中使用 256 而不是 255，如果这能让事情变得更简单的话。

编辑：

我尝试根据另一个stackoverflow答案来实现这一点。然而，它似乎比一次运行一个像素的非 AVX512 代码慢。我做错了什么？

我尝试不使用lazy_static！（因为我认为它使用锁定数据结构）并将常量传递到函数中，但它仍然较慢。这难道不是 AVX512 解决的好问题吗？看来应该如此。

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn overlay_row_avx512(this_chunk: &mut [u8], image_chunk: &[u8]) {
    use std::arch::x86_64::*;

    let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
    let image_ptr = image_chunk.as_ptr() as *const i8;

    let this_argb = _mm512_loadu_epi8(this_ptr);
    let image_argb = _mm512_loadu_epi8(image_ptr);

    // Pick out the upper 256-bits and calculate inv_alpha.
    let this_upper = _mm512_shuffle_epi8(this_argb, *UPPER_TO_U16);
    let image_upper = _mm512_shuffle_epi8(image_argb, *UPPER_TO_U16);
    let alpha_upper = _mm512_shuffle_epi8(image_argb, *UPPER_ALPHA_TO_U16);
    let inv_alpha_upper = _mm512_subs_epu8(*U8_MAX_VALUE, alpha_upper);

    // Apply the blend function and store the result in blended_upper_u8.
    let this_blended_upper = _mm512_mullo_epi16(this_upper, inv_alpha_upper);
    let image_blended_upper = _mm512_mullo_epi16(image_upper, alpha_upper); // TODO: premultiply alpha
    let blended_upper = _mm512_add_epi16(this_blended_upper, image_blended_upper);
    let blended_upper_u8 = _mm512_shuffle_epi8(blended_upper, *UPPER_U16_TO_U8);

    // Repeat for the lower 256-bits.
    let this_lower = _mm512_shuffle_epi8(this_argb, *LOWER_TO_U16);
    let image_lower = _mm512_shuffle_epi8(image_argb, *LOWER_TO_U16);
    let alpha_lower = _mm512_shuffle_epi8(image_argb, *LOWER_ALPHA_TO_U16);
    let inv_alpha_lower = _mm512_subs_epu8(*U8_MAX_VALUE, alpha_lower);

    let this_blended_lower = _mm512_mullo_epi16(this_lower, inv_alpha_lower);
    let image_blended_lower = _mm512_mullo_epi16(image_lower, alpha_lower); // TODO: premultiply alpha
    let blended_lower = _mm512_add_epi16(this_blended_lower, image_blended_lower);
    let blended_lower_u8 = _mm512_add_epi16(blended_lower, *LOWER_U16_TO_U8);

    // OR together the upper and lower 256-bits.
    let blended = _mm512_or_si512(blended_upper_u8, blended_lower_u8);

    _mm512_storeu_epi8(this_ptr, blended);
}

lazy_static! {
    static ref U8_MAX_VALUE: __m512i = unsafe { _mm512_set1_epi8(-1) };

    static ref UPPER_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 63, X, 62, X, 61, X, 60, X, 59, X, 58, X, 57, X, 56,
            X, 55, X, 54, X, 53, X, 52, X, 51, X, 50, X, 49, X, 48,
            X, 47, X, 46, X, 45, X, 44, X, 43, X, 42, X, 41, X, 40,
            X, 39, X, 38, X, 37, X, 36, X, 35, X, 34, X, 33, X, 32,
        )
    };

    static ref LOWER_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 31, X, 30, X, 29, X, 28, X, 27, X, 26, X, 25, X, 24,
            X, 23, X, 22, X, 21, X, 20, X, 19, X, 18, X, 17, X, 16,
            X, 15, X, 14, X, 13, X, 12, X, 11, X, 10, X,  9, X,  8,
            X,  7, X,  6, X,  5, X,  4, X,  3, X,  2, X,  1, X,  0,
        )
    };

    static ref UPPER_ALPHA_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 63, X, 63, X, 63, X, 63, X, 59, X, 59, X, 59, X, 59,
            X, 55, X, 55, X, 55, X, 55, X, 51, X, 51, X, 51, X, 51,
            X, 47, X, 47, X, 47, X, 47, X, 43, X, 43, X, 43, X, 43,
            X, 39, X, 39, X, 39, X, 39, X, 35, X, 35, X, 35, X, 35,
        )
    };

    static ref LOWER_ALPHA_TO_U16: __m512i = unsafe {
        _mm512_set_epi8(
            X, 31, X, 31, X, 31, X, 31, X, 27, X, 27, X, 27, X, 27,
            X, 23, X, 23, X, 23, X, 23, X, 19, X, 19, X, 19, X, 19,
            X, 15, X, 15, X, 15, X, 15, X, 11, X, 11, X, 11, X, 11,
            X,  7, X,  7, X,  7, X,  7, X,  3, X,  3, X,  3, X,  3,
        )
    };

    // Pick out the upper 8-bits of each 16-bit u16.
    // This effectively divides by 256.
    static ref UPPER_U16_TO_U8: __m512i = unsafe {
        _mm512_set_epi8(
            63, X, 62, X, 61, X, 60, X, 59, X, 58, X, 57, X, 56, X,
            55, X, 54, X, 53, X, 52, X, 51, X, 50, X, 49, X, 48, X,
            47, X, 46, X, 45, X, 44, X, 43, X, 42, X, 41, X, 40, X,
            39, X, 38, X, 37, X, 36, X, 35, X, 34, X, 33, X, 32, X,
        )
    };

    static ref LOWER_U16_TO_U8: __m512i = unsafe {
        _mm512_set_epi8(
            31, X, 30, X, 29, X, 28, X, 27, X, 26, X, 25, X, 24, X,
            23, X, 22, X, 21, X, 20, X, 19, X, 18, X, 17, X, 16, X,
            15, X, 14, X, 13, X, 12, X, 11, X, 10, X,  9, X,  8, X,
             7, X,  6, X,  5, X,  4, X,  3, X,  2, X,  1, X,  0, X,
        )
    };
}

为了进行比较，这是我一次运行一个像素的代码：

// A chunk is just 4 bytes in this case rather than 64 bytes.
fn overlay_row_without_simd(this_chunk: &mut [u8], image_chunk: &[u8]) {
    let alpha = image_chunk[0] as u32;
    let inv_alpha = 255 - alpha;

    this_chunk[1] = ((this_chunk[1] as u32 * inv_alpha + image_chunk[1] as u32 * alpha) / 255) as u8;
    this_chunk[2] = ((this_chunk[2] as u32 * inv_alpha + image_chunk[2] as u32 * alpha) / 255) as u8;
    this_chunk[3] = ((this_chunk[3] as u32 * inv_alpha + image_chunk[3] as u32 * alpha) / 255) as u8;
}

Answer 1

我设法弄清楚发生了什么事！

基本上，我的问题中的实现是错误的，最终生成了一个带有大量随机像素的视频。就我而言，我将每个帧的输出通过管道传输到 ffmpeg，由于所有随机颜色，压缩帧非常困难，这就是我的程序运行速度慢两倍的原因 - 不是因为有什么可做的与 AVX512 代码。

我花了很长时间弄清楚如何在 AVX512 中正确执行此操作并修复我的代码。然后我直接对该函数进行了基准测试，发现它的运行速度比单像素代码快 5.38 倍。我故意编写它，以便只依赖 avx512f 和 avx512bw 功能来获得更好的 CPU 兼容性。可以使用

_mm512_permutexvar_epi8

保存一些指令，但这需要 avx512vbmi。

我的工作实现在这里，一次处理 32 个像素：

unsafe fn overlay_chunk(this_chunk: &mut [u8], image_chunk: &[u8], c: &AVX512Constants) {
    let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
    let image_ptr = image_chunk.as_ptr() as *const i8;

    let this_argb = _mm256_loadu_epi8(this_ptr);
    let image_argb = _mm256_loadu_epi8(image_ptr);

    // Extend each 8-bit integer into a 16-bit integer (zero filled).
    let this_u16 = _mm512_cvtepu8_epi16(this_argb);
    let image_u16 = _mm512_cvtepu8_epi16(image_argb);

    // Copy the alpha channel over each rgb channel.
    let image_alpha = _mm512_shuffle_epi8(image_u16, c.copy_alpha_to_rgb);

    // Calculate (255 - alpha) and set each u16 alpha value to 256.
    // We shift right by 8 bits later and 256 >> 8 equals 1.
    let image_inv_alpha = _mm512_sub_epi8(c.inv_alpha_minuend, image_alpha);

    // Apply the alpha blending formula (https://graphics.fandom.com/wiki/Alpha_blending).
    let this_blended = _mm512_mullo_epi16(this_u16, image_inv_alpha);
    let image_blended = _mm512_mullo_epi16(image_u16, image_alpha); // TODO: premultiply alpha

    let blended = _mm512_add_epi16(this_blended, image_blended);

    // Shift the u16 values right by 8 bits which divides by 256. We should
    // divide by 255 but this is faster and is close enough. The alpha value
    // of this_argb is preserved because of the 1 bits in the minuend.
    let divided = _mm512_srli_epi16(blended, 8);

    // Convert back to 8-bit integers, discarding the high bits that are zero.
    let divided_u8 = _mm512_cvtepi16_epi8(divided);

    _mm256_storeu_epi8(this_ptr, divided_u8);
}

struct AVX512Constants {
    copy_alpha_to_rgb: __m512i,
    inv_alpha_minuend: __m512i,
}

const X: i8 = -1;

impl AVX512Constants {
    fn new() -> Self {
        unsafe {
            Self {
                copy_alpha_to_rgb: _mm512_set_epi8(
                  X, 56, X, 56, X, 56, X, X, X, 48, X, 48, X, 48, X, X,
                  X, 40, X, 40, X, 40, X, X, X, 32, X, 32, X, 32, X, X,
                  X, 24, X, 24, X, 24, X, X, X, 16, X, 16, X, 16, X, X,
                  X, 8,  X, 8,  X, 8,  X, X, X, 0,  X, 0,  X, 0,  X, X, // right to left
                                                            //    v  v
                                                            // high  low
                ),
                inv_alpha_minuend: _mm512_set_epi8(
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0,
                    0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, // right to left
                                                              //    v  v
                                                              // high  low
                ),
            }
        }
    }
}

如何使用AVX512指令通过alpha混合叠加图像？

问题描述投票：0回答：1

1个回答

最新问题

如何使用AVX512指令通过alpha混合叠加图像？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1