如何用Neon将8位YUV420图像转换为RGB?

问题描述 投票:0回答:1

我是 Neon 新手。我想编写一个代码,用 Neon 将 YUV420 转换为 RGB。像素为 8 位深度,我需要将它们转换为 int32_t 或 float(但仍限制在 0-255 范围内)。但是,我还没有找到任何 Neon 指令来执行此操作。顺便说一句,似乎没有指令可以在 int16 和 float 等不同类型之间进行乘法。我是否必须使用 C 代码进行数据类型转换才能满足 Neon 的要求?

下面是YUV转RGB的算法,参数范围是[0, 255],我在外面用C代码进行转换:

inline std::uint8_t clamp(std::int32_t n) {
    n &= -(n >= 0);
    return n | ((255 - n) >> 31);
}

inline void yuv2rgb_i32(std::int32_t y, std::int32_t u, std::int32_t v, std::int32_t &r, std::int32_t &g, std::int32_t &b) {
    y -= 16;
    u -= 128;
    v -= 128;

    r = clamp((std::int32_t)(1.164 * y + 1.596 * v));
    g = clamp((std::int32_t)(1.164 * y - 0.392 * u - 0.813 * v));
    b = clamp((std::int32_t)(1.164 * y + 2.017 * u));
}
c++ image simd neon yuv
1个回答
1
投票

这里是如何转换平面 YUV 的示例(BT.601 full swing,例如在 JPEG 中使用):

/*------------------------------------------------------------------------------
* Global declarations (for use with linkage to C/C++ world)
* ----------------------------------------------------------------------------*/

.global neonYUV420ToRGBAFullSwing
.internal neonYUV420ToRGBAFullSwing
.type neonYUV420ToRGBAFullSwing, %function

/*------------------------------------------------------------------------------
* Definitions (internal)
* ----------------------------------------------------------------------------*/


.struct 0
yr_stackptr:
.struct yr_stackptr+8
yr_width:
.struct yr_width+4
yr_height:
.struct yr_height+4
yr_outputstride:
.struct yr_outputstride+4
yr_lumastride:
.struct yr_lumastride+4
yr_chromastride:
.struct yr_chromastride+4
yr_transbuf:
.struct yr_transbuf+16


/*------------------------------------------------------------------------------
* Text segment
* ----------------------------------------------------------------------------*/

.text


/**
* neonYUV420ToRGBAFullSwing:
* 
* \param X0 Pointer to input Y data
* \param X1 Pointer to input U data
* \param X2 Pointer to input V data
* \param X3 Pointer to output RGBA data
* \param W4 width of block to process (in output RGBA quadruplets)
* \param W5 height of block to process (in pixels)
* \param W6 row-stride (in bytes) of output data block (rgba stride)
* \param W7 row-stride (in bytes) of input Y block (luma stride)
* \param stack row-stride (in bytes) of input U/V blocks (chroma stride)
*
*
* This function converts YUV420 (planar) in BT.601 (full-swing) format to
* 32-Bit RGBA (8888) with alpha channel set to opaque.
*
* Fixed-point arithmetic is used here and RGB is computed as follows:
*
* R = [ ( 128*Y +               179*(V-128) ) >> 7 ]
* G = [ ( 128*Y -  44*(U-128) -  91*(V-128) ) >> 7 ]
* B = [ ( 128*Y + 227*(U-128)               ) >> 7 ]
*
* A is set to 255 statically.
*
*-----------------------------------------------------------------------------*/

        .align 2
neonYUV420ToRGBAFullSwing:
        /*----------------------------------------------------------------------
        * First establish proper stack-frame for facilitated gdb debugging...
        * --------------------------------------------------------------------*/
        mov x17,sp
        sub sp,sp,#(3*16+8*16+256)
        ldr w9,[x17]                                        // R9: chroma stride
        stp x29,x30,[x17,#-16]!                             // Push return address (link register) and framepointer
        stp x20,x28,[x17,#-16]!
        stp x18,x19,[x17,#-16]!
        sub x17,x17,#4*16
        st4 {v8.16b,v9.16b,v10.16b,v11.16b},[x17]
        sub x17,x17,#4*16
        st4 {v12.16b,v13.16b,v14.16b,v15.16b},[x17]
        str w4,[sp,#yr_width]
        str w5,[sp,#yr_height]
        str w6,[sp,#yr_outputstride]
        str w7,[sp,#yr_lumastride]
        str w9,[sp,#yr_chromastride]
        adr x9,yuvrgb_full_multiplier1
        ld1 {v30.4h},[x9]
        adr x9,yuvrgb_full_multiplier2
        ld1 {v31.4h},[x9]
        movi v9.8h,#128
        dup v10.8h,v30.4h[0]
        dup v11.8h,v30.4h[1]
        dup v12.8h,v30.4h[2]
        dup v13.8h,v30.4h[3]
        dup v14.8h,v31.4h[0]
        sxtw x7,w7
        add x4,x3,x6                                        // X4:  RGBA data (next row)
        add x5,x0,x7                                        // X5:  luma data (next row)
0:
        ldr w6,[sp,#yr_width]
        /*----------------------------------------------------------------------
        * Horizontal conversion loop (8 pixel per iteration, vectorized)
        *
        * X0: input Y (current row)       X1: input U (current row)
        * X2: input V (current row)       X3: output RGB (current row)
        * X4: output RGB (next row)       X5: input Y (next row)
        * W6: loop counter
        *---------------------------------------------------------------------*/
        stp x0,x1,[sp,#-16]!
        stp x2,x3,[sp,#-16]!
        stp x4,x5,[sp,#-16]!
        cmp w6,#8
        blt 3f
1:
        ld1 {v0.8b},[x0],#8                                 // V0: y7 y6 y5 y4 y3 y2 y1 y0 (8-bit)
        ld1r {v1.4s},[x1],#4                                // V1: u6 u4 u2 u0 u6 u4 u2 u0
        ld1r {v2.4s},[x2],#4                                // V2: u6 u4 u2 u0 v6 v4 v2 v0
        uxtl v3.8h,v0.8b                                    // V3: y7 y6 y5 y4 y3 y2 y1 y0 (16-bit)
        mov v4.8b,v1.8b
        mov v5.8b,v2.8b
        ld1 {v0.8b},[x5],#8                                 // V0: y7 y6 y5 y4 y3 y2 y1 y0 (8-bit) (next row)
        sub w6,w6,#8
        zip1 v1.8b,v1.8b,v4.8b                              // V1: u6 u6 u4 u4 u2 u2 u0 u0
        zip2 v2.8b,v2.8b,v5.8b                              // V2: v6 v6 v4 v4 v2 v2 v0 v0
        uxtl v4.8h,v0.8b                                    // V4: y7 y6 y5 y4 y3 y2 y1 y0 (next row, 16-bit)
        uxtl v5.8h,v1.8b                                    // V5: u7 u6 u5 u4 u3 u2 u1 u0
        uxtl v6.8h,v2.8b                                    // V6: v7 v6 v5 v4 v3 v2 v1 v0
        sub v5.8h,v5.8h,v9.8h
        sub v6.8h,v6.8h,v9.8h
        mul v3.8h,v3.8h,v10.8h                              // V3: 128*Y
        mul v4.8h,v4.8h,v10.8h                              // V4: 128*Y (next row)
        mul v0.8h,v6.8h,v11.8h                              // V0: 179*(V-128)
        mul v7.8h,v5.8h,v14.8h                              // V7: 227*(U-128)
        mul v1.8h,v5.8h,v12.8h                              // V1: -44*(U-128)
        sqadd v0.8h,v0.8h,v3.8h                             // V0: R << 7
        mla v1.8h,v6.8h,v13.8h                              // V1: -44*(U-128)-91*(V-128)
        sqadd v7.8h,v7.8h,v3.8h                             // V7: B << 7
        sqadd v1.8h,v1.8h,v3.8h                             // V1: G << 7
        sqshrun v0.8b,v0.8h,#7                              // V0: R (8-bit)
        sqshrun v1.8b,v1.8h,#7                              // V1: G (8-bit)
        cmp w6,#8
        sqshrun v2.8b,v7.8h,#7                              // V2: B (8-bit)
        movi v3.8b,#0xff
        mul v7.8h,v6.8h,v11.8h                              // V7: 179*(V-128)
        movi v3.8b,#255
        mul v6.8h,v6.8h,v13.8h                              // V6: -91*(V-128)
        st4 {v0.8b,v1.8b,v2.8b,v3.8b},[x3],#32
        sqadd v3.8h,v7.8h,v4.8h                             // V3: R << 7 (next row)
        mla v6.8h,v5.8h,v12.8h                              // V6: -44*(U-128) - 91*(V-128)
        mul v7.8h,v5.8h,v14.8h
        sqadd v6.8h,v6.8h,v4.8h                             // V6: G << 7 (next row)
        sqadd v4.8h,v4.8h,v7.8h                             // V4: B << 7 (next row)
        sqshrun v0.8b,v3.8h,#7                              // V0: R (8-bit, next row)
        sqshrun v1.8b,v6.8h,#7                              // V1: G (8-bit, next row)
        movi v3.8b,#255
        sqshrun v2.8b,v4.8h,#7                              // V2: B (8-bit, next row)
        st4 {v0.8b,v1.8b,v2.8b,v3.8b},[x4],#32
        bge 1b
3:
        mov w19,#255
        cmp w6,#0
        ble 5f
        /*----------------------------------------------------------------------
        * Horizontal conversion loop (scalar)
        *
        * X0: input Y (current row)          X1: input U/V (current row)
        * X2: output RGB (current row)       X3: output RGB (next row)
        * W4: loop counter                   X5: input Y (next row)
        *---------------------------------------------------------------------*/
4:
        ldrb w7,[x0],#1                                     // W7: Y
        ldrb w8,[x1],#1                                     // W8: U
        ldrb w9,[x2],#1                                     // W9: V
        mov w12,#179
        sub w8,w8,#128
        sub w9,w9,#128
        lsl w7,w7,#7                                        // W7: 128*Y
        mov w11,#227
        mul w10,w9,w12                                      // W10: 179*(V-128) (re-usable)
        mov w12,#-44
        mul w11,w8,w11                                      // W11: 227*(U-128) (re-usable)
        mul w8,w8,w12                                       // WR8:  -44*(U-128)
        mov w12,#-91
        madd w8,w9,w12,w8                                   // W8:  -44*(U-128) - 91*(V-128) (re-usable)
        add w9,w7,w10                                       // W9:  R << 7
        add w12,w7,w8                                       // W12: G << 7
        add w7,w7,w11                                       // W7:  B << 7
        asr w9,w9,#7                                        // W9:  R (unclipped)
        cmp w9,#0
        csel w9,wzr,w9,mi
        asr w12,w12,#7                                      // W12: G (unclipped)
        cmp w12,#0
        csel w12,wzr,w12,mi
        asr w7,w7,#7                                        // W7:  B (unclipped)
        cmp w7,#0
        csel w7,wzr,w7,mi
        cmp w9,w19
        csel w9,w19,w9,gt
        cmp w12,w19
        csel w12,w19,w12,gt
        cmp w7,w19
        csel w7,w19,w7,gt
        strb w9,[x3],#1
        strb w12,[x3],#1
        strb w7,[x3],#1
        strb w19,[x3],#1
        /* next pixel */
        ldrb w7,[x0],#1                                     // W7:  Y
        lsl w7,w7,#7
        add w9,w7,w10                                       // W9:  R << 7
        add w12,w7,w8                                       // W12: G << 7
        add w7,w7,w11                                       // W7:  B << 7
        asr w9,w9,#7                                        // W9:  R (unclipped)
        cmp w9,#0
        csel w9,wzr,w9,mi
        asr w12,w12,#7                                      // W12: G (unclipped)
        cmp w12,#0
        csel w12,wzr,w12,mi
        asr w7,w7,#7                                        // W7:  B (unclipped)
        cmp w7,#0
        csel w7,wzr,w7,mi
        cmp w9,w19
        csel w9,w19,w9,gt
        cmp w12,w19
        csel w12,w19,w12,gt
        cmp w7,w19
        csel w7,w19,w7,gt
        strb w9,[x3],#1
        strb w12,[x3],#1
        strb w7,[x3],#1
        strb w19,[x3],#1
        /* next row */
        ldrb w7,[x5],#1                                     // W7:  Y
        lsl w7,w7,#7
        add w9,w7,w10                                       // W9:  R << 7
        add w12,w7,w8                                       // W12: G << 7
        add w7,w7,w11                                       // W7:  B << 7
        asr w9,w9,#7                                        // W9:  R (unclipped)
        cmp w9,#0
        csel w9,wzr,w9,mi
        asr w12,w12,#7                                      // W12: G (unclipped)
        cmp w12,#0
        csel w12,wzr,w12,mi
        asr w7,w7,#7                                        // W7:  B (unclipped)
        cmp w7,#0
        csel w7,wzr,w7,mi
        cmp w9,w19
        csel w9,w19,w9,gt
        cmp w12,w19
        csel w12,w19,w12,gt
        cmp w7,w19
        csel w7,w19,w7,gt
        strb w9,[x4],#1
        strb w12,[x4],#1
        strb w7,[x4],#1
        strb w19,[x4],#1
        /* next pixel */
        ldrb w7,[x5],#1                                     // W7:  Y
        lsl w7,w7,#7
        add w9,w7,w10                                       // W9:  R << 7
        add w12,w7,w8                                       // W12: G << 7
        add w7,w7,w11                                       // W7:  B << 7
        asr w9,w9,#7                                        // W9:  R (unclipped)
        cmp w9,#0
        csel w9,wzr,w9,mi
        asr w12,w12,#7                                      // W12: G (unclipped)
        cmp w12,#0
        csel w12,wzr,w12,mi
        asr w7,w7,#7                                        // W7:  B (unclipped)
        cmp w7,#0
        csel w7,wzr,w7,mi
        cmp w9,w19
        csel w9,w19,w9,gt
        cmp w12,w19
        csel w12,w19,w12,gt
        cmp w7,w19
        csel w7,w19,w7,gt
        subs w6,w6,#2
        strb w9,[x4],#1
        strb w12,[x4],#1
        strb w7,[x4],#1
        strb w19,[x4],#1
        bgt 4b
5:
        ldp x4,x5,[sp],#16
        ldp x2,x3,[sp],#16
        ldp x0,x1,[sp],#16
        ldr w7,[sp,#yr_height]
        ldrsw x6,[sp,#yr_outputstride]
        subs w7,w7,#2
        ldrsw x8,[sp,#yr_lumastride]
        add x3,x3,x6,lsl #1
        ldrsw x9,[sp,#yr_chromastride]
        add x4,x4,x6,lsl #1
        add x0,x0,x8,lsl #1
        add x5,x5,x8,lsl #1
        add x1,x1,x9
        add x2,x2,x9
        str w7,[sp,#yr_height]
        bgt 0b
        /*----------------------------------------------------------------------
        * Epilogue and out...
        *---------------------------------------------------------------------*/
        add x17,sp,#256
        ld4 {v12.16b,v13.16b,v14.16b,v15.16b},[x17],#64
        ld4 {v8.16b,v9.16b,v10.16b,v11.16b},[x17],#64
        ldp x18,x19,[x17],#16
        ldp x20,x28,[x17],#16
        ldp x29,x30,[x17],#16
        add sp,sp,#(3*16+8*16+256)
        ret                                   // Restore frame-pointer and return




/*------------------------------------------------------------------------------
* Constants
* ----------------------------------------------------------------------------*/

        .align 4
yuvrgb_full_multiplier1:
        .short 128,179,-44,-91

        .align 4
yuvrgb_full_multiplier2:
        .short 227,0,0,0

这是一个独立的文件,可以由 aarch64 的 GNU 汇编器进行汇编。一旦组装成目标文件,您可以简单地将其链接到代码的其余部分。在 C 中,您可以简单地通过以下方式引用代码:

extern void neonYUV420ToRGBAFullSwing(const uint8_t *yInput,const uint8_t *uInput,const uint8_t *vInput,uint8_t *rgbaOutput,int width,int height,int rgbaStride,int lumaStride, int chromaStride);

该代码可能不是最好的方法,但它也并不慢。它期望图像尺寸均匀,因此如果情况并非如此,请务必进行填充。

© www.soinside.com 2019 - 2024. All rights reserved.