[FFmpeg-devel] [PATCH 3/3] lavc/h264dsp: optimise R-V V biweight for shorter heights

Rémi Denis-Courmont remi at remlab.net
Sun Sep 1 19:17:44 EEST 2024


T-Head C908:
h264_biweight2_8_c:                                    313.7 ( 1.00x)
h264_biweight2_8_rvv_i32:              before          239.5 ( 1.23x)
h264_biweight2_8_rvv_i32:              after            72.7 ( 4.31x)
h264_biweight4_8_c:                                    582.0 ( 1.00x)
h264_biweight4_8_rvv_i32:              before          471.0 ( 1.16x)
h264_biweight4_8_rvv_i32:              after            91.5 ( 6.36x)
h264_biweight8_8_c:                                   1110.0 ( 1.00x)
h264_biweight8_8_rvv_i32:              before          943.3 ( 1.10x)
h264_biweight8_8_rvv_i64:              after           147.0 ( 7.55x)

SpacemiT X60:
h264_biweight2_8_c:                                    311.4 ( 1.00x)
h264_biweight2_8_rvv_i32:              before          363.1 ( 0.83x)
h264_biweight2_8_rvv_i32:              after           103.1 ( 3.02x)
h264_biweight4_8_c:                                    571.9 ( 1.00x)
h264_biweight4_8_rvv_i32:              before          717.4 ( 0.78x)
h264_biweight4_8_rvv_i32:              after            71.8 ( 7.96x)
h264_biweight8_8_c:                                   1103.1 ( 1.00x)
h264_biweight8_8_rvv_i32:              before         1415.2 ( 0.76x)
h264_biweight8_8_rvv_i64:              ater             92.8 (11.88x)
---
 libavcodec/riscv/h264dsp_init.c | 23 ++++++---
 libavcodec/riscv/h264dsp_rvv.S  | 89 +++++++++++----------------------
 2 files changed, 44 insertions(+), 68 deletions(-)

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 643673d1d6..30dd272d6e 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -97,23 +97,30 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
         const bool zvl128b = ff_rv_vlen_least(128);
 
         if (bit_depth == 8) {
-            if (zvl128b && (flags & AV_CPU_FLAG_RVB))
-                dsp->weight_h264_pixels_tab[0] =
-                    ff_h264_weight_funcs_8_rvv[0].weight;
-            if (flags & AV_CPU_FLAG_RVV_I64)
+            if (zvl128b) {
+                if (flags & AV_CPU_FLAG_RVB)
+                    dsp->weight_h264_pixels_tab[0] =
+                        ff_h264_weight_funcs_8_rvv[0].weight;
+                dsp->biweight_h264_pixels_tab[0] =
+                    ff_h264_weight_funcs_8_rvv[0].biweight;
+            }
+            if (flags & AV_CPU_FLAG_RVV_I64) {
                 dsp->weight_h264_pixels_tab[1] =
                     ff_h264_weight_funcs_8_rvv[1].weight;
+                dsp->biweight_h264_pixels_tab[1] =
+                    ff_h264_weight_funcs_8_rvv[1].biweight;
+            }
             dsp->weight_h264_pixels_tab[2] =
                  ff_h264_weight_funcs_8_rvv[2].weight;
+            dsp->biweight_h264_pixels_tab[2] =
+                 ff_h264_weight_funcs_8_rvv[2].biweight;
             dsp->weight_h264_pixels_tab[3] =
                  ff_h264_weight_funcs_8_rvv[3].weight;
+            dsp->biweight_h264_pixels_tab[3] =
+                 ff_h264_weight_funcs_8_rvv[3].biweight;
         }
 
         if (bit_depth == 8 && zvl128b) {
-            for (int i = 0; i < 4; i++)
-                dsp->biweight_h264_pixels_tab[i] =
-                    ff_h264_weight_funcs_8_rvv[i].biweight;
-
             dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma_mbaff =
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index 9e2f3bc038..b1e11f92b4 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -76,20 +76,16 @@ func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x, b
 
         ret
 endfunc
-.endm
-
-h264_weight 8, 2, 16
-h264_weight 8, 4, 32
-h264_weight 8, 8, 64
-h264_weight 8, 16
 
-        .variant_cc ff_h264_biweight_pixels_simple_8_rvv
-func ff_h264_biweight_pixels_simple_8_rvv, zve32x
+func ff_h264_biweight_pixels\w\()_8_rvv, zve64x
+        lpad    0
         csrwi   vxrm, 2
         addi    a7, a7, 1
         ori     a7, a7, 1
         sll     a7, a7, a4
         addi    a4, a4, 1
+        .ifb    \b
+        li      t6, \w
 1:
         vsetvli zero, t6, e16, m2, ta, ma
         vle8.v  v8, (a0)
@@ -106,65 +102,38 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x
         vnclipu.wx  v8, v16, a4
         vse8.v  v8, (a0)
         add     a0, a0, a2
-        bnez    a3, 1b
-
-        ret
-endfunc
-
-        .variant_cc ff_h264_biweight_pixels_8_rvv
-func ff_h264_biweight_pixels_8_rvv, zve32x
-        csrwi   vxrm, 2
-        addi    a7, a7, 1
-        ori     a7, a7, 1
-        sll     a7, a7, a4
-        addi    a4, a4, 1
+        .else
+        li      t6, \w
 1:
-        mv      t0, a0
-        mv      t1, a1
-        mv      t5, t6
-2:
-        vsetvli     t2, a3, e16, m8, ta, ma
-        vlsseg2e8.v v0, (t0), a2
-        vlsseg2e8.v v8, (t1), a2
-        addi    t5, t5, -2
-        vmv.v.x v16, a7
-        vmv.v.x v24, a7
-        vsetvli     zero, zero, e8, m4, ta, ma
-        vwmaccsu.vx v16, a5, v0
-        vwmaccsu.vx v24, a5, v4
-        vwmaccsu.vx v16, a6, v8
-        vwmaccsu.vx v24, a6, v12
-        vsetvli     zero, zero, e16, m8, ta, ma
+        vsetvli     t1, a3, e\b, m2, ta, ma
+        vlse\b\().v v8, (a0), a2
+        sub     a3, a3, t1
+        vlse\b\().v v12, (a1), a2
+        mul     t2, t1, a2
+        vsetvli     t0, zero, e16, m4, ta, ma
+        vmv.v.x     v16, a7
+        vsetvli     zero, zero, e8, m2, ta, ma
+        vwmaccsu.vx v16, a5, v8
+        add     a1, a1, t2
+        vwmaccsu.vx v16, a6, v12
+        vsetvli     zero, zero, e16, m4, ta, ma
         vmax.vx     v16, v16, zero
-        vmax.vx     v24, v24, zero
-        vsetvli     zero, zero, e8, m4, ta, ma
-        vnclipu.wx  v0, v16, a4
-        vnclipu.wx  v4, v24, a4
-        vssseg2e8.v v0, (t0), a2
-        addi    t0, t0, 2
-        addi    t1, t1, 2
-        bnez    t5, 2b
-
-        mul     t3, a2, t2
-        sub     a3, a3, t2
-        add     a0, a0, t3
-        add     a1, a1, t3
+        vsetvli     zero, zero, e8, m2, ta, ma
+        vnclipu.wx  v8, v16, a4
+        vsetvli     zero, t1, e\b, m2, ta, ma
+        vsse\b\().v v8, (a0), a2
+        add     a0, a0, t2
+        .endif
         bnez    a3, 1b
 
         ret
 endfunc
+.endm
 
-.irp    w, 16, 8, 4, 2
-func ff_h264_biweight_pixels\w\()_8_rvv, zve32x
-        lpad    0
-        li      t6, \w
-        .if     \w == 16
-        j       ff_h264_biweight_pixels_simple_8_rvv
-        .else
-        j       ff_h264_biweight_pixels_8_rvv
-        .endif
-endfunc
-.endr
+h264_weight 8, 2, 16
+h264_weight 8, 4, 32
+h264_weight 8, 8, 64
+h264_weight 8, 16
 
         .global ff_h264_weight_funcs_8_rvv
         .hidden ff_h264_weight_funcs_8_rvv
-- 
2.45.2



More information about the ffmpeg-devel mailing list