[FFmpeg-devel] [PATCH 2/2] lavc/h264dsp: R-V V 8-bit MBAFF loop filter

Sun Jun 30 12:05:27 EEST 2024

Performance is (unfortunately) the same as with non-MBAFF, since the
hardware under test does not short-circuit vector tail calculations.
(IMO, a generic solution or work-around should be agreed on, rather
than bespoke approaches all over the place.)
---
 libavcodec/riscv/h264dsp_init.c |  4 ++++
 libavcodec/riscv/h264dsp_rvv.S  | 19 ++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 0d4d541992..ab412a9924 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -31,6 +31,8 @@ void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
                                       int alpha, int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
+                                            int alpha, int beta, int8_t *tc0);
 
 extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
 extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
@@ -48,6 +50,8 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
         if (bit_depth == 8 && ff_rv_vlen_least(128)) {
             dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv;
             dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv;
+            dsp->h264_h_loop_filter_luma_mbaff =
+                ff_h264_h_loop_filter_luma_mbaff_8_rvv;
         }
         dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
     }
diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S
index ea9dfb1a7e..c5d21ba607 100644
--- a/libavcodec/riscv/h264dsp_rvv.S
+++ b/libavcodec/riscv/h264dsp_rvv.S
@@ -26,6 +26,15 @@
 
 #include "libavutil/riscv/asm.S"
 
+        .macro  .mbaff
+        .irp    type,,_mbaff
+        .ifb    \type
+        .equ    IS_MBAFF, 0
+        .else
+        .equ    IS_MBAFF, 1
+        .endif
+        .endm
+
         .variant_cc ff_h264_loop_filter_luma_8_rvv
 func ff_h264_loop_filter_luma_8_rvv, zve32x
         # p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13
@@ -33,7 +42,7 @@ func ff_h264_loop_filter_luma_8_rvv, zve32x
         csrwi        vxrm, 0
         vid.v        v0
         vaaddu.vv    v14, v10, v11 # (p0 + q0 + 1) / 2
-        vsrl.vi      v0, v0, 2     # v0[i] = i / inner_iters
+        vsrl.vx      v0, v0, a4    # v0[i] = i / inner_iters
         vwsubu.vv    v16, v9, v12
         vrgather.vv  v6, v4, v0    # tc_orig
         vwaddu.vv    v18, v8, v14
@@ -116,6 +125,7 @@ func ff_h264_v_loop_filter_luma_8_rvv, zve32x
         add      t6, t5, a1
         vle8.v   v12, (t5)
         vle8.v   v13, (t6)
+        li       a4, 2 # log2(inner_iters)
         jal      t0, ff_h264_loop_filter_luma_8_rvv
         vse8.v   v9, (t2)
         vse8.v   v10, (t3)
@@ -124,13 +134,16 @@ func ff_h264_v_loop_filter_luma_8_rvv, zve32x
         ret
 endfunc
 
-func ff_h264_h_loop_filter_luma_8_rvv, zve32x
+.mbaff
+func ff_h264_h_loop_filter_luma\type\()_8_rvv, zve32x
         vsetivli zero, 4, e8, mf4, ta, ma
         vle8.v   v4, (a4)
         addi     a0, a0, -3
-        vsetivli zero, 16, e8, m1, ta, ma
+        vsetivli zero, 16 >> IS_MBAFF, e8, m1, ta, ma
         vlsseg6e8.v v8, (a0), a1
+        li       a4, 2 >> IS_MBAFF # log2(inner_iters)
         jal      t0, ff_h264_loop_filter_luma_8_rvv
         vssseg6e8.v v8, (a0), a1
         ret
 endfunc
+.endr
-- 
2.45.2