[FFmpeg-devel] [PATCH] lavc/h264dsp: R-V V high-depth idct_add{, intra}16, idct8_add4
Rémi Denis-Courmont
remi at remlab.net
Mon Jul 15 22:11:21 EEST 2024
As with 8-bit, this tends to be faster, but results are all over the
place due to the variable distribution of non-zero coefficients.
---
libavcodec/riscv/h264dsp_init.c | 77 +++++++++--------
libavcodec/riscv/h264idct_rvv.S | 147 +++++++++++++++++++++++++-------
2 files changed, 154 insertions(+), 70 deletions(-)
diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 4fc695f158..14eea29892 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -40,26 +40,25 @@ void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
void ff_h264_h_loop_filter_luma_mbaff_8_rvv(uint8_t *pix, ptrdiff_t stride,
int alpha, int beta, int8_t *tc0);
-void ff_h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add16_8_rvv(uint8_t *dst, const int *blockoffset,
- int16_t *block, int stride,
- const uint8_t nnzc[5 * 8]);
-void ff_h264_idct_add16intra_8_rvv(uint8_t *dst, const int *blockoffset,
- int16_t *block, int stride,
- const uint8_t nnzc[5 * 8]);
-void ff_h264_idct8_add4_8_rvv(uint8_t *dst, const int *blockoffset,
- int16_t *block, int stride,
- const uint8_t nnzc[5 * 8]);
-
-void ff_h264_idct_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_9_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_10_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_12_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add_14_rvv(uint8_t *dst, int16_t *block, int stride);
+#define IDCT_DEPTH(depth) \
+void ff_h264_idct_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
+void ff_h264_idct8_add_##depth##_rvv(uint8_t *d, int16_t *s, int stride); \
+void ff_h264_idct_add16_##depth##_rvv(uint8_t *d, const int *soffset, \
+ int16_t *s, int stride, \
+ const uint8_t nnzc[5 * 8]); \
+void ff_h264_idct_add16intra_##depth##_rvv(uint8_t *d, const int *soffset, \
+ int16_t *s, int stride, \
+ const uint8_t nnzc[5 * 8]); \
+void ff_h264_idct8_add4_##depth##_rvv(uint8_t *d, const int *soffset, \
+ int16_t *s, int stride, \
+ const uint8_t nnzc[5 * 8]);
+
+IDCT_DEPTH(8)
+IDCT_DEPTH(9)
+IDCT_DEPTH(10)
+IDCT_DEPTH(12)
+IDCT_DEPTH(14)
+#undef IDCT_DEPTH
void ff_h264_add_pixels8_8_rvv(uint8_t *dst, int16_t *block, int stride);
void ff_h264_add_pixels4_8_rvv(uint8_t *dst, int16_t *block, int stride);
@@ -106,26 +105,26 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
dsp->h264_add_pixels4_clear = ff_h264_add_pixels4_8_rvv;
}
- if (bit_depth == 9) {
- if (zvl128b)
- dsp->h264_idct_add = ff_h264_idct_add_9_rvv;
- dsp->h264_idct8_add = ff_h264_idct8_add_9_rvv;
- }
- if (bit_depth == 10) {
- if (zvl128b)
- dsp->h264_idct_add = ff_h264_idct_add_10_rvv;
- dsp->h264_idct8_add = ff_h264_idct8_add_10_rvv;
- }
- if (bit_depth == 12) {
- if (zvl128b)
- dsp->h264_idct_add = ff_h264_idct_add_12_rvv;
- dsp->h264_idct8_add = ff_h264_idct8_add_12_rvv;
- }
- if (bit_depth == 14) {
- if (zvl128b)
- dsp->h264_idct_add = ff_h264_idct_add_14_rvv;
- dsp->h264_idct8_add = ff_h264_idct8_add_14_rvv;
+#define IDCT_DEPTH(depth) \
+ if (bit_depth == depth) { \
+ if (zvl128b) \
+ dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
+ if (flags & AV_CPU_FLAG_RVB_ADDR) \
+ dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
+ if (__riscv_xlen == 64 && zvl128b) { \
+ dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
+ dsp->h264_idct_add16intra = \
+ ff_h264_idct_add16intra_##depth##_rvv; \
+ } \
+ if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \
+ dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \
}
+
+ IDCT_DEPTH(9)
+ IDCT_DEPTH(10)
+ IDCT_DEPTH(12)
+ IDCT_DEPTH(14)
+
if (bit_depth > 8 && zvl128b) {
dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_16_rvv;
if (flags & AV_CPU_FLAG_RVV_I64)
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 7dd0a524fe..48de65ec0b 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -107,6 +107,7 @@ endfunc
func ff_h264_idct_add_16_rvv, zve32x
csrwi vxrm, 0
+.Lidct_add4_16_rvv:
vsetivli zero, 4, e32, m1, ta, ma
addi t1, a1, 1 * 4 * 4
vle32.v v0, (a1)
@@ -147,7 +148,7 @@ func ff_h264_idct_add_16_rvv, zve32x
vmax.vx v\n, v\n, zero
.endr
.irp n,0,1,2,3
- vmin.vx v\n, v\n, a3
+ vmin.vx v\n, v\n, a5
.endr
vsetvli zero, zero, e16, mf2, ta, ma
vncvt.x.x.w v4, v0
@@ -295,9 +296,10 @@ func ff_h264_idct8_add_8_rvv, zve32x
endfunc
func ff_h264_idct8_add_16_rvv, zve32x
- li a4, 8
csrwi vxrm, 0
- vsetivli a5, 8, e32, m1, ta, ma
+.Lidct8_add_16_rvv:
+ li a4, 8
+ vsetivli a3, 8, e32, m1, ta, ma
1:
addi t1, a1, 1 * 8 * 4
vle32.v v0, (a1)
@@ -313,11 +315,11 @@ func ff_h264_idct8_add_16_rvv, zve32x
vle32.v v5, (t5)
addi a7, a1, 7 * 8 * 4
vle32.v v6, (t6)
- sub a4, a4, a5
+ sub a4, a4, a3
vle32.v v7, (a7)
jal t0, ff_h264_idct8_rvv
vse32.v v0, (a1)
- sh2add a1, a5, a1
+ sh2add a1, a3, a1
vse32.v v1, (t1)
vse32.v v2, (t2)
vse32.v v3, (t3)
@@ -329,7 +331,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
addi a1, a1, -8 * 4
li a4, 8
- slli a6, a5, 3 + 2
+ slli a6, a3, 3 + 2
2:
vsetvli zero, zero, e32, m1, ta, ma
vlseg8e32.v v0, (a1)
@@ -348,7 +350,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
vle16.v v21, (t5)
add a7, t6, a2
vle16.v v22, (t6)
- sub a4, a4, a5
+ sub a4, a4, a3
vle16.v v23, (a7)
.irp n,0,1,2,3,4,5,6,7
vssra.vi v\n, v\n, 6
@@ -368,7 +370,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
vmax.vx v\n, v\n, zero
.endr
.irp n,0,1,2,3,4,5,6,7
- vmin.vx v\n, v\n, a3
+ vmin.vx v\n, v\n, a5
.endr
vsetvli zero, zero, e16, mf2, ta, ma
vncvt.x.x.w v16, v0
@@ -380,7 +382,7 @@ func ff_h264_idct8_add_16_rvv, zve32x
vncvt.x.x.w v22, v6
vncvt.x.x.w v23, v7
vse16.v v16, (a0)
- sh1add a0, a5, a0
+ sh1add a0, a3, a0
vse16.v v17, (t1)
vse16.v v18, (t2)
vse16.v v19, (t3)
@@ -400,12 +402,12 @@ endfunc
.irp depth, 9, 10, 12, 14
func ff_h264_idct_add_\depth\()_rvv, zve32x
- li a3, (1 << \depth) - 1
+ li a5, (1 << \depth) - 1
j ff_h264_idct_add_16_rvv
endfunc
func ff_h264_idct8_add_\depth\()_rvv, zve32x
- li a3, (1 << \depth) - 1
+ li a5, (1 << \depth) - 1
j ff_h264_idct8_add_16_rvv
endfunc
.endr
@@ -416,13 +418,13 @@ const ff_h264_scan8
endconst
#if (__riscv_xlen == 64)
-.irp depth, 8
+.irp depth, 8, 16
func ff_h264_idct_add16_\depth\()_rvv, zve32x
csrwi vxrm, 0
- addi sp, sp, -80
+ addi sp, sp, -96
lla t0, ff_h264_scan8
sd s0, (sp)
- li t1, 32 << (\depth > 8)
+ li t1, 32 * (\depth / 8)
mv s0, sp
sd ra, 8(sp)
sd s1, 16(sp)
@@ -432,9 +434,19 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
sd s5, 48(sp)
sd s6, 56(sp)
sd s7, 64(sp)
+.if \depth > 8
+ sd s8, 72(sp)
+ sd s9, 80(sp)
+ mv s8, a5
+ mv s9, a6
+.endif
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0)
+.if \depth == 8
vlse16.v v16, (a2), t1
+.else
+ vlse32.v v16, (a2), t1
+.endif
vluxei8.v v12, (a4), v8
.if \depth == 8
vsetvli zero, zero, e16, m2, ta, ma
@@ -464,17 +476,28 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
mv a1, s6
mv a2, s7
add a0, s4, t2
- beqz t1, 2f # if (nnz == 1 && block[i * 16])
- call ff_h264_idct_dc_add_\depth\()_c
+.if \depth > 8
+ mv a5, s8
+.endif
+ bnez t1, 2f # if (nnz == 1 && block[i * 16])
+ jal .Lidct_add4_\depth\()_rvv
j 3f
2:
- call .Lidct_add4_\depth\()_rvv
+.if \depth == 8
+ call ff_h264_idct_dc_add_\depth\()_c
+.else
+ jalr s9
+.endif
3:
srli s3, s3, 1
addi s5, s5, 4
- addi s6, s6, 16 * 2 << (\depth > 8)
+ addi s6, s6, 16 * 2 * (\depth / 8)
bnez s1, 1b
+.if \depth > 8
+ ld s9, 80(sp)
+ ld s8, 72(sp)
+.endif
ld s7, 64(sp)
ld s6, 56(sp)
ld s5, 48(sp)
@@ -484,16 +507,16 @@ func ff_h264_idct_add16_\depth\()_rvv, zve32x
ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
- addi sp, sp, 80
+ addi sp, sp, 96
ret
endfunc
func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
csrwi vxrm, 0
- addi sp, sp, -80
+ addi sp, sp, -96
lla t0, ff_h264_scan8
sd s0, (sp)
- li t1, 32 << (\depth > 8)
+ li t1, 32 * (\depth / 8)
mv s0, sp
sd ra, 8(sp)
sd s1, 16(sp)
@@ -503,9 +526,19 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
sd s5, 48(sp)
sd s6, 56(sp)
sd s7, 64(sp)
+.if \depth > 8
+ sd s8, 72(sp)
+ sd s9, 80(sp)
+ mv s8, a5
+ mv s9, a6
+.endif
vsetivli zero, 16, e8, m1, ta, ma
vle8.v v8, (t0)
+.if \depth == 8
vlse16.v v16, (a2), t1
+.else
+ vlse32.v v16, (a2), t1
+.endif
vluxei8.v v12, (a4), v8
.if \depth == 8
vsetvli zero, zero, e16, m2, ta, ma
@@ -532,18 +565,29 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
mv a1, s6
mv a2, s7
add a0, s4, t2
+.if \depth > 8
+ mv a5, s8
+.endif
beqz t0, 2f # if (nnzc[scan8[i]])
- call .Lidct_add4_\depth\()_rvv
+ jal .Lidct_add4_\depth\()_rvv
j 3f
2:
beqz t1, 3f # if (block[i * 16])
+.if \depth == 8
call ff_h264_idct_dc_add_\depth\()_c
+.else
+ jalr s9
+.endif
3:
srli s3, s3, 1
addi s5, s5, 4
- addi s6, s6, 16 * 2 << (\depth > 8)
+ addi s6, s6, 16 * 2 * (\depth / 8)
bnez s1, 1b
+.if \depth > 8
+ ld s9, 80(sp)
+ ld s8, 72(sp)
+.endif
ld s7, 64(sp)
ld s6, 56(sp)
ld s5, 48(sp)
@@ -553,16 +597,16 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
- addi sp, sp, 80
+ addi sp, sp, 96
ret
endfunc
func ff_h264_idct8_add4_\depth\()_rvv, zve32x
csrwi vxrm, 0
- addi sp, sp, -80
+ addi sp, sp, -96
lla t0, ff_h264_scan8
sd s0, (sp)
- li t1, 4 * 32 << (\depth > 8)
+ li t1, 4 * 32 * (\depth / 8)
mv s0, sp
li t2, 4
sd ra, 8(sp)
@@ -573,9 +617,19 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
sd s5, 48(sp)
sd s6, 56(sp)
sd s7, 64(sp)
+.if \depth > 8
+ sd s8, 72(sp)
+ sd s9, 80(sp)
+ mv s8, a5
+ mv s9, a6
+.endif
vsetivli zero, 4, e8, mf4, ta, ma
vlse8.v v8, (t0), t2
+.if \depth == 8
vlse16.v v16, (a2), t1
+.else
+ vlse32.v v16, (a2), t1
+.endif
vluxei8.v v12, (a4), v8
.if \depth == 8
vsetvli zero, zero, e16, mf2, ta, ma
@@ -604,17 +658,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
mv a1, s6
mv a2, s7
add a0, s4, t2
- beqz t1, 2f # if (nnz == 1 && block[i * 16])
- call ff_h264_idct8_dc_add_\depth\()_c
+.if \depth > 8
+ mv a5, s8
+.endif
+ bnez t1, 2f # if (nnz == 1 && block[i * 16])
+ jal .Lidct8_add_\depth\()_rvv
j 3f
2:
- call .Lidct8_add_\depth\()_rvv
+.if \depth == 8
+ call ff_h264_idct8_dc_add_\depth\()_c
+.else
+ jalr s9
+.endif
3:
srli s3, s3, 1
addi s5, s5, 4 * 4
- addi s6, s6, 4 * 16 * 2 << (\depth > 8)
+ addi s6, s6, 4 * 16 * 2 * (\depth / 8)
bnez s1, 1b
+.if \depth > 8
+ ld s9, 80(sp)
+ ld s8, 72(sp)
+.endif
ld s7, 64(sp)
ld s6, 56(sp)
ld s5, 48(sp)
@@ -624,8 +689,28 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
ld s1, 16(sp)
ld ra, 8(sp)
ld s0, 0(sp)
- addi sp, sp, 80
+ addi sp, sp, 96
ret
endfunc
.endr
+
+.irp depth, 9, 10, 12, 14
+func ff_h264_idct_add16_\depth\()_rvv, zve32x
+ li a5, (1 << \depth) - 1
+ lla a6, ff_h264_idct_dc_add_\depth\()_c
+ j ff_h264_idct_add16_16_rvv
+endfunc
+
+func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
+ li a5, (1 << \depth) - 1
+ lla a6, ff_h264_idct_dc_add_\depth\()_c
+ j ff_h264_idct_add16intra_16_rvv
+endfunc
+
+func ff_h264_idct8_add4_\depth\()_rvv, zve32x
+ li a5, (1 << \depth) - 1
+ lla a6, ff_h264_idct8_dc_add_\depth\()_c
+ j ff_h264_idct8_add4_16_rvv
+endfunc
+.endr
#endif
--
2.45.2
More information about the ffmpeg-devel
mailing list