[FFmpeg-devel] [PATCH] lavc/rv34dsp: optimise R-V V idct_dc_add
    Rémi Denis-Courmont 
    remi at remlab.net
       
    Wed May 22 23:28:54 EEST 2024
    
    
  
This removes one stray LI and reworks the vector arithmetic to avoid
changing the vector configuration. On K230, this takes the 46.5 cycle
count down from 46.5 to 43.5.
---
 libavcodec/riscv/rv34dsp_rvv.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/libavcodec/riscv/rv34dsp_rvv.S b/libavcodec/riscv/rv34dsp_rvv.S
index f1f6345012..e8aff7e570 100644
--- a/libavcodec/riscv/rv34dsp_rvv.S
+++ b/libavcodec/riscv/rv34dsp_rvv.S
@@ -36,16 +36,15 @@ func ff_rv34_idct_dc_add_rvv, zve32x
         vsetivli      zero, 4, e8, mf4, ta, ma
         vlse32.v      v0, (a0), a1
         li            t1, 169
+        li            t2, 128
         mul           t1, t1, a2
-        li            a2, 255
+        vsetivli      zero, 4*4, e8, m1, ta, ma
+        vwsubu.vx     v2, v0, t2
         addi          t1, t1, 512
         srai          t1, t1, 10
-        vsetivli      zero, 4*4, e16, m2, ta, ma
-        vzext.vf2     v2, v0
-        vadd.vx       v2, v2, t1
-        vmax.vx       v2, v2, zero
-        vsetvli       zero, zero, e8, m1, ta, ma
-        vnclipu.wi    v0, v2, 0
+        vwadd.wx      v2, v2, t1
+        vnclip.wi     v0, v2, 0
+        vxor.vx       v0, v0, t2
         vsetivli      zero, 4, e8, mf4, ta, ma
         vsse32.v      v0, (a0), a1
 
-- 
2.45.1
    
    
More information about the ffmpeg-devel
mailing list