[FFmpeg-devel] [PATCH 5/5] lavc/aarch64: new optimization for 8-bit hevc_epel_uni_w_hv

Thu Jul 13 17:54:27 EEST 2023

Hi, Martin,

Thanks for your comments.

I have now amended the unreasonable parts of ldp/stp that I have seen.  
And I updated patch 3 and patch 5. (Although I have attached all 5 patches)
In addition, I thought that q8-q15 was required to be saved according to 
the calling convention before, but later I confirmed that it is the 
lower 64bit, thank you for reminding.

Please take a look. If there are some small mistakes, please correct 
them directly. If there are still many problems, please remind me again, 
thank you!


在 2023/7/2 5:28, Martin Storsjö 写道:
> On Sun, 18 Jun 2023, Logan.Lyu wrote:
>
>> Hi, Martin,
>>
>> I modified it according to your comments. Please review again.
>
>> From 47b7f7af634add7680b56a216fff7dbe1f08cd11 Mon Sep 17 00:00:00 2001
>> From: Logan Lyu <Logan.Lyu at myais.com.cn>
>> Date: Sun, 28 May 2023 10:35:43 +0800
>> Subject: [PATCH 5/5] lavc/aarch64: new optimization for 8-bit
>>  hevc_epel_uni_w_hv
>>
>> Signed-off-by: Logan Lyu <Logan.Lyu at myais.com.cn>
>> ---
>>  libavcodec/aarch64/hevcdsp_epel_neon.S    | 694 ++++++++++++++++++++++
>>  libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
>>  2 files changed, 700 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
>> b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> index 8b6f396a0b..355679af29 100644
>> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
>> @@ -717,6 +717,700 @@ function 
>> ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
>>          ret
>>  endfunc
>>
>> +.macro epel_uni_w_hv_start
>> +        mov             x15, x5         //denom
>> +        mov             x16, x6         //wx
>> +        mov             x17, x7         //ox
>> +        add             w15, w15, #6    //shift = denom+6
>> +
>> +
>> +        ldp             x5, x6, [sp]
>> +        ldr             x7, [sp, #16]
>> +
>> +        stp             q12, q13, [sp, #-128]!
>> +        stp             q14, q15, [sp, #32]
>> +        stp             q8, q9,   [sp, #64]
>> +        stp             q10, q11, [sp, #96]
>
> Only need to back up 64 bytes, by backing up d8-d15. Also, the order
> is quite weird here, why not keep them in e.g. linear order?
>
>> +function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
>> +        epel_uni_w_hv_start
>> +        sxtw            x4, w4
>> +
>> +        add             x10, x4, #3
>> +        lsl             x10, x10, #7
>> +        sub             sp, sp, x10     // tmp_array
>> +        stp             xzr, x30, [sp, #-48]!
>
> As mentioned already in the previous review - why do you back up and
> restore xzr here? That's not necessary. Yes, you should keep the stack
> 16 byte aligned, but you can just leave an empty slot, and just do
> "str x30, [sp, #-48]!" here, and vice versa with "ldr" instead of ldp
> when restoring.
>
> The same goes in all functions here.
>
>> +2:
>> +        ldp             q14, q15, [sp, #32]
>> +        ldp             q8, q9,   [sp, #64]
>> +        ldp             q10, q11, [sp, #96]
>> +        ldp             q12, q13, [sp], #128
>
> Only need d8-d15, and weird register order here, and elsewhere.
>
>> +function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
>> +        epel_uni_w_hv_start
>> +        sxtw            x4, w4
>
> FWIW, it's unusual to need an explicit sxtw instruction, but I guess
> if you use it in the form "add x10, x4, #3" it might be needed.
>
>> +function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
>> +        ldp             x15, x16, [sp]
>> +        stp             x0, x30, [sp, #-16]!
>> +        stp             x1, x2, [sp, #-16]!
>> +        stp             x3, x4, [sp, #-16]!
>> +        stp             x5, x6, [sp, #-16]!
>
> Don't do consecutive stack pointer updates like this, but merge it
> into one large stack decrement followed by positive offsets, like in
> all the other cases of stp/ldp.
>
>> +        mov             x17, #16
>> +        stp             x17, x7, [sp, #-16]!
>> +        stp             x15, x16, [sp, #-16]!
>> +        bl X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
>> +        ldp             x15, x16, [sp], #16
>> +        ldp             x17, x7, [sp], #16
>> +        ldp             x5, x6, [sp], #16
>> +        ldp             x3, x4, [sp], #16
>> +        ldp             x1, x2, [sp], #16
>> +        ldr             x0, [sp]
>> +        add             x0, x0, #16
>> +        add             x2, x2, #16
>> +        mov             x17, #16
>> +        stp             x17, xzr, [sp, #-16]!
>> +        stp             x15, x16, [sp, #-16]!
>
> Don't do multiple stack decrements, don't needlessly store xzr here.
>
> The same goes for all the other functions in this patch.
>
> // Martin
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
-------------- next part --------------
From c7959c64da41d2e6a14cbd3afa019fa1792d9767 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Sat, 27 May 2023 09:42:07 +0800
Subject: [PATCH v1 3/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 503 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 509 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0411de9864..0e3bf74953 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -375,3 +375,506 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
 endfunc
 
 #endif
+
+
+.macro EPEL_UNI_W_V_HEADER
+        ldr             x12, [sp, #8]
+        movrel          x9, epel_filters
+        add             x9, x9, x12, lsl #2
+        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b}, [x9] // filter
+        neg             v0.16b, v0.16b
+        neg             v3.16b, v3.16b
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x2, x2, x3
+.endm
+
+.macro EPEL_UNI_W_V4_CALC d0, s0, s1, s2, s3
+        movi            \d0\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        smull           \d0\().4s, \d0\().4h, v30.4h
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqxtn           \d0\().4h, \d0\().4s
+        sqxtun          \d0\().8b, \d0\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v4_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             s4, [x2]
+        ldr             s5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s6, [x2]
+1:
+        ldr             s7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V4_CALC v16, v4, v5, v6, v7
+        str             s16, [x0]
+        b.eq            2f
+        add             x0, x0, x1
+        ldr             s4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V4_CALC v17, v5, v6, v7, v4
+        str             s17, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             s5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V4_CALC v18, v6, v7, v4, v5
+        str             s18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             s6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V4_CALC v19, v7, v4, v5, v6
+        str             s19, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V8_CALC d0, s0, s1, s2, s3, t0, t1
+        movi            \d0\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtun          \d0\().8b, \d0\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v6_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        sub             x1, x1, #4
+        ldr             d4, [x2]
+        ldr             d5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d6, [x2]
+1:
+        ldr             d7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21
+        str             s17, [x0], #4
+        st1             {v17.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21
+        str             s18, [x0], #4
+        st1             {v18.h}[2], [x0], x1
+        b.eq            2f
+        ldr             d6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21
+        str             s19, [x0], #4
+        st1             {v19.h}[2], [x0], x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v8_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             d4, [x2]
+        ldr             d5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d6, [x2]
+1:
+        ldr             d7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v16, v4, v5, v6, v7, v20, v21
+        str             d16, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v17, v5, v6, v7, v4, v20, v21
+        str             d17, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V8_CALC v18, v6, v7, v4, v5, v20, v21
+        str             d18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             d6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V8_CALC v19, v7, v4, v5, v6, v20, v21
+        str             d19, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V12_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlsl2          \d1\().8h, \s0\().16b, v0.16b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal2          \d1\().8h, \s1\().16b, v1.16b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlal2          \d1\().8h, \s2\().16b, v2.16b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        umlsl2          \d1\().8h, \s3\().16b, v3.16b
+
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        smull           \t2\().4s, \d1\().4h, v30.4h
+
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqrshl          \t2\().4s, \t2\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqadd           \t2\().4s, \t2\().4s, v29.4s
+
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtn           \d1\().4h, \t2\().4s
+        sqxtun          \d0\().8b,  \d0\().8h
+        sqxtun2         \d0\().16b, \d1\().8h
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_v12_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             q4, [x2]
+        ldr             q5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q6, [x2]
+        sub             x1, x1, #8
+1:
+        ldr             q7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V12_CALC v16, v17, v4, v5, v6, v7, v24, v25, v26, v27
+        str             d16, [x0], #8
+        st1             {v16.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V12_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27
+        str             d18, [x0], #8
+        st1             {v18.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V12_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27
+        str             d20, [x0], #8
+        st1             {v20.s}[2], [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V12_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27
+        str             d22, [x0], #8
+        st1             {v22.s}[2], [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+.macro EPEL_UNI_W_V16_CALC d0, d1, s0, s1, s2, s3, t0, t1, t2, t3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        umlsl           \d0\().8h, \s0\().8b, v0.8b
+        umlsl2          \d1\().8h, \s0\().16b, v0.16b
+        umlal           \d0\().8h, \s1\().8b, v1.8b
+        umlal2          \d1\().8h, \s1\().16b, v1.16b
+        umlal           \d0\().8h, \s2\().8b, v2.8b
+        umlal2          \d1\().8h, \s2\().16b, v2.16b
+        umlsl           \d0\().8h, \s3\().8b, v3.8b
+        umlsl2          \d1\().8h, \s3\().16b, v3.16b
+
+        smull           \t0\().4s, \d0\().4h, v30.4h
+        smull2          \t1\().4s, \d0\().8h, v30.8h
+        smull           \t2\().4s, \d1\().4h, v30.4h
+        smull2          \t3\().4s, \d1\().8h, v30.8h
+
+        sqrshl          \t0\().4s, \t0\().4s, v31.4s
+        sqrshl          \t1\().4s, \t1\().4s, v31.4s
+        sqrshl          \t2\().4s, \t2\().4s, v31.4s
+        sqrshl          \t3\().4s, \t3\().4s, v31.4s
+        sqadd           \t0\().4s, \t0\().4s, v29.4s
+        sqadd           \t1\().4s, \t1\().4s, v29.4s
+        sqadd           \t2\().4s, \t2\().4s, v29.4s
+        sqadd           \t3\().4s, \t3\().4s, v29.4s
+
+        sqxtn           \d0\().4h, \t0\().4s
+        sqxtn2          \d0\().8h, \t1\().4s
+        sqxtn           \d1\().4h, \t2\().4s
+        sqxtn2          \d1\().8h, \t3\().4s
+        sqxtun          \d0\().8b,  \d0\().8h
+        sqxtun2         \d0\().16b, \d1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_epel_uni_w_v16_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldr             q4, [x2]
+        ldr             q5, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q6, [x2]
+1:
+        ldr             q7, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V16_CALC v16, v17 v4, v5, v6, v7, v24, v25, v26, v27
+        str             q16, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q4, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v18, v19, v5, v6, v7, v4, v24, v25, v26, v27
+        str             q18, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q5, [x2, x3]
+        subs            w4, w4, #1
+        add             x2, x2, x3, lsl #1
+        EPEL_UNI_W_V16_CALC v20, v21, v6, v7, v4, v5, v24, v25, v26, v27
+        str             q20, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+        ldr             q6, [x2]
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v22, v23, v7, v4, v5, v6, v24, v25, v26, v27
+        str             q22, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_uni_w_v24_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldp             q16, q17, [x2]
+        add             x2, x2, x3
+        ldp             q18, q19, [x2]
+        add             x2, x2, x3
+        ldp             q20, q21, [x2]
+        add             x2, x2, x3
+1:
+        ldp             q22, q23, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v17, v19, v21, v23, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q16, q17, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v19, v21, v23, v17, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q18, q19, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18,  v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v21, v23, v17, v19, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q20, q21, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27
+        EPEL_UNI_W_V8_CALC  v6, v23, v17, v19, v21, v24, v25
+        str             q4, [x0]
+        str             d6, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v32_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+
+        ldp             q16, q17, [x2]
+        add             x2, x2, x3
+        ldp             q18, q19, [x2]
+        add             x2, x2, x3
+        ldp             q20, q21, [x2]
+        add             x2, x2, x3
+1:
+        ldp             q22, q23, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v16, v18, v20, v22, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v17, v19, v21, v23, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q16, q17, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v18, v20, v22, v16, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v19, v21, v23, v17, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q18, q19, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v20, v22, v16, v18,  v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v21, v23, v17, v19, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.eq            2f
+        ldp             q20, q21, [x2]
+        subs            w4, w4, #1
+        add             x2, x2, x3
+        EPEL_UNI_W_V16_CALC v4, v5, v22, v16, v18, v20, v24, v25, v26, v27
+        EPEL_UNI_W_V16_CALC v6, v7, v23, v17, v19, v21, v24, v25, v26, v27
+        str             q4, [x0]
+        str             q6, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v48_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+        stp             d8, d9, [sp, #-32]!
+        stp             d10, d11, [sp, #16]
+
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+1:
+        ld1             {v25.16b, v26.16b, v27.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v16, v19, v22, v25, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v17, v20, v23, v26, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v18, v21, v24, v27, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v19, v22, v25, v16, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v20, v23, v26, v17, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v21, v24, v27, v18, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v19.16b, v20.16b, v21.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6,  v22, v25, v16, v19, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7,  v23, v26, v17, v20, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7,  v24, v27, v18, v21, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v22.16b, v23.16b, v24.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6,  v25, v16, v19, v22, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7,  v26, v17, v20, v23, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7,  v27, v18, v21, v24, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b}, [x0], x1
+        b.hi            1b
+2:
+        ldp             d10, d11, [sp, #16]
+        ldp             d8, d9, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_v64_8_neon, export=1
+        EPEL_UNI_W_V_HEADER
+        stp             d8, d9, [sp, #-64]!
+        stp             d10, d11, [sp, #16]
+        stp             d12, d13, [sp, #32]
+        stp             d14, d15, [sp, #48]
+
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+1:
+        ld1             {v12.16b, v13.16b, v14.16b, v15.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v16, v20, v24, v12, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v17, v21, v25, v13, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v18, v22, v26, v14, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v19, v23, v27, v15, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v20, v24, v12, v16, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v21, v25, v13, v17, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v22, v26, v14, v18, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v23, v27, v15, v19, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v24, v12, v16, v20, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v25, v13, v17, v21, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v26, v14, v18, v22, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v27, v15, v19, v23, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.eq            2f
+        ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], x3
+        subs            w4, w4, #1
+        EPEL_UNI_W_V16_CALC v4, v6, v12, v16, v20, v24, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v5, v7, v13, v17, v21, v25, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v6, v7, v14, v18, v22, v26, v8, v9, v10, v11
+        EPEL_UNI_W_V16_CALC v7,v28, v15, v19, v23, v27, v8, v9, v10, v11
+        st1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0], x1
+        b.hi            1b
+2:
+        ldp             d10, d11, [sp, #16]
+        ldp             d12, d13, [sp, #32]
+        ldp             d14, d15, [sp, #48]
+        ldp             d8, d9, [sp], #64
+        ret
+endfunc
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 8af0a2b4b9..4a260e1d9a 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -161,6 +161,11 @@ NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
 NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -274,6 +279,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 0, epel_uni_w_v,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
-- 
2.38.0.windows.1

-------------- next part --------------
From f07eee2c6cdeb0260c00a1ec49a0dddb6b9df9db Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Sun, 28 May 2023 10:30:28 +0800
Subject: [PATCH v1 4/5] lavc/aarch64: new optimization for 8-bit hevc_epel_h

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 343 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 348 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0e3bf74953..8942a41cbf 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -33,6 +33,349 @@ const epel_filters, align=4
 endconst
 
 #if HAVE_I8MM
+
+.macro EPEL_H_HEADER
+        movrel          x5, epel_filters
+        add             x5, x5, x4, lsl #2
+        ld1r            {v30.4s}, [x5]
+        sub             x1, x1, #1
+        mov             x10, #(MAX_PB_SIZE * 2)
+.endm
+
+function ff_hevc_put_hevc_epel_h4_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.8b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.8b, v4.8b, v4.8b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v4.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v4.2d, v4.2d, v6.2d
+        movi            v16.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        st1             {v16.4h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_h6_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b},  [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.8b, v4.8b, v4.8b, #2
+        ext             v7.8b, v4.8b, v4.8b, #3
+        trn1            v16.2s, v4.2s, v5.2s
+        trn2            v17.2s, v4.2s, v5.2s
+        trn1            v6.2s, v6.2s, v7.2s
+        trn1            v16.2d, v16.2d, v6.2d
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v18.4s, v16.16b, v30.16b
+        usdot           v19.2s, v17.8b, v30.8b
+        xtn             v18.4h, v18.4s
+        xtn             v19.4h, v19.4s
+        str             d18, [x0]
+        str             s19, [x0, #8]
+        add             x0, x0, x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h8_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        zip1            v20.4s, v4.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn             v17.4h, v17.4s
+        st2             {v16.4h, v17.4h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h12_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v4.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v4.16b, v4.16b, #1
+        ext             v6.16b, v4.16b, v4.16b, #2
+        ext             v7.16b, v4.16b, v4.16b, #3
+        trn1            v20.2d, v4.2d, v6.2d
+        trn2            v22.2d, v4.2d, v6.2d
+        trn1            v21.2d, v5.2d, v7.2d
+        trn2            v23.2d, v5.2d, v7.2d
+        trn1            v4.4s, v20.4s, v21.4s
+        trn2            v5.4s, v20.4s, v21.4s
+        trn1            v6.4s, v22.4s, v23.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v4.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v17.4s
+        xtn             v18.4h, v18.4s
+        str             q16, [x0]
+        str             d18, [x0, #16]
+        add             x0, x0, x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h16_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v6.4s
+        zip2            v22.4s, v0.4s, v6.4s
+        zip1            v21.4s, v5.4s, v7.4s
+        zip2            v23.4s, v5.4s, v7.4s
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v20.16b, v30.16b
+        usdot           v17.4s, v21.16b, v30.16b
+        usdot           v18.4s, v22.16b, v30.16b
+        usdot           v19.4s, v23.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v18.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v19.4s
+        st2             {v16.8h, v17.8h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h24_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v1.16b, #1
+        ext             v27.16b, v1.16b, v1.16b, #2
+        ext             v28.16b, v1.16b, v1.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        zip1            v20.8h, v16.8h, v18.8h
+        zip1            v21.8h, v17.8h, v19.8h
+        zip2            v22.8h, v16.8h, v18.8h
+        zip2            v23.8h, v17.8h, v19.8h
+        zip1            v22.8h, v22.8h, v23.8h
+        add             x7, x0, #32
+        st2             {v20.8h, v21.8h}, [x0], x10
+        st1             {v22.8h}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h32_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v5.16b, v0.16b, v1.16b, #1
+        ext             v6.16b, v0.16b, v1.16b, #2
+        ext             v7.16b, v0.16b, v1.16b, #3
+        ext             v26.16b, v1.16b, v2.16b, #1
+        ext             v27.16b, v1.16b, v2.16b, #2
+        ext             v28.16b, v1.16b, v2.16b, #3
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v16.4s, v0.16b, v30.16b
+        usdot           v17.4s, v5.16b, v30.16b
+        usdot           v18.4s, v6.16b, v30.16b
+        usdot           v19.4s, v7.16b, v30.16b
+        usdot           v20.4s, v1.16b, v30.16b
+        usdot           v21.4s, v26.16b, v30.16b
+        usdot           v22.4s, v27.16b, v30.16b
+        usdot           v23.4s, v28.16b, v30.16b
+        xtn             v16.4h, v16.4s
+        xtn2            v16.8h, v20.4s
+        xtn             v17.4h, v17.4s
+        xtn2            v17.8h, v21.4s
+        xtn             v18.4h, v18.4s
+        xtn2            v18.8h, v22.4s
+        xtn             v19.4h, v19.4s
+        xtn2            v19.8h, v23.4s
+        st4             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x10
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h48_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
+        subs            w3, w3, #1   // height
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x10
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v22.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v23.4s
+        add             x7, x0, #64
+        st2             {v20.8h, v21.8h}, [x7]
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_h64_8_neon_i8mm, export=1
+        EPEL_H_HEADER
+        sub             x2, x2, #64
+1:      ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64
+        subs            w3, w3, #1   // height
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v0.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v1.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        ld1             {v7.8b}, [x1], x2
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        ext             v16.16b, v3.16b, v7.16b, #1
+        ext             v17.16b, v3.16b, v7.16b, #2
+        ext             v18.16b, v3.16b, v7.16b, #3
+        movi            v20.2d, #0
+        movi            v21.2d, #0
+        movi            v22.2d, #0
+        movi            v23.2d, #0
+        usdot           v20.4s, v2.16b, v30.16b
+        usdot           v21.4s, v4.16b, v30.16b
+        usdot           v22.4s, v5.16b, v30.16b
+        usdot           v23.4s, v6.16b, v30.16b
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        movi            v26.2d, #0
+        movi            v27.2d, #0
+        usdot           v24.4s, v3.16b, v30.16b
+        usdot           v25.4s, v16.16b, v30.16b
+        usdot           v26.4s, v17.16b, v30.16b
+        usdot           v27.4s, v18.16b, v30.16b
+        xtn             v20.4h, v20.4s
+        xtn2            v20.8h, v24.4s
+        xtn             v21.4h, v21.4s
+        xtn2            v21.8h, v25.4s
+        xtn             v22.4h, v22.4s
+        xtn2            v22.8h, v26.4s
+        xtn             v23.4h, v23.4s
+        xtn2            v23.8h, v27.4s
+        st4             {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+        b.ne            1b
+        ret
+endfunc
+
 .macro EPEL_UNI_W_H_HEADER
         ldr             x12, [sp]
         sub             x2, x2, #1
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 4a260e1d9a..b448d755b9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -171,6 +171,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -283,6 +287,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
+            NEON8_FNASSIGN(c->put_hevc_epel, 0, 1, epel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
-- 
2.38.0.windows.1

-------------- next part --------------
From 7c86c8aef2b718bf8a163614764943aa2a62df0c Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Sun, 28 May 2023 10:35:43 +0800
Subject: [PATCH v1 5/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_hv

---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 668 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   6 +
 2 files changed, 674 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 8942a41cbf..93fb69cc24 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -717,6 +717,674 @@ function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
         ret
 endfunc
 
+.macro epel_uni_w_hv_start
+        mov             x15, x5         //denom
+        mov             x16, x6         //wx
+        mov             x17, x7         //ox
+        add             w15, w15, #6    //shift = denom+6
+
+
+        ldp             x5, x6, [sp]
+        ldr             x7, [sp, #16]
+
+        stp             d14, d15, [sp, #-64]!
+        stp             d8, d9, [sp, #16]
+        stp             d10, d11, [sp, #32]
+        stp             d12, d13, [sp, #48]
+
+        dup             v13.8h, w16     //wx
+        dup             v14.4s, w17     //ox
+
+        mov             w17, #1
+        lsl             w17, w17, w15
+        lsr             w17, w17, #1
+        dup             v15.4s, w17
+
+        neg             w15, w15        // -shift
+        dup             v12.4s, w15     //shift
+.endm
+
+.macro epel_uni_w_hv_end
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+.endm
+
+.macro epel_uni_w_hv_end2
+        smull           v28.4s, v4.4h, v13.4h
+        smull2          v29.4s, v4.8h, v13.8h
+        smull           v30.4s, v5.4h, v13.4h
+        smull2          v31.4s, v5.8h, v13.8h
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v28.4s
+        sqxtn2          v4.8h, v29.4s
+        sqxtn           v5.4h, v30.4s
+        sqxtn2          v5.8h, v31.4s
+.endm
+
+.macro epel_uni_w_hv_end3
+        smull           v1.4s,  v4.4h, v13.4h
+        smull2          v2.4s,  v4.8h, v13.8h
+        smull           v28.4s, v5.4h, v13.4h
+        smull2          v29.4s, v5.8h, v13.8h
+        smull           v30.4s, v6.4h, v13.4h
+        smull2          v31.4s, v6.8h, v13.8h
+        add             v1.4s, v1.4s, v15.4s
+        add             v2.4s, v2.4s, v15.4s
+        add             v28.4s, v28.4s, v15.4s
+        add             v29.4s, v29.4s, v15.4s
+        add             v30.4s, v30.4s, v15.4s
+        add             v31.4s, v31.4s, v15.4s
+
+        sshl            v1.4s, v1.4s, v12.4s
+        sshl            v2.4s, v2.4s, v12.4s
+        sshl            v28.4s, v28.4s, v12.4s
+        sshl            v29.4s, v29.4s, v12.4s
+        sshl            v30.4s, v30.4s, v12.4s
+        sshl            v31.4s, v31.4s, v12.4s
+        add             v1.4s, v1.4s, v14.4s
+        add             v2.4s, v2.4s, v14.4s
+        add             v28.4s, v28.4s, v14.4s
+        add             v29.4s, v29.4s, v14.4s
+        add             v30.4s, v30.4s, v14.4s
+        add             v31.4s, v31.4s, v14.4s
+
+        sqxtn           v4.4h, v1.4s
+        sqxtn2          v4.8h, v2.4s
+        sqxtn           v5.4h, v28.4s
+        sqxtn2          v5.8h, v29.4s
+        sqxtn           v6.4h, v30.4s
+        sqxtn2          v6.8h, v31.4s
+.endm
+
+.macro calc_epelh dst, src0, src1, src2, src3
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        sqshrn          \dst\().4h, \dst\().4s, #6
+.endm
+
+.macro calc_epelh2 dst, tmp, src0, src1, src2, src3
+        smull2          \tmp\().4s, \src0\().8h, v0.h[0]
+        smlal2          \tmp\().4s, \src1\().8h, v0.h[1]
+        smlal2          \tmp\().4s, \src2\().8h, v0.h[2]
+        smlal2          \tmp\().4s, \src3\().8h, v0.h[3]
+        sqshrn2         \dst\().8h, \tmp\().4s, #6
+.endm
+
+.macro load_epel_filterh freg, xreg
+        movrel          \xreg, epel_filters
+        add             \xreg, \xreg, \freg, lsl #2
+        ld1             {v0.8b}, [\xreg]
+        sxtl            v0.8h, v0.8b
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_hv4_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.4h}, [sp], x10
+        ld1             {v17.4h}, [sp], x10
+        ld1             {v18.4h}, [sp], x10
+1:      ld1             {v19.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v16.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v17.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.eq            2f
+
+        ld1             {v18.4h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        str             s4, [x0]
+        add             x0, x0, x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv6_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #4
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.s}[0], [x0], #4
+        st1             {v4.h}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv8_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h}, [sp], x10
+        ld1             {v17.8h}, [sp], x10
+        ld1             {v18.8h}, [sp], x10
+1:      ld1             {v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v17, v18, v19
+        calc_epelh2     v4, v5, v16, v17, v18, v19
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v17, v18, v19, v16
+        calc_epelh2     v4, v5, v17, v18, v19, v16
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v19, v16, v17
+        calc_epelh2     v4, v5, v18, v19, v16, v17
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v16, v17, v18
+        calc_epelh2     v4, v5, v19, v16, v17, v18
+        epel_uni_w_hv_end
+        sqxtun          v4.8b, v4.8h
+        st1             {v4.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv12_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h12_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        sub             x1, x1, #8
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.8b}, [x0], #8
+        st1             {v4.s}[2], [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h16_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        ld1             {v20.8h, v21.8h}, [sp], x10
+1:      ld1             {v22.8h, v23.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v18, v20, v22
+        calc_epelh2     v4, v5, v16, v18, v20, v22
+        calc_epelh      v5, v17, v19, v21, v23
+        calc_epelh2     v5, v6, v17, v19, v21, v23
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v18, v20, v22, v16
+        calc_epelh2     v4, v5, v18, v20, v22, v16
+        calc_epelh      v5, v19, v21, v23, v17
+        calc_epelh2     v5, v6, v19, v21, v23, v17
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v18.8h, v19.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v20, v22, v16, v18
+        calc_epelh2     v4, v5, v20, v22, v16, v18
+        calc_epelh      v5, v21, v23, v17, v19
+        calc_epelh2     v5, v6, v21, v23, v17, v19
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v16, v18, v20
+        calc_epelh2     v4, v5, v22, v16, v18, v20
+        calc_epelh      v5, v23, v17, v19, v21
+        calc_epelh2     v5, v6, v23, v17, v19, v21
+        epel_uni_w_hv_end2
+        sqxtun          v4.8b, v4.8h
+        sqxtun2         v4.16b, v5.8h
+        st1             {v4.16b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm, export=1
+        epel_uni_w_hv_start
+        sxtw            x4, w4
+
+        add             x10, x4, #3
+        lsl             x10, x10, #7
+        sub             sp, sp, x10     // tmp_array
+        str             x30, [sp, #-48]!
+        stp             x4, x6, [sp, #16]
+        stp             x0, x1, [sp, #32]
+        add             x0, sp, #48
+        sub             x1, x2, x3
+        mov             x2, x3
+        add             x3, x4, #3
+        mov             x4, x5
+        bl              X(ff_hevc_put_hevc_epel_h24_8_neon_i8mm)
+        ldp             x4, x6, [sp, #16]
+        ldp             x0, x1, [sp, #32]
+        ldr             x30, [sp], #48
+        load_epel_filterh x6, x5
+        mov             x10, #(MAX_PB_SIZE * 2)
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+1:      ld1             {v25.8h, v26.8h, v27.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v16, v19, v22, v25
+        calc_epelh2     v4, v5, v16, v19, v22, v25
+        calc_epelh      v5, v17, v20, v23, v26
+        calc_epelh2     v5, v6, v17, v20, v23, v26
+        calc_epelh      v6, v18, v21, v24, v27
+        calc_epelh2     v6, v7, v18, v21, v24, v27
+
+        epel_uni_w_hv_end3
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v16.8h, v17.8h, v18.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v19, v22, v25, v16
+        calc_epelh2     v4, v5, v19, v22, v25, v16
+        calc_epelh      v5, v20, v23, v26, v17
+        calc_epelh2     v5, v6, v20, v23, v26, v17
+        calc_epelh      v6, v21, v24, v27, v18
+        calc_epelh2     v6, v7, v21, v24, v27, v18
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v19.8h, v20.8h, v21.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v22, v25, v16, v19
+        calc_epelh2     v4, v5, v22, v25, v16, v19
+        calc_epelh      v5, v23, v26, v17, v20
+        calc_epelh2     v5, v6, v23, v26, v17, v20
+        calc_epelh      v6, v24, v27, v18, v21
+        calc_epelh2     v6, v7, v24, v27, v18, v21
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.eq            2f
+
+        ld1             {v22.8h, v23.8h, v24.8h}, [sp], x10
+        subs            x4, x4, #1
+        calc_epelh      v4, v25, v16, v19, v22
+        calc_epelh2     v4, v5, v25, v16, v19, v22
+        calc_epelh      v5, v26, v17, v20, v23
+        calc_epelh2     v5, v6, v26, v17, v20, v23
+        calc_epelh      v6, v27, v18, v21, v24
+        calc_epelh2     v6, v7, v27, v18, v21, v24
+        epel_uni_w_hv_end3
+
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        sqxtun          v6.8b, v6.8h
+        st1             {v4.8b, v5.8b, v6.8b}, [x0], x1
+        b.ne            1b
+2:
+        ldp             d8, d9, [sp, #16]
+        ldp             d10, d11, [sp, #32]
+        ldp             d12, d13, [sp, #48]
+        ldp             d14, d15, [sp], #64
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        mov             x17, #16
+        stp             x15, x16, [sp, #-96]!
+        stp             x0, x30, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x3, x4, [sp, #48]
+        stp             x5, x6, [sp, #64]
+        stp             x17, x7, [sp, #80]
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x0, x30, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldp             x3, x4, [sp, #48]
+        ldp             x5, x6, [sp, #64]
+        ldp             x17, x7, [sp, #80]
+        ldp             x15, x16, [sp], #96
+        add             x0, x0, #16
+        add             x2, x2, #16
+        mov             x17, #16
+        stp             x15, x16, [sp, #-32]!
+        stp             x17, x30, [sp, #16]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv16_8_neon_i8mm)
+        ldp             x17, x30, [sp, #16]
+        ldp             x15, x16, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv48_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        mov             x17, #24
+        stp             x15, x16, [sp, #-96]!
+        stp             x0, x30, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x3, x4, [sp, #48]
+        stp             x5, x6, [sp, #64]
+        stp             x17, x7, [sp, #80]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x0, x30, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldp             x3, x4, [sp, #48]
+        ldp             x5, x6, [sp, #64]
+        ldp             x17, x7, [sp, #80]
+        ldp             x15, x16, [sp], #96
+        add             x0, x0, #24
+        add             x2, x2, #24
+        mov             x17, #24
+        stp             x15, x16, [sp, #-32]!
+        stp             x17, x30, [sp, #16]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv24_8_neon_i8mm)
+        ldp             x17, x30, [sp, #16]
+        ldp             x15, x16, [sp], #32
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1
+        ldp             x15, x16, [sp]
+        mov             x17, #32
+        stp             x15, x16, [sp, #-96]!
+        stp             x0, x30, [sp, #16]
+        stp             x1, x2, [sp, #32]
+        stp             x3, x4, [sp, #48]
+        stp             x5, x6, [sp, #64]
+        stp             x17, x7, [sp, #80]
+
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm)
+        ldp             x0, x30, [sp, #16]
+        ldp             x1, x2, [sp, #32]
+        ldp             x3, x4, [sp, #48]
+        ldp             x5, x6, [sp, #64]
+        ldp             x17, x7, [sp, #80]
+        ldp             x15, x16, [sp], #96
+        add             x0, x0, #32
+        add             x2, x2, #32
+        mov             x17, #32
+        stp             x15, x16, [sp, #-32]!
+        stp             x17, x30, [sp, #16]
+        bl              X(ff_hevc_put_hevc_epel_uni_w_hv32_8_neon_i8mm)
+        ldp             x17, x30, [sp, #16]
+        ldp             x15, x16, [sp], #32
+        ret
+endfunc
+
+
 #endif
 
 
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index b448d755b9..e125b0cfb2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -189,6 +189,11 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO(epel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -291,6 +296,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
             NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
         }
 
-- 
2.38.0.windows.1

-------------- next part --------------
From a654b41fd8b100f631db49bd419ef65594ef32b3 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Sun, 7 May 2023 16:58:30 +0800
Subject: [PATCH v1 1/5] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_pixels

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 104 ++++++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 483a9d5253..5a1d520eec 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -152,6 +152,9 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+NEON8_FNPROTO(pel_uni_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width),);
 
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -263,6 +266,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
 
+        NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,);
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index ed659cfe9b..ed5b5027db 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -490,6 +490,110 @@ put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
 
+function ff_hevc_put_hevc_pel_uni_pixels4_8_neon, export=1
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels6_8_neon, export=1
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels8_8_neon, export=1
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels12_8_neon, export=1
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d1, [x0], #8
+        st1             {v1.s}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels16_8_neon, export=1
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        subs            w4, w4, #2
+        add             x2, x2, x3, lsl #1
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels24_8_neon, export=1
+1:
+        ld1             {v0.8b, v1.8b, v2.8b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels32_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels48_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_pixels64_8_neon, export=1
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
 
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
         mov             w10, #-6
-- 
2.38.0.windows.1

-------------- next part --------------
From 9985cbcc0aa402d9920dd690b6f6a71392d62f79 Mon Sep 17 00:00:00 2001
From: Logan Lyu <Logan.Lyu at myais.com.cn>
Date: Sun, 28 May 2023 10:07:28 +0800
Subject: [PATCH v1 2/5] lavc/aarch64: new optimization for 8-bit
 hevc_epel_uni_w_h

---
 libavcodec/aarch64/Makefile               |   1 +
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 377 ++++++++++++++++++++++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   7 +-
 3 files changed, 384 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 216191640c..cb428b49e0 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -69,4 +69,5 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_deblock_neon.o      \
                                            aarch64/hevcdsp_idct_neon.o         \
                                            aarch64/hevcdsp_init_aarch64.o      \
                                            aarch64/hevcdsp_qpel_neon.o         \
+                                           aarch64/hevcdsp_epel_neon.o         \
                                            aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..0411de9864
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,377 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const epel_filters, align=4
+        .byte  0,  0,  0,  0
+        .byte -2, 58, 10, -2
+        .byte -4, 54, 16, -2
+        .byte -6, 46, 28, -4
+        .byte -4, 36, 36, -4
+        .byte -4, 28, 46, -6
+        .byte -2, 16, 54, -4
+        .byte -2, 10, 58, -2
+endconst
+
+#if HAVE_I8MM
+.macro EPEL_UNI_W_H_HEADER
+        ldr             x12, [sp]
+        sub             x2, x2, #1
+        movrel          x9, epel_filters
+        add             x9, x9, x12, lsl #2
+        ld1r            {v28.4s}, [x9]
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.4s, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+.endm
+
+
+function ff_hevc_put_hevc_epel_uni_w_h4_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.8b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.8b, v0.8b, v0.8b, #1
+        ext             v2.8b, v0.8b, v0.8b, #2
+        ext             v3.8b, v0.8b, v0.8b, #3
+        trn1            v0.2s, v0.2s, v2.2s
+        trn1            v1.2s, v1.2s, v3.2s
+        zip1            v0.4s, v0.4s, v1.4s
+        movi            v16.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        mul             v16.4s, v16.4s, v30.4s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_uni_w_h6_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+        sub             x1, x1, #4
+1:
+        ld1             {v0.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        trn1            v4.2s, v0.2s, v1.2s
+        trn2            v6.2s, v0.2s, v1.2s
+        trn1            v5.2s, v2.2s, v3.2s
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v4.16b, v28.16b
+        usdot           v17.2s, v6.8b, v28.8b
+        mul             v16.4s, v16.4s, v30.4s
+        mul             v17.2s, v17.2s, v30.2s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.2s, v17.2s, v31.2s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.2s, v17.2s, v29.2s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v17.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+.macro  EPEL_UNI_W_H_CALC s0, s1, d0, d1
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        mul             \d1\().4s, \d1\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d1\().4s, \d1\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d1\().4s, \d1\().4s, v29.4s
+.endm
+
+function ff_hevc_put_hevc_epel_uni_w_h8_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v4.4s, v0.4s, v2.4s
+        zip1            v5.4s, v1.4s, v3.4s
+        EPEL_UNI_W_H_CALC v4, v5, v16, v17
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        zip1            v16.8h, v16.8h, v17.8h
+        sqxtun          v16.8b, v16.8h
+        str             d16, [x0]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h12_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v4.4s, v0.4s, v2.4s
+        zip1            v5.4s, v1.4s, v3.4s
+        zip2            v6.4s, v0.4s, v2.4s
+        zip2            v7.4s, v1.4s, v3.4s
+        zip1            v6.4s, v6.4s, v7.4s
+        EPEL_UNI_W_H_CALC v4, v5, v16, v17
+        movi            v18.2d, #0
+        usdot           v18.4s, v6.16b, v28.16b
+        mul             v18.4s, v18.4s, v30.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        sqxtn           v18.4h, v18.4s
+        zip1            v16.8h, v16.8h, v17.8h
+        sqxtun          v16.8b, v16.8h
+        sqxtun          v18.8b, v18.8h
+        str             d16, [x0]
+        str             s18, [x0, #8]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h16_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v5.4s
+        zip1            v21.4s, v4.4s, v6.4s
+        zip2            v22.4s, v0.4s, v5.4s
+        zip2            v23.4s, v4.4s, v6.4s
+        EPEL_UNI_W_H_CALC v20, v21, v16, v17
+        EPEL_UNI_W_H_CALC v22, v23, v18, v19
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        sqxtn2          v16.8h, v18.4s
+        sqxtn2          v17.8h, v19.4s
+        sqxtun          v16.8b, v16.8h
+        sqxtun          v17.8b, v17.8h
+        st2             {v16.8b, v17.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h24_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v2.16b, v0.16b, v1.16b, #1
+        ext             v3.16b, v0.16b, v1.16b, #2
+        ext             v4.16b, v0.16b, v1.16b, #3
+        ext             v5.16b, v1.16b, v1.16b, #1
+        ext             v6.16b, v1.16b, v1.16b, #2
+        ext             v7.16b, v1.16b, v1.16b, #3
+        zip1            v20.4s, v0.4s, v3.4s
+        zip1            v21.4s, v2.4s, v4.4s
+        zip2            v22.4s, v0.4s, v3.4s
+        zip2            v23.4s, v2.4s, v4.4s
+        zip1            v24.4s, v1.4s, v6.4s
+        zip1            v25.4s, v5.4s, v7.4s
+        EPEL_UNI_W_H_CALC v20, v21, v16, v17
+        EPEL_UNI_W_H_CALC v22, v23, v18, v19
+        EPEL_UNI_W_H_CALC v24, v25, v26, v27
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v17.4h, v17.4s
+        sqxtn           v18.4h, v18.4s
+        sqxtn           v19.4h, v19.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn           v27.4h, v27.4s
+        zip1            v16.8h, v16.8h, v17.8h
+        zip1            v18.8h, v18.8h, v19.8h
+        zip1            v26.8h, v26.8h, v27.8h
+        sqxtun          v16.8b, v16.8h
+        sqxtun2         v16.16b, v18.8h
+        sqxtun          v26.8b, v26.8h
+        str             q16, [x0]
+        str             d26, [x0, #16]
+        add             x0, x0, x1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_epel_uni_w_h32_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v3.16b, v0.16b, v1.16b, #1
+        ext             v4.16b, v0.16b, v1.16b, #2
+        ext             v5.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        EPEL_UNI_W_H_CALC v0, v3, v6, v7
+        EPEL_UNI_W_H_CALC v4, v5, v19, v20
+        EPEL_UNI_W_H_CALC v1, v16, v21, v22
+        EPEL_UNI_W_H_CALC v17, v18, v23, v24
+        sqxtn           v6.4h, v6.4s
+        sqxtn2          v6.8h, v21.4s
+        sqxtn           v7.4h, v7.4s
+        sqxtn2          v7.8h, v22.4s
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtun          v0.8b, v6.8h
+        sqxtun          v1.8b, v7.8h
+        sqxtun          v2.8b, v19.8h
+        sqxtun          v3.8b, v20.8h
+        st4             {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_epel_uni_w_h48_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+        sub             x1, x1, #32
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        subs            w4, w4, #1
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        EPEL_UNI_W_H_CALC v0, v4, v19, v20
+        EPEL_UNI_W_H_CALC v5, v6, v21, v22
+        EPEL_UNI_W_H_CALC v1, v16, v23, v24
+        EPEL_UNI_W_H_CALC v17, v18, v25, v26
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v25.4s
+        sqxtn           v22.4h, v22.4s
+        sqxtn2          v22.8h, v26.4s
+        sqxtun          v19.8b, v19.8h
+        sqxtun          v20.8b, v20.8h
+        sqxtun          v21.8b, v21.8h
+        sqxtun          v22.8b, v22.8h
+        st4             {v19.8b, v20.8b, v21.8b, v22.8b}, [x0], #32
+        ext             v5.16b, v2.16b, v3.16b, #1
+        ext             v6.16b, v2.16b, v3.16b, #2
+        ext             v7.16b, v2.16b, v3.16b, #3
+        EPEL_UNI_W_H_CALC v2, v5, v19, v20
+        EPEL_UNI_W_H_CALC v6, v7, v21, v22
+        sqxtn           v19.4h, v19.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn           v22.4h, v22.4s
+        zip1            v4.8h, v19.8h, v21.8h
+        zip1            v5.8h, v20.8h, v22.8h
+        sqxtun          v4.8b, v4.8h
+        sqxtun          v5.8b, v5.8h
+        st2             {v4.8b, v5.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_epel_uni_w_h64_8_neon_i8mm, export=1
+        EPEL_UNI_W_H_HEADER
+        sub             x1, x1, #32
+        sub             x3, x3, #64
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
+        subs            w4, w4, #1
+        ext             v4.16b, v0.16b, v1.16b, #1
+        ext             v5.16b, v0.16b, v1.16b, #2
+        ext             v6.16b, v0.16b, v1.16b, #3
+        ext             v16.16b, v1.16b, v2.16b, #1
+        ext             v17.16b, v1.16b, v2.16b, #2
+        ext             v18.16b, v1.16b, v2.16b, #3
+        EPEL_UNI_W_H_CALC v0, v4, v19, v20
+        EPEL_UNI_W_H_CALC v5, v6, v21, v22
+        EPEL_UNI_W_H_CALC v1, v16, v23, v24
+        EPEL_UNI_W_H_CALC v17, v18, v25, v26
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v25.4s
+        sqxtn           v22.4h, v22.4s
+        sqxtn2          v22.8h, v26.4s
+        sqxtun          v19.8b, v19.8h
+        sqxtun          v20.8b, v20.8h
+        sqxtun          v21.8b, v21.8h
+        sqxtun          v22.8b, v22.8h
+        st4             {v19.8b, v20.8b, v21.8b, v22.8b}, [x0], #32
+        ld1             {v7.8b}, [x2], x3
+        ext             v4.16b, v2.16b, v3.16b, #1
+        ext             v5.16b, v2.16b, v3.16b, #2
+        ext             v6.16b, v2.16b, v3.16b, #3
+        ext             v16.16b, v3.16b, v7.16b, #1
+        ext             v17.16b, v3.16b, v7.16b, #2
+        ext             v18.16b, v3.16b, v7.16b, #3
+        EPEL_UNI_W_H_CALC v2, v4, v19, v20
+        EPEL_UNI_W_H_CALC v5, v6, v21, v22
+        EPEL_UNI_W_H_CALC v3, v16, v23, v24
+        EPEL_UNI_W_H_CALC v17, v18, v25, v26
+        sqxtn           v19.4h, v19.4s
+        sqxtn2          v19.8h, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v24.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v25.4s
+        sqxtn           v22.4h, v22.4s
+        sqxtn2          v22.8h, v26.4s
+        sqxtun          v19.8b, v19.8h
+        sqxtun          v20.8b, v20.8h
+        sqxtun          v21.8b, v21.8h
+        sqxtun          v22.8b, v22.8h
+        st4             {v19.8b, v20.8b, v21.8b, v22.8b}, [x0], x1
+        b.hi            1b
+        ret
+endfunc
+
+#endif
diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 5a1d520eec..8af0a2b4b9 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -166,6 +166,10 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(epel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
 
 NEON8_FNPROTO(qpel_h, (int16_t *dst,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -273,8 +277,9 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
         if (have_i8mm(cpu_flags)) {
-            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 1, epel_uni_w_h ,_i8mm);
             NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
             NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm);
         }
 
-- 
2.38.0.windows.1