[FFmpeg-devel] [PATCH] lavc/aarch64: new optimization for 8-bit hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv and qpel_h

myais Logan.Lyu at myais.com.cn
Wed May 3 05:14:19 EEST 2023


Hello,

- I splited this patch, Do I need to resubmit or just attach them as 
attachments? (I attached those patches.  If I need to resummit, please 
let me know.)

- Those functions are checked by checkasm already.

Thanks.


在 2023/5/2 20:32, Jean-Baptiste Kempf 写道:
> Hello,
>
> Just 2 questions:
> - could you split this patch into several (3,4 or 5)
> - are all those functions checked by chekasm?
>
> Thanks,
>
> jb
>
> On Sun, 30 Apr 2023, at 10:57, myais wrote:
>> Hi,
>> This is a patch for the aarch64, which completes the neon versions of
>> the hevc_pel_uni_w_pixels, qpel_uni_w_h, qpel_uni_w_v, qpel_uni_w_hv
>> interfaces.
>>
>> put_hevc_pel_uni_w_pixels4_8_c: 54.3
>> put_hevc_pel_uni_w_pixels4_8_neon: 24.1
>> put_hevc_pel_uni_w_pixels6_8_c: 105.3
>> put_hevc_pel_uni_w_pixels6_8_neon: 53.1
>> put_hevc_pel_uni_w_pixels8_8_c: 176.6
>> put_hevc_pel_uni_w_pixels8_8_neon: 63.8
>> put_hevc_pel_uni_w_pixels12_8_c: 391.1
>> put_hevc_pel_uni_w_pixels12_8_neon: 193.3
>> put_hevc_pel_uni_w_pixels16_8_c: 688.1
>> put_hevc_pel_uni_w_pixels16_8_neon: 226.1
>> put_hevc_pel_uni_w_pixels24_8_c: 1542.3
>> put_hevc_pel_uni_w_pixels24_8_neon: 536.8
>> put_hevc_pel_uni_w_pixels32_8_c: 2753.1
>> put_hevc_pel_uni_w_pixels32_8_neon: 875.8
>> put_hevc_pel_uni_w_pixels48_8_c: 6251.1
>> put_hevc_pel_uni_w_pixels48_8_neon: 1966.1
>> put_hevc_pel_uni_w_pixels64_8_c: 11047.1
>> put_hevc_pel_uni_w_pixels64_8_neon: 3449.8
>>
>> put_hevc_qpel_uni_w_h4_8_c: 156.6
>> put_hevc_qpel_uni_w_h4_8_neon: 44.6
>> put_hevc_qpel_uni_w_h6_8_c: 324.6
>> put_hevc_qpel_uni_w_h6_8_neon: 103.1
>> put_hevc_qpel_uni_w_h8_8_c: 549.3
>> put_hevc_qpel_uni_w_h8_8_neon: 138.6
>> put_hevc_qpel_uni_w_h12_8_c: 1240.3
>> put_hevc_qpel_uni_w_h12_8_neon: 277.3
>> put_hevc_qpel_uni_w_h16_8_c: 2161.8
>> put_hevc_qpel_uni_w_h16_8_neon: 394.1
>> put_hevc_qpel_uni_w_h24_8_c: 4874.8
>> put_hevc_qpel_uni_w_h24_8_neon: 972.6
>> put_hevc_qpel_uni_w_h32_8_c: 8517.8
>> put_hevc_qpel_uni_w_h32_8_neon: 1517.3
>> put_hevc_qpel_uni_w_h48_8_c: 19856.1
>> put_hevc_qpel_uni_w_h48_8_neon: 3429.8
>> put_hevc_qpel_uni_w_h64_8_c: 35159.3
>> put_hevc_qpel_uni_w_h64_8_neon: 6018.1
>>
>> put_hevc_qpel_uni_w_v4_8_c: 180.6
>> put_hevc_qpel_uni_w_v4_8_neon: 63.8
>> put_hevc_qpel_uni_w_v6_8_c: 318.6
>> put_hevc_qpel_uni_w_v6_8_neon: 117.8
>> put_hevc_qpel_uni_w_v8_8_c: 547.6
>> put_hevc_qpel_uni_w_v8_8_neon: 132.1
>> put_hevc_qpel_uni_w_v12_8_c: 1202.8
>> put_hevc_qpel_uni_w_v12_8_neon: 350.1
>> put_hevc_qpel_uni_w_v16_8_c: 2109.6
>> put_hevc_qpel_uni_w_v16_8_neon: 442.1
>> put_hevc_qpel_uni_w_v24_8_c: 4748.8
>> put_hevc_qpel_uni_w_v24_8_neon: 1287.1
>> put_hevc_qpel_uni_w_v32_8_c: 8487.3
>> put_hevc_qpel_uni_w_v32_8_neon: 1704.3
>> put_hevc_qpel_uni_w_v48_8_c: 18798.8
>> put_hevc_qpel_uni_w_v48_8_neon: 3790.8
>> put_hevc_qpel_uni_w_v64_8_c: 35614.6
>> put_hevc_qpel_uni_w_v64_8_neon: 6725.6
>>
>>
>> put_hevc_qpel_uni_w_hv4_8_c: 498.8
>> put_hevc_qpel_uni_w_hv4_8_neon: 139.3
>> put_hevc_qpel_uni_w_hv6_8_c: 874.6
>> put_hevc_qpel_uni_w_hv6_8_neon: 295.3
>> put_hevc_qpel_uni_w_hv8_8_c: 1372.1
>> put_hevc_qpel_uni_w_hv8_8_neon: 387.1
>> put_hevc_qpel_uni_w_hv12_8_c: 2721.8
>> put_hevc_qpel_uni_w_hv12_8_neon: 804.8
>> put_hevc_qpel_uni_w_hv16_8_c: 4503.1
>> put_hevc_qpel_uni_w_hv16_8_neon: 1038.1
>> put_hevc_qpel_uni_w_hv24_8_c: 9321.8
>> put_hevc_qpel_uni_w_hv24_8_neon: 2962.1
>> put_hevc_qpel_uni_w_hv32_8_c: 15926.8
>> put_hevc_qpel_uni_w_hv32_8_neon: 3858.6
>> put_hevc_qpel_uni_w_hv48_8_c: 35051.1
>> put_hevc_qpel_uni_w_hv48_8_neon: 9301.1
>> put_hevc_qpel_uni_w_hv64_8_c: 61215.3
>> put_hevc_qpel_uni_w_hv64_8_neon: 14920.1
>>
>> put_hevc_qpel_uni_h4_8_c: 143.3
>> put_hevc_qpel_uni_h4_8_neon: 55.3
>> put_hevc_qpel_uni_h6_8_c: 304.6
>> put_hevc_qpel_uni_h6_8_neon: 82.3
>> put_hevc_qpel_uni_h8_8_c: 557.8
>> put_hevc_qpel_uni_h8_8_neon: 99.3
>> put_hevc_qpel_uni_h12_8_c: 1228.3
>> put_hevc_qpel_uni_h12_8_neon: 251.6
>> put_hevc_qpel_uni_h16_8_c: 2210.3
>> put_hevc_qpel_uni_h16_8_neon: 324.6
>> put_hevc_qpel_uni_h24_8_c: 4859.1
>> put_hevc_qpel_uni_h24_8_neon: 962.3
>> put_hevc_qpel_uni_h32_8_c: 8728.6
>> put_hevc_qpel_uni_h32_8_neon: 1249.6
>> put_hevc_qpel_uni_h48_8_c: 20346.3
>> put_hevc_qpel_uni_h48_8_neon: 2824.1
>> put_hevc_qpel_uni_h64_8_c: 36702.6
>> put_hevc_qpel_uni_h64_8_neon: 5012.1
>>
>>
>>
>>
>> Signed-off-by: myais <Logan.Lyu at myais.com.cn>
>> ---
>>    libavcodec/aarch64/hevcdsp_init_aarch64.c |   96 +
>>    libavcodec/aarch64/hevcdsp_qpel_neon.S    | 2223 +++++++++++++++++++++
>>    2 files changed, 2319 insertions(+)
>>
>> diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> index be1049a2ec..42b8e9169d 100644
>> --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
>> @@ -128,6 +128,91 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t
>> *_dst, ptrdiff_t _dststride, co
>>                                             ptrdiff_t _srcstride, const
>> int16_t *src2, int height, intptr_t
>>                                             mx, intptr_t my, int width);
>>    +#define NEON8_FNPROTO(fn, args, ext) \
>> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>> +
>> +#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
>> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>> +
>> +#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
>> +    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
>> +    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
>> +
>> +
>> +NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox, +        intptr_t mx,
>> intptr_t my, int width),);
>> +
>> +NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t
>> _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width),);
>> +
>> +#if defined(__ARM_FEATURE_DOTPROD)
>> +NEON8_FNPROTO(qpel_h, (int16_t *dst,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, intptr_t mx, intptr_t my, int width), _dotprod);
>> +
>> +NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width), _dotprod);
>> +
>> +NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t
>> _dststride,
>> +        const uint8_t *_src, ptrdiff_t _srcstride,
>> +        int height, int denom, int wx, int ox,
>> +        intptr_t mx, intptr_t my, int width), _dotprod);
>> +
>> +#endif
>> +
>> +#define NEON8_FNASSIGN(member, v, h, fn, ext) \
>> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
>> +        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
>> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
>> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
>> +        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
>> +        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
>> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
>> +
>> +#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
>> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
>> +        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
>> +
>> +#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
>> +        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
>> +        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
>> +        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
>> +        member[6][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
>> +        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
>> +        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
>> +        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
>> +
>>    av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int
>> bit_depth)
>>    {
>>        if (!have_neon(av_get_cpu_flags())) return;
>> @@ -185,6 +270,17 @@ av_cold void
>> ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
>>            c->put_hevc_qpel_bi[7][0][1]   =
>>            c->put_hevc_qpel_bi[8][0][1]   =
>>            c->put_hevc_qpel_bi[9][0][1]   =
>> ff_hevc_put_hevc_qpel_bi_h16_8_neon;
>> +
>> +        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
>> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0,
>> pel_uni_w_pixels,); +
>> NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
>> +
>> +    #if defined(__ARM_FEATURE_DOTPROD)
>> +        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _dotprod);
>> +        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h,
>> _dotprod);
>> +        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1,
>> qpel_uni_w_hv, _dotprod);
>> +
>> +    #endif
>>        }
>>        if (bit_depth == 10) {
>>            c->hevc_h_loop_filter_chroma   =
>> ff_hevc_h_loop_filter_chroma_10_neon;
>> diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> b/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> index 0e7b912678..e30ac1b465 100644
>> --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
>> @@ -30,6 +30,13 @@ const qpel_filters, align=4
>>            .byte           0,  1, -5, 17, 58,-10, 4, -1
>>    endconst
>>    +const qpel_filters_abs, align=4
>> +        .byte           0,  0,  0,  0,  0,  0, 0,  0
>> +        .byte           1,  4, 10, 58, 17,  5, 1,  0
>> +        .byte           1,  4, 11, 40, 40, 11, 4,  1
>> +        .byte           0,  1,  5, 17, 58, 10, 4,  1
>> +endconst
>> +
>>    .macro load_filter m
>>            movrel          x15, qpel_filters
>>            add             x15, x15, \m, lsl #3
>> @@ -482,3 +489,2219 @@ endfunc
>>    put_hevc qpel
>>    put_hevc qpel_uni
>>    put_hevc qpel_bi
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.8h, w7
>> +1:
>> +        ldr     s0, [x2]
>> +        ldr     s1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v0.8h, v0.8b, #6
>> +        ushll   v1.8h, v1.8b, #6
>> +        smull   v0.4s, v0.4h, v30.4h
>> +        smull   v1.4s, v1.4h, v30.4h
>> +        sqrshl  v0.4s, v0.4s, v31.4s
>> +        sqrshl  v1.4s, v1.4s, v31.4s
>> +        sqadd   v0.4s, v0.4s, v29.4s
>> +        sqadd   v1.4s, v1.4s, v29.4s
>> +        sqxtn  v0.4h, v0.4s
>> +        sqxtn  v1.4h, v1.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        str     s0, [x0]
>> +        str     s1, [x0, x1]
>> +        add     x0, x0, x1, lsl 1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +        sub     x1, x1, #4
>> +1:
>> +        ldr     d0, [x2]
>> +        ldr     d1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v0.8h, v0.8b, #6
>> +        ushll   v1.8h, v1.8b, #6
>> +        smull   v4.4s, v0.4h, v30.4h
>> +        smull2  v5.4s, v0.8h, v30.8h
>> +        smull   v6.4s, v1.4h, v30.4h
>> +        smull2  v7.4s, v1.8h, v30.8h
>> +        sqrshl  v4.4s, v4.4s, v31.4s
>> +        sqrshl  v5.4s, v5.4s, v31.4s
>> +        sqrshl  v6.4s, v6.4s, v31.4s
>> +        sqrshl  v7.4s, v7.4s, v31.4s
>> +        sqadd   v4.4s, v4.4s, v29.4s
>> +        sqadd   v5.4s, v5.4s, v29.4s
>> +        sqadd   v6.4s, v6.4s, v29.4s
>> +        sqadd   v7.4s, v7.4s, v29.4s
>> +        sqxtn   v0.4h, v4.4s
>> +        sqxtn2  v0.8h, v5.4s
>> +        sqxtn   v1.4h, v6.4s
>> +        sqxtn2  v1.8h, v7.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        str     s0, [x0], #4
>> +        st1     {v0.h}[2], [x0], x1
>> +        str     s1, [x0], #4
>> +        st1     {v1.h}[2], [x0], x1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ldr     d0, [x2]
>> +        ldr     d1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v0.8h, v0.8b, #6
>> +        ushll   v1.8h, v1.8b, #6
>> +        smull   v4.4s, v0.4h, v30.4h
>> +        smull2  v5.4s, v0.8h, v30.8h
>> +        smull   v6.4s, v1.4h, v30.4h
>> +        smull2  v7.4s, v1.8h, v30.8h
>> +        sqrshl  v4.4s, v4.4s, v31.4s
>> +        sqrshl  v5.4s, v5.4s, v31.4s
>> +        sqrshl  v6.4s, v6.4s, v31.4s
>> +        sqrshl  v7.4s, v7.4s, v31.4s
>> +        sqadd   v4.4s, v4.4s, v29.4s
>> +        sqadd   v5.4s, v5.4s, v29.4s
>> +        sqadd   v6.4s, v6.4s, v29.4s
>> +        sqadd   v7.4s, v7.4s, v29.4s
>> +        sqxtn   v0.4h, v4.4s
>> +        sqxtn2  v0.8h, v5.4s
>> +        sqxtn   v1.4h, v6.4s
>> +        sqxtn2  v1.8h, v7.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        str     d0, [x0]
>> +        str     d1, [x0, x1]
>> +        add     x0, x0, x1, lsl 1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +        sub     x1, x1, #8
>> +1:
>> +        ldr     q0, [x2]
>> +        ldr     q1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        ushll   v4.8h, v0.8b, #6
>> +        ushll2  v5.8h, v0.16b, #6
>> +        ushll   v6.8h, v1.8b, #6
>> +        ushll2  v7.8h, v1.16b, #6
>> +        smull   v16.4s, v4.4h, v30.4h
>> +        smull2  v17.4s, v4.8h, v30.8h
>> +        smull   v18.4s, v5.4h, v30.4h
>> +        smull2  v19.4s, v5.8h, v30.8h
>> +        smull   v20.4s, v6.4h, v30.4h
>> +        smull2  v21.4s, v6.8h, v30.8h
>> +        smull   v22.4s, v7.4h, v30.4h
>> +        smull2  v23.4s, v7.8h, v30.8h
>> +        +        sqrshl  v16.4s, v16.4s, v31.4s
>> +        sqrshl  v17.4s, v17.4s, v31.4s
>> +        sqrshl  v18.4s, v18.4s, v31.4s
>> +        sqrshl  v19.4s, v19.4s, v31.4s
>> +        sqrshl  v20.4s, v20.4s, v31.4s
>> +        sqrshl  v21.4s, v21.4s, v31.4s
>> +        sqrshl  v22.4s, v22.4s, v31.4s
>> +        sqrshl  v23.4s, v23.4s, v31.4s
>> +        sqadd   v16.4s, v16.4s, v29.4s
>> +        sqadd   v17.4s, v17.4s, v29.4s
>> +        sqadd   v18.4s, v18.4s, v29.4s
>> +        sqadd   v19.4s, v19.4s, v29.4s
>> +        sqadd   v20.4s, v20.4s, v29.4s
>> +        sqadd   v21.4s, v21.4s, v29.4s
>> +        sqadd   v22.4s, v22.4s, v29.4s
>> +        sqadd   v23.4s, v23.4s, v29.4s
>> +        sqxtn   v0.4h, v16.4s
>> +        sqxtn2  v0.8h, v17.4s
>> +        sqxtn   v1.4h, v18.4s
>> +        sqxtn2  v1.8h, v19.4s
>> +        sqxtn   v2.4h, v20.4s
>> +        sqxtn2  v2.8h, v21.4s
>> +        sqxtn   v3.4h, v22.4s
>> +        sqxtn2  v3.8h, v23.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun2 v0.16b, v1.8h
>> +        sqxtun  v2.8b, v2.8h
>> +        sqxtun2 v2.16b, v3.8h
>> +        str     d0, [x0], #8
>> +        st1     {v0.s}[2], [x0], x1
>> +        str     d2, [x0], #8
>> +        st1     {v2.s}[2], [x0], x1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
>> +        ushll   \t0\().8h, \s0\().8b, #6
>> +        ushll2  \t1\().8h, \s0\().16b, #6
>> +        smull   \d0\().4s, \t0\().4h, v30.4h
>> +        smull2  \d1\().4s, \t0\().8h, v30.8h
>> +        smull   \d2\().4s, \t1\().4h, v30.4h
>> +        smull2  \d3\().4s, \t1\().8h, v30.8h
>> +        sqrshl  \d0\().4s, \d0\().4s, v31.4s
>> +        sqrshl  \d1\().4s, \d1\().4s, v31.4s
>> +        sqrshl  \d2\().4s, \d2\().4s, v31.4s
>> +        sqrshl  \d3\().4s, \d3\().4s, v31.4s
>> +        sqadd   \d0\().4s, \d0\().4s, v29.4s
>> +        sqadd   \d1\().4s, \d1\().4s, v29.4s
>> +        sqadd   \d2\().4s, \d2\().4s, v29.4s
>> +        sqadd   \d3\().4s, \d3\().4s, v29.4s
>> +        sqxtn   \t0\().4h, \d0\().4s
>> +        sqxtn2  \t0\().8h, \d1\().4s
>> +        sqxtn   \t1\().4h, \d2\().4s
>> +        sqxtn2  \t1\().8h, \d3\().4s
>> +        sqxtun  \s0\().8b,  \t0\().8h
>> +        sqxtun2 \s0\().16b, \t1\().8h
>> +.endm
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ldr     q0, [x2]
>> +        ldr     q1, [x2, x3]
>> +        add     x2, x2, x3, lsl 1
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        str     q0, [x0]
>> +        str     q1, [x0, x1]
>> +        add     x0, x0, x1, lsl 1
>> +        subs    w4, w4, #2
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b}, [x2], x3
>> +        ushll   v4.8h, v0.8b, #6
>> +        ushll2  v5.8h, v0.16b, #6
>> +        ushll   v6.8h, v1.8b, #6
>> +        smull   v16.4s, v4.4h, v30.4h
>> +        smull2  v17.4s, v4.8h, v30.8h
>> +        smull   v18.4s, v5.4h, v30.4h
>> +        smull2  v19.4s, v5.8h, v30.8h
>> +        smull   v20.4s, v6.4h, v30.4h
>> +        smull2  v21.4s, v6.8h, v30.8h
>> +        sqrshl  v16.4s, v16.4s, v31.4s
>> +        sqrshl  v17.4s, v17.4s, v31.4s
>> +        sqrshl  v18.4s, v18.4s, v31.4s
>> +        sqrshl  v19.4s, v19.4s, v31.4s
>> +        sqrshl  v20.4s, v20.4s, v31.4s
>> +        sqrshl  v21.4s, v21.4s, v31.4s
>> +        sqadd   v16.4s, v16.4s, v29.4s
>> +        sqadd   v17.4s, v17.4s, v29.4s
>> +        sqadd   v18.4s, v18.4s, v29.4s
>> +        sqadd   v19.4s, v19.4s, v29.4s
>> +        sqadd   v20.4s, v20.4s, v29.4s
>> +        sqadd   v21.4s, v21.4s, v29.4s
>> +        sqxtn   v0.4h, v16.4s
>> +        sqxtn2  v0.8h, v17.4s
>> +        sqxtn   v1.4h, v18.4s
>> +        sqxtn2  v1.8h, v19.4s
>> +        sqxtn   v2.4h, v20.4s
>> +        sqxtn2  v2.8h, v21.4s
>> +        sqxtun  v0.8b, v0.8h
>> +        sqxtun  v1.8b, v1.8h
>> +        sqxtun  v2.8b, v2.8h
>> +        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b}, [x2], x3
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        st1     {v0.16b, v1.16b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
>> +        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
>> +        mov     w10, #-6
>> +        sub     w10, w10, w5
>> +        dup     v30.8h, w6
>> +        dup     v31.4s, w10
>> +        dup     v29.4s, w7
>> +1:
>> +        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
>> +        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
>> +        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
>> +        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
>> +        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
>> +        subs    w4, w4, #1
>> +        b.ne    1b
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_V_HEADER
>> +        ldur            x12, [sp, #8]          // my
>> +        sub             x2, x2, x3, lsl #1
>> +        sub             x2, x2, x3
>> +        movrel          x9, qpel_filters_abs
>> +        add             x9, x9, x12, lsl 3
>> +        ldr             d28, [x9]
>> +        dup             v0.16b, v28.b[0]
>> +        dup             v1.16b, v28.b[1]
>> +        dup             v2.16b, v28.b[2]
>> +        dup             v3.16b, v28.b[3]
>> +        dup             v4.16b, v28.b[4]
>> +        dup             v5.16b, v28.b[5]
>> +        dup             v6.16b, v28.b[6]
>> +        dup             v7.16b, v28.b[7]
>> +
>> +        mov             w10, #-6
>> +        sub             w10, w10, w5
>> +        dup             v30.8h, w6              // wx
>> +        dup             v31.4s, w10             // shift
>> +        dup             v29.4s, w7              // ox
>> +.endm
>> +
>> +.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
>> +        umull           \dst\().8h, \src1\().8b, v1.8b
>> +        umlsl           \dst\().8h, \src0\().8b, v0.8b
>> +        umlsl           \dst\().8h, \src2\().8b, v2.8b
>> +        umlal           \dst\().8h, \src3\().8b, v3.8b
>> +        umlal           \dst\().8h, \src4\().8b, v4.8b
>> +        umlsl           \dst\().8h, \src5\().8b, v5.8b
>> +        umlal           \dst\().8h, \src6\().8b, v6.8b
>> +        umlsl           \dst\().8h, \src7\().8b, v7.8b
>> +.endm
>> +
>> +.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
>> +        umull2          \dst\().8h, \src1\().16b, v1.16b
>> +        umlsl2          \dst\().8h, \src0\().16b, v0.16b
>> +        umlsl2          \dst\().8h, \src2\().16b, v2.16b
>> +        umlal2          \dst\().8h, \src3\().16b, v3.16b
>> +        umlal2          \dst\().8h, \src4\().16b, v4.16b
>> +        umlsl2          \dst\().8h, \src5\().16b, v5.16b
>> +        umlal2          \dst\().8h, \src6\().16b, v6.16b
>> +        umlsl2          \dst\().8h, \src7\().16b, v7.16b
>> +.endm
>> +
>> +.macro  QPEL_UNI_W_V_4
>> +        smull           v24.4s, v24.4h, v30.4h
>> +        sqrshl          v24.4s, v24.4s, v31.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.s}[0], [x0], x1
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldr             s16, [x2]
>> +        ldr             s17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             s18, [x2]
>> +        ldr             s19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             s20, [x2]
>> +        ldr             s21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             s22, [x2]
>> +
>> +1:      ldr             s23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s16, [x2]
>> +        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s18, [x2]
>> +        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s20, [x2]
>> +        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             s22, [x2]
>> +        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_4
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_V_8
>> +        smull           v24.4s, v26.4h, v30.4h
>> +        smull2          v25.4s, v26.8h, v30.8h
>> +        sqrshl          v24.4s, v24.4s, v31.4s
>> +        sqrshl          v25.4s, v25.4s, v31.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.d}[0], [x0], x1
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldr             d16, [x2]
>> +        ldr             d17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             d18, [x2]
>> +        ldr             d19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             d20, [x2]
>> +        ldr             d21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             d22, [x2]
>> +
>> +1:      ldr             d23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d16, [x2]
>> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d18, [x2]
>> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d20, [x2]
>> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             d22, [x2]
>> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_8
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_V_16
>> +        smull           v24.4s, v26.4h, v30.4h
>> +        smull2          v25.4s, v26.8h, v30.8h
>> +        smull           v26.4s, v27.4h, v30.4h
>> +        smull2          v27.4s, v27.8h, v30.8h
>> +        sqrshl          v24.4s, v24.4s, v31.4s
>> +        sqrshl          v25.4s, v25.4s, v31.4s
>> +        sqrshl          v26.4s, v26.4s, v31.4s
>> +        sqrshl          v27.4s, v27.4s, v31.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqadd           v26.4s, v26.4s, v29.4s
>> +        sqadd           v27.4s, v27.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtn           v26.4h, v26.4s
>> +        sqxtn2          v26.8h, v27.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        sqxtun2         v24.16b, v26.8h
>> +        st1             {v24.16b}, [x0], x1
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldr             q16, [x2]
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q18, [x2]
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q20, [x2]
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q22, [x2]
>> +
>> +1:      ldr             q23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q16, [x2]
>> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q18, [x2]
>> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q20, [x2]
>> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q22, [x2]
>> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
>> +        QPEL_UNI_W_V_HEADER
>> +        ldur            w13, [sp, #16]
>> +        mov             x14, x0
>> +        mov             x15, x2
>> +        mov             w11, w4
>> +
>> +3:
>> +        ldr             q16, [x2]
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q18, [x2]
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q20, [x2]
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        ldr             q22, [x2]
>> +
>> +
>> +1:      ldr             q23, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q16, [x2]
>> +        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q17, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q18, [x2]
>> +        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q19, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q20, [x2]
>> +        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q21, [x2, x3]
>> +        add             x2, x2, x3, lsl 1
>> +        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.eq            2f
>> +
>> +        ldr             q22, [x2]
>> +        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_V_16
>> +        subs            w4, w4, #1
>> +        b.ne            1b
>> +2:
>> +        subs            w13, w13, #16
>> +        add             x14, x14, #16
>> +        add             x15, x15, #16
>> +        mov             x0, x14
>> +        mov             x2, x15
>> +        mov             w4, w11
>> +        b.hi            3b
>> +        ret
>> +endfunc
>> +
>> +#if __ARM_FEATURE_DOTPROD
>> +.macro QPEL_UNI_W_H_HEADER
>> +        ldr             x12, [sp]
>> +        sub             x2, x2, #3
>> +        movrel          x9, qpel_filters
>> +        add             x9, x9, x12, lsl 3
>> +        ldr             x11, [x9]
>> +        dup             v28.2d, x11
>> +        mov             w10, #-6
>> +        sub             w10, w10, w5
>> +        dup             v30.4s, w6              // wx
>> +        dup             v31.4s, w10             // shift
>> +        dup             v29.4s, w7              // ox
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v0.16b}, [x2], x3
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        usdot           v16.4s, v0.16b, v28.16b
>> +        usdot           v17.4s, v2.16b, v28.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        mul             v16.4s, v16.4s, v30.4s
>> +        sqrshl          v16.4s, v16.4s, v31.4s
>> +        sqadd           v16.4s, v16.4s, v29.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtun          v16.8b, v16.8h
>> +        str             s16, [x0]
>> +        add             x0, x0, x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        sub             x1, x1, #4
>> +1:
>> +        ld1             {v0.16b}, [x2], x3
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        ext             v4.16b, v0.16b, v0.16b, #4
>> +        ext             v5.16b, v0.16b, v0.16b, #5
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        movi            v18.2d, #0
>> +        usdot           v16.4s, v0.16b, v28.16b
>> +        usdot           v17.4s, v2.16b, v28.16b
>> +        usdot           v18.4s, v4.16b, v28.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        addp            v18.4s, v18.4s, v18.4s
>> +        mul             v16.4s, v16.4s, v30.4s
>> +        mul             v18.2s, v18.2s, v30.2s
>> +        sqrshl          v16.4s, v16.4s, v31.4s
>> +        sqrshl          v18.2s, v18.2s, v31.2s
>> +        sqadd           v16.4s, v16.4s, v29.4s
>> +        sqadd           v18.2s, v18.2s, v29.2s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtn2          v16.8h, v18.4s
>> +        sqxtun          v16.8b, v16.8h
>> +        str             s16, [x0], #4
>> +        st1             {v16.h}[2], [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
>> +        movi            \d0\().2d, #0
>> +        movi            \d1\().2d, #0
>> +        movi            \d2\().2d, #0
>> +        movi            \d3\().2d, #0
>> +        usdot           \d0\().4s, \s0\().16b, v28.16b
>> +        usdot           \d1\().4s, \s1\().16b, v28.16b
>> +        usdot           \d2\().4s, \s2\().16b, v28.16b
>> +        usdot           \d3\().4s, \s3\().16b, v28.16b
>> +        addp            \d0\().4s, \d0\().4s, \d1\().4s
>> +        addp            \d2\().4s, \d2\().4s, \d3\().4s
>> +        mul             \d0\().4s, \d0\().4s, v30.4s
>> +        mul             \d2\().4s, \d2\().4s, v30.4s
>> +        sqrshl          \d0\().4s, \d0\().4s, v31.4s
>> +        sqrshl          \d2\().4s, \d2\().4s, v31.4s
>> +        sqadd           \d0\().4s, \d0\().4s, v29.4s
>> +        sqadd           \d2\().4s, \d2\().4s, v29.4s
>> +.endm
>> +
>> +.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
>> +        movi            \d0\().2d, #0
>> +        movi            \d1\().2d, #0
>> +        usdot           \d0\().4s, \s0\().16b, v28.16b
>> +        usdot           \d1\().4s, \s1\().16b, v28.16b
>> +        addp            \d0\().4s, \d0\().4s, \d1\().4s
>> +        mul             \d0\().4s, \d0\().4s, v30.4s
>> +        sqrshl          \d0\().4s, \d0\().4s, v31.4s
>> +        sqadd           \d0\().4s, \d0\().4s, v29.4s
>> +.endm
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        zip1            v0.2d, v16.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
>> +        sqxtn           v18.4h, v18.4s
>> +        sqxtn2          v18.8h, v20.4s
>> +        sqxtun          v18.8b, v18.8h
>> +        str             d18, [x0]
>> +        add             x0, x0, x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        add             x13, x0, #8
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        zip1            v18.2d, v16.2d, v1.2d
>> +        zip1            v19.2d, v2.2d, v3.2d
>> +        zip1            v20.2d, v4.2d, v5.2d
>> +        zip1            v21.2d, v6.2d, v7.2d
>> +        zip2            v22.2d, v16.2d, v1.2d
>> +        zip2            v23.2d, v2.2d, v3.2d
>> +        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
>> +        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
>> +        sqxtn           v0.4h, v0.4s
>> +        sqxtn2          v0.8h, v4.4s
>> +        sqxtn           v1.4h, v24.4s
>> +        sqxtun          v0.8b, v0.8h
>> +        sqxtun          v1.8b, v1.8h
>> +
>> +        str             d0, [x0]
>> +        str             s1, [x13]
>> +        add             x0, x0, x1
>> +        add             x13, x13, x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   //
>> v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    //
>> v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
>> +        sqxtn           v0.4h, v18.4s
>> +        sqxtn2          v0.8h, v22.4s
>> +        sqxtn           v1.4h, v20.4s
>> +        sqxtn2          v1.8h, v24.4s
>> +        trn1            v2.8h, v0.8h, v1.8h
>> +        trn2            v3.8h, v0.8h, v1.8h
>> +        sqxtun          v0.8b, v2.8h
>> +        sqxtun2         v0.16b, v3.8h
>> +        st1             {v0.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        sub             x1, x1, #16
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
>> +        sqxtn           v18.4h, v18.4s
>> +        sqxtn2          v18.8h, v22.4s
>> +        sqxtn           v19.4h, v20.4s
>> +        sqxtn2          v19.8h, v24.4s
>> +        trn1            v20.8h, v18.8h, v19.8h
>> +        trn2            v21.8h, v18.8h, v19.8h
>> +        sqxtun          v26.8b, v20.8h
>> +        sqxtun2         v26.16b, v21.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v17.16b, #1
>> +        ext             v2.16b, v17.16b, v17.16b, #2
>> +        ext             v3.16b, v17.16b, v17.16b, #3
>> +        ext             v4.16b, v17.16b, v17.16b, #4
>> +        ext             v5.16b, v17.16b, v17.16b, #5
>> +        ext             v6.16b, v17.16b, v17.16b, #6
>> +        ext             v7.16b, v17.16b, v17.16b, #7
>> +        zip1            v0.2d, v17.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
>> +        sqxtn           v18.4h, v18.4s
>> +        sqxtn2          v18.8h, v20.4s
>> +        sqxtun          v27.8b, v18.8h
>> +
>> +        st1             {v26.16b}, [x0], #16
>> +        st1             {v27.8b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
>> +        sqxtn           v0.4h, v0.4s
>> +        sqxtn2          v0.8h, v22.4s
>> +        sqxtn           v19.4h, v20.4s
>> +        sqxtn2          v19.8h, v24.4s
>> +        trn1            v20.8h, v0.8h, v19.8h
>> +        trn2            v21.8h, v0.8h, v19.8h
>> +        sqxtun          v26.8b, v20.8h
>> +        sqxtun2         v26.16b, v21.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
>> +        sqxtn           v0.4h, v0.4s
>> +        sqxtn2          v0.8h, v22.4s
>> +        sqxtn           v19.4h, v20.4s
>> +        sqxtn2          v19.8h, v24.4s
>> +        trn1            v20.8h, v0.8h, v19.8h
>> +        trn2            v21.8h, v0.8h, v19.8h
>> +        sqxtun          v27.8b, v20.8h
>> +        sqxtun2         v27.16b, v21.8h                         // 16-31
>> +        st1             {v26.16b, v27.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v25.8b, v22.8h
>> +        sqxtun2         v25.16b, v23.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v26.8b, v22.8h
>> +        sqxtun2         v26.16b, v23.8h                         // 16-31
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v27.8b, v22.8h
>> +        sqxtun2         v27.16b, v23.8h                         // 32-47
>> +        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_H_HEADER
>> +        sub             x3, x3, #64
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v16.8b, v22.8h
>> +        sqxtun2         v16.16b, v23.8h                         // 0-15
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v17.8b, v22.8h
>> +        sqxtun2         v17.16b, v23.8h                         // 16-31
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        ld1             {v0.16b}, [x2], x3
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v18.8b, v22.8h
>> +        sqxtun2         v18.16b, v23.8h                         // 32-47
>> +        ext             v1.16b, v19.16b, v0.16b, #1
>> +        ext             v2.16b, v19.16b, v0.16b, #2
>> +        ext             v3.16b, v19.16b, v0.16b, #3
>> +        ext             v4.16b, v19.16b, v0.16b, #4
>> +        ext             v5.16b, v19.16b, v0.16b, #5
>> +        ext             v6.16b, v19.16b, v0.16b, #6
>> +        ext             v7.16b, v19.16b, v0.16b, #7
>> +        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
>> +        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        sqxtn           v21.4h, v21.4s
>> +        sqxtn2          v21.8h, v23.4s
>> +        trn1            v22.8h, v20.8h, v21.8h
>> +        trn2            v23.8h, v20.8h, v21.8h
>> +        sqxtun          v19.8b, v22.8h
>> +        sqxtun2         v19.16b, v23.8h                         // 48-63
>> +
>> +        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
>> +        subs            w4, w4, #1
>> +        b.hi            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +.macro QPEL_H_HEADER
>> +        movrel          x9, qpel_filters
>> +        add             x9, x9, x4, lsl 3
>> +        ldr             x11, [x9]
>> +        dup             v31.2d, x11
>> +        sub             x1, x1, #3
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_h4_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +1:
>> +        ld1             {v0.16b}, [x1], x2
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        usdot           v16.4s, v0.16b, v31.16b
>> +        usdot           v17.4s, v2.16b, v31.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        str             d16, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h6_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #8
>> +1:
>> +        ld1             {v0.16b}, [x1], x2
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        ext             v4.16b, v0.16b, v0.16b, #4
>> +        ext             v5.16b, v0.16b, v0.16b, #5
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        movi            v18.2d, #0
>> +        usdot           v16.4s, v0.16b, v31.16b
>> +        usdot           v17.4s, v2.16b, v31.16b
>> +        usdot           v18.4s, v4.16b, v31.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        addp            v18.4s, v18.4s, v18.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtn           v18.4h, v18.4s
>> +        str             d16, [x0]
>> +        str             s18, [x15]
>> +        add             x0, x0, x10
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h8_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +1:
>> +        ld1             {v0.16b}, [x1], x2
>> +        ext             v1.16b, v0.16b, v0.16b, #1
>> +        ext             v2.16b, v0.16b, v0.16b, #2
>> +        ext             v3.16b, v0.16b, v0.16b, #3
>> +        ext             v4.16b, v0.16b, v0.16b, #4
>> +        ext             v5.16b, v0.16b, v0.16b, #5
>> +        ext             v6.16b, v0.16b, v0.16b, #6
>> +        ext             v7.16b, v0.16b, v0.16b, #7
>> +        zip1            v0.2d, v0.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        movi            v16.2d, #0
>> +        movi            v17.2d, #0
>> +        movi            v18.2d, #0
>> +        movi            v19.2d, #0
>> +        usdot           v16.4s, v0.16b, v31.16b
>> +        usdot           v17.4s, v2.16b, v31.16b
>> +        usdot           v18.4s, v4.16b, v31.16b
>> +        usdot           v19.4s, v6.16b, v31.16b
>> +        addp            v16.4s, v16.4s, v17.4s
>> +        addp            v18.4s, v18.4s, v19.4s
>> +        sqxtn           v16.4h, v16.4s
>> +        sqxtn2          v16.8h, v18.4s
>> +        str             q16, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 +        movi
>>      \d0\().2d, #0
>> +        movi            \d1\().2d, #0
>> +        movi            \d2\().2d, #0
>> +        movi            \d3\().2d, #0
>> +        usdot           \d0\().4s, \s0\().16b, v31.16b
>> +        usdot           \d1\().4s, \s1\().16b, v31.16b
>> +        usdot           \d2\().4s, \s2\().16b, v31.16b
>> +        usdot           \d3\().4s, \s3\().16b, v31.16b
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_h12_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #16
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        zip1            v18.2d, v4.2d, v5.2d
>> +        zip1            v19.2d, v6.2d, v7.2d
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        movi            v24.2d, #0
>> +        movi            v25.2d, #0
>> +        usdot           v24.4s, v18.16b, v31.16b
>> +        usdot           v25.4s, v19.16b, v31.16b
>> +        addp            v24.4s, v24.4s, v25.4s
>> +        trn1            v26.4s, v20.4s, v21.4s
>> +        trn2            v27.4s, v20.4s, v21.4s
>> +        sqxtn           v26.4h, v26.4s
>> +        sqxtn           v27.4h, v27.4s
>> +        sqxtn2          v26.8h, v24.4s
>> +
>> +        str             q26, [x0]
>> +        str             d27, [x15]
>> +        add             x0, x0, x10
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h16_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +
>> +        sqxtn           v18.4h, v22.4s
>> +        sqxtn2          v18.8h, v26.4s
>> +        sqxtn           v19.4h, v23.4s
>> +        sqxtn2          v19.8h, v27.4s
>> +        +        stp             q18, q19, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h24_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #32
>> +1:
>> +        ld1             {v16.16b, v17.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v18.4h, v22.4s
>> +        sqxtn2          v18.8h, v26.4s
>> +        sqxtn           v19.4h, v23.4s
>> +        sqxtn2          v19.8h, v27.4s
>> +        stp             q18, q19, [x0]
>> +        add             x0, x0, x10
>> +        ext             v1.16b, v17.16b, v17.16b, #1
>> +        ext             v2.16b, v17.16b, v17.16b, #2
>> +        ext             v3.16b, v17.16b, v17.16b, #3
>> +        ext             v4.16b, v17.16b, v17.16b, #4
>> +        ext             v5.16b, v17.16b, v17.16b, #5
>> +        ext             v6.16b, v17.16b, v17.16b, #6
>> +        ext             v7.16b, v17.16b, v17.16b, #7
>> +        zip1            v0.2d, v17.2d, v1.2d
>> +        zip1            v2.2d, v2.2d, v3.2d
>> +        zip1            v4.2d, v4.2d, v5.2d
>> +        zip1            v6.2d, v6.2d, v7.2d
>> +        QPEL_H_CALC     v0, v2, v4, v5, v20, v21, v22, v23
>> +        addp            v20.4s, v20.4s, v21.4s
>> +        addp            v22.4s, v22.4s, v23.4s
>> +        sqxtn           v20.4h, v20.4s
>> +        sqxtn2          v20.8h, v22.4s
>> +        str             q20, [x15]
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h32_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2
>> +        add             x15, x0, #32
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0]
>> +        add             x0, x0, x10
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x15]
>> +        add             x15, x15, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h48_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        mov             x10, #MAX_PB_SIZE * 2 - 64
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0]
>> +        add             x0, x0, x10
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_h64_8_neon_dotprod, export=1
>> +        QPEL_H_HEADER
>> +        sub             x2, x2, #64
>> +1:
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
>> +        ext             v1.16b, v16.16b, v17.16b, #1
>> +        ext             v2.16b, v16.16b, v17.16b, #2
>> +        ext             v3.16b, v16.16b, v17.16b, #3
>> +        ext             v4.16b, v16.16b, v17.16b, #4
>> +        ext             v5.16b, v16.16b, v17.16b, #5
>> +        ext             v6.16b, v16.16b, v17.16b, #6
>> +        ext             v7.16b, v16.16b, v17.16b, #7
>> +        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +
>> +        ext             v1.16b, v17.16b, v18.16b, #1
>> +        ext             v2.16b, v17.16b, v18.16b, #2
>> +        ext             v3.16b, v17.16b, v18.16b, #3
>> +        ext             v4.16b, v17.16b, v18.16b, #4
>> +        ext             v5.16b, v17.16b, v18.16b, #5
>> +        ext             v6.16b, v17.16b, v18.16b, #6
>> +        ext             v7.16b, v17.16b, v18.16b, #7
>> +        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        ext             v1.16b, v18.16b, v19.16b, #1
>> +        ext             v2.16b, v18.16b, v19.16b, #2
>> +        ext             v3.16b, v18.16b, v19.16b, #3
>> +        ext             v4.16b, v18.16b, v19.16b, #4
>> +        ext             v5.16b, v18.16b, v19.16b, #5
>> +        ext             v6.16b, v18.16b, v19.16b, #6
>> +        ext             v7.16b, v18.16b, v19.16b, #7
>> +        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        ld1             {v28.8b}, [x1], x2
>> +        ext             v1.16b, v19.16b, v28.16b, #1
>> +        ext             v2.16b, v19.16b, v28.16b, #2
>> +        ext             v3.16b, v19.16b, v28.16b, #3
>> +        ext             v4.16b, v19.16b, v28.16b, #4
>> +        ext             v5.16b, v19.16b, v28.16b, #5
>> +        ext             v6.16b, v19.16b, v28.16b, #6
>> +        ext             v7.16b, v19.16b, v28.16b, #7
>> +        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
>> +        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
>> +        addp            v20.4s, v20.4s, v22.4s
>> +        addp            v21.4s, v21.4s, v23.4s
>> +        addp            v24.4s, v24.4s, v26.4s
>> +        addp            v25.4s, v25.4s, v27.4s
>> +        trn1            v22.4s, v20.4s, v21.4s
>> +        trn2            v23.4s, v20.4s, v21.4s
>> +        trn1            v26.4s, v24.4s, v25.4s
>> +        trn2            v27.4s, v24.4s, v25.4s
>> +        sqxtn           v20.4h, v22.4s
>> +        sqxtn2          v20.8h, v26.4s
>> +        sqxtn           v21.4h, v23.4s
>> +        sqxtn2          v21.8h, v27.4s
>> +        stp             q20, q21, [x0], #32
>> +        subs            w3, w3, #1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_HV_HEADER width
>> +        ldp             x14, x15, [sp]          // mx, my
>> +        ldr             w13, [sp, #16]          // width
>> +        stp             x20, x21, [sp, #-16]!
>> +        stp             x22, x23, [sp, #-16]!
>> +        stp             x24, x25, [sp, #-16]!
>> +        stp             x26, x27, [sp, #-16]!
>> +        stp             x28, x30, [sp, #-16]!
>> +        mov             x28, sp
>> +        mov             x11, #9088
>> +        sub             sp, sp, x11
>> +        mov             x20, x0
>> +        mov             x21, x1
>> +        mov             x0, sp
>> +        sub             x1, x2, x3, lsl 1
>> +        sub             x1, x1, x3
>> +        mov             x2, x3
>> +        add             w3, w4, #7
>> +        mov             w22, w4                 // height
>> +        mov             x4, x14                 // mx
>> +        mov             x23, x15                // my
>> +        mov             w24, w6                 // wx
>> +        mov             w25, w7                 // ox
>> +        mov             w26, #-6
>> +        sub             w26, w26, w5            // -shift
>> +        mov             w27, w13                // width
>> +        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_dotprod)
>> +        movrel          x9, qpel_filters
>> +        add             x9, x9, x23, lsl 3
>> +        ld1             {v0.8b}, [x9]
>> +        sxtl            v0.8h, v0.8b
>> +        mov             x10, #(MAX_PB_SIZE * 2)
>> +        dup             v28.4s, w24
>> +        dup             v29.4s, w25
>> +        dup             v30.4s, w26
>> +.endm
>> +
>> +.macro QPEL_UNI_W_HV_END
>> +        mov             sp, x28
>> +        ldp             x28, x30, [sp], #16
>> +        ldp             x26, x27, [sp], #16
>> +        ldp             x24, x25, [sp], #16
>> +        ldp             x22, x23, [sp], #16
>> +        ldp             x20, x21, [sp], #16
>> +.endm
>> +
>> +.macro QPEL_UNI_W_HV_4
>> +        sshr            v26.4s, v26.4s, #6
>> +        mul             v24.4s, v26.4s, v28.4s
>> +        sqrshl          v24.4s, v24.4s, v30.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.s}[0], [x20], x21
>> +.endm
>> +
>> +.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
>> +        smull           \dst\().4s, \src0\().4h, v0.h[0]
>> +        smlal           \dst\().4s, \src1\().4h, v0.h[1]
>> +        smlal           \dst\().4s, \src2\().4h, v0.h[2]
>> +        smlal           \dst\().4s, \src3\().4h, v0.h[3]
>> +        smlal           \dst\().4s, \src4\().4h, v0.h[4]
>> +        smlal           \dst\().4s, \src5\().4h, v0.h[5]
>> +        smlal           \dst\().4s, \src6\().4h, v0.h[6]
>> +        smlal           \dst\().4s, \src7\().4h, v0.h[7]
>> +.endm
>> +
>> +.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6,
>> src7
>> +        smull2          \dst\().4s, \src0\().8h, v0.h[0]
>> +        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
>> +        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
>> +        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
>> +        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
>> +        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
>> +        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
>> +        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 4
>> +        ldr             d16, [sp]
>> +        ldr             d17, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             d18, [sp]
>> +        ldr             d19, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             d20, [sp]
>> +        ldr             d21, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             d22, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldr             d23, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d16, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d17, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d18, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d19, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d20, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d21, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             d22, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_HV_4
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +
>> +2:
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_HV_8
>> +        sshr            v26.4s, v26.4s, #6
>> +        sshr            v27.4s, v27.4s, #6
>> +        mul             v24.4s, v26.4s, v28.4s
>> +        mul             v25.4s, v27.4s, v28.4s
>> +        sqrshl          v24.4s, v24.4s, v30.4s
>> +        sqrshl          v25.4s, v25.4s, v30.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        st1             {v24.d}[0], [x20], x21
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 8
>> +        ldr             q16, [sp]
>> +        ldr             q17, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             q18, [sp]
>> +        ldr             q19, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             q20, [sp]
>> +        ldr             q21, [sp, x10]
>> +        add             sp, sp, x10, lsl 1
>> +        ldr             q22, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldr             q23, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q16, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q17, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q18, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q19, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q20, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q21, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldr             q22, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_UNI_W_HV_8
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +
>> +2:
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +.macro QPEL_UNI_W_HV_16
>> +        sshr            v24.4s, v24.4s, #6
>> +        sshr            v25.4s, v25.4s, #6
>> +        sshr            v26.4s, v26.4s, #6
>> +        sshr            v27.4s, v27.4s, #6
>> +        mul             v24.4s, v24.4s, v28.4s
>> +        mul             v25.4s, v25.4s, v28.4s
>> +        mul             v26.4s, v26.4s, v28.4s
>> +        mul             v27.4s, v27.4s, v28.4s
>> +        sqrshl          v24.4s, v24.4s, v30.4s
>> +        sqrshl          v25.4s, v25.4s, v30.4s
>> +        sqrshl          v26.4s, v26.4s, v30.4s
>> +        sqrshl          v27.4s, v27.4s, v30.4s
>> +        sqadd           v24.4s, v24.4s, v29.4s
>> +        sqadd           v25.4s, v25.4s, v29.4s
>> +        sqadd           v26.4s, v26.4s, v29.4s
>> +        sqadd           v27.4s, v27.4s, v29.4s
>> +        sqxtn           v24.4h, v24.4s
>> +        sqxtn2          v24.8h, v25.4s
>> +        sqxtn           v26.4h, v26.4s
>> +        sqxtn2          v26.8h, v27.4s
>> +        sqxtun          v24.8b, v24.8h
>> +        sqxtun2         v24.16b, v26.8h
>> +
>> +        st1             {v24.16b}, [x20], x21
>> +.endm
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 16
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldp             q23, q31, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +
>> +2:
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 32
>> +        mov             x11, sp
>> +        mov             w12, w22
>> +        mov             x13, x20
>> +3:
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldp             q23, q31, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +2:
>> +        subs            w27, w27, #16
>> +        add             sp, x11, #32
>> +        add             x20, x13, #16
>> +        mov             w22, w12
>> +        mov             x11, sp
>> +        mov             x13, x20
>> +        b.hi            3b
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_dotprod, export=1
>> +        QPEL_UNI_W_HV_HEADER 64
>> +        mov             x11, sp
>> +        mov             w12, w22
>> +        mov             x13, x20
>> +3:
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +1:
>> +        ldp             q23, q31, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
>> +        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q16, q1, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
>> +        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q17, q2, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
>> +        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q18, q3, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
>> +        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q19, q4, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
>> +        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q20, q5, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
>> +        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q21, q6, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
>> +        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.eq            2f
>> +
>> +        ldp             q22, q7, [sp]
>> +        add             sp, sp, x10
>> +        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
>> +        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
>> +        QPEL_UNI_W_HV_16
>> +        subs            w22, w22, #1
>> +        b.hi            1b
>> +2:
>> +        subs            w27, w27, #16
>> +        add             sp, x11, #32
>> +        add             x20, x13, #16
>> +        mov             w22, w12
>> +        mov             x11, sp
>> +        mov             x13, x20
>> +        b.hi            3b
>> +        QPEL_UNI_W_HV_END
>> +        ret
>> +endfunc
>> +
>> +#endif // __ARM_FEATURE_DOTPROD
>> \ No newline at end of file
>> -- 
>> 2.38.0.windows.1
>>
>> _______________________________________________
>> ffmpeg-devel mailing list
>> ffmpeg-devel at ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
-------------- next part --------------
From 10924c4552031b9a35b514cdf11d48e122e0326a Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu at myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

Signed-off-by: myais <Logan.Lyu at myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  55 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 765 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..fd96819b5e 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,57 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox, 
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#endif
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +236,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..9e83bc0e01 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.8h, w7
+1:
+        ldr     s0, [x2]
+        ldr     s1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v0.4s, v0.4h, v30.4h
+        smull   v1.4s, v1.4h, v30.4h
+        sqrshl  v0.4s, v0.4s, v31.4s
+        sqrshl  v1.4s, v1.4s, v31.4s
+        sqadd   v0.4s, v0.4s, v29.4s
+        sqadd   v1.4s, v1.4s, v29.4s
+        sqxtn  v0.4h, v0.4s
+        sqxtn  v1.4h, v1.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0]
+        str     s1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #4
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     s0, [x0], #4
+        st1     {v0.h}[2], [x0], x1
+        str     s1, [x0], #4
+        st1     {v1.h}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret 
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     d0, [x2]
+        ldr     d1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v0.8h, v0.8b, #6
+        ushll   v1.8h, v1.8b, #6
+        smull   v4.4s, v0.4h, v30.4h
+        smull2  v5.4s, v0.8h, v30.8h
+        smull   v6.4s, v1.4h, v30.4h
+        smull2  v7.4s, v1.8h, v30.8h
+        sqrshl  v4.4s, v4.4s, v31.4s
+        sqrshl  v5.4s, v5.4s, v31.4s
+        sqrshl  v6.4s, v6.4s, v31.4s
+        sqrshl  v7.4s, v7.4s, v31.4s
+        sqadd   v4.4s, v4.4s, v29.4s
+        sqadd   v5.4s, v5.4s, v29.4s
+        sqadd   v6.4s, v6.4s, v29.4s
+        sqadd   v7.4s, v7.4s, v29.4s
+        sqxtn   v0.4h, v4.4s
+        sqxtn2  v0.8h, v5.4s
+        sqxtn   v1.4h, v6.4s
+        sqxtn2  v1.8h, v7.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        str     d0, [x0]
+        str     d1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+        sub     x1, x1, #8
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        ushll2  v7.8h, v1.16b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        smull   v22.4s, v7.4h, v30.4h
+        smull2  v23.4s, v7.8h, v30.8h
+        
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqrshl  v22.4s, v22.4s, v31.4s
+        sqrshl  v23.4s, v23.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqadd   v22.4s, v22.4s, v29.4s
+        sqadd   v23.4s, v23.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtn   v3.4h, v22.4s
+        sqxtn2  v3.8h, v23.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun2 v0.16b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        sqxtun2 v2.16b, v3.8h
+        str     d0, [x0], #8
+        st1     {v0.s}[2], [x0], x1
+        str     d2, [x0], #8
+        st1     {v2.s}[2], [x0], x1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll   \t0\().8h, \s0\().8b, #6
+        ushll2  \t1\().8h, \s0\().16b, #6
+        smull   \d0\().4s, \t0\().4h, v30.4h
+        smull2  \d1\().4s, \t0\().8h, v30.8h
+        smull   \d2\().4s, \t1\().4h, v30.4h
+        smull2  \d3\().4s, \t1\().8h, v30.8h
+        sqrshl  \d0\().4s, \d0\().4s, v31.4s
+        sqrshl  \d1\().4s, \d1\().4s, v31.4s
+        sqrshl  \d2\().4s, \d2\().4s, v31.4s
+        sqrshl  \d3\().4s, \d3\().4s, v31.4s
+        sqadd   \d0\().4s, \d0\().4s, v29.4s
+        sqadd   \d1\().4s, \d1\().4s, v29.4s
+        sqadd   \d2\().4s, \d2\().4s, v29.4s
+        sqadd   \d3\().4s, \d3\().4s, v29.4s
+        sqxtn   \t0\().4h, \d0\().4s
+        sqxtn2  \t0\().8h, \d1\().4s
+        sqxtn   \t1\().4h, \d2\().4s
+        sqxtn2  \t1\().8h, \d3\().4s
+        sqxtun  \s0\().8b,  \t0\().8h
+        sqxtun2 \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ldr     q0, [x2]
+        ldr     q1, [x2, x3]
+        add     x2, x2, x3, lsl 1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str     q0, [x0]
+        str     q1, [x0, x1]
+        add     x0, x0, x1, lsl 1
+        subs    w4, w4, #2
+        b.ne    1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        ushll   v4.8h, v0.8b, #6
+        ushll2  v5.8h, v0.16b, #6
+        ushll   v6.8h, v1.8b, #6
+        smull   v16.4s, v4.4h, v30.4h
+        smull2  v17.4s, v4.8h, v30.8h
+        smull   v18.4s, v5.4h, v30.4h
+        smull2  v19.4s, v5.8h, v30.8h
+        smull   v20.4s, v6.4h, v30.4h
+        smull2  v21.4s, v6.8h, v30.8h
+        sqrshl  v16.4s, v16.4s, v31.4s
+        sqrshl  v17.4s, v17.4s, v31.4s
+        sqrshl  v18.4s, v18.4s, v31.4s
+        sqrshl  v19.4s, v19.4s, v31.4s
+        sqrshl  v20.4s, v20.4s, v31.4s
+        sqrshl  v21.4s, v21.4s, v31.4s
+        sqadd   v16.4s, v16.4s, v29.4s
+        sqadd   v17.4s, v17.4s, v29.4s
+        sqadd   v18.4s, v18.4s, v29.4s
+        sqadd   v19.4s, v19.4s, v29.4s
+        sqadd   v20.4s, v20.4s, v29.4s
+        sqadd   v21.4s, v21.4s, v29.4s
+        sqxtn   v0.4h, v16.4s
+        sqxtn2  v0.8h, v17.4s
+        sqxtn   v1.4h, v18.4s
+        sqxtn2  v1.8h, v19.4s
+        sqxtn   v2.4h, v20.4s
+        sqxtn2  v2.8h, v21.4s
+        sqxtun  v0.8b, v0.8h
+        sqxtun  v1.8b, v1.8h
+        sqxtun  v2.8b, v2.8h
+        st1     {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1     {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov     w10, #-6
+        sub     w10, w10, w5
+        dup     v30.8h, w6
+        dup     v31.4s, w10
+        dup     v29.4s, w7
+1:
+        ld1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1     {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs    w4, w4, #1
+        b.ne    1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl 3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B     v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B     v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B     v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B     v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B     v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2    v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2    v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B     v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2    v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2    v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B     v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2    v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl 1
+        QPEL_FILTER_B     v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2    v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B     v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2    v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
-- 
2.38.0.windows.1

-------------- next part --------------
From d4e7e3c6a38fb8b13688694c5580ffd0baba55cb Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu at myais.com.cn>
Date: Wed, 3 May 2023 09:58:14 +0800
Subject: [PATCH 2/3] lavc/aarch64: new optimization for 8-bit
 hevc_qpel_uni_w_h

Signed-off-by: myais <Logan.Lyu at myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  20 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 434 ++++++++++++++++++++++
 2 files changed, 454 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index fd96819b5e..1462ca35df 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -145,6 +145,14 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox, 
@@ -155,6 +163,13 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+#if defined(__ARM_FEATURE_DOTPROD)
+
+NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _dotprod);
+
 #endif
 
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
@@ -179,6 +194,7 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
         member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -240,6 +256,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
+    #if defined(__ARM_FEATURE_DOTPROD)
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _dotprod);
+
+    #endif
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 9e83bc0e01..c6eb13ad98 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
         b.hi            3b
         ret
 endfunc
+
+#if __ARM_FEATURE_DOTPROD
+.macro QPEL_UNI_W_H_HEADER
+        ldr             x12, [sp]
+        sub             x2, x2, #3
+        movrel          x9, qpel_filters
+        add             x9, x9, x12, lsl 3
+        ldr             x11, [x9]
+        dup             v28.2d, x11
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.4s, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        mul             v16.4s, v16.4s, v30.4s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #4
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        usdot           v18.4s, v4.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        mul             v16.4s, v16.4s, v30.4s
+        mul             v18.2s, v18.2s, v30.2s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v18.2s, v18.2s, v31.2s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v18.2s, v18.2s, v29.2s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        usdot           \d2\().4s, \s2\().16b, v28.16b
+        usdot           \d3\().4s, \s3\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        addp            \d2\().4s, \d2\().4s, \d3\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        mul             \d2\().4s, \d2\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+.endm
+
+.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+.endm
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v0.2d, v16.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v18.8b, v18.8h
+        str             d18, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        add             x13, x0, #8
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v16.2d, v1.2d
+        zip1            v19.2d, v2.2d, v3.2d
+        zip1            v20.2d, v4.2d, v5.2d
+        zip1            v21.2d, v6.2d, v7.2d
+        zip2            v22.2d, v16.2d, v1.2d
+        zip2            v23.2d, v2.2d, v3.2d
+        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
+        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v4.4s
+        sqxtn           v1.4h, v24.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+
+        str             d0, [x0]
+        str             s1, [x13]
+        add             x0, x0, x1
+        add             x13, x13, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   // v18: 0, 8, 2, 10 v20: 1, 9, 3, 11
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    // v22: 4, 12, 6, 14 v24: 5, 13, 7, 15
+        sqxtn           v0.4h, v18.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v1.4h, v20.4s
+        sqxtn2          v1.8h, v24.4s
+        trn1            v2.8h, v0.8h, v1.8h
+        trn2            v3.8h, v0.8h, v1.8h
+        sqxtun          v0.8b, v2.8h
+        sqxtun2         v0.16b, v3.8h
+        st1             {v0.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v18.8h, v19.8h
+        trn2            v21.8h, v18.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v27.8b, v18.8h
+
+        st1             {v26.16b}, [x0], #16
+        st1             {v27.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v27.8b, v20.8h
+        sqxtun2         v27.16b, v21.8h                         // 16-31
+        st1             {v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v25.8b, v22.8h
+        sqxtun2         v25.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v26.8b, v22.8h
+        sqxtun2         v26.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v27.8b, v22.8h
+        sqxtun2         v27.16b, v23.8h                         // 32-47
+        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x3, x3, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v16.8b, v22.8h
+        sqxtun2         v16.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v17.8b, v22.8h
+        sqxtun2         v17.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        ld1             {v0.16b}, [x2], x3
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v18.8b, v22.8h
+        sqxtun2         v18.16b, v23.8h                         // 32-47
+        ext             v1.16b, v19.16b, v0.16b, #1
+        ext             v2.16b, v19.16b, v0.16b, #2
+        ext             v3.16b, v19.16b, v0.16b, #3
+        ext             v4.16b, v19.16b, v0.16b, #4
+        ext             v5.16b, v19.16b, v0.16b, #5
+        ext             v6.16b, v19.16b, v0.16b, #6
+        ext             v7.16b, v19.16b, v0.16b, #7
+        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v19.8b, v22.8h
+        sqxtun2         v19.16b, v23.8h                         // 48-63
+
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+#endif // __ARM_FEATURE_DOTPROD
\ No newline at end of file
-- 
2.38.0.windows.1

-------------- next part --------------
From dc1e4dde1ed46305224440820480d0b56e0b44db Mon Sep 17 00:00:00 2001
From: myais <Logan.Lyu at myais.com.cn>
Date: Wed, 3 May 2023 10:01:11 +0800
Subject: [PATCH 3/3] lavc/aarch64: new optimization for 8-bit hevc_qpel_h
 hevc_qpel_uni_w_hv

Signed-off-by: myais <Logan.Lyu at myais.com.cn>
---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   20 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 1079 +++++++++++++++++++++
 2 files changed, 1099 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 1462ca35df..1182e4bcd2 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -164,12 +164,20 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
         intptr_t mx, intptr_t my, int width),);
 
 #if defined(__ARM_FEATURE_DOTPROD)
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _dotprod);
 
 NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _dotprod);
 
+NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _dotprod);
+
 #endif
 
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
@@ -194,6 +202,16 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
         member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
+#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -257,7 +275,9 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
     #if defined(__ARM_FEATURE_DOTPROD)
+        NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _dotprod);
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _dotprod);
+        NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _dotprod);
 
     #endif
     }
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index c6eb13ad98..e30ac1b465 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1625,4 +1625,1083 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_dotprod, export=1
         ret
 endfunc
 
+
+.macro QPEL_H_HEADER
+        movrel          x9, qpel_filters
+        add             x9, x9, x4, lsl 3
+        ldr             x11, [x9]
+        dup             v31.2d, x11
+        sub             x1, x1, #3
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        sqxtn           v16.4h, v16.4s
+        str             d16, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #8
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        usdot           v18.4s, v4.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v18.4h, v18.4s
+        str             d16, [x0]
+        str             s18, [x15]
+        add             x0, x0, x10
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        ext             v6.16b, v0.16b, v0.16b, #6
+        ext             v7.16b, v0.16b, v0.16b, #7
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        usdot           v18.4s, v4.16b, v31.16b
+        usdot           v19.4s, v6.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v19.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        str             q16, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3 
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v31.16b
+        usdot           \d1\().4s, \s1\().16b, v31.16b
+        usdot           \d2\().4s, \s2\().16b, v31.16b
+        usdot           \d3\().4s, \s3\().16b, v31.16b
+.endm
+
+function ff_hevc_put_hevc_qpel_h12_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v4.2d, v5.2d
+        zip1            v19.2d, v6.2d, v7.2d
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        usdot           v24.4s, v18.16b, v31.16b
+        usdot           v25.4s, v19.16b, v31.16b
+        addp            v24.4s, v24.4s, v25.4s
+        trn1            v26.4s, v20.4s, v21.4s
+        trn2            v27.4s, v20.4s, v21.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn           v27.4h, v27.4s
+        sqxtn2          v26.8h, v24.4s
+
+        str             q26, [x0]
+        str             d27, [x15]
+        add             x0, x0, x10
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+
+        sqxtn           v18.4h, v22.4s
+        sqxtn2          v18.8h, v26.4s
+        sqxtn           v19.4h, v23.4s
+        sqxtn2          v19.8h, v27.4s
+        
+        stp             q18, q19, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #32
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v18.4h, v22.4s
+        sqxtn2          v18.8h, v26.4s
+        sqxtn           v19.4h, v23.4s
+        sqxtn2          v19.8h, v27.4s
+        stp             q18, q19, [x0]
+        add             x0, x0, x10
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_H_CALC     v0, v2, v4, v5, v20, v21, v22, v23
+        addp            v20.4s, v20.4s, v21.4s
+        addp            v22.4s, v22.4s, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        str             q20, [x15]
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #32
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x15]
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2 - 64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon_dotprod, export=1
+        QPEL_H_HEADER
+        sub             x2, x2, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ld1             {v28.8b}, [x1], x2
+        ext             v1.16b, v19.16b, v28.16b, #1
+        ext             v2.16b, v19.16b, v28.16b, #2
+        ext             v3.16b, v19.16b, v28.16b, #3
+        ext             v4.16b, v19.16b, v28.16b, #4
+        ext             v5.16b, v19.16b, v28.16b, #5
+        ext             v6.16b, v19.16b, v28.16b, #6
+        ext             v7.16b, v19.16b, v28.16b, #7
+        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_HEADER width
+        ldp             x14, x15, [sp]          // mx, my
+        ldr             w13, [sp, #16]          // width
+        stp             x20, x21, [sp, #-16]!
+        stp             x22, x23, [sp, #-16]!
+        stp             x24, x25, [sp, #-16]!
+        stp             x26, x27, [sp, #-16]!
+        stp             x28, x30, [sp, #-16]!
+        mov             x28, sp
+        mov             x11, #9088
+        sub             sp, sp, x11
+        mov             x20, x0
+        mov             x21, x1
+        mov             x0, sp
+        sub             x1, x2, x3, lsl 1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             w22, w4                 // height
+        mov             x4, x14                 // mx
+        mov             x23, x15                // my
+        mov             w24, w6                 // wx
+        mov             w25, w7                 // ox
+        mov             w26, #-6
+        sub             w26, w26, w5            // -shift
+        mov             w27, w13                // width
+        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_dotprod)
+        movrel          x9, qpel_filters
+        add             x9, x9, x23, lsl 3
+        ld1             {v0.8b}, [x9]
+        sxtl            v0.8h, v0.8b
+        mov             x10, #(MAX_PB_SIZE * 2)
+        dup             v28.4s, w24
+        dup             v29.4s, w25
+        dup             v30.4s, w26
+.endm
+
+.macro QPEL_UNI_W_HV_END
+        mov             sp, x28
+        ldp             x28, x30, [sp], #16
+        ldp             x26, x27, [sp], #16
+        ldp             x24, x25, [sp], #16
+        ldp             x22, x23, [sp], #16
+        ldp             x20, x21, [sp], #16
+.endm
+
+.macro QPEL_UNI_W_HV_4
+        sshr            v26.4s, v26.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x20], x21
+.endm
+
+.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.endm
+
+.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6, src7
+        smull2          \dst\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 4
+        ldr             d16, [sp]
+        ldr             d17, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             d18, [sp]
+        ldr             d19, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             d20, [sp]
+        ldr             d21, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             d22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             d23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_8
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        mul             v25.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 8
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x10]
+        add             sp, sp, x10, lsl 1
+        ldr             q22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             q23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_16
+        sshr            v24.4s, v24.4s, #6
+        sshr            v25.4s, v25.4s, #6
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v24.4s, v28.4s
+        mul             v25.4s, v25.4s, v28.4s
+        mul             v26.4s, v26.4s, v28.4s
+        mul             v27.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqrshl          v26.4s, v26.4s, v30.4s
+        sqrshl          v27.4s, v27.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+
+        st1             {v24.16b}, [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 16
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 32
+        mov             x11, sp
+        mov             w12, w22
+        mov             x13, x20
+3:
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+2:
+        subs            w27, w27, #16
+        add             sp, x11, #32
+        add             x20, x13, #16
+        mov             w22, w12
+        mov             x11, sp
+        mov             x13, x20
+        b.hi            3b
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_dotprod, export=1
+        QPEL_UNI_W_HV_HEADER 64
+        mov             x11, sp
+        mov             w12, w22
+        mov             x13, x20
+3:
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+2:
+        subs            w27, w27, #16
+        add             sp, x11, #32
+        add             x20, x13, #16
+        mov             w22, w12
+        mov             x11, sp
+        mov             x13, x20
+        b.hi            3b
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
 #endif // __ARM_FEATURE_DOTPROD
\ No newline at end of file
-- 
2.38.0.windows.1



More information about the ffmpeg-devel mailing list