[FFmpeg-devel] [PATCH 2/2] lavc/aarch64: add hevc epel/qpel assembly

chen chenm003 at 163.com
Thu Apr 29 04:40:31 EEST 2021


inline comment with prefix [MC]

At 2021-04-29 03:50:26, "Josh Dekker" <josh at itanimul.li> wrote:
>From: Rafal Dabrowa <fatwildcat at gmail.com>
>
>Benchmarked on Apple M1:
>
>put_hevc_epel_bi_h4_8_c: 69.9
>put_hevc_epel_bi_h4_8_neon: 15.4
>put_hevc_epel_bi_h6_8_c: 137.1
>put_hevc_epel_bi_h6_8_neon: 31.9
>put_hevc_epel_bi_h8_8_c: 124.6
>put_hevc_epel_bi_h8_8_neon: 40.9
>put_hevc_epel_bi_h12_8_c: 331.9
>put_hevc_epel_bi_h12_8_neon: 72.4
>put_hevc_epel_bi_h16_8_c: 383.4
>put_hevc_epel_bi_h16_8_neon: 124.9
>put_hevc_epel_bi_h24_8_c: 771.6
>put_hevc_epel_bi_h24_8_neon: 209.6
>put_hevc_epel_bi_h32_8_c: 1324.4
>put_hevc_epel_bi_h32_8_neon: 389.4
>put_hevc_epel_bi_h48_8_c: 2869.6
>put_hevc_epel_bi_h48_8_neon: 730.1
>put_hevc_epel_bi_h64_8_c: 4992.6
>put_hevc_epel_bi_h64_8_neon: 1490.4
>put_hevc_epel_bi_hv4_8_c: 163.4
>put_hevc_epel_bi_hv4_8_neon: 38.4
>put_hevc_epel_bi_hv6_8_c: 292.4
>put_hevc_epel_bi_hv6_8_neon: 66.4
>put_hevc_epel_bi_hv8_8_c: 375.6
>put_hevc_epel_bi_hv8_8_neon: 62.4
>put_hevc_epel_bi_hv12_8_c: 831.6
>put_hevc_epel_bi_hv12_8_neon: 134.9
>put_hevc_epel_bi_hv16_8_c: 1257.9
>put_hevc_epel_bi_hv16_8_neon: 214.1
>put_hevc_epel_bi_hv24_8_c: 2666.6
>put_hevc_epel_bi_hv24_8_neon: 391.1
>put_hevc_epel_bi_hv32_8_c: 4722.4
>put_hevc_epel_bi_hv32_8_neon: 734.1
>put_hevc_epel_bi_hv48_8_c: 10100.4
>put_hevc_epel_bi_hv48_8_neon: 1570.4
>put_hevc_epel_bi_hv64_8_c: 17613.4
>put_hevc_epel_bi_hv64_8_neon: 2810.6
>put_hevc_epel_bi_v4_8_c: 77.4
>put_hevc_epel_bi_v4_8_neon: 18.6
>put_hevc_epel_bi_v6_8_c: 142.1
>put_hevc_epel_bi_v6_8_neon: 27.1
>put_hevc_epel_bi_v8_8_c: 192.9
>put_hevc_epel_bi_v8_8_neon: 9.1
>put_hevc_epel_bi_v12_8_c: 415.6
>put_hevc_epel_bi_v12_8_neon: 55.6
>put_hevc_epel_bi_v16_8_c: 487.6
>put_hevc_epel_bi_v16_8_neon: 61.9
>put_hevc_epel_bi_v24_8_c: 957.4
>put_hevc_epel_bi_v24_8_neon: 131.1
>put_hevc_epel_bi_v32_8_c: 1540.4
>put_hevc_epel_bi_v32_8_neon: 210.4
>put_hevc_epel_bi_v48_8_c: 3242.9
>put_hevc_epel_bi_v48_8_neon: 465.6
>put_hevc_epel_bi_v64_8_c: 5441.1
>put_hevc_epel_bi_v64_8_neon: 818.1
>put_hevc_epel_h4_8_c: 41.6
>put_hevc_epel_h4_8_neon: 8.4
>put_hevc_epel_h6_8_c: 110.1
>put_hevc_epel_h6_8_neon: 24.4
>put_hevc_epel_h8_8_c: 41.6
>put_hevc_epel_h8_8_neon: 17.6
>put_hevc_epel_h12_8_c: 183.1
>put_hevc_epel_h12_8_neon: 58.1
>put_hevc_epel_h16_8_c: 146.6
>put_hevc_epel_h16_8_neon: 83.4
>put_hevc_epel_h24_8_c: 240.4
>put_hevc_epel_h24_8_neon: 157.1
>put_hevc_epel_h32_8_c: 431.1
>put_hevc_epel_h32_8_neon: 292.1
>put_hevc_epel_h48_8_c: 858.6
>put_hevc_epel_h48_8_neon: 557.4
>put_hevc_epel_h64_8_c: 1536.6
>put_hevc_epel_h64_8_neon: 1116.6
>put_hevc_epel_hv4_8_c: 152.6
>put_hevc_epel_hv4_8_neon: 34.9
>put_hevc_epel_hv6_8_c: 269.6
>put_hevc_epel_hv6_8_neon: 61.6
>put_hevc_epel_hv8_8_c: 307.4
>put_hevc_epel_hv8_8_neon: 76.9
>put_hevc_epel_hv12_8_c: 702.6
>put_hevc_epel_hv12_8_neon: 113.1
>put_hevc_epel_hv16_8_c: 1081.4
>put_hevc_epel_hv16_8_neon: 190.6
>put_hevc_epel_hv24_8_c: 2276.1
>put_hevc_epel_hv24_8_neon: 345.1
>put_hevc_epel_hv32_8_c: 4068.6
>put_hevc_epel_hv32_8_neon: 780.4
>put_hevc_epel_hv48_8_c: 8754.1
>put_hevc_epel_hv48_8_neon: 1394.4
>put_hevc_epel_hv64_8_c: 15402.1
>put_hevc_epel_hv64_8_neon: 2616.6
>put_hevc_epel_uni_hv4_8_c: 142.1
>put_hevc_epel_uni_hv4_8_neon: 46.6
>put_hevc_epel_uni_hv6_8_c: 298.4
>put_hevc_epel_uni_hv6_8_neon: 72.4
>put_hevc_epel_uni_hv8_8_c: 352.9
>put_hevc_epel_uni_hv8_8_neon: 75.1
>put_hevc_epel_uni_hv12_8_c: 776.6
>put_hevc_epel_uni_hv12_8_neon: 125.9
>put_hevc_epel_uni_hv16_8_c: 1216.1
>put_hevc_epel_uni_hv16_8_neon: 199.1
>put_hevc_epel_uni_hv24_8_c: 2577.9
>put_hevc_epel_uni_hv24_8_neon: 386.6
>put_hevc_epel_uni_hv32_8_c: 4554.9
>put_hevc_epel_uni_hv32_8_neon: 710.9
>put_hevc_epel_uni_hv48_8_c: 9869.1
>put_hevc_epel_uni_hv48_8_neon: 1499.4
>put_hevc_epel_uni_hv64_8_c: 17307.1
>put_hevc_epel_uni_hv64_8_neon: 2750.6
>put_hevc_epel_uni_v4_8_c: 59.9
>put_hevc_epel_uni_v4_8_neon: 21.9
>put_hevc_epel_uni_v6_8_c: 136.1
>put_hevc_epel_uni_v6_8_neon: 19.6
>put_hevc_epel_uni_v8_8_c: 222.4
>put_hevc_epel_uni_v8_8_neon: 17.1
>put_hevc_epel_uni_v12_8_c: 481.6
>put_hevc_epel_uni_v12_8_neon: 42.4
>put_hevc_epel_uni_v16_8_c: 424.4
>put_hevc_epel_uni_v16_8_neon: 63.4
>put_hevc_epel_uni_v24_8_c: 1184.1
>put_hevc_epel_uni_v24_8_neon: 109.9
>put_hevc_epel_uni_v32_8_c: 1401.1
>put_hevc_epel_uni_v32_8_neon: 182.9
>put_hevc_epel_uni_v48_8_c: 2933.9
>put_hevc_epel_uni_v48_8_neon: 388.9
>put_hevc_epel_uni_v64_8_c: 5044.9
>put_hevc_epel_uni_v64_8_neon: 701.1
>put_hevc_epel_v4_8_c: 31.9
>put_hevc_epel_v4_8_neon: 13.4
>put_hevc_epel_v6_8_c: 95.1
>put_hevc_epel_v6_8_neon: 16.4
>put_hevc_epel_v8_8_c: 98.9
>put_hevc_epel_v8_8_neon: 26.1
>put_hevc_epel_v12_8_c: 283.9
>put_hevc_epel_v12_8_neon: 36.9
>put_hevc_epel_v16_8_c: 229.6
>put_hevc_epel_v16_8_neon: 41.9
>put_hevc_epel_v24_8_c: 376.4
>put_hevc_epel_v24_8_neon: 90.4
>put_hevc_epel_v32_8_c: 577.4
>put_hevc_epel_v32_8_neon: 188.4
>put_hevc_epel_v48_8_c: 1058.4
>put_hevc_epel_v48_8_neon: 350.6
>put_hevc_epel_v64_8_c: 1647.4
>put_hevc_epel_v64_8_neon: 647.9
>put_hevc_pel_bi_pixels4_8_c: 39.1
>put_hevc_pel_bi_pixels4_8_neon: 36.4
>put_hevc_pel_bi_pixels6_8_c: 78.6
>put_hevc_pel_bi_pixels6_8_neon: 0.-6
>put_hevc_pel_bi_pixels8_8_c: 60.6
>put_hevc_pel_bi_pixels8_8_neon: 14.1
>put_hevc_pel_bi_pixels12_8_c: 186.1
>put_hevc_pel_bi_pixels12_8_neon: 30.4
>put_hevc_pel_bi_pixels16_8_c: 231.9
>put_hevc_pel_bi_pixels16_8_neon: 32.1
>put_hevc_pel_bi_pixels24_8_c: 454.1
>put_hevc_pel_bi_pixels24_8_neon: 70.1
>put_hevc_pel_bi_pixels32_8_c: 774.1
>put_hevc_pel_bi_pixels32_8_neon: 102.1
>put_hevc_pel_bi_pixels48_8_c: 1632.9
>put_hevc_pel_bi_pixels48_8_neon: 220.4
>put_hevc_pel_bi_pixels64_8_c: 2812.9
>put_hevc_pel_bi_pixels64_8_neon: 402.4
>put_hevc_pel_pixels4_8_c: 41.1
>put_hevc_pel_pixels4_8_neon: 6.4
>put_hevc_pel_pixels6_8_c: 45.1
>put_hevc_pel_pixels6_8_neon: 5.4
>put_hevc_pel_pixels8_8_c: 94.6
>put_hevc_pel_pixels8_8_neon: 15.6
>put_hevc_pel_pixels12_8_c: 198.6
>put_hevc_pel_pixels12_8_neon: 15.4
>put_hevc_pel_pixels16_8_c: 87.9
>put_hevc_pel_pixels16_8_neon: 18.1
>put_hevc_pel_pixels24_8_c: 310.6
>put_hevc_pel_pixels24_8_neon: 39.6
>put_hevc_pel_pixels32_8_c: 198.6
>put_hevc_pel_pixels32_8_neon: 78.1
>put_hevc_pel_pixels48_8_c: 372.4
>put_hevc_pel_pixels48_8_neon: 173.1
>put_hevc_pel_pixels64_8_c: 569.1
>put_hevc_pel_pixels64_8_neon: 324.4
>put_hevc_qpel_bi_h4_8_c: 101.4
>put_hevc_qpel_bi_h4_8_neon: 34.6
>put_hevc_qpel_bi_h6_8_c: 270.1
>put_hevc_qpel_bi_h6_8_neon: 61.6
>put_hevc_qpel_bi_h8_8_c: 165.6
>put_hevc_qpel_bi_h8_8_neon: 62.9
>put_hevc_qpel_bi_h12_8_c: 546.4
>put_hevc_qpel_bi_h12_8_neon: 124.1
>put_hevc_qpel_bi_h16_8_c: 536.9
>put_hevc_qpel_bi_h16_8_neon: 178.6
>put_hevc_qpel_bi_h24_8_c: 1151.6
>put_hevc_qpel_bi_h24_8_neon: 316.6
>put_hevc_qpel_bi_h32_8_c: 1981.4
>put_hevc_qpel_bi_h32_8_neon: 575.4
>put_hevc_qpel_bi_h48_8_c: 4336.6
>put_hevc_qpel_bi_h48_8_neon: 1189.6
>put_hevc_qpel_bi_h64_8_c: 7591.6
>put_hevc_qpel_bi_h64_8_neon: 2184.9
>put_hevc_qpel_bi_hv4_8_c: 438.9
>put_hevc_qpel_bi_hv4_8_neon: 97.6
>put_hevc_qpel_bi_hv6_8_c: 829.1
>put_hevc_qpel_bi_hv6_8_neon: 131.4
>put_hevc_qpel_bi_hv8_8_c: 983.9
>put_hevc_qpel_bi_hv8_8_neon: 146.1
>put_hevc_qpel_bi_hv12_8_c: 2050.9
>put_hevc_qpel_bi_hv12_8_neon: 364.6
>put_hevc_qpel_bi_hv16_8_c: 3028.4
>put_hevc_qpel_bi_hv16_8_neon: 432.6
>put_hevc_qpel_bi_hv24_8_c: 6294.9
>put_hevc_qpel_bi_hv24_8_neon: 910.1
>put_hevc_qpel_bi_hv32_8_c: 10583.4
>put_hevc_qpel_bi_hv32_8_neon: 1345.9
>put_hevc_qpel_bi_hv48_8_c: 22412.4
>put_hevc_qpel_bi_hv48_8_neon: 2852.6
>put_hevc_qpel_bi_hv64_8_c: 38653.9
>put_hevc_qpel_bi_hv64_8_neon: 5094.1
>put_hevc_qpel_bi_v4_8_c: 143.9
>put_hevc_qpel_bi_v4_8_neon: 25.9
>put_hevc_qpel_bi_v6_8_c: 296.6
>put_hevc_qpel_bi_v6_8_neon: 35.1
>put_hevc_qpel_bi_v8_8_c: 515.4
>put_hevc_qpel_bi_v8_8_neon: 31.6
>put_hevc_qpel_bi_v12_8_c: 1175.6
>put_hevc_qpel_bi_v12_8_neon: 81.1
>put_hevc_qpel_bi_v16_8_c: 2051.6
>put_hevc_qpel_bi_v16_8_neon: 111.1
>put_hevc_qpel_bi_v24_8_c: 4556.9
>put_hevc_qpel_bi_v24_8_neon: 208.6
>put_hevc_qpel_bi_v32_8_c: 8048.1
>put_hevc_qpel_bi_v32_8_neon: 351.6
>put_hevc_qpel_bi_v48_8_c: 18009.9
>put_hevc_qpel_bi_v48_8_neon: 773.1
>put_hevc_qpel_bi_v64_8_c: 31784.9
>put_hevc_qpel_bi_v64_8_neon: 1370.6
>put_hevc_qpel_h4_8_c: 120.1
>put_hevc_qpel_h4_8_neon: 33.1
>put_hevc_qpel_h6_8_c: 241.6
>put_hevc_qpel_h6_8_neon: 29.1
>put_hevc_qpel_h8_8_c: 70.6
>put_hevc_qpel_h8_8_neon: 52.6
>put_hevc_qpel_h12_8_c: 347.4
>put_hevc_qpel_h12_8_neon: 111.1
>put_hevc_qpel_h16_8_c: 180.4
>put_hevc_qpel_h16_8_neon: 149.9
>put_hevc_qpel_h24_8_c: 333.4
>put_hevc_qpel_h24_8_neon: 289.1
>put_hevc_qpel_h32_8_c: 597.1
>put_hevc_qpel_h32_8_neon: 478.9
>put_hevc_qpel_h48_8_c: 1262.6
>put_hevc_qpel_h48_8_neon: 975.6
>put_hevc_qpel_h64_8_c: 2212.4
>put_hevc_qpel_h64_8_neon: 1831.9
>put_hevc_qpel_hv4_8_c: 430.9
>put_hevc_qpel_hv4_8_neon: 77.4
>put_hevc_qpel_hv6_8_c: 785.9
>put_hevc_qpel_hv6_8_neon: 122.9
>put_hevc_qpel_hv8_8_c: 921.9
>put_hevc_qpel_hv8_8_neon: 150.1
>put_hevc_qpel_hv12_8_c: 1943.4
>put_hevc_qpel_hv12_8_neon: 245.4
>put_hevc_qpel_hv16_8_c: 2886.9
>put_hevc_qpel_hv16_8_neon: 375.4
>put_hevc_qpel_hv24_8_c: 5954.6
>put_hevc_qpel_hv24_8_neon: 711.4
>put_hevc_qpel_hv32_8_c: 9967.1
>put_hevc_qpel_hv32_8_neon: 1161.1
>put_hevc_qpel_hv48_8_c: 21173.1
>put_hevc_qpel_hv48_8_neon: 2593.9
>put_hevc_qpel_hv64_8_c: 37378.1
>put_hevc_qpel_hv64_8_neon: 4470.4
>put_hevc_qpel_uni_h4_8_c: 108.4
>put_hevc_qpel_uni_h4_8_neon: 38.9
>put_hevc_qpel_uni_h6_8_c: 237.9
>put_hevc_qpel_uni_h6_8_neon: 54.6
>put_hevc_qpel_uni_h8_8_c: 432.4
>put_hevc_qpel_uni_h8_8_neon: 64.9
>put_hevc_qpel_uni_h12_8_c: 1019.4
>put_hevc_qpel_uni_h12_8_neon: 116.1
>put_hevc_qpel_uni_h16_8_c: 463.6
>put_hevc_qpel_uni_h16_8_neon: 153.1
>put_hevc_qpel_uni_h24_8_c: 1919.4
>put_hevc_qpel_uni_h24_8_neon: 292.1
>put_hevc_qpel_uni_h32_8_c: 1800.6
>put_hevc_qpel_uni_h32_8_neon: 496.9
>put_hevc_qpel_uni_h48_8_c: 4056.1
>put_hevc_qpel_uni_h48_8_neon: 1071.1
>put_hevc_qpel_uni_h64_8_c: 7149.9
>put_hevc_qpel_uni_h64_8_neon: 1820.6
>put_hevc_qpel_uni_hv4_8_c: 444.6
>put_hevc_qpel_uni_hv4_8_neon: 86.6
>put_hevc_qpel_uni_hv6_8_c: 810.6
>put_hevc_qpel_uni_hv6_8_neon: 121.9
>put_hevc_qpel_uni_hv8_8_c: 949.6
>put_hevc_qpel_uni_hv8_8_neon: 137.6
>put_hevc_qpel_uni_hv12_8_c: 2021.6
>put_hevc_qpel_uni_hv12_8_neon: 261.1
>put_hevc_qpel_uni_hv16_8_c: 3004.6
>put_hevc_qpel_uni_hv16_8_neon: 367.1
>put_hevc_qpel_uni_hv24_8_c: 6204.9
>put_hevc_qpel_uni_hv24_8_neon: 813.1
>put_hevc_qpel_uni_hv32_8_c: 10447.4
>put_hevc_qpel_uni_hv32_8_neon: 1216.4
>put_hevc_qpel_uni_hv48_8_c: 22322.9
>put_hevc_qpel_uni_hv48_8_neon: 2531.6
>put_hevc_qpel_uni_hv64_8_c: 38859.9
>put_hevc_qpel_uni_hv64_8_neon: 4528.9
>put_hevc_qpel_uni_v4_8_c: 124.6
>put_hevc_qpel_uni_v4_8_neon: 33.9
>put_hevc_qpel_uni_v6_8_c: 260.6
>put_hevc_qpel_uni_v6_8_neon: 28.6
>put_hevc_qpel_uni_v8_8_c: 480.4
>put_hevc_qpel_uni_v8_8_neon: 30.4
>put_hevc_qpel_uni_v12_8_c: 1101.4
>put_hevc_qpel_uni_v12_8_neon: 72.1
>put_hevc_qpel_uni_v16_8_c: 720.4
>put_hevc_qpel_uni_v16_8_neon: 87.4
>put_hevc_qpel_uni_v24_8_c: 2443.4
>put_hevc_qpel_uni_v24_8_neon: 253.9
>put_hevc_qpel_uni_v32_8_c: 2328.6
>put_hevc_qpel_uni_v32_8_neon: 311.4
>put_hevc_qpel_uni_v48_8_c: 4856.9
>put_hevc_qpel_uni_v48_8_neon: 692.6
>put_hevc_qpel_uni_v64_8_c: 8169.9
>put_hevc_qpel_uni_v64_8_neon: 1203.4
>put_hevc_qpel_v4_8_c: 123.6
>put_hevc_qpel_v4_8_neon: 26.1
>put_hevc_qpel_v6_8_c: 259.9
>put_hevc_qpel_v6_8_neon: 22.6
>put_hevc_qpel_v8_8_c: 197.4
>put_hevc_qpel_v8_8_neon: 24.9
>put_hevc_qpel_v12_8_c: 561.4
>put_hevc_qpel_v12_8_neon: 53.6
>put_hevc_qpel_v16_8_c: 474.9
>put_hevc_qpel_v16_8_neon: 75.4
>put_hevc_qpel_v24_8_c: 799.9
>put_hevc_qpel_v24_8_neon: 159.1
>put_hevc_qpel_v32_8_c: 1214.1
>put_hevc_qpel_v32_8_neon: 267.9
>put_hevc_qpel_v48_8_c: 2217.6
>put_hevc_qpel_v48_8_neon: 639.1
>put_hevc_qpel_v64_8_c: 3495.4
>put_hevc_qpel_v64_8_neon: 1081.1
>
>Signed-off-by: Josh Dekker <josh at itanimul.li>
>---
> libavcodec/aarch64/Makefile               |    4 +-
> libavcodec/aarch64/hevcdsp_epel_neon.S    | 3931 ++++++++++++++
> libavcodec/aarch64/hevcdsp_init_aarch64.c |  118 +
> libavcodec/aarch64/hevcdsp_qpel_neon.S    | 5646 +++++++++++++++++++++
> 4 files changed, 9698 insertions(+), 1 deletion(-)
> create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S
> create mode 100644 libavcodec/aarch64/hevcdsp_qpel_neon.S
>
>diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
>index 954461f81d..ebedc03bfa 100644
>--- a/libavcodec/aarch64/Makefile
>+++ b/libavcodec/aarch64/Makefile
>@@ -61,6 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9itxfm_16bpp_neon.o       \
>                                            aarch64/vp9lpf_neon.o               \
>                                            aarch64/vp9mc_16bpp_neon.o          \
>                                            aarch64/vp9mc_neon.o
>-NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_idct_neon.o         \
>+NEON-OBJS-$(CONFIG_HEVC_DECODER)        += aarch64/hevcdsp_epel_neon.o         \
>+                                           aarch64/hevcdsp_idct_neon.o         \
>                                            aarch64/hevcdsp_init_aarch64.o      \
>+                                           aarch64/hevcdsp_qpel_neon.o         \
>                                            aarch64/hevcdsp_sao_neon.o
>diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S
>new file mode 100644
>index 0000000000..0366fe8ae3
>--- /dev/null
>+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
>@@ -0,0 +1,3931 @@
>+/* -*-arm64-*-
>+ * vim: syntax=arm64asm
>+ *
>+ * This file is part of FFmpeg.
>+ *
>+ * FFmpeg is free software; you can redistribute it and/or
>+ * modify it under the terms of the GNU Lesser General Public
>+ * License as published by the Free Software Foundation; either
>+ * version 2.1 of the License, or (at your option) any later version.
>+ *
>+ * FFmpeg is distributed in the hope that it will be useful,
>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>+ * Lesser General Public License for more details.
>+ *
>+ * You should have received a copy of the GNU Lesser General Public
>+ * License along with FFmpeg; if not, write to the Free Software
>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>+ */
>+
>+#include "libavutil/aarch64/asm.S"
>+#define MAX_PB_SIZE 64
>+
>+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)

>+1:      ld1            {v0.s}[0], [x1], x2
[MC] I haven't M1, so I am not sure how about this instruction.
However, in A57 doc, LD1 latency=8, throughput=1, the LD1R latency=5, throughput=1
Moreover, I guess all of interpolate function works on even rows, so we can unroll a little.
Further, we may insert SUB in between LD/ST to avoid pipeline stall, and CBNZ avoid affect flags register


>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.d}[0], [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2 - 8)
>+1:      ld1            {v0.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.d}[0], [x0], #8
>+        st1            {v4.s}[2], [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2 - 16)
>+1:      ld1            {v0.8b, v1.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        st1            {v4.8h}, [x0], #16
>+        ushll           v5.8h, v1.8b, #6
>+        st1            {v5.d}[0], [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll           v5.8h, v1.8b, #6
>+        st1            {v4.8h, v5.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll           v5.8h, v1.8b, #6
>+        ushll           v6.8h, v2.8b, #6
>+        st1            {v4.8h, v5.8h, v6.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b, v2.8b, v3.8b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll           v5.8h, v1.8b, #6
>+        ushll           v6.8h, v2.8b, #6
>+        ushll           v7.8h, v3.8b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels48_8_neon, export=1
>+        mov             x7, #(MAX_PB_SIZE)
>+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll2          v5.8h, v0.16b, #6
>+        ushll           v6.8h, v1.8b, #6
>+        ushll2          v7.8h, v1.16b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
>+        ushll           v4.8h, v2.8b, #6
>+        ushll2          v5.8h, v2.16b, #6
>+        st1            {v4.8h, v5.8h}, [x0], x7
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
>+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], x2
>+        ushll           v4.8h, v0.8b, #6
>+        ushll2          v5.8h, v0.16b, #6
>+        ushll           v6.8h, v1.8b, #6
>+        ushll2          v7.8h, v1.16b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
>+        ushll           v4.8h, v2.8b, #6
>+        ushll2          v5.8h, v2.16b, #6
>+        ushll           v6.8h, v3.8b, #6
>+        ushll2          v7.8h, v3.16b, #6
>+        st1            {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #(MAX_PB_SIZE)
>+        subs            x3, x3, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.s}[0], [x2], x3 // src
>+        ushll           v16.8h, v0.8b, #6
>+        ld1            {v20.4h}, [x4], x10 // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        st1            {v0.s}[0], [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
>+        sub             x1, x1, #4
>+1:      ld1            {v0.8b}, [x2], x3
>+        ushll           v16.8h, v0.8b, #6
>+        ld1            {v20.4h}, [x4], #8
>+        ld1            {v20.s}[2], [x4], x10
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        st1            {v0.s}[0], [x0], #4
>+        st1            {v0.h}[2], [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b}, [x2], x3     // src
>+        ushll           v16.8h, v0.8b, #6
>+        ld1            {v20.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        st1            {v0.8b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2 - 16)
>+        sub             x1, x1, #8
>+1:      ld1            {v0.16b}, [x2], x3
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ld1            {v20.8h}, [x4], #16
>+        ld1            {v21.4h}, [x4], x10
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        st1            {v0.8b}, [x0], #8
>+        st1            {v0.s}[2], [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ld1            {v20.8h, v21.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        st1            {v0.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.8b, v1.8b, v2.8b}, [x2], x3  // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll           v17.8h, v1.8b, #6
>+        ushll           v18.8h, v2.8b, #6
>+        ld1            {v20.8h, v21.8h, v22.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqadd           v18.8h, v18.8h, v22.8h
>+        sqrshrun        v0.8b, v16.8h, #7
>+        sqrshrun        v1.8b, v17.8h, #7
>+        sqrshrun        v2.8b, v18.8h, #7
>+        st1            {v0.8b, v1.8b, v2.8b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels32_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v0.16b, v1.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ushll           v18.8h, v1.8b, #6
>+        ushll2          v19.8h, v1.16b, #6
>+        ld1            {v20.8h, v21.8h, v22.8h, v23.8h}, [x4], x10   // src2
>+        sqadd           v16.8h, v16.8h, v20.8h
>+        sqadd           v17.8h, v17.8h, v21.8h
>+        sqadd           v18.8h, v18.8h, v22.8h
>+        sqadd           v19.8h, v19.8h, v23.8h
>+        sqrshrun        v0.8b,  v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        sqrshrun        v1.8b,  v18.8h, #7
>+        sqrshrun2       v1.16b, v19.8h, #7
>+        st1            {v0.16b, v1.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels48_8_neon, export=1
>+        mov             x10, #(MAX_PB_SIZE)
>+1:      ld1            {v0.16b, v1.16b, v2.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ushll           v18.8h, v1.8b, #6
>+        ushll2          v19.8h, v1.16b, #6
>+        ushll           v20.8h, v2.8b, #6
>+        ushll2          v21.8h, v2.16b, #6
>+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
>+        sqadd           v16.8h, v16.8h, v24.8h
>+        sqadd           v17.8h, v17.8h, v25.8h
>+        sqadd           v18.8h, v18.8h, v26.8h
>+        sqadd           v19.8h, v19.8h, v27.8h
>+        ld1            {v24.8h, v25.8h}, [x4], x10
>+        sqadd           v20.8h, v20.8h, v24.8h
>+        sqadd           v21.8h, v21.8h, v25.8h
>+        sqrshrun        v0.8b, v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        sqrshrun        v1.8b, v18.8h, #7
>+        sqrshrun2       v1.16b, v19.8h, #7
>+        sqrshrun        v2.8b, v20.8h, #7
>+        sqrshrun2       v2.16b, v21.8h, #7
>+        st1            {v0.16b, v1.16b, v2.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1
>+1:      ld1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3            // src
>+        ushll           v16.8h, v0.8b, #6
>+        ushll2          v17.8h, v0.16b, #6
>+        ushll           v18.8h, v1.8b, #6
>+        ushll2          v19.8h, v1.16b, #6
>+        ushll           v20.8h, v2.8b, #6
>+        ushll2          v21.8h, v2.16b, #6
>+        ushll           v22.8h, v3.8b, #6
>+        ushll2          v23.8h, v3.16b, #6
>+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)        // src2
>+        sqadd           v16.8h, v16.8h, v24.8h
>+        sqadd           v17.8h, v17.8h, v25.8h
>+        sqadd           v18.8h, v18.8h, v26.8h
>+        sqadd           v19.8h, v19.8h, v27.8h
>+        ld1            {v24.8h, v25.8h, v26.8h, v27.8h}, [x4], #(MAX_PB_SIZE)
>+        sqadd           v20.8h, v20.8h, v24.8h
>+        sqadd           v21.8h, v21.8h, v25.8h
>+        sqadd           v22.8h, v22.8h, v26.8h
>+        sqadd           v23.8h, v23.8h, v27.8h
>+        sqrshrun        v0.8b, v16.8h, #7
>+        sqrshrun2       v0.16b, v17.8h, #7
>+        sqrshrun        v1.8b, v18.8h, #7
>+        sqrshrun2       v1.16b, v19.8h, #7
>+        sqrshrun        v2.8b, v20.8h, #7
>+        sqrshrun2       v2.16b, v21.8h, #7
>+        sqrshrun        v3.8b, v22.8h, #7
>+        sqrshrun2       v3.16b, v23.8h, #7
>+        st1            {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
>+        subs            x5, x5, #1
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+.Lepel_filters:
>+        .byte  0,  0,  0,  0
>+        .byte -2, 58, 10, -2
>+        .byte -4, 54, 16, -2
>+        .byte -6, 46, 28, -4
>+        .byte -4, 36, 36, -4
>+        .byte -4, 28, 46, -6
>+        .byte -2, 16, 54, -4
>+        .byte -2, 10, 58, -2
>+
>+.macro load_epel_filterb freg, xreg
>+        adr             \xreg, .Lepel_filters
>+        add             \xreg, \xreg, \freg, lsl #2
>+        ld4r           {v0.16b, v1.16b, v2.16b, v3.16b}, [\xreg] // filter
>+        neg             v0.16b, v0.16b

>+        neg             v3.16b, v3.16b
[MC] Why not put abs(x) in the constant table?


>+.endm
>+
>+.macro calc_epelb dst, src1, src2, src3, src4
>+        umlsl           \dst\().8h, \src1\().8b, v0.8b
>+        umlal           \dst\().8h, \src2\().8b, v1.8b
>+        umlal           \dst\().8h, \src3\().8b, v2.8b
>+        umlsl           \dst\().8h, \src4\().8b, v3.8b
>+.endm
>+
>+.macro calc_epelb2 dst, src1, src2, src3, src4
>+        umlsl2          \dst\().8h, \src1\().16b, v0.16b
>+        umlal2          \dst\().8h, \src2\().16b, v1.16b
>+        umlal2          \dst\().8h, \src3\().16b, v2.16b
>+        umlsl2          \dst\().8h, \src4\().16b, v3.16b
>+.endm
>+
>+.macro load_epel_filterh freg, xreg
>+        adr             \xreg, .Lepel_filters
>+        add             \xreg, \xreg, \freg, lsl #2
>+        ld1            {v0.8b}, [\xreg]
>+        sxtl            v0.8h, v0.8b
>+.endm
>+
>+.macro calc_epelh dst, src1, src2, src3, src4
>+        smull           \dst\().4s, \src1\().4h, v0.h[0]
>+        smlal           \dst\().4s, \src2\().4h, v0.h[1]
>+        smlal           \dst\().4s, \src3\().4h, v0.h[2]
>+        smlal           \dst\().4s, \src4\().4h, v0.h[3]
>+        sqshrn          \dst\().4h, \dst\().4s, #6
>+.endm
>+
>+.macro calc_epelh2 dst, tmp, src1, src2, src3, src4
>+        smull2          \tmp\().4s, \src1\().8h, v0.h[0]
>+        smlal2          \tmp\().4s, \src2\().8h, v0.h[1]
>+        smlal2          \tmp\().4s, \src3\().8h, v0.h[2]
>+        smlal2          \tmp\().4s, \src4\().8h, v0.h[3]
>+        sqshrn2         \dst\().8h, \tmp\().4s, #6
>+.endm
>+
>+function ff_hevc_put_hevc_epel_h4_8_neon, export=1
>+        load_epel_filterb x4, x5
>+        sub             x1, x1, #1
>+        mov             x10, #(MAX_PB_SIZE * 2)
>+1:      ld1            {v4.8b}, [x1], x2
>+        ushr            v5.2d, v4.2d, #8
>+        ushr            v6.2d, v5.2d, #8
>+        ushr            v7.2d, v6.2d, #8
>+        movi            v16.8h, #0
>+        calc_epelb      v16, v4, v5, v6, v7
>+        st1            {v16.4h}, [x0], x10
>+        subs            x3, x3, #1   // height
>+        b.ne            1b
>+        ret
>+endfunc
>+
>+function ff_hevc_put_hevc_epel_h6_8_neon, export=1
>+        load_epel_filterb x4, x5
>+        sub             x1,  x1, #1
>+        sub             x2,  x2, #8
>+        mov             x10, #(MAX_PB_SIZE * 2 - 8)
>+1:      ld1            {v24.8b},  [x1], #8
>+        ushr            v26.2d, v24.2d, #8
>+        ushr            v27.2d, v26.2d, #8

>+        ushr            v28.2d, v27.2d, #8
[MC] Dependency link will made pipeline stall, how about EXT or LD1 directly?


>+        movi            v16.8h,   #0
>+        ld1            {v28.b}[5], [x1], x2
>+        calc_epelb      v16, v24, v26, v27, v28
>+        st1            {v16.4h},   [x0], #8
>+        st1            {v16.s}[2], [x0], x10
>+        subs            x3, x3,   #1   // height
>+        b.ne            1b
>+        ret
>+endfunc
>+

...


>-- 
>2.30.1 (Apple Git-130)
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel at ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".


More information about the ffmpeg-devel mailing list