[FFmpeg-cvslog] [ffmpeg] branch master updated. 6f9e8a599d checkasm/swscale: fix whitespace issues

Tue Aug 12 12:05:41 EEST 2025

The branch, master has been updated
       via  6f9e8a599dd94911cbc6713b53ae9bf01701c35c (commit)
       via  ca2a88c1b3f31417cda689bdb9b2ae2c9f607ca6 (commit)
       via  49477972b7175284663c9ef4124345c71dc9c7a1 (commit)
      from  5929d46f7bd8a19b40dbb266161a2146beed3afb (commit)


- Log -----------------------------------------------------------------
commit 6f9e8a599dd94911cbc6713b53ae9bf01701c35c
Author:     Dash Santosh <santdas36 at gmail.com>
AuthorDate: Mon Aug 11 10:13:19 2025 +0530
Commit:     Martin StorsjÃ¶ <martin at martin.st>
CommitDate: Tue Aug 12 09:05:00 2025 +0000

    checkasm/swscale: fix whitespace issues

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 0306f02695..52e3ebf75c 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -159,7 +159,7 @@ static void check_yuv2yuv1(int accurate)
                             (int) dst0[fail_offset],
                             (int) dst1[fail_offset]);
                 }
-                if(dstW == LARGEST_INPUT_SIZE)
+                if (dstW == LARGEST_INPUT_SIZE)
                     bench_new(src_pixels, dst1, dstW, dither, offset);
             }
         }
@@ -266,7 +266,7 @@ static void check_yuv2yuvX(int accurate, int bit_depth, int dst_pix_format)
                             show_differences_16(dst0, dst1, LARGEST_INPUT_SIZE);
                         }
                     }
-                    if(dstW == LARGEST_INPUT_SIZE)
+                    if (dstW == LARGEST_INPUT_SIZE)
                         bench_new(filter, filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi);
 
                 }

commit ca2a88c1b3f31417cda689bdb9b2ae2c9f607ca6
Author:     Dash Santosh <santdas36 at gmail.com>
AuthorDate: Mon Aug 11 10:10:53 2025 +0530
Commit:     Martin StorsjÃ¶ <martin at martin.st>
CommitDate: Tue Aug 12 09:05:00 2025 +0000

    swscale/output: Implement yuv2nv12cx neon assembly
    
    yuv2nv12cX_2_512_accurate_c:                          3540.1 ( 1.00x)
    yuv2nv12cX_2_512_accurate_neon:                        408.0 ( 8.68x)
    yuv2nv12cX_2_512_approximate_c:                       3521.4 ( 1.00x)
    yuv2nv12cX_2_512_approximate_neon:                     409.2 ( 8.61x)
    yuv2nv12cX_4_512_accurate_c:                          4740.0 ( 1.00x)
    yuv2nv12cX_4_512_accurate_neon:                        604.4 ( 7.84x)
    yuv2nv12cX_4_512_approximate_c:                       4681.9 ( 1.00x)
    yuv2nv12cX_4_512_approximate_neon:                     603.3 ( 7.76x)
    yuv2nv12cX_8_512_accurate_c:                          7273.1 ( 1.00x)
    yuv2nv12cX_8_512_accurate_neon:                       1012.2 ( 7.19x)
    yuv2nv12cX_8_512_approximate_c:                       7223.0 ( 1.00x)
    yuv2nv12cX_8_512_approximate_neon:                    1015.8 ( 7.11x)
    yuv2nv12cX_16_512_accurate_c:                        13762.0 ( 1.00x)
    yuv2nv12cX_16_512_accurate_neon:                      1761.4 ( 7.81x)
    yuv2nv12cX_16_512_approximate_c:                     13884.0 ( 1.00x)
    yuv2nv12cX_16_512_approximate_neon:                   1766.8 ( 7.86x)
    
    Benchmarked on:
    Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Oryon(TM) CPU
    3417 Mhz, 12 Core(s), 12 Logical Processor(s)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 4945633856..a650d72f54 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -402,3 +402,230 @@ function ff_yuv2plane1_8_neon, export=1
         b.gt            2b                              // loop until width consumed
         ret
 endfunc
+
+function ff_yuv2nv12cX_neon_asm, export=1
+// w0 - isSwapped
+// x1 - uint8_t *chrDither
+// x2 - int16_t *chrFilter
+// x3 - int chrFilterSize
+// x4 - int16_t **chrUSrc
+// x5 - int16_t **chrVSrc
+// x6 - uint8_t *dest
+// x7 - int chrDstW
+
+        stp             x19, x20, [sp, #-32]!
+        stp             x21, x22, [sp, #16]
+
+        ld1             {v0.8b}, [x1]                   // chrDither[0..7]
+        ext             v1.8b, v0.8b, v0.8b, #3         // Rotate for V: (i+3)&7
+
+        uxtl            v0.8h, v0.8b
+        uxtl            v1.8h, v1.8b
+
+        ushll           v2.4s, v0.4h, #12               // U dither low
+        ushll2          v3.4s, v0.8h, #12               // U dither high
+        ushll           v4.4s, v1.4h, #12               // V dither low
+        ushll2          v5.4s, v1.8h, #12               // V dither high
+
+        mov             x8, #0                          // i = 0
+1:
+        cmp             w7, #16
+        blt             5f
+
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             v20.16b, v2.16b
+        mov             v21.16b, v3.16b
+        mov             v22.16b, v4.16b
+        mov             v23.16b, v5.16b
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+2:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+        add             x15, x13, #16
+        add             x16, x14, #16
+
+        ld1             {v24.8h}, [x13]                 // U samples 0-7
+        ld1             {v25.8h}, [x14]                 // V samples 0-7
+
+        ld1             {v26.8h}, [x15]                 // U samples 8-15
+        ld1             {v27.8h}, [x16]                 // V samples 8-15
+        subs            w9, w9, #1
+
+        smlal           v16.4s, v24.4h, v6.h[0]
+        smlal2          v17.4s, v24.8h, v6.h[0]
+        smlal           v18.4s, v25.4h, v6.h[0]
+        smlal2          v19.4s, v25.8h, v6.h[0]
+
+        smlal           v20.4s, v26.4h, v6.h[0]
+        smlal2          v21.4s, v26.8h, v6.h[0]
+        smlal           v22.4s, v27.4h, v6.h[0]
+        smlal2          v23.4s, v27.8h, v6.h[0]
+
+        b.gt            2b
+
+        sqshrun         v24.4h, v16.4s, #16             // Process and store first 8 pixels
+        sqshrun2        v24.8h, v17.4s, #16
+        sqshrun         v25.4h, v18.4s, #16
+        sqshrun2        v25.8h, v19.4s, #16
+
+        sqshrun         v26.4h, v20.4s, #16             // Process and store next 8 pixels
+        sqshrun2        v26.8h, v21.4s, #16
+        sqshrun         v27.4h, v22.4s, #16
+        sqshrun2        v27.8h, v23.4s, #16
+
+        cbz             w0, 3f
+
+        uqshrn          v28.8b, v24.8h, #3              // Storing U
+        uqshrn2         v28.16b, v26.8h, #3
+        uqshrn          v29.8b, v25.8h, #3              // Storing V
+        uqshrn2         v29.16b, v27.8h, #3
+
+        st2             {v28.16b, v29.16b}, [x6], #32
+        b               4f
+3:
+        uqshrn          v28.8b, v25.8h, #3              // Storing V
+        uqshrn2         v28.16b, v27.8h, #3
+        uqshrn          v29.8b, v24.8h, #3              // Storing U
+        uqshrn2         v29.16b, v26.8h, #3
+
+        st2             {v28.16b, v29.16b}, [x6], #32
+4:
+        subs            w7, w7, #16
+        add             x8, x8, #16
+        b.gt            1b
+
+5:
+        cmp             w7, #8
+        blt             10f
+6:
+        mov             v16.16b, v2.16b                 // U acc low
+        mov             v17.16b, v3.16b                 // U acc high
+        mov             v18.16b, v4.16b                 // V acc low
+        mov             v19.16b, v5.16b                 // V acc high
+
+        mov             w9, w3                          // chrFilterSize counter
+        mov             x10, x2                         // chrFilter pointer
+        mov             x11, x4                         // chrUSrc base
+        mov             x12, x5                         // chrVSrc base
+
+7:
+        ldr             h6, [x10], #2                   // Load filter coefficient
+
+        ldr             x13, [x11], #8                  // chrUSrc[j]
+        ldr             x14, [x12], #8                  // chrVSrc[j]
+        add             x13, x13, x8, lsl #1            // &chrUSrc[j][i]
+        add             x14, x14, x8, lsl #1            // &chrVSrc[j][i]
+
+        ld1             {v20.8h}, [x13]                 // U samples
+        ld1             {v21.8h}, [x14]                 // V samples
+        subs            w9, w9, #1
+
+        smlal           v16.4s, v20.4h, v6.h[0]
+        smlal2          v17.4s, v20.8h, v6.h[0]
+        smlal           v18.4s, v21.4h, v6.h[0]
+        smlal2          v19.4s, v21.8h, v6.h[0]
+
+        b.gt            7b
+
+        sqshrun         v26.4h, v16.4s, #16             // Final processing and store
+        sqshrun2        v26.8h, v17.4s, #16
+        sqshrun         v27.4h, v18.4s, #16
+        sqshrun2        v27.8h, v19.4s, #16
+
+        cbz             w0, 8f
+        uqshrn          v28.8b, v26.8h, #3              // Storing U
+        uqshrn          v29.8b, v27.8h, #3              // Storing V
+        st2             {v28.8b, v29.8b}, [x6], #16
+        b               9f
+8:
+        uqshrn          v28.8b, v27.8h, #3              // Storing V
+        uqshrn          v29.8b, v26.8h, #3              // Storing U
+        st2             {v28.8b, v29.8b}, [x6], #16
+9:
+        subs            w7, w7, #8
+        add             x8, x8, #8
+
+10:
+        cbz             w7, 15f                         // Scalar loop
+
+11:
+        and             x15, x8, #7
+        ldrb            w9, [x1, x15]
+        sxtw            x9, w9
+        lsl             x9, x9, #12                     // u = chrDither[i & 7] << 12;
+
+        add             x15, x8, #3
+        and             x15, x15, #7
+        ldrb            w10, [x1, x15]
+        sxtw            x10, w10
+        lsl             x10, x10, #12                   // v = chrDither[(i + 3) & 7] << 12;
+
+        mov             w11, w3                         // chrFilterSize counter
+        mov             x12, x2                         // chrFilter pointer
+        mov             x13, x4                         // chrUSrc base
+        mov             x14, x5                         // chrVSrc base
+
+12:
+        ldrsh           x16, [x12], #2
+
+        ldr             x17, [x13], #8                  // chrUSrc[j]
+        ldr             x19, [x14], #8                  // chrVSrc[j]
+        add             x17, x17, x8, lsl #1            // &chrUSrc[j][i]
+        add             x19, x19, x8, lsl #1            // &chrVSrc[j][i]
+
+        ldrsh           x20, [x17]
+        ldrsh           x21, [x19]
+
+        madd            x9, x16, x20, x9
+        madd            x10, x16, x21, x10
+
+        subs            w11, w11, #1
+        b.gt            12b
+
+        asr             x9, x9, #19                     // Process and store U and V
+        asr             x10, x10, #19
+
+        cmp             x9, #0
+        csel            x9, x9, xzr, ge
+        cmp             x10, #0
+        csel            x10, x10, xzr, ge
+
+        mov             x22, #1
+        lsl             x22, x22, #8
+        sub             x22, x22, #1
+
+        cmp             x9, x22
+        csel            x9, x22, x9, gt
+        cmp             x10, x22
+        csel            x10, x22, x10, gt
+
+        cbz             w0, 13f
+        strb            w9, [x6], #1                    // Storing U
+        strb            w10, [x6], #1                   // Storing V
+        b               14f
+13:
+        strb            w10, [x6], #1                   // Storing V
+        strb            w9, [x6], #1                    // Storing U
+
+14:
+        subs            w7, w7, #1
+        add             x8, x8, #1
+        b.gt            11b
+15:
+        ldp             x21, x22, [sp, #16]
+        ldp             x19, x20, [sp], #32
+        ret
+endfunc
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6fd4cc7265..55fff03a5a 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -191,6 +191,25 @@ void ff_yuv2plane1_8_neon(
         const uint8_t *dither,
         int offset);
 
+void ff_yuv2nv12cX_neon_asm(int isSwapped, const uint8_t *chrDither,
+                            const int16_t *chrFilter, int chrFilterSize,
+                            const int16_t **chrUSrc, const int16_t **chrVSrc,
+                            uint8_t *dest, int chrDstW);
+
+static void ff_yuv2nv12cX_neon(enum AVPixelFormat dstFormat, const uint8_t *chrDither,
+                               const int16_t *chrFilter, int chrFilterSize,
+                               const int16_t **chrUSrc, const int16_t **chrVSrc,
+                               uint8_t *dest, int chrDstW)
+{
+    if (!isSwappedChroma(dstFormat)) {
+        ff_yuv2nv12cX_neon_asm(1, chrDither, chrFilter, chrFilterSize,
+                               chrUSrc, chrVSrc, dest, chrDstW);
+    } else {
+        ff_yuv2nv12cX_neon_asm(0, chrDither, chrFilter, chrFilterSize,
+                               chrUSrc, chrVSrc, dest, chrDstW);
+    }
+}
+
 #define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt) do {              \
     if (c->srcBpc == 8) {                                               \
         if(c->dstBpc <= 14) {                                           \
@@ -300,6 +319,8 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         ASSIGN_VSCALE_FUNC(c->yuv2plane1, neon);
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
+            if (isSemiPlanarYUV(dstFormat) && !isDataInHighBits(dstFormat))
+                c->yuv2nv12cX = ff_yuv2nv12cX_neon;
         }
 
         if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) && !isDataInHighBits(dstFormat)) {

commit 49477972b7175284663c9ef4124345c71dc9c7a1
Author:     Logaprakash Ramajayam <logaprakash.ramajayam at multicorewareinc.com>
AuthorDate: Tue Jul 1 23:48:36 2025 -0700
Commit:     Martin StorsjÃ¶ <martin at martin.st>
CommitDate: Tue Aug 12 09:05:00 2025 +0000

    swscale/aarch64/output: Implement neon assembly for yuv2planeX_10_c_template()
    
    yuv2yuvX_8_2_0_512_accurate_c:                        2213.4 ( 1.00x)
    yuv2yuvX_8_2_0_512_accurate_neon:                      147.5 (15.01x)
    yuv2yuvX_8_2_0_512_approximate_c:                     2203.9 ( 1.00x)
    yuv2yuvX_8_2_0_512_approximate_neon:                   154.1 (14.30x)
    yuv2yuvX_8_2_16_512_accurate_c:                       2147.2 ( 1.00x)
    yuv2yuvX_8_2_16_512_accurate_neon:                     150.8 (14.24x)
    yuv2yuvX_8_2_16_512_approximate_c:                    2149.7 ( 1.00x)
    yuv2yuvX_8_2_16_512_approximate_neon:                  146.8 (14.64x)
    yuv2yuvX_8_2_32_512_accurate_c:                       2078.9 ( 1.00x)
    yuv2yuvX_8_2_32_512_accurate_neon:                     139.0 (14.95x)
    yuv2yuvX_8_2_32_512_approximate_c:                    2083.7 ( 1.00x)
    yuv2yuvX_8_2_32_512_approximate_neon:                  140.5 (14.84x)
    yuv2yuvX_8_2_48_512_accurate_c:                       2010.7 ( 1.00x)
    yuv2yuvX_8_2_48_512_accurate_neon:                     138.2 (14.55x)
    yuv2yuvX_8_2_48_512_approximate_c:                    2012.6 ( 1.00x)
    yuv2yuvX_8_2_48_512_approximate_neon:                  141.2 (14.26x)
    yuv2yuvX_10LE_16_0_512_accurate_c:                    7874.1 ( 1.00x)
    yuv2yuvX_10LE_16_0_512_accurate_neon:                  831.6 ( 9.47x)
    yuv2yuvX_10LE_16_0_512_approximate_c:                 7918.1 ( 1.00x)
    yuv2yuvX_10LE_16_0_512_approximate_neon:               836.1 ( 9.47x)
    yuv2yuvX_10LE_16_16_512_accurate_c:                   7630.9 ( 1.00x)
    yuv2yuvX_10LE_16_16_512_accurate_neon:                 804.5 ( 9.49x)
    yuv2yuvX_10LE_16_16_512_approximate_c:                7724.7 ( 1.00x)
    yuv2yuvX_10LE_16_16_512_approximate_neon:              808.6 ( 9.55x)
    yuv2yuvX_10LE_16_32_512_accurate_c:                   7436.4 ( 1.00x)
    yuv2yuvX_10LE_16_32_512_accurate_neon:                 780.4 ( 9.53x)
    yuv2yuvX_10LE_16_32_512_approximate_c:                7366.7 ( 1.00x)
    yuv2yuvX_10LE_16_32_512_approximate_neon:              780.5 ( 9.44x)
    yuv2yuvX_10LE_16_48_512_accurate_c:                   7099.9 ( 1.00x)
    yuv2yuvX_10LE_16_48_512_accurate_neon:                 761.0 ( 9.33x)
    yuv2yuvX_10LE_16_48_512_approximate_c:                7097.6 ( 1.00x)
    yuv2yuvX_10LE_16_48_512_approximate_neon:              754.6 ( 9.41x)
    
    Benchmarked on:
    Snapdragon(R) X Elite - X1E80100 - Qualcomm(R) Oryon(TM) CPU
    3417 Mhz, 12 Core(s), 12 Logical Processor(s)

diff --git a/libswscale/aarch64/output.S b/libswscale/aarch64/output.S
index 190c438870..4945633856 100644
--- a/libswscale/aarch64/output.S
+++ b/libswscale/aarch64/output.S
@@ -20,6 +20,182 @@
 
 #include "libavutil/aarch64/asm.S"
 
+function ff_yuv2planeX_10_neon, export=1
+// x0 = filter (int16_t*)
+// w1 = filterSize
+// x2 = src (int16_t**)
+// x3 = dest (uint16_t*)
+// w4 = dstW
+// w5 = big_endian
+// w6 = output_bits
+
+        mov             w8, #27
+        sub             w8, w8, w6                      // shift = 11 + 16 - output_bits
+
+        sub             w9, w8, #1
+        mov             w10, #1
+        lsl             w9, w10, w9                     // val = 1 << (shift - 1)
+
+        dup             v1.4s, w9
+        dup             v2.4s, w9                       // Create vectors with val
+
+        neg             w16, w8
+        dup             v20.4s, w16                     // Create (-shift) vector for right shift
+
+        mov             w10, #1
+        lsl             w10, w10, w6
+        sub             w10, w10, #1                    // (1U << output_bits) - 1
+        dup             v21.4s, w10                     // Create Clip vector for upper bound
+        dup             v27.8h, w10
+
+        mov             x7, #0                          // i = 0
+
+1:
+        cmp             w4, #16                         // Process 16-pixels if available
+        blt             4f
+
+        mov             v3.16b, v1.16b
+        mov             v4.16b, v2.16b
+        mov             v5.16b, v1.16b
+        mov             v6.16b, v2.16b
+
+        mov             w11, w1                         // tmpfilterSize = filterSize
+        mov             x12, x2                         // srcp = src
+        mov             x13, x0                         // filterp = filter
+
+2:                                                      // Filter loop
+        ldp             x14, x15, [x12], #16            // get 2 pointers: src[j] and src[j+1]
+        ldr             s7, [x13], #4                   // load filter coefficients
+        add             x14, x14, x7, lsl #1
+        add             x15, x15, x7, lsl #1
+        ld1             {v16.8h, v17.8h}, [x14]
+        ld1             {v18.8h, v19.8h}, [x15]
+
+        subs            w11, w11, #2                    // tmpfilterSize -= 2
+
+        smlal           v3.4s,  v16.4h, v7.h[0]         // Multiply-accumulate
+        smlal2          v4.4s,  v16.8h, v7.h[0]
+        smlal           v5.4s,  v17.4h, v7.h[0]
+        smlal2          v6.4s,  v17.8h, v7.h[0]
+
+        smlal           v3.4s,  v18.4h, v7.h[1]
+        smlal2          v4.4s,  v18.8h, v7.h[1]
+        smlal           v5.4s,  v19.4h, v7.h[1]
+        smlal2          v6.4s,  v19.8h, v7.h[1]
+
+        b.gt            2b                              // continue filter loop
+
+        sshl            v3.4s,  v3.4s, v20.4s           // Shift results
+        sshl            v4.4s,  v4.4s, v20.4s
+        sshl            v5.4s,  v5.4s, v20.4s
+        sshl            v6.4s,  v6.4s, v20.4s
+
+        sqxtun          v23.4h, v3.4s                   // Narrow and clamp to 0
+        sqxtun2         v23.8h, v4.4s
+        sqxtun          v24.4h, v5.4s
+        sqxtun2         v24.8h, v6.4s
+
+        umin            v23.8h, v23.8h, v27.8h
+        umin            v24.8h, v24.8h, v27.8h
+
+        cbz             w5, 3f                          // Check if big endian
+        rev16           v23.16b, v23.16b
+        rev16           v24.16b, v24.16b                // Swap bits for big endian
+3:
+        st1             {v23.8h, v24.8h}, [x3], #32
+
+        subs            w4, w4, #16                     // dstW = dstW - 16
+        add             x7, x7, #16                     // i = i + 16
+        b               1b                              // Continue loop
+
+4:
+        cmp             w4, #8                          // Process 8-pixels if available
+        blt             8f
+5:
+        mov             v3.16b, v1.16b
+        mov             v4.16b, v2.16b
+
+        mov             w11, w1                         // tmpfilterSize = filterSize
+        mov             x12, x2                         // srcp = src
+        mov             x13, x0                         // filterp = filter
+
+6:                                                      // Filter loop
+        ldp             x14, x15, [x12], #16
+        ldr             s7, [x13], #4
+        add             x14, x14, x7, lsl #1
+        add             x15, x15, x7, lsl #1
+        ld1             {v5.8h}, [x14]
+        ld1             {v6.8h}, [x15]
+
+        subs            w11, w11, #2                    // tmpfilterSize -= 2
+
+        smlal           v3.4s, v5.4h, v7.h[0]           // Multiply-accumulate
+        smlal2          v4.4s, v5.8h, v7.h[0]
+        smlal           v3.4s, v6.4h, v7.h[1]
+        smlal2          v4.4s, v6.8h, v7.h[1]
+
+        b.gt            6b                              // loop until filterSize consumed
+
+        sshl            v3.4s, v3.4s, v20.4s            // Shift results
+        sshl            v4.4s, v4.4s, v20.4s
+
+        sqxtun          v25.4h, v3.4s                   // Narrow and clamp to 0
+        sqxtun2         v25.8h, v4.4s
+
+        umin            v25.8h, v25.8h, v27.8h
+
+        cbz             w5, 7f                          // Check if big endian
+        rev16           v25.16b, v25.16b
+
+7:
+        st1             {v25.8h}, [x3], #16             // Store 8 pixels
+
+        subs            w4, w4, #8                      // dstW = dstW - 8
+        add             x7, x7, #8                      // i = i + 8
+8:
+        cbz             w4, 12f                         // Scalar loop for remaining pixels
+9:
+        mov             w11, w1                         // tmpfilterSize = filterSize
+        mov             x12, x2                         // srcp = src
+        mov             x13, x0                         // filterp = filter
+        sxtw            x9, w9
+        mov             x17, x9
+
+10:                                                     // Filter loop
+        ldr             x14, [x12], #8                  // Load src pointer
+        ldrsh           w15, [x13], #2                  // Load filter coefficient
+        add             x14, x14, x7, lsl #1            // Add pixel offset
+        ldrh            w16, [x14]
+
+        sxtw            x16, w16
+        sxtw            x15, w15
+        madd            x17, x16, x15, x17
+
+        subs            w11, w11, #1                    // tmpfilterSize -= 1
+        b.gt            10b                             // loop until filterSize consumed
+
+        sxtw            x8, w8
+        asr             x17, x17, x8
+        cmp             x17, #0
+        csel            x17, x17, xzr, ge               // Clamp to 0 if negative
+
+        sxtw            x10, w10
+        cmp             x17, x10
+        csel            x17, x10, x17, gt               // Clamp to max if greater than max
+
+        cbz             w5, 11f                         // Check if big endian
+        rev16           x17, x17                        // Swap bits for big endian
+11:
+        strh            w17, [x3], #2
+
+        subs            w4, w4, #1                      // dstW = dstW - 1
+        add             x7, x7, #1                      // i = i + 1
+        b.gt            9b                              // Loop if more pixels
+
+12:
+        ret
+endfunc
+
 function ff_yuv2planeX_8_neon, export=1
 // x0 - const int16_t *filter,
 // x1 - int filterSize,
diff --git a/libswscale/aarch64/swscale.c b/libswscale/aarch64/swscale.c
index 6e5a721c1f..6fd4cc7265 100644
--- a/libswscale/aarch64/swscale.c
+++ b/libswscale/aarch64/swscale.c
@@ -158,6 +158,29 @@ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
 
 ALL_SCALE_FUNCS(neon);
 
+void ff_yuv2planeX_10_neon(const int16_t *filter, int filterSize,
+                           const int16_t **src, uint16_t *dest, int dstW,
+                           int big_endian, int output_bits);
+
+#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t)                                    \
+static void yuv2planeX_ ## bits ## BE_LE ## _neon(const int16_t *filter, int filterSize,        \
+                                                  const int16_t **src, uint8_t *dest, int dstW, \
+                                                  const uint8_t *dither, int offset)            \
+{                                                                                               \
+    ff_yuv2planeX_## template_size ## _neon(filter,                                             \
+                                            filterSize, (const typeX_t **) src,                 \
+                                            (uint16_t *) dest, dstW, is_be, bits);              \
+}
+
+yuv2NBPS( 9, BE, 1, 10, int16_t)
+yuv2NBPS( 9, LE, 0, 10, int16_t)
+yuv2NBPS(10, BE, 1, 10, int16_t)
+yuv2NBPS(10, LE, 0, 10, int16_t)
+yuv2NBPS(12, BE, 1, 10, int16_t)
+yuv2NBPS(12, LE, 0, 10, int16_t)
+yuv2NBPS(14, BE, 1, 10, int16_t)
+yuv2NBPS(14, LE, 0, 10, int16_t)
+
 void ff_yuv2planeX_8_neon(const int16_t *filter, int filterSize,
                           const int16_t **src, uint8_t *dest, int dstW,
                           const uint8_t *dither, int offset);
@@ -268,6 +291,8 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
 {
     int cpu_flags = av_get_cpu_flags();
+    enum AVPixelFormat dstFormat = c->opts.dst_format;
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
 
     if (have_neon(cpu_flags)) {
         ASSIGN_SCALE_FUNC(c->hyScale, c->hLumFilterSize, neon);
@@ -276,6 +301,19 @@ av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
         if (c->dstBpc == 8) {
             c->yuv2planeX = ff_yuv2planeX_8_neon;
         }
+
+        if (isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) && !isDataInHighBits(dstFormat)) {
+            if (desc->comp[0].depth == 9) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_neon  : yuv2planeX_9LE_neon;
+            } else if (desc->comp[0].depth == 10) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_neon  : yuv2planeX_10LE_neon;
+            } else if (desc->comp[0].depth == 12) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_neon  : yuv2planeX_12LE_neon;
+            } else if (desc->comp[0].depth == 14) {
+                c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_neon  : yuv2planeX_14LE_neon;
+            } else
+                av_assert0(0);
+        }
         switch (c->opts.src_format) {
         case AV_PIX_FMT_ABGR:
             c->lumToYV12 = ff_abgr32ToY_neon;
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 051b2bb4bf..0306f02695 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -52,50 +52,59 @@ static void yuv2planeX_8_ref(const int16_t *filter, int filterSize,
     }
 }
 
-static int cmp_off_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy)
-{
-    for (size_t i = 0; i < n; i++) {
-        if (abs(ref[i] - test[i]) > accuracy)
-            return 1;
-    }
-    return 0;
+#define CMP_FUNC(bits)                                                                      \
+static int cmp_off_by_n_##bits(const uint##bits##_t *ref, const uint##bits##_t *test,       \
+                               size_t n, int accuracy)                                      \
+{                                                                                           \
+    for (size_t i = 0; i < n; i++) {                                                        \
+        if (abs((int)ref[i] - (int)test[i]) > accuracy)                                     \
+            return 1;                                                                       \
+    }                                                                                       \
+    return 0;                                                                               \
 }
 
-static void print_data(uint8_t *p, size_t len, size_t offset)
-{
-    size_t i = 0;
-    for (; i < len; i++) {
-        if (i % 8 == 0) {
-            printf("0x%04zx: ", i+offset);
-        }
-        printf("0x%02x ", (uint32_t) p[i]);
-        if (i % 8 == 7) {
-            printf("\n");
-        }
-    }
-    if (i % 8 != 0) {
-        printf("\n");
-    }
+CMP_FUNC(8)
+CMP_FUNC(16)
+
+#define SHOW_DIFF_FUNC(bits)                                                                \
+static void print_data_##bits(const uint##bits##_t *p, size_t len, size_t offset)           \
+{                                                                                           \
+    size_t i = 0;                                                                           \
+    for (; i < len; i++) {                                                                  \
+        if (i % 8 == 0) {                                                                   \
+            printf("0x%04zx: ", i+offset);                                                  \
+        }                                                                                   \
+        printf("0x%02x ", (uint32_t) p[i]);                                                 \
+        if (i % 8 == 7) {                                                                   \
+            printf("\n");                                                                   \
+        }                                                                                   \
+    }                                                                                       \
+    if (i % 8 != 0) {                                                                       \
+        printf("\n");                                                                       \
+    }                                                                                       \
+}                                                                                           \
+static size_t show_differences_##bits(const uint##bits##_t *a, const uint##bits##_t *b,     \
+                                      size_t len)                                           \
+{                                                                                           \
+    for (size_t i = 0; i < len; i++) {                                                      \
+        if (a[i] != b[i]) {                                                                 \
+            size_t offset_of_mismatch = i;                                                  \
+            size_t offset;                                                                  \
+            if (i >= 8) i-=8;                                                               \
+            offset = i & (~7);                                                              \
+            printf("test a:\n");                                                            \
+            print_data_##bits(&a[offset], 32, offset);                                      \
+            printf("\ntest b:\n");                                                          \
+            print_data_##bits(&b[offset], 32, offset);                                      \
+            printf("\n");                                                                   \
+            return offset_of_mismatch;                                                      \
+        }                                                                                   \
+    }                                                                                       \
+    return len;                                                                             \
 }
 
-static size_t show_differences(uint8_t *a, uint8_t *b, size_t len)
-{
-    for (size_t i = 0; i < len; i++) {
-        if (a[i] != b[i]) {
-            size_t offset_of_mismatch = i;
-            size_t offset;
-            if (i >= 8) i-=8;
-            offset = i & (~7);
-            printf("test a:\n");
-            print_data(&a[offset], 32, offset);
-            printf("\ntest b:\n");
-            print_data(&b[offset], 32, offset);
-            printf("\n");
-            return offset_of_mismatch;
-        }
-    }
-    return len;
-}
+SHOW_DIFF_FUNC(8)
+SHOW_DIFF_FUNC(16)
 
 static void check_yuv2yuv1(int accurate)
 {
@@ -140,10 +149,10 @@ static void check_yuv2yuv1(int accurate)
 
                 call_ref(src_pixels, dst0, dstW, dither, offset);
                 call_new(src_pixels, dst1, dstW, dither, offset);
-                if (cmp_off_by_n(dst0, dst1, dstW * sizeof(dst0[0]), accurate ? 0 : 2)) {
+                if (cmp_off_by_n_8(dst0, dst1, dstW * sizeof(dst0[0]), accurate ? 0 : 2)) {
                     fail();
                     printf("failed: yuv2yuv1_%d_%di_%s\n", offset, dstW, accurate_str);
-                    fail_offset = show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                    fail_offset = show_differences_8(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
                     printf("failing values: src: 0x%04x dither: 0x%02x dst-c: %02x dst-asm: %02x\n",
                             (int) src_pixels[fail_offset],
                             (int) dither[(fail_offset + fail_offset) & 7],
@@ -158,7 +167,7 @@ static void check_yuv2yuv1(int accurate)
     sws_freeContext(sws);
 }
 
-static void check_yuv2yuvX(int accurate)
+static void check_yuv2yuvX(int accurate, int bit_depth, int dst_pix_format)
 {
     SwsContext *sws;
     SwsInternal *c;
@@ -179,8 +188,8 @@ static void check_yuv2yuvX(int accurate)
     const int16_t **src;
     LOCAL_ALIGNED_16(int16_t, src_pixels, [LARGEST_FILTER * LARGEST_INPUT_SIZE]);
     LOCAL_ALIGNED_16(int16_t, filter_coeff, [LARGEST_FILTER]);
-    LOCAL_ALIGNED_16(uint8_t, dst0, [LARGEST_INPUT_SIZE]);
-    LOCAL_ALIGNED_16(uint8_t, dst1, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_16(uint16_t, dst0, [LARGEST_INPUT_SIZE]);
+    LOCAL_ALIGNED_16(uint16_t, dst1, [LARGEST_INPUT_SIZE]);
     LOCAL_ALIGNED_16(uint8_t, dither, [LARGEST_INPUT_SIZE]);
     union VFilterData{
         const int16_t *src;
@@ -190,12 +199,14 @@ static void check_yuv2yuvX(int accurate)
     memset(dither, d_val, LARGEST_INPUT_SIZE);
     randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE * sizeof(int16_t));
     sws = sws_alloc_context();
+    sws->dst_format = dst_pix_format;
     if (accurate)
         sws->flags |= SWS_ACCURATE_RND;
     if (sws_init_context(sws, NULL, NULL) < 0)
         fail();
 
     c = sws_internal(sws);
+    c->dstBpc = bit_depth;
     ff_sws_init_scale(c);
     for(isi = 0; isi < FF_ARRAY_ELEMS(input_sizes); ++isi){
         dstW = input_sizes[isi];
@@ -227,24 +238,36 @@ static void check_yuv2yuvX(int accurate)
                     for(j = 0; j < 4; ++j)
                         vFilterData[i].coeff[j + 4] = filter_coeff[i];
                 }
-                if (check_func(c->yuv2planeX, "yuv2yuvX_%d_%d_%d_%s", filter_sizes[fsi], osi, dstW, accurate_str)){
+                if (check_func(c->yuv2planeX, "yuv2yuvX_%d%s_%d_%d_%d_%s", bit_depth, (bit_depth == 8) ? "" : (isBE(dst_pix_format) ? "BE" : "LE"), filter_sizes[fsi], osi, dstW, accurate_str)) {
                     // use vFilterData for the mmx function
                     const int16_t *filter = c->use_mmx_vfilter ? (const int16_t*)vFilterData : &filter_coeff[0];
                     memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
                     memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 
-                    // We can't use call_ref here, because we don't know if use_mmx_vfilter was set for that
-                    // function or not, so we can't pass it the parameters correctly.
-                    yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
-
-                    call_new(filter, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
-                    if (cmp_off_by_n(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]), accurate ? 0 : 2)) {
-                        fail();
-                        printf("failed: yuv2yuvX_%d_%d_%d_%s\n", filter_sizes[fsi], osi, dstW, accurate_str);
-                        show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
+                    if (c->dstBpc == 8) {
+                        // We can't use call_ref here, because we don't know if use_mmx_vfilter was set for that
+                        // function or not, so we can't pass it the parameters correctly.
+
+                        yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, (uint8_t*)dst0, dstW - osi, dither, osi);
+                        call_new(filter, filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi);
+
+                        if (cmp_off_by_n_8((uint8_t*)dst0, (uint8_t*)dst1, LARGEST_INPUT_SIZE, accurate ? 0 : 2)) {
+                            fail();
+                            printf("failed: yuv2yuvX_%d_%d_%d_%d_%s\n", bit_depth, filter_sizes[fsi], osi, dstW, accurate_str);
+                            show_differences_8((uint8_t*)dst0, (uint8_t*)dst1, LARGEST_INPUT_SIZE);
+                        }
+                    } else {
+                        call_ref(&filter_coeff[0], filter_sizes[fsi], src, (uint8_t*)dst0, dstW - osi, dither, osi);
+                        call_new(&filter_coeff[0], filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi);
+
+                        if (cmp_off_by_n_16(dst0, dst1, LARGEST_INPUT_SIZE, accurate ? 0 : 2)) {
+                            fail();
+                            printf("failed: yuv2yuvX_%d%s_%d_%d_%d_%s\n", bit_depth, isBE(dst_pix_format) ? "BE" : "LE", filter_sizes[fsi], osi, dstW, accurate_str);
+                            show_differences_16(dst0, dst1, LARGEST_INPUT_SIZE);
+                        }
                     }
                     if(dstW == LARGEST_INPUT_SIZE)
-                        bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                        bench_new(filter, filter_sizes[fsi], src, (uint8_t*)dst1, dstW - osi, dither, osi);
 
                 }
                 av_freep(&src);
@@ -311,10 +334,10 @@ static void check_yuv2nv12cX(int accurate)
                 call_ref(sws->dst_format, dither, &filter_coeff[0], filter_size, srcU, srcV, dst0, dstW);
                 call_new(sws->dst_format, dither, &filter_coeff[0], filter_size, srcU, srcV, dst1, dstW);
 
-                if (cmp_off_by_n(dst0, dst1, dstW * 2 * sizeof(dst0[0]), accurate ? 0 : 2)) {
+                if (cmp_off_by_n_8(dst0, dst1, dstW * 2 * sizeof(dst0[0]), accurate ? 0 : 2)) {
                     fail();
                     printf("failed: yuv2nv12wX_%d_%d_%s\n", filter_size, dstW, accurate_str);
-                    show_differences(dst0, dst1, dstW * 2 * sizeof(dst0[0]));
+                    show_differences_8(dst0, dst1, dstW * 2 * sizeof(dst0[0]));
                 }
                 if (dstW == LARGEST_INPUT_SIZE)
                     bench_new(sws->dst_format, dither, &filter_coeff[0], filter_size, srcU, srcV, dst1, dstW);
@@ -441,9 +464,33 @@ void checkasm_check_sw_scale(void)
     check_yuv2yuv1(0);
     check_yuv2yuv1(1);
     report("yuv2yuv1");
-    check_yuv2yuvX(0);
-    check_yuv2yuvX(1);
-    report("yuv2yuvX");
+    check_yuv2yuvX(0, 8, AV_PIX_FMT_YUV420P);
+    check_yuv2yuvX(1, 8, AV_PIX_FMT_YUV420P);
+    report("yuv2yuvX_8");
+    check_yuv2yuvX(0, 9, AV_PIX_FMT_YUV420P9LE);
+    check_yuv2yuvX(1, 9, AV_PIX_FMT_YUV420P9LE);
+    report("yuv2yuvX_9LE");
+    check_yuv2yuvX(0, 9, AV_PIX_FMT_YUV420P9BE);
+    check_yuv2yuvX(1, 9, AV_PIX_FMT_YUV420P9BE);
+    report("yuv2yuvX_9BE");
+    check_yuv2yuvX(0, 10, AV_PIX_FMT_YUV420P10LE);
+    check_yuv2yuvX(1, 10, AV_PIX_FMT_YUV420P10LE);
+    report("yuv2yuvX_10LE");
+    check_yuv2yuvX(0, 10, AV_PIX_FMT_YUV420P10BE);
+    check_yuv2yuvX(1, 10, AV_PIX_FMT_YUV420P10BE);
+    report("yuv2yuvX_10BE");
+    check_yuv2yuvX(0, 12, AV_PIX_FMT_YUV420P12LE);
+    check_yuv2yuvX(1, 12, AV_PIX_FMT_YUV420P12LE);
+    report("yuv2yuvX_12LE");
+    check_yuv2yuvX(0, 12, AV_PIX_FMT_YUV420P12BE);
+    check_yuv2yuvX(1, 12, AV_PIX_FMT_YUV420P12BE);
+    report("yuv2yuvX_12BE");
+    check_yuv2yuvX(0, 14, AV_PIX_FMT_YUV420P14LE);
+    check_yuv2yuvX(1, 14, AV_PIX_FMT_YUV420P14LE);
+    report("yuv2yuvX_14LE");
+    check_yuv2yuvX(0, 14, AV_PIX_FMT_YUV420P14BE);
+    check_yuv2yuvX(1, 14, AV_PIX_FMT_YUV420P14BE);
+    report("yuv2yuvX_14BE");
     check_yuv2nv12cX(0);
     check_yuv2nv12cX(1);
     report("yuv2nv12cX");

-----------------------------------------------------------------------

Summary of changes:
 libswscale/aarch64/output.S  | 403 +++++++++++++++++++++++++++++++++++++++++++
 libswscale/aarch64/swscale.c |  59 +++++++
 tests/checkasm/sw_scale.c    | 173 ++++++++++++-------
 3 files changed, 572 insertions(+), 63 deletions(-)


hooks/post-receive
--