[FFmpeg-devel] [PATCH 6/6] avcodec/vc1: Introduce fast path for unescaping bitstream buffer

Thu Mar 17 20:58:19 EET 2022

Populate with implementations suitable for 32-bit and 64-bit Arm.

Signed-off-by: Ben Avison <bavison at riscosopen.org>
---
 libavcodec/aarch64/vc1dsp_init_aarch64.c |  60 ++++++++
 libavcodec/aarch64/vc1dsp_neon.S         | 176 +++++++++++++++++++++++
 libavcodec/arm/vc1dsp_init_neon.c        |  60 ++++++++
 libavcodec/arm/vc1dsp_neon.S             | 118 +++++++++++++++
 libavcodec/vc1dec.c                      |  20 +--
 libavcodec/vc1dsp.c                      |   2 +
 libavcodec/vc1dsp.h                      |   3 +
 7 files changed, 429 insertions(+), 10 deletions(-)

diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
index b672b2aa99..2fc2d5d1d3 100644
--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
@@ -51,6 +51,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 
+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+    /* Dealing with starting and stopping, and removing escape bytes, are
+     * comparatively less time-sensitive, so are more clearly expressed using
+     * a C wrapper around the assembly inner loop. Note that we assume a
+     * little-endian machine that supports unaligned loads. */
+    int dsize = 0;
+    while (size >= 4)
+    {
+        int found = 0;
+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+        {
+            found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+            if (!found)
+            {
+                *dst++ = *src++;
+                --size;
+                ++dsize;
+            }
+        }
+        if (!found)
+        {
+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
+            dst += skip;
+            src += skip;
+            size -= skip;
+            dsize += skip;
+            while (!found && size >= 4)
+            {
+                found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+                if (!found)
+                {
+                    *dst++ = *src++;
+                    --size;
+                    ++dsize;
+                }
+            }
+        }
+        if (found)
+        {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            ++src;
+            size -= 3;
+            dsize += 2;
+        }
+    }
+    while (size > 0)
+    {
+        *dst++ = *src++;
+        --size;
+        ++dsize;
+    }
+    return dsize;
+}
+
 av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -76,5 +134,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
     }
 }
diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
index c3ca3eae1e..8bdeffab44 100644
--- a/libavcodec/aarch64/vc1dsp_neon.S
+++ b/libavcodec/aarch64/vc1dsp_neon.S
@@ -1374,3 +1374,179 @@ function ff_vc1_h_loop_filter16_neon, export=1
         st2     {v2.b, v3.b}[7], [x6]
 4:      ret
 endfunc
+
+// Copy at most the specified number of bytes from source to destination buffer,
+// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
+// On entry:
+//   x0 -> source buffer
+//   w1 = max number of bytes to copy
+//   x2 -> destination buffer, optimally 8-byte aligned
+// On exit:
+//   w0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+        // Offset by 80 to screen out cases that are too short for us to handle,
+        // and also make it easy to test for loop termination, or to determine
+        // whether we need an odd number of half-iterations of the loop.
+        subs    w1, w1, #80
+        b.mi    90f
+
+        // Set up useful constants
+        movi    v20.4s, #3, lsl #24
+        movi    v21.4s, #3, lsl #16
+
+        tst     w1, #32
+        b.ne    1f
+
+          ld1     {v0.16b, v1.16b, v2.16b}, [x0], #48
+          ext     v25.16b, v0.16b, v1.16b, #1
+          ext     v26.16b, v0.16b, v1.16b, #2
+          ext     v27.16b, v0.16b, v1.16b, #3
+          ext     v29.16b, v1.16b, v2.16b, #1
+          ext     v30.16b, v1.16b, v2.16b, #2
+          ext     v31.16b, v1.16b, v2.16b, #3
+          bic     v24.16b, v0.16b, v20.16b
+          bic     v25.16b, v25.16b, v20.16b
+          bic     v26.16b, v26.16b, v20.16b
+          bic     v27.16b, v27.16b, v20.16b
+          bic     v28.16b, v1.16b, v20.16b
+          bic     v29.16b, v29.16b, v20.16b
+          bic     v30.16b, v30.16b, v20.16b
+          bic     v31.16b, v31.16b, v20.16b
+          eor     v24.16b, v24.16b, v21.16b
+          eor     v25.16b, v25.16b, v21.16b
+          eor     v26.16b, v26.16b, v21.16b
+          eor     v27.16b, v27.16b, v21.16b
+          eor     v28.16b, v28.16b, v21.16b
+          eor     v29.16b, v29.16b, v21.16b
+          eor     v30.16b, v30.16b, v21.16b
+          eor     v31.16b, v31.16b, v21.16b
+          cmeq    v24.4s, v24.4s, #0
+          cmeq    v25.4s, v25.4s, #0
+          cmeq    v26.4s, v26.4s, #0
+          cmeq    v27.4s, v27.4s, #0
+          add     w1, w1, #32
+          b       3f
+
+1:      ld1     {v3.16b, v4.16b, v5.16b}, [x0], #48
+        ext     v25.16b, v3.16b, v4.16b, #1
+        ext     v26.16b, v3.16b, v4.16b, #2
+        ext     v27.16b, v3.16b, v4.16b, #3
+        ext     v29.16b, v4.16b, v5.16b, #1
+        ext     v30.16b, v4.16b, v5.16b, #2
+        ext     v31.16b, v4.16b, v5.16b, #3
+        bic     v24.16b, v3.16b, v20.16b
+        bic     v25.16b, v25.16b, v20.16b
+        bic     v26.16b, v26.16b, v20.16b
+        bic     v27.16b, v27.16b, v20.16b
+        bic     v28.16b, v4.16b, v20.16b
+        bic     v29.16b, v29.16b, v20.16b
+        bic     v30.16b, v30.16b, v20.16b
+        bic     v31.16b, v31.16b, v20.16b
+        eor     v24.16b, v24.16b, v21.16b
+        eor     v25.16b, v25.16b, v21.16b
+        eor     v26.16b, v26.16b, v21.16b
+        eor     v27.16b, v27.16b, v21.16b
+        eor     v28.16b, v28.16b, v21.16b
+        eor     v29.16b, v29.16b, v21.16b
+        eor     v30.16b, v30.16b, v21.16b
+        eor     v31.16b, v31.16b, v21.16b
+        cmeq    v24.4s, v24.4s, #0
+        cmeq    v25.4s, v25.4s, #0
+        cmeq    v26.4s, v26.4s, #0
+        cmeq    v27.4s, v27.4s, #0
+        // Drop through...
+2:        mov     v0.16b, v5.16b
+          ld1     {v1.16b, v2.16b}, [x0], #32
+        cmeq    v28.4s, v28.4s, #0
+        cmeq    v29.4s, v29.4s, #0
+        cmeq    v30.4s, v30.4s, #0
+        cmeq    v31.4s, v31.4s, #0
+        orr     v24.16b, v24.16b, v25.16b
+        orr     v26.16b, v26.16b, v27.16b
+        orr     v28.16b, v28.16b, v29.16b
+        orr     v30.16b, v30.16b, v31.16b
+          ext     v25.16b, v0.16b, v1.16b, #1
+        orr     v22.16b, v24.16b, v26.16b
+          ext     v26.16b, v0.16b, v1.16b, #2
+          ext     v27.16b, v0.16b, v1.16b, #3
+          ext     v29.16b, v1.16b, v2.16b, #1
+        orr     v23.16b, v28.16b, v30.16b
+          ext     v30.16b, v1.16b, v2.16b, #2
+          ext     v31.16b, v1.16b, v2.16b, #3
+          bic     v24.16b, v0.16b, v20.16b
+          bic     v25.16b, v25.16b, v20.16b
+          bic     v26.16b, v26.16b, v20.16b
+        orr     v22.16b, v22.16b, v23.16b
+          bic     v27.16b, v27.16b, v20.16b
+          bic     v28.16b, v1.16b, v20.16b
+          bic     v29.16b, v29.16b, v20.16b
+          bic     v30.16b, v30.16b, v20.16b
+          bic     v31.16b, v31.16b, v20.16b
+        addv    s22, v22.4s
+          eor     v24.16b, v24.16b, v21.16b
+          eor     v25.16b, v25.16b, v21.16b
+          eor     v26.16b, v26.16b, v21.16b
+          eor     v27.16b, v27.16b, v21.16b
+          eor     v28.16b, v28.16b, v21.16b
+        mov     w3, v22.s[0]
+          eor     v29.16b, v29.16b, v21.16b
+          eor     v30.16b, v30.16b, v21.16b
+          eor     v31.16b, v31.16b, v21.16b
+          cmeq    v24.4s, v24.4s, #0
+          cmeq    v25.4s, v25.4s, #0
+          cmeq    v26.4s, v26.4s, #0
+          cmeq    v27.4s, v27.4s, #0
+        cbnz    w3, 90f
+        st1     {v3.16b, v4.16b}, [x2], #32
+3:          mov     v3.16b, v2.16b
+            ld1     {v4.16b, v5.16b}, [x0], #32
+          cmeq    v28.4s, v28.4s, #0
+          cmeq    v29.4s, v29.4s, #0
+          cmeq    v30.4s, v30.4s, #0
+          cmeq    v31.4s, v31.4s, #0
+          orr     v24.16b, v24.16b, v25.16b
+          orr     v26.16b, v26.16b, v27.16b
+          orr     v28.16b, v28.16b, v29.16b
+          orr     v30.16b, v30.16b, v31.16b
+            ext     v25.16b, v3.16b, v4.16b, #1
+          orr     v22.16b, v24.16b, v26.16b
+            ext     v26.16b, v3.16b, v4.16b, #2
+            ext     v27.16b, v3.16b, v4.16b, #3
+            ext     v29.16b, v4.16b, v5.16b, #1
+          orr     v23.16b, v28.16b, v30.16b
+            ext     v30.16b, v4.16b, v5.16b, #2
+            ext     v31.16b, v4.16b, v5.16b, #3
+            bic     v24.16b, v3.16b, v20.16b
+            bic     v25.16b, v25.16b, v20.16b
+            bic     v26.16b, v26.16b, v20.16b
+          orr     v22.16b, v22.16b, v23.16b
+            bic     v27.16b, v27.16b, v20.16b
+            bic     v28.16b, v4.16b, v20.16b
+            bic     v29.16b, v29.16b, v20.16b
+            bic     v30.16b, v30.16b, v20.16b
+            bic     v31.16b, v31.16b, v20.16b
+          addv    s22, v22.4s
+            eor     v24.16b, v24.16b, v21.16b
+            eor     v25.16b, v25.16b, v21.16b
+            eor     v26.16b, v26.16b, v21.16b
+            eor     v27.16b, v27.16b, v21.16b
+            eor     v28.16b, v28.16b, v21.16b
+          mov     w3, v22.s[0]
+            eor     v29.16b, v29.16b, v21.16b
+            eor     v30.16b, v30.16b, v21.16b
+            eor     v31.16b, v31.16b, v21.16b
+            cmeq    v24.4s, v24.4s, #0
+            cmeq    v25.4s, v25.4s, #0
+            cmeq    v26.4s, v26.4s, #0
+            cmeq    v27.4s, v27.4s, #0
+          cbnz    w3, 91f
+          st1     {v0.16b, v1.16b}, [x2], #32
+        subs    w1, w1, #64
+        b.pl    2b
+
+90:     add     w0, w1, #80
+        ret
+
+91:     sub     w1, w1, #32
+        b       90b
+endfunc
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index f5f5c702d7..3aefbcaf6d 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -84,6 +84,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
 void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
                                 int h, int x, int y);
 
+int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
+
+static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
+{
+    /* Dealing with starting and stopping, and removing escape bytes, are
+     * comparatively less time-sensitive, so are more clearly expressed using
+     * a C wrapper around the assembly inner loop. Note that we assume a
+     * little-endian machine that supports unaligned loads. */
+    int dsize = 0;
+    while (size >= 4)
+    {
+        int found = 0;
+        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
+        {
+            found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+            if (!found)
+            {
+                *dst++ = *src++;
+                --size;
+                ++dsize;
+            }
+        }
+        if (!found)
+        {
+            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
+            dst += skip;
+            src += skip;
+            size -= skip;
+            dsize += skip;
+            while (!found && size >= 4)
+            {
+                found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
+                if (!found)
+                {
+                    *dst++ = *src++;
+                    --size;
+                    ++dsize;
+                }
+            }
+        }
+        if (found)
+        {
+            *dst++ = *src++;
+            *dst++ = *src++;
+            ++src;
+            size -= 3;
+            dsize += 2;
+        }
+    }
+    while (size > 0)
+    {
+        *dst++ = *src++;
+        --size;
+        ++dsize;
+    }
+    return dsize;
+}
+
 #define FN_ASSIGN(X, Y) \
     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
@@ -130,4 +188,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
+
+    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
 }
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index 4ef083102b..9d7333cf12 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1
 4:      vpop            {d8-d15}
         pop             {r4-r6,pc}
 endfunc
+
+@ Copy at most the specified number of bytes from source to destination buffer,
+@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
+@ On entry:
+@   r0 -> source buffer
+@   r1 = max number of bytes to copy
+@   r2 -> destination buffer, optimally 8-byte aligned
+@ On exit:
+@   r0 = number of bytes not copied
+function ff_vc1_unescape_buffer_helper_neon, export=1
+        @ Offset by 48 to screen out cases that are too short for us to handle,
+        @ and also make it easy to test for loop termination, or to determine
+        @ whether we need an odd number of half-iterations of the loop.
+        subs    r1, r1, #48
+        bmi     90f
+
+        @ Set up useful constants
+        vmov.i32        q0, #0x3000000
+        vmov.i32        q1, #0x30000
+
+        tst             r1, #16
+        bne             1f
+
+          vld1.8          {q8, q9}, [r0]!
+          vbic            q12, q8, q0
+          vext.8          q13, q8, q9, #1
+          vext.8          q14, q8, q9, #2
+          vext.8          q15, q8, q9, #3
+          veor            q12, q12, q1
+          vbic            q13, q13, q0
+          vbic            q14, q14, q0
+          vbic            q15, q15, q0
+          vceq.i32        q12, q12, #0
+          veor            q13, q13, q1
+          veor            q14, q14, q1
+          veor            q15, q15, q1
+          vceq.i32        q13, q13, #0
+          vceq.i32        q14, q14, #0
+          vceq.i32        q15, q15, #0
+          add             r1, r1, #16
+          b               3f
+
+1:      vld1.8          {q10, q11}, [r0]!
+        vbic            q12, q10, q0
+        vext.8          q13, q10, q11, #1
+        vext.8          q14, q10, q11, #2
+        vext.8          q15, q10, q11, #3
+        veor            q12, q12, q1
+        vbic            q13, q13, q0
+        vbic            q14, q14, q0
+        vbic            q15, q15, q0
+        vceq.i32        q12, q12, #0
+        veor            q13, q13, q1
+        veor            q14, q14, q1
+        veor            q15, q15, q1
+        vceq.i32        q13, q13, #0
+        vceq.i32        q14, q14, #0
+        vceq.i32        q15, q15, #0
+        @ Drop through...
+2:        vmov            q8, q11
+          vld1.8          {q9}, [r0]!
+        vorr            q13, q12, q13
+        vorr            q15, q14, q15
+          vbic            q12, q8, q0
+        vorr            q3, q13, q15
+          vext.8          q13, q8, q9, #1
+          vext.8          q14, q8, q9, #2
+          vext.8          q15, q8, q9, #3
+          veor            q12, q12, q1
+        vorr            d6, d6, d7
+          vbic            q13, q13, q0
+          vbic            q14, q14, q0
+          vbic            q15, q15, q0
+          vceq.i32        q12, q12, #0
+        vmov            r3, r12, d6
+          veor            q13, q13, q1
+          veor            q14, q14, q1
+          veor            q15, q15, q1
+          vceq.i32        q13, q13, #0
+          vceq.i32        q14, q14, #0
+          vceq.i32        q15, q15, #0
+        orrs            r3, r3, r12
+        bne             90f
+        vst1.64         {q10}, [r2]!
+3:          vmov            q10, q9
+            vld1.8          {q11}, [r0]!
+          vorr            q13, q12, q13
+          vorr            q15, q14, q15
+            vbic            q12, q10, q0
+          vorr            q3, q13, q15
+            vext.8          q13, q10, q11, #1
+            vext.8          q14, q10, q11, #2
+            vext.8          q15, q10, q11, #3
+            veor            q12, q12, q1
+          vorr            d6, d6, d7
+            vbic            q13, q13, q0
+            vbic            q14, q14, q0
+            vbic            q15, q15, q0
+            vceq.i32        q12, q12, #0
+          vmov            r3, r12, d6
+            veor            q13, q13, q1
+            veor            q14, q14, q1
+            veor            q15, q15, q1
+            vceq.i32        q13, q13, #0
+            vceq.i32        q14, q14, #0
+            vceq.i32        q15, q15, #0
+          orrs            r3, r3, r12
+          bne             91f
+          vst1.64         {q8}, [r2]!
+        subs            r1, r1, #32
+        bpl             2b
+
+90:     add             r0, r1, #48
+        bx              lr
+
+91:     sub             r1, r1, #16
+        b               90b
+endfunc
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 1c92b9d401..6a30b5b664 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -490,7 +490,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
             size = next - start - 4;
             if (size <= 0)
                 continue;
-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
+            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
             init_get_bits(&gb, buf2, buf2_size * 8);
             switch (AV_RB32(start)) {
             case VC1_CODE_SEQHDR:
@@ -680,7 +680,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 case VC1_CODE_FRAME:
                     if (avctx->hwaccel)
                         buf_start = start;
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     break;
                 case VC1_CODE_FIELD: {
                     int buf_size3;
@@ -697,8 +697,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
@@ -709,7 +709,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     break;
                 }
                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
+                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
                     break;
@@ -726,8 +726,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                         ret = AVERROR(ENOMEM);
                         goto err;
                     }
-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
-                                                    slices[n_slices].buf);
+                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
+                                                              slices[n_slices].buf);
                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                                   buf_size3 << 3);
                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
@@ -761,7 +761,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                     ret = AVERROR(ENOMEM);
                     goto err;
                 }
-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
                               buf_size3 << 3);
                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
@@ -770,9 +770,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
                 n_slices1 = n_slices - 1;
                 n_slices++;
             }
-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
         } else {
-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
+            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
         }
         init_get_bits(&s->gb, buf2, buf_size2*8);
     } else{
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index a29b91bf3d..11d493f002 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -34,6 +34,7 @@
 #include "rnd_avg.h"
 #include "vc1dsp.h"
 #include "startcode.h"
+#include "vc1_common.h"
 
 /* Apply overlap transform to horizontal edge */
 static void vc1_v_overlap_c(uint8_t *src, int stride)
@@ -1030,6 +1031,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
 #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
 
     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
+    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
 
     if (ARCH_AARCH64)
         ff_vc1dsp_init_aarch64(dsp);
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index c6443acb20..8be1198071 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
      * one or more further zero bytes and a one byte.
      */
     int (*startcode_find_candidate)(const uint8_t *buf, int size);
+
+    /* Copy a buffer, removing startcode emulation escape bytes as we go */
+    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
 } VC1DSPContext;
 
 void ff_vc1dsp_init(VC1DSPContext* c);
-- 
2.25.1