[FFmpeg-devel] [PATCH 2/3] avcodec: [loongarch] Optimize idctdstp with LASX.
Hao Chen
chenhao at loongson.cn
Fri Dec 24 11:49:21 EET 2021
./ffmpeg -i 8_mpeg4_1080p_24fps_12Mbps.avi -f rawvideo -y /dev/null -an
before:433fps
after :552fps
Change-Id: Ic233aeeb3a3b7414db294a7cb699ddbf4ca2e790
---
libavcodec/idctdsp.c | 2 +
libavcodec/idctdsp.h | 2 +
libavcodec/loongarch/Makefile | 3 +
libavcodec/loongarch/idctdsp_init_loongarch.c | 45 +++
libavcodec/loongarch/idctdsp_lasx.c | 124 ++++++++
libavcodec/loongarch/idctdsp_loongarch.h | 41 +++
libavcodec/loongarch/simple_idct_lasx.c | 297 ++++++++++++++++++
7 files changed, 514 insertions(+)
create mode 100644 libavcodec/loongarch/idctdsp_init_loongarch.c
create mode 100644 libavcodec/loongarch/idctdsp_lasx.c
create mode 100644 libavcodec/loongarch/idctdsp_loongarch.h
create mode 100644 libavcodec/loongarch/simple_idct_lasx.c
diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index 846ed0b0f8..71bd03c606 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -315,6 +315,8 @@ av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx)
ff_idctdsp_init_x86(c, avctx, high_bit_depth);
if (ARCH_MIPS)
ff_idctdsp_init_mips(c, avctx, high_bit_depth);
+ if (ARCH_LOONGARCH)
+ ff_idctdsp_init_loongarch(c, avctx, high_bit_depth);
ff_init_scantable_permutation(c->idct_permutation,
c->perm_type);
diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h
index ca21a31a02..014488aec3 100644
--- a/libavcodec/idctdsp.h
+++ b/libavcodec/idctdsp.h
@@ -118,5 +118,7 @@ void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
void ff_idctdsp_init_mips(IDCTDSPContext *c, AVCodecContext *avctx,
unsigned high_bit_depth);
+void ff_idctdsp_init_loongarch(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth);
#endif /* AVCODEC_IDCTDSP_H */
diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07a401d883..c4d71e801b 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -6,6 +6,7 @@ OBJS-$(CONFIG_VP8_DECODER) += loongarch/vp8dsp_init_loongarch.o
OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9dsp_init_loongarch.o
OBJS-$(CONFIG_VC1DSP) += loongarch/vc1dsp_init_loongarch.o
OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_init_loongarch.o
+OBJS-$(CONFIG_IDCTDSP) += loongarch/idctdsp_init_loongarch.o
LASX-OBJS-$(CONFIG_H264CHROMA) += loongarch/h264chroma_lasx.o
LASX-OBJS-$(CONFIG_H264QPEL) += loongarch/h264qpel_lasx.o
LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
@@ -14,6 +15,8 @@ LASX-OBJS-$(CONFIG_H264DSP) += loongarch/h264dsp_lasx.o \
LASX-OBJS-$(CONFIG_H264PRED) += loongarch/h264_intrapred_lasx.o
LASX-OBJS-$(CONFIG_VC1_DECODER) += loongarch/vc1dsp_lasx.o
LASX-OBJS-$(CONFIG_HPELDSP) += loongarch/hpeldsp_lasx.o
+LASX-OBJS-$(CONFIG_IDCTDSP) += loongarch/simple_idct_lasx.o \
+ loongarch/idctdsp_lasx.o
LSX-OBJS-$(CONFIG_VP8_DECODER) += loongarch/vp8_mc_lsx.o \
loongarch/vp8_lpf_lsx.o
LSX-OBJS-$(CONFIG_VP9_DECODER) += loongarch/vp9_mc_lsx.o \
diff --git a/libavcodec/loongarch/idctdsp_init_loongarch.c b/libavcodec/loongarch/idctdsp_init_loongarch.c
new file mode 100644
index 0000000000..9d1d21cc18
--- /dev/null
+++ b/libavcodec/loongarch/idctdsp_init_loongarch.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/cpu.h"
+#include "idctdsp_loongarch.h"
+#include "libavcodec/xvididct.h"
+
+av_cold void ff_idctdsp_init_loongarch(IDCTDSPContext *c, AVCodecContext *avctx,
+ unsigned high_bit_depth)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_lasx(cpu_flags)) {
+ if ((avctx->lowres != 1) && (avctx->lowres != 2) && (avctx->lowres != 3) &&
+ (avctx->bits_per_raw_sample != 10) &&
+ (avctx->bits_per_raw_sample != 12) &&
+ (avctx->idct_algo == FF_IDCT_AUTO)) {
+ c->idct_put = ff_simple_idct_put_lasx;
+ c->idct_add = ff_simple_idct_add_lasx;
+ c->idct = ff_simple_idct_lasx;
+ c->perm_type = FF_IDCT_PERM_NONE;
+ }
+ c->put_pixels_clamped = ff_put_pixels_clamped_lasx;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_lasx;
+ c->add_pixels_clamped = ff_add_pixels_clamped_lasx;
+ }
+}
diff --git a/libavcodec/loongarch/idctdsp_lasx.c b/libavcodec/loongarch/idctdsp_lasx.c
new file mode 100644
index 0000000000..1cfab0e028
--- /dev/null
+++ b/libavcodec/loongarch/idctdsp_lasx.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "idctdsp_loongarch.h"
+#include "libavutil/loongarch/loongson_intrinsics.h"
+
+void ff_put_pixels_clamped_lasx(const int16_t *block,
+ uint8_t *av_restrict pixels,
+ ptrdiff_t stride)
+{
+ __m256i b0, b1, b2, b3;
+ __m256i temp0, temp1;
+ ptrdiff_t stride_2x = stride << 1;
+ ptrdiff_t stride_4x = stride << 2;
+ ptrdiff_t stride_3x = stride_2x + stride;
+
+ DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
+ b0, b1, b2, b3);
+ DUP4_ARG1(__lasx_xvclip255_h, b0, b1, b2, b3, b0, b1, b2, b3);
+ DUP2_ARG2(__lasx_xvpickev_b, b1, b0, b3, b2, temp0, temp1);
+ __lasx_xvstelm_d(temp0, pixels, 0, 0);
+ __lasx_xvstelm_d(temp0, pixels + stride, 0, 2);
+ __lasx_xvstelm_d(temp0, pixels + stride_2x, 0, 1);
+ __lasx_xvstelm_d(temp0, pixels + stride_3x, 0, 3);
+ pixels += stride_4x;
+ __lasx_xvstelm_d(temp1, pixels, 0, 0);
+ __lasx_xvstelm_d(temp1, pixels + stride, 0, 2);
+ __lasx_xvstelm_d(temp1, pixels + stride_2x, 0, 1);
+ __lasx_xvstelm_d(temp1, pixels + stride_3x, 0, 3);
+}
+
+void ff_put_signed_pixels_clamped_lasx(const int16_t *block,
+ uint8_t *av_restrict pixels,
+ ptrdiff_t stride)
+{
+ __m256i b0, b1, b2, b3;
+ __m256i temp0, temp1;
+ __m256i const_128 = {0x0080008000800080, 0x0080008000800080,
+ 0x0080008000800080, 0x0080008000800080};
+ ptrdiff_t stride_2x = stride << 1;
+ ptrdiff_t stride_4x = stride << 2;
+ ptrdiff_t stride_3x = stride_2x + stride;
+
+ DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
+ b0, b1, b2, b3);
+ DUP4_ARG2(__lasx_xvadd_h, b0, const_128, b1, const_128, b2, const_128,
+ b3, const_128, b0, b1, b2, b3);
+ DUP4_ARG1(__lasx_xvclip255_h, b0, b1, b2, b3, b0, b1, b2, b3);
+ DUP2_ARG2(__lasx_xvpickev_b, b1, b0, b3, b2, temp0, temp1);
+ __lasx_xvstelm_d(temp0, pixels, 0, 0);
+ __lasx_xvstelm_d(temp0, pixels + stride, 0, 2);
+ __lasx_xvstelm_d(temp0, pixels + stride_2x, 0, 1);
+ __lasx_xvstelm_d(temp0, pixels + stride_3x, 0, 3);
+ pixels += stride_4x;
+ __lasx_xvstelm_d(temp1, pixels, 0, 0);
+ __lasx_xvstelm_d(temp1, pixels + stride, 0, 2);
+ __lasx_xvstelm_d(temp1, pixels + stride_2x, 0, 1);
+ __lasx_xvstelm_d(temp1, pixels + stride_3x, 0, 3);
+}
+
+void ff_add_pixels_clamped_lasx(const int16_t *block,
+ uint8_t *av_restrict pixels,
+ ptrdiff_t stride)
+{
+ __m256i b0, b1, b2, b3;
+ __m256i p0, p1, p2, p3, p4, p5, p6, p7;
+ __m256i temp0, temp1, temp2, temp3;
+ uint8_t *pix = pixels;
+ ptrdiff_t stride_2x = stride << 1;
+ ptrdiff_t stride_4x = stride << 2;
+ ptrdiff_t stride_3x = stride_2x + stride;
+
+ DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96,
+ b0, b1, b2, b3);
+ p0 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p1 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p2 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p3 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p4 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p5 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p6 = __lasx_xvldrepl_d(pix, 0);
+ pix += stride;
+ p7 = __lasx_xvldrepl_d(pix, 0);
+ DUP4_ARG3(__lasx_xvpermi_q, p1, p0, 0x20, p3, p2, 0x20, p5, p4, 0x20,
+ p7, p6, 0x20, temp0, temp1, temp2, temp3);
+ DUP4_ARG2(__lasx_xvaddw_h_h_bu, b0, temp0, b1, temp1, b2, temp2, b3, temp3,
+ temp0, temp1, temp2, temp3);
+ DUP4_ARG1(__lasx_xvclip255_h, temp0, temp1, temp2, temp3,
+ temp0, temp1, temp2, temp3);
+ DUP2_ARG2(__lasx_xvpickev_b, temp1, temp0, temp3, temp2, temp0, temp1);
+ __lasx_xvstelm_d(temp0, pixels, 0, 0);
+ __lasx_xvstelm_d(temp0, pixels + stride, 0, 2);
+ __lasx_xvstelm_d(temp0, pixels + stride_2x, 0, 1);
+ __lasx_xvstelm_d(temp0, pixels + stride_3x, 0, 3);
+ pixels += stride_4x;
+ __lasx_xvstelm_d(temp1, pixels, 0, 0);
+ __lasx_xvstelm_d(temp1, pixels + stride, 0, 2);
+ __lasx_xvstelm_d(temp1, pixels + stride_2x, 0, 1);
+ __lasx_xvstelm_d(temp1, pixels + stride_3x, 0, 3);
+}
diff --git a/libavcodec/loongarch/idctdsp_loongarch.h b/libavcodec/loongarch/idctdsp_loongarch.h
new file mode 100644
index 0000000000..cae8e7af58
--- /dev/null
+++ b/libavcodec/loongarch/idctdsp_loongarch.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_IDCTDSP_LOONGARCH_H
+#define AVCODEC_LOONGARCH_IDCTDSP_LOONGARCH_H
+
+#include <stdint.h>
+#include "libavcodec/mpegvideo.h"
+
+void ff_simple_idct_lasx(int16_t *block);
+void ff_simple_idct_put_lasx(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+void ff_simple_idct_add_lasx(uint8_t *dest, ptrdiff_t stride_dst, int16_t *block);
+void ff_put_pixels_clamped_lasx(const int16_t *block,
+ uint8_t *av_restrict pixels,
+ ptrdiff_t line_size);
+void ff_put_signed_pixels_clamped_lasx(const int16_t *block,
+ uint8_t *av_restrict pixels,
+ ptrdiff_t line_size);
+void ff_add_pixels_clamped_lasx(const int16_t *block,
+ uint8_t *av_restrict pixels,
+ ptrdiff_t line_size);
+
+#endif /* AVCODEC_LOONGARCH_IDCTDSP_LOONGARCH_H */
diff --git a/libavcodec/loongarch/simple_idct_lasx.c b/libavcodec/loongarch/simple_idct_lasx.c
new file mode 100644
index 0000000000..a0d936b666
--- /dev/null
+++ b/libavcodec/loongarch/simple_idct_lasx.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Hao Chen <chenhao at loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/loongarch/loongson_intrinsics.h"
+#include "idctdsp_loongarch.h"
+
+#define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
+{ \
+ __m256i temp_0, temp_1, temp_2, temp_3; \
+ __m256i temp_4, temp_5, temp_6, temp_7; \
+ DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
+ 0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3); \
+ DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
+ DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
+ DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2); \
+ DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3); \
+}
+
+#define LASX_IDCTROWCONDDC \
+ const_val = 16383 * ((1 << 19) / 16383); \
+ const_val1 = __lasx_xvreplgr2vr_w(const_val); \
+ DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, \
+ in0, in1, in2, in3); \
+ LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \
+ a0 = __lasx_xvpermi_d(in0, 0xD8); \
+ a0 = __lasx_vext2xv_w_h(a0); \
+ temp = __lasx_xvslli_w(a0, 3); \
+ a1 = __lasx_xvpermi_d(in0, 0x8D); \
+ a1 = __lasx_vext2xv_w_h(a1); \
+ a2 = __lasx_xvpermi_d(in1, 0xD8); \
+ a2 = __lasx_vext2xv_w_h(a2); \
+ a3 = __lasx_xvpermi_d(in1, 0x8D); \
+ a3 = __lasx_vext2xv_w_h(a3); \
+ b0 = __lasx_xvpermi_d(in2, 0xD8); \
+ b0 = __lasx_vext2xv_w_h(b0); \
+ b1 = __lasx_xvpermi_d(in2, 0x8D); \
+ b1 = __lasx_vext2xv_w_h(b1); \
+ b2 = __lasx_xvpermi_d(in3, 0xD8); \
+ b2 = __lasx_vext2xv_w_h(b2); \
+ b3 = __lasx_xvpermi_d(in3, 0x8D); \
+ b3 = __lasx_vext2xv_w_h(b3); \
+ select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3; \
+ select_vec = __lasx_xvslti_wu(select_vec, 1); \
+ \
+ DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5, \
+ w2, w3, w4, w5); \
+ DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7); \
+ w1 = __lasx_xvrepl128vei_h(w1, 1); \
+ \
+ /* part of FUNC6(idctRowCondDC) */ \
+ temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4); \
+ DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \
+ a0 = __lasx_xvadd_w(temp0, temp1); \
+ a1 = __lasx_xvadd_w(temp0, temp2); \
+ a2 = __lasx_xvsub_w(temp0, temp2); \
+ a3 = __lasx_xvsub_w(temp0, temp1); \
+ \
+ DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \
+ b0 = __lasx_xvdp2_w_h(temp0, temp1); \
+ temp1 = __lasx_xvneg_h(w7); \
+ temp2 = __lasx_xvilvl_h(temp1, w3); \
+ b1 = __lasx_xvdp2_w_h(temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w1); \
+ temp2 = __lasx_xvilvl_h(temp1, w5); \
+ b2 = __lasx_xvdp2_w_h(temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w5); \
+ temp2 = __lasx_xvilvl_h(temp1, w7); \
+ b3 = __lasx_xvdp2_w_h(temp0, temp2); \
+ \
+ /* if (AV_RAN64A(row + 4)) */ \
+ DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \
+ a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \
+ temp1 = __lasx_xvilvl_h(w2, w4); \
+ a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \
+ temp1 = __lasx_xvneg_h(w4); \
+ temp2 = __lasx_xvilvl_h(w2, temp1); \
+ a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w6); \
+ temp2 = __lasx_xvilvl_h(temp1, w4); \
+ a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \
+ \
+ DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \
+ b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \
+ DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \
+ b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \
+ b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w1); \
+ temp2 = __lasx_xvilvl_h(temp1, w3); \
+ b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \
+ \
+ DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \
+ temp0, temp1, temp2, temp3); \
+ DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3, \
+ a0, a1, a2, a3); \
+ DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11, \
+ temp0, temp1, temp2, temp3); \
+ DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
+ DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp, \
+ select_vec, temp2, temp, select_vec, temp3, temp, select_vec, \
+ in0, in1, in2, in3); \
+ DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp, \
+ select_vec, a2, temp, select_vec, a3, temp, select_vec, \
+ a0, a1, a2, a3); \
+ DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1, \
+ in0, in1, in2, in3); \
+ DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, \
+ in0, in1, in2, in3); \
+
+#define LASX_IDCTCOLS \
+ /* part of FUNC6(idctSparaseCol) */ \
+ LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \
+ temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4); \
+ DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \
+ a0 = __lasx_xvadd_w(temp0, temp1); \
+ a1 = __lasx_xvadd_w(temp0, temp2); \
+ a2 = __lasx_xvsub_w(temp0, temp2); \
+ a3 = __lasx_xvsub_w(temp0, temp1); \
+ \
+ DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \
+ b0 = __lasx_xvdp2_w_h(temp0, temp1); \
+ temp1 = __lasx_xvneg_h(w7); \
+ temp2 = __lasx_xvilvl_h(temp1, w3); \
+ b1 = __lasx_xvdp2_w_h(temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w1); \
+ temp2 = __lasx_xvilvl_h(temp1, w5); \
+ b2 = __lasx_xvdp2_w_h(temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w5); \
+ temp2 = __lasx_xvilvl_h(temp1, w7); \
+ b3 = __lasx_xvdp2_w_h(temp0, temp2); \
+ \
+ /* if (AV_RAN64A(row + 4)) */ \
+ DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \
+ a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \
+ temp1 = __lasx_xvilvl_h(w2, w4); \
+ a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \
+ temp1 = __lasx_xvneg_h(w4); \
+ temp2 = __lasx_xvilvl_h(w2, temp1); \
+ a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w6); \
+ temp2 = __lasx_xvilvl_h(temp1, w4); \
+ a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \
+ \
+ DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \
+ b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \
+ DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \
+ b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \
+ b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \
+ temp1 = __lasx_xvneg_h(w1); \
+ temp2 = __lasx_xvilvl_h(temp1, w3); \
+ b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \
+ \
+ DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \
+ temp0, temp1, temp2, temp3); \
+ DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0, \
+ a3, a2, a1, a0); \
+ DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3, \
+ 20, a0, a1, 20, in0, in1, in2, in3); \
+
+void ff_simple_idct_lasx(int16_t *block)
+{
+ int32_t const_val = 1 << 10;
+ __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
+ 0x4B42539F58C50000, 0x11A822A332493FFF};
+ __m256i in0, in1, in2, in3;
+ __m256i w2, w3, w4, w5, w6, w7;
+ __m256i a0, a1, a2, a3;
+ __m256i b0, b1, b2, b3;
+ __m256i temp0, temp1, temp2, temp3;
+ __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
+ __m256i const_val1, select_vec, temp;
+
+ LASX_IDCTROWCONDDC
+ LASX_IDCTCOLS
+ DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
+ in0, in1, in2, in3);
+ __lasx_xvst(in0, block, 0);
+ __lasx_xvst(in1, block, 32);
+ __lasx_xvst(in2, block, 64);
+ __lasx_xvst(in3, block, 96);
+}
+
+void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride,
+ int16_t *block)
+{
+ int32_t const_val = 1 << 10;
+ ptrdiff_t dst_stride_2x = dst_stride << 1;
+ ptrdiff_t dst_stride_4x = dst_stride << 2;
+ ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
+ __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
+ 0x4B42539F58C50000, 0x11A822A332493FFF};
+ __m256i in0, in1, in2, in3;
+ __m256i w2, w3, w4, w5, w6, w7;
+ __m256i a0, a1, a2, a3;
+ __m256i b0, b1, b2, b3;
+ __m256i temp0, temp1, temp2, temp3;
+ __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
+ __m256i const_val1, select_vec, temp;
+
+ LASX_IDCTROWCONDDC
+ LASX_IDCTCOLS
+ DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
+ in0, in1, in2, in3);
+ DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
+ DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
+ __lasx_xvstelm_d(in0, dst, 0, 0);
+ __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
+ __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
+ __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lasx_xvstelm_d(in1, dst, 0, 0);
+ __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
+ __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
+ __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
+}
+
+void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride,
+ int16_t *block)
+{
+ int32_t const_val = 1 << 10;
+ uint8_t *dst1 = dst;
+ ptrdiff_t dst_stride_2x = dst_stride << 1;
+ ptrdiff_t dst_stride_4x = dst_stride << 2;
+ ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
+
+ __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
+ 0x4B42539F58C50000, 0x11A822A332493FFF};
+ __m256i sh = {0x0003000200010000, 0x000B000A00090008,
+ 0x0007000600050004, 0x000F000E000D000C};
+ __m256i in0, in1, in2, in3;
+ __m256i w2, w3, w4, w5, w6, w7;
+ __m256i a0, a1, a2, a3;
+ __m256i b0, b1, b2, b3;
+ __m256i temp0, temp1, temp2, temp3;
+ __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
+ __m256i const_val1, select_vec, temp;
+
+ LASX_IDCTROWCONDDC
+ LASX_IDCTCOLS
+ a0 = __lasx_xvldrepl_d(dst1, 0);
+ a0 = __lasx_vext2xv_hu_bu(a0);
+ dst1 += dst_stride;
+ a1 = __lasx_xvldrepl_d(dst1, 0);
+ a1 = __lasx_vext2xv_hu_bu(a1);
+ dst1 += dst_stride;
+ a2 = __lasx_xvldrepl_d(dst1, 0);
+ a2 = __lasx_vext2xv_hu_bu(a2);
+ dst1 += dst_stride;
+ a3 = __lasx_xvldrepl_d(dst1, 0);
+ a3 = __lasx_vext2xv_hu_bu(a3);
+ dst1 += dst_stride;
+ b0 = __lasx_xvldrepl_d(dst1, 0);
+ b0 = __lasx_vext2xv_hu_bu(b0);
+ dst1 += dst_stride;
+ b1 = __lasx_xvldrepl_d(dst1, 0);
+ b1 = __lasx_vext2xv_hu_bu(b1);
+ dst1 += dst_stride;
+ b2 = __lasx_xvldrepl_d(dst1, 0);
+ b2 = __lasx_vext2xv_hu_bu(b2);
+ dst1 += dst_stride;
+ b3 = __lasx_xvldrepl_d(dst1, 0);
+ b3 = __lasx_vext2xv_hu_bu(b3);
+ DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2,
+ temp0, temp1, temp2, temp3);
+ DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
+ in0, in1, in2, in3);
+ DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
+ in0, in1, in2, in3);
+ DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
+ DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
+ __lasx_xvstelm_d(in0, dst, 0, 0);
+ __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
+ __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
+ __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
+ dst += dst_stride_4x;
+ __lasx_xvstelm_d(in1, dst, 0, 0);
+ __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
+ __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
+ __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
+}
--
2.20.1
More information about the ffmpeg-devel
mailing list