[FFmpeg-devel] [PATCH 1/2] Optimization of AC3 floating point decoder for MIPS
Nedeljko Babic
nbabic at mips.com
Tue Jun 26 13:13:20 CEST 2012
FFT in MIPS implementation is working iteratively instead
of "recursively" calling functions for smaller FFT sizes.
Some of DSP and format convert utils functions are also optimized.
Signed-off-by: Nedeljko Babic <nbabic at mips.com>
---
libavcodec/dsputil.c | 1 +
libavcodec/dsputil.h | 1 +
libavcodec/fft.c | 1 +
libavcodec/fft.h | 1 +
libavcodec/fmtconvert.c | 1 +
libavcodec/fmtconvert.h | 1 +
libavcodec/mips/Makefile | 4 +
libavcodec/mips/dsputil_mips.c | 168 +++++++++
libavcodec/mips/fft_mips.c | 691 +++++++++++++++++++++++++++++++++++++
libavcodec/mips/fft_table.h | 104 ++++++
libavcodec/mips/fmtconvert_mips.c | 336 ++++++++++++++++++
11 files changed, 1309 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/mips/dsputil_mips.c
create mode 100644 libavcodec/mips/fft_mips.c
create mode 100644 libavcodec/mips/fft_table.h
create mode 100644 libavcodec/mips/fmtconvert_mips.c
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 442b900..b7d928f 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3161,6 +3161,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
+ if (HAVE_MIPSFPU) ff_dsputil_init_mips (c, avctx);
for (i = 0; i < 4; i++) {
for (j = 0; j < 16; j++) {
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index e1aefe1..b41af59 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -633,6 +633,7 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_mips(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_dwt(DSPContext *c);
void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
diff --git a/libavcodec/fft.c b/libavcodec/fft.c
index 6b93a5c..e5bdcbd 100644
--- a/libavcodec/fft.c
+++ b/libavcodec/fft.c
@@ -162,6 +162,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
if (HAVE_ALTIVEC) ff_fft_init_altivec(s);
if (HAVE_MMX) ff_fft_init_mmx(s);
if (CONFIG_MDCT) s->mdct_calcw = s->mdct_calc;
+ if (HAVE_MIPSFPU) ff_fft_init_mips(s);
#else
if (CONFIG_MDCT) s->mdct_calcw = ff_mdct_calcw_c;
if (ARCH_ARM) ff_fft_fixed_init_arm(s);
diff --git a/libavcodec/fft.h b/libavcodec/fft.h
index 0e19e94..15e5a12 100644
--- a/libavcodec/fft.h
+++ b/libavcodec/fft.h
@@ -137,6 +137,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse);
void ff_fft_init_altivec(FFTContext *s);
void ff_fft_init_mmx(FFTContext *s);
void ff_fft_init_arm(FFTContext *s);
+void ff_fft_init_mips(FFTContext *s);
#else
void ff_fft_fixed_init_arm(FFTContext *s);
#endif
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index c03117c..e47c205 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -85,6 +85,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx);
+ if (HAVE_MIPSFPU) ff_fmt_convert_init_mips(c);
}
/* ffdshow custom code */
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index a9fbb31..ab2caa2 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -92,6 +92,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_altivec(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx);
+void ff_fmt_convert_init_mips(FmtConvertContext *c);
/* ffdshow custom code */
void float_interleave(float *dst, const float **src, long len, int channels);
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 24a95b5..4dfb2e3 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -11,3 +11,7 @@ MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER) += mips/acelp_filters_mips.o \
mips/amrwbdec_mips.o \
mips/celp_math_mips.o \
mips/acelp_vectors_mips.o
+
+MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o
+MIPSFPU-OBJS += mips/dsputil_mips.o
+MIPSFPU-OBJS-$(CONFIG_AC3_DECODER) += mips/fmtconvert_mips.o
diff --git a/libavcodec/mips/dsputil_mips.c b/libavcodec/mips/dsputil_mips.c
new file mode 100644
index 0000000..600e256
--- /dev/null
+++ b/libavcodec/mips/dsputil_mips.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2012
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Zoran Lukic (zoranl at mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/dsputil.h"
+
+#if HAVE_INLINE_ASM
+static void vector_fmul_window_mips(float *dst, const float *src0,
+ const float *src1, const float *win, int len) {
+ int i, j;
+ /*
+ * variables used in inline assembler
+ */
+ float * dst_i, * dst_j, * dst_i2, * dst_j2;
+ float temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+
+ dst += len;
+ win += len;
+ src0 += len;
+
+ for (i = -len, j = len - 1; i < 0; i += 8, j -= 8) {
+
+ dst_i = dst + i;
+ dst_j = dst + j;
+
+ dst_i2 = dst + i + 4;
+ dst_j2 = dst + j - 4;
+
+ __asm__ __volatile__ (
+ "mul.s %[temp], %[s1], %[wi] \n\t"
+ "mul.s %[temp1], %[s1], %[wj] \n\t"
+ "mul.s %[temp2], %[s11], %[wi1] \n\t"
+ "mul.s %[temp3], %[s11], %[wj1] \n\t"
+
+ "msub.s %[temp], %[temp], %[s0], %[wj] \n\t"
+ "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t"
+ "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t"
+ "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t"
+
+ "swc1 %[temp], 0(%[dst_i]) \n\t" /* dst[i] = s0*wj - s1*wi; */
+ "swc1 %[temp1], 0(%[dst_j]) \n\t" /* dst[j] = s0*wi + s1*wj; */
+ "swc1 %[temp2], 4(%[dst_i]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
+ "swc1 %[temp3], -4(%[dst_j]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
+
+ "mul.s %[temp4], %[s12], %[wi2] \n\t"
+ "mul.s %[temp5], %[s12], %[wj2] \n\t"
+ "mul.s %[temp6], %[s13], %[wi3] \n\t"
+ "mul.s %[temp7], %[s13], %[wj3] \n\t"
+
+ "msub.s %[temp4], %[temp4], %[s02], %[wj2] \n\t"
+ "madd.s %[temp5], %[temp5], %[s02], %[wi2] \n\t"
+ "msub.s %[temp6], %[temp6], %[s03], %[wj3] \n\t"
+ "madd.s %[temp7], %[temp7], %[s03], %[wi3] \n\t"
+
+ "swc1 %[temp4], 8(%[dst_i]) \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */
+ "swc1 %[temp5], -8(%[dst_j]) \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */
+ "swc1 %[temp6], 12(%[dst_i]) \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */
+ "swc1 %[temp7], -12(%[dst_j]) \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */
+ : [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+ [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5),
+ [temp6]"=&f"(temp6), [temp7]"=&f"(temp7)
+ : [dst_j]"r"(dst_j), [dst_i]"r" (dst_i),
+ [s0] "f"(src0[i]), [wj] "f"(win[j]), [s1] "f"(src1[j]),
+ [wi] "f"(win[i]), [s01]"f"(src0[i + 1]),[wj1]"f"(win[j - 1]),
+ [s11]"f"(src1[j - 1]), [wi1]"f"(win[i + 1]), [s02]"f"(src0[i + 2]),
+ [wj2]"f"(win[j - 2]), [s12]"f"(src1[j - 2]),[wi2]"f"(win[i + 2]),
+ [s03]"f"(src0[i + 3]), [wj3]"f"(win[j - 3]), [s13]"f"(src1[j - 3]),
+ [wi3]"f"(win[i + 3])
+ : "memory"
+ );
+
+ __asm__ __volatile__ (
+ "mul.s %[temp], %[s1], %[wi] \n\t"
+ "mul.s %[temp1], %[s1], %[wj] \n\t"
+ "mul.s %[temp2], %[s11], %[wi1] \n\t"
+ "mul.s %[temp3], %[s11], %[wj1] \n\t"
+
+ "msub.s %[temp], %[temp], %[s0], %[wj] \n\t"
+ "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t"
+ "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t"
+ "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t"
+
+ "swc1 %[temp], 0(%[dst_i2]) \n\t" /* dst[i] = s0*wj - s1*wi; */
+ "swc1 %[temp1], 0(%[dst_j2]) \n\t" /* dst[j] = s0*wi + s1*wj; */
+ "swc1 %[temp2], 4(%[dst_i2]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */
+ "swc1 %[temp3], -4(%[dst_j2]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */
+
+ "mul.s %[temp4], %[s12], %[wi2] \n\t"
+ "mul.s %[temp5], %[s12], %[wj2] \n\t"
+ "mul.s %[temp6], %[s13], %[wi3] \n\t"
+ "mul.s %[temp7], %[s13], %[wj3] \n\t"
+
+ "msub.s %[temp4], %[temp4], %[s02], %[wj2] \n\t"
+ "madd.s %[temp5], %[temp5], %[s02], %[wi2] \n\t"
+ "msub.s %[temp6], %[temp6], %[s03], %[wj3] \n\t"
+ "madd.s %[temp7], %[temp7], %[s03], %[wi3] \n\t"
+
+ "swc1 %[temp4], 8(%[dst_i2]) \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */
+ "swc1 %[temp5], -8(%[dst_j2]) \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */
+ "swc1 %[temp6], 12(%[dst_i2]) \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */
+ "swc1 %[temp7], -12(%[dst_j2]) \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */
+ : [temp]"=&f"(temp),
+ [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), [temp3]"=&f"(temp3),
+ [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+ [temp7] "=&f" (temp7)
+ : [dst_j2]"r"(dst_j2), [dst_i2]"r"(dst_i2),
+ [s0] "f"(src0[i + 4]), [wj] "f"(win[j - 4]), [s1] "f"(src1[j - 4]),
+ [wi] "f"(win[i + 4]), [s01]"f"(src0[i + 5]),[wj1]"f"(win[j - 5]),
+ [s11]"f"(src1[j - 5]), [wi1]"f"(win[i + 5]), [s02]"f"(src0[i + 6]),
+ [wj2]"f"(win[j - 6]), [s12]"f"(src1[j - 6]),[wi2]"f"(win[i + 6]),
+ [s03]"f"(src0[i + 7]), [wj3]"f"(win[j - 7]), [s13]"f"(src1[j - 7]),
+ [wi3]"f"(win[i + 7])
+ : "memory"
+ );
+ }
+}
+#endif /* HAVE_INLINE_ASM */
+
+av_cold void ff_dsputil_init_mips( DSPContext* c, AVCodecContext *avctx )
+{
+#if HAVE_INLINE_ASM
+ c->vector_fmul_window = vector_fmul_window_mips;
+#endif
+}
+
diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c
new file mode 100644
index 0000000..1ca806b
--- /dev/null
+++ b/libavcodec/mips/fft_mips.c
@@ -0,0 +1,691 @@
+/*
+ * Copyright (c) 2012
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Stanislav Ocovaj (socovaj at mips.com)
+ * Author: Zoran Lukic (zoranl at mips.com)
+ *
+ * Optimized MDCT/IMDCT and FFT transforms
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/fft.h"
+#include "fft_table.h"
+
+/**
+ * FFT transform
+ */
+
+#if HAVE_INLINE_ASM
+static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) {
+
+ int nbits, i, n, num_transforms, offset, step;
+ int n4, n2, n34;
+ FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+ FFTComplex *tmpz;
+ float w_re, w_im;
+ float *w_re_ptr, *w_im_ptr;
+ const int fft_size = (1 << s->nbits);
+ int s_n = s->nbits;
+ int tem1, tem2;
+ float pom, pom1, pom2, pom3;
+ float temp, temp1, temp3, temp4;
+ FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
+ FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
+
+ /**
+ *num_transforms = (0x2aab >> (16 - s->nbits)) | 1;
+ */
+ __asm__ __volatile__ (
+ "li %[tem1], 16 \n\t"
+ "sub %[s_n], %[tem1], %[s_n] \n\t"
+ "li %[tem2], 10923 \n\t"
+ "srav %[tem2], %[tem2], %[s_n] \n\t"
+ "ori %[num_t],%[tem2], 1 \n\t"
+ : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n),
+ [tem1]"=&r"(tem1), [tem2]"=&r"(tem2)
+ );
+
+
+ for (n=0; n<num_transforms; n++)
+ {
+ offset = fft_offsets_lut[n] << 2;
+ tmpz = z + offset;
+
+ tmp1 = tmpz[0].re + tmpz[1].re;
+ tmp5 = tmpz[2].re + tmpz[3].re;
+ tmp2 = tmpz[0].im + tmpz[1].im;
+ tmp6 = tmpz[2].im + tmpz[3].im;
+ tmp3 = tmpz[0].re - tmpz[1].re;
+ tmp8 = tmpz[2].im - tmpz[3].im;
+ tmp4 = tmpz[0].im - tmpz[1].im;
+ tmp7 = tmpz[2].re - tmpz[3].re;
+
+ tmpz[0].re = tmp1 + tmp5;
+ tmpz[2].re = tmp1 - tmp5;
+ tmpz[0].im = tmp2 + tmp6;
+ tmpz[2].im = tmp2 - tmp6;
+ tmpz[1].re = tmp3 + tmp8;
+ tmpz[3].re = tmp3 - tmp8;
+ tmpz[1].im = tmp4 - tmp7;
+ tmpz[3].im = tmp4 + tmp7;
+
+}
+
+ if (fft_size < 8)
+ return;
+
+ num_transforms = (num_transforms >> 1) | 1;
+ for (n=0; n<num_transforms; n++)
+ {
+ offset = fft_offsets_lut[n] << 3;
+ tmpz = z + offset;
+
+ __asm__ __volatile__ (
+ "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
+ "lwc1 %[pom], 40(%[tmpz]) \n\t"
+ "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
+ "lwc1 %[pom1], 56(%[tmpz]) \n\t"
+ "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
+ "lwc1 %[pom2], 44(%[tmpz]) \n\t"
+ "lwc1 %[pom3], 60(%[tmpz]) \n\t"
+ "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
+ "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
+ "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
+ "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
+ "lwc1 %[pom], 40(%[tmpz]) \n\t"
+ "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
+ "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
+ "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
+ "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
+ "lwc1 %[pom1], 44(%[tmpz]) \n\t"
+ "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
+ "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
+ "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
+ "lwc1 %[pom2], 56(%[tmpz]) \n\t"
+ "lwc1 %[pom3], 60(%[tmpz]) \n\t"
+ "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
+ "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
+ "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
+ "lwc1 %[pom], 0(%[tmpz]) \n\t"
+ "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
+ "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
+ "lwc1 %[pom2], 4(%[tmpz]) \n\t"
+ "sub.s %[pom1], %[pom], %[tmp5] \n\t"
+ "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
+ "add.s %[pom3], %[pom], %[tmp5] \n\t"
+ "sub.s %[pom], %[pom2], %[tmp6] \n\t"
+ "add.s %[pom2], %[pom2], %[tmp6] \n\t"
+ "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
+ "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
+ "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
+ "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
+ "lwc1 %[pom1], 16(%[tmpz]) \n\t"
+ "lwc1 %[pom3], 20(%[tmpz]) \n\t"
+ "li.s %[pom], 0.7071067812 \n\t" // float pom = 0.7071067812f;
+ "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
+ "sub.s %[temp], %[pom1], %[tmp8] \n\t"
+ "add.s %[pom2], %[pom3], %[tmp7] \n\t"
+ "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
+ "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
+ "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
+ "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
+ "add.s %[pom1], %[pom1], %[tmp8] \n\t"
+ "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
+ "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
+ "mul.s %[tmp5], %[pom], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
+ "mul.s %[tmp7], %[pom], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
+ "mul.s %[tmp6], %[pom], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
+ "mul.s %[tmp8], %[pom], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
+ "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
+ "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
+ "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
+ "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
+ "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
+ "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
+ "lwc1 %[temp], 8(%[tmpz]) \n\t"
+ "lwc1 %[temp1],12(%[tmpz]) \n\t"
+ "lwc1 %[pom], 24(%[tmpz]) \n\t"
+ "lwc1 %[pom2], 28(%[tmpz]) \n\t"
+ "sub.s %[temp4],%[temp], %[tmp1] \n\t"
+ "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
+ "add.s %[temp], %[temp], %[tmp1] \n\t"
+ "add.s %[temp1],%[temp1], %[tmp2] \n\t"
+ "sub.s %[pom1], %[pom], %[tmp4] \n\t"
+ "add.s %[pom3], %[pom2], %[tmp3] \n\t"
+ "add.s %[pom], %[pom], %[tmp4] \n\t"
+ "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
+ "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
+ "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
+ "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
+ "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
+ "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
+ "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
+ "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
+ "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
+ : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
+ [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
+ [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
+ [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
+ : [tmpz]"r"(tmpz)
+ : "memory"
+ );
+ }
+
+ step = 1 << (MAX_LOG2_NFFT - 4);
+ n4 = 4;
+ for (nbits=4; nbits<=s->nbits; nbits++)
+ {
+ /*
+ * num_transforms = (num_transforms >> 1) | 1;
+ */
+ __asm__ __volatile__ (
+ "sra %[num_t], %[num_t], 1 \n\t"
+ "ori %[num_t], %[num_t], 1 \n\t"
+
+ : [num_t] "+r" (num_transforms)
+ );
+ n2 = 2 * n4;
+ n34 = 3 * n4;
+
+ for (n=0; n<num_transforms; n++)
+ {
+ offset = fft_offsets_lut[n] << nbits;
+ tmpz = z + offset;
+
+ tmpz_n2 = tmpz + n2;
+ tmpz_n4 = tmpz + n4;
+ tmpz_n34 = tmpz + n34;
+
+ __asm__ __volatile__ (
+ "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
+ "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
+ "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
+ "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
+ "lwc1 %[temp1],0(%[tmpz]) \n\t"
+ "lwc1 %[temp3],4(%[tmpz]) \n\t"
+ "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
+ "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
+ "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
+ "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
+ "sub.s %[temp], %[temp1], %[tmp5] \n\t"
+ "add.s %[temp1],%[temp1], %[tmp5] \n\t"
+ "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
+ "add.s %[temp3],%[temp3], %[tmp6] \n\t"
+ "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
+ "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
+ "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
+ "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
+ "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
+ "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
+ "sub.s %[pom], %[pom1], %[tmp2] \n\t"
+ "add.s %[pom1], %[pom1], %[tmp2] \n\t"
+ "add.s %[temp1],%[temp], %[tmp1] \n\t"
+ "sub.s %[temp], %[temp], %[tmp1] \n\t"
+ "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
+ "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
+ "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
+ "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
+ : [tmp5]"=&f"(tmp5),
+ [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
+ [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3),
+ [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
+ : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
+ : "memory"
+ );
+
+ w_re_ptr = (float*)(ff_cos_4096 + step);
+ w_im_ptr = (float*)(ff_cos_4096 + MAX_FFT_SIZE/4 - step);
+
+ for (i=1; i<n4; i++)
+ {
+ w_re = w_re_ptr[0];
+ w_im = w_im_ptr[0];
+ tmpz_n2_i = tmpz_n2 + i;
+ tmpz_n4_i = tmpz_n4 + i;
+ tmpz_n34_i= tmpz_n34 + i;
+ tmpz_i = tmpz + i;
+
+ __asm__ __volatile__ (
+ "lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t"
+ "lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t"
+ "lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t"
+ "lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t"
+ "mul.s %[temp3], %[w_im], %[temp] \n\t"
+ "mul.s %[temp4], %[w_im], %[temp1] \n\t"
+ "mul.s %[pom2], %[w_im], %[pom1] \n\t"
+ "mul.s %[pom3], %[w_im], %[pom] \n\t"
+ "msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
+ "madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
+ "msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
+ "madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
+ "lwc1 %[temp], 0(%[tmpz_i]) \n\t"
+ "lwc1 %[pom], 4(%[tmpz_i]) \n\t"
+ "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
+ "sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3;
+ "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
+ "sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4;
+ "sub.s %[temp1], %[temp], %[tmp5] \n\t"
+ "add.s %[temp], %[temp], %[tmp5] \n\t"
+ "sub.s %[pom1], %[pom], %[tmp6] \n\t"
+ "add.s %[pom], %[pom], %[tmp6] \n\t"
+ "lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t"
+ "lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t"
+ "swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5;
+ "swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5;
+ "swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6;
+ "swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6;
+ "sub.s %[temp4], %[temp3], %[tmp2] \n\t"
+ "add.s %[pom3], %[pom2], %[tmp1] \n\t"
+ "add.s %[temp3], %[temp3], %[tmp2] \n\t"
+ "sub.s %[pom2], %[pom2], %[tmp1] \n\t"
+ "swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
+ "swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
+ "swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
+ "swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
+ : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
+ [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
+ [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+ [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
+ : [w_re]"f"(w_re), [w_im]"f"(w_im),
+ [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
+ [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
+ : "memory"
+ );
+ w_re_ptr += step;
+ w_im_ptr -= step;
+ }
+ }
+ step >>= 1;
+ n4 <<= 1;
+ }
+}
+
+/**
+ * MDCT/IMDCT transforms.
+ */
+
+static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int k, n8, n4, n2, n, j;
+ const uint16_t *revtab = s->revtab;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ const FFTSample *in1, *in2, *in3, *in4;
+ FFTComplex *z = (FFTComplex *)output;
+
+ int j1;
+ const float *tcos1, *tsin1, *tcos2, *tsin2;
+ float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+ temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+ FFTComplex *z1, *z2;
+
+ n = 1 << s->mdct_bits;
+ n2 = n >> 1;
+ n4 = n >> 2;
+ n8 = n >> 3;
+
+ /* pre rotation */
+ in1 = input;
+ in2 = input + n2 - 1;
+ in3 = input + 2;
+ in4 = input + n2 - 3;
+
+ tcos1 = tcos;
+ tsin1 = tsin;
+
+ /* n4 = 64 or 128 */
+ for(k = 0; k < n4; k += 2) {
+ j = revtab[k ];
+ j1 = revtab[k + 1];
+
+ __asm__ __volatile__ (
+ "lwc1 %[temp1], 0(%[in2]) \t\n"
+ "lwc1 %[temp2], 0(%[tcos1]) \t\n"
+ "lwc1 %[temp3], 0(%[tsin1]) \t\n"
+ "lwc1 %[temp4], 0(%[in1]) \t\n"
+ "lwc1 %[temp5], 0(%[in4]) \t\n"
+ "mul.s %[temp9], %[temp1], %[temp2] \t\n"
+ "mul.s %[temp10], %[temp1], %[temp3] \t\n"
+ "lwc1 %[temp6], 4(%[tcos1]) \t\n"
+ "lwc1 %[temp7], 4(%[tsin1]) \t\n"
+ "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
+ "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
+ "mul.s %[temp11], %[temp5], %[temp6] \t\n"
+ "mul.s %[temp12], %[temp5], %[temp7] \t\n"
+ "lwc1 %[temp8], 0(%[in3]) \t\n"
+ "addiu %[tcos1], %[tcos1], 8 \t\n"
+ "addiu %[tsin1], %[tsin1], 8 \t\n"
+ "addiu %[in1], %[in1], 16 \t\n"
+ "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
+ "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
+ "addiu %[in2], %[in2], -16 \t\n"
+ "addiu %[in3], %[in3], 16 \t\n"
+ "addiu %[in4], %[in4], -16 \t\n"
+
+ : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+ [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+ [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+ [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+ [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+ [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+ [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
+ [in1]"+r"(in1), [in2]"+r"(in2),
+ [in3]"+r"(in3), [in4]"+r"(in4)
+ );
+
+ z[j ].re = temp9;
+ z[j ].im = temp10;
+ z[j1].re = temp11;
+ z[j1].im = temp12;
+ }
+
+ s->fft_calc(s, z);
+
+ /* post rotation + reordering */
+ /* n8 = 32 or 64 */
+ for(k = 0; k < n8; k += 2) {
+ tcos1 = &tcos[n8 - k - 2];
+ tsin1 = &tsin[n8 - k - 2];
+ tcos2 = &tcos[n8 + k];
+ tsin2 = &tsin[n8 + k];
+ z1 = &z[n8 - k - 2];
+ z2 = &z[n8 + k ];
+
+ __asm__ __volatile__ (
+ "lwc1 %[temp1], 12(%[z1]) \t\n"
+ "lwc1 %[temp2], 4(%[tsin1]) \t\n"
+ "lwc1 %[temp3], 4(%[tcos1]) \t\n"
+ "lwc1 %[temp4], 8(%[z1]) \t\n"
+ "lwc1 %[temp5], 4(%[z1]) \t\n"
+ "mul.s %[temp9], %[temp1], %[temp2] \t\n"
+ "mul.s %[temp10], %[temp1], %[temp3] \t\n"
+ "lwc1 %[temp6], 0(%[tsin1]) \t\n"
+ "lwc1 %[temp7], 0(%[tcos1]) \t\n"
+ "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
+ "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
+ "mul.s %[temp11], %[temp5], %[temp6] \t\n"
+ "mul.s %[temp12], %[temp5], %[temp7] \t\n"
+ "lwc1 %[temp8], 0(%[z1]) \t\n"
+ "lwc1 %[temp1], 4(%[z2]) \t\n"
+ "lwc1 %[temp2], 0(%[tsin2]) \t\n"
+ "lwc1 %[temp3], 0(%[tcos2]) \t\n"
+ "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
+ "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
+ "mul.s %[temp13], %[temp1], %[temp2] \t\n"
+ "mul.s %[temp14], %[temp1], %[temp3] \t\n"
+ "lwc1 %[temp4], 0(%[z2]) \t\n"
+ "lwc1 %[temp5], 12(%[z2]) \t\n"
+ "lwc1 %[temp6], 4(%[tsin2]) \t\n"
+ "lwc1 %[temp7], 4(%[tcos2]) \t\n"
+ "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n"
+ "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n"
+ "mul.s %[temp15], %[temp5], %[temp6] \t\n"
+ "mul.s %[temp16], %[temp5], %[temp7] \t\n"
+ "lwc1 %[temp8], 8(%[z2]) \t\n"
+ "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n"
+ "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n"
+ : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
+ [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
+ [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
+ [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
+ [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
+ [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
+ [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
+ [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
+ : [z1]"r"(z1), [z2]"r"(z2),
+ [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
+ [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
+ );
+
+ z1[1].re = temp9;
+ z1[1].im = temp14;
+ z2[0].re = temp13;
+ z2[0].im = temp10;
+
+ z1[0].re = temp11;
+ z1[0].im = temp16;
+ z2[1].re = temp15;
+ z2[1].im = temp12;
+ }
+}
+#else
+static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int k, n8, n4, n2, n, j,j2;
+ const uint16_t *revtab = s->revtab;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ const FFTSample *in1, *in2;
+ const FFTSample *in3, *in4;
+ FFTSample temp1, temp2, temp3, temp4;
+ FFTSample temp5, temp6, temp7, temp8;
+
+ FFTSample temp11, temp12, temp13, temp14;
+ FFTSample temp15, temp16, temp17, temp18;
+
+ FFTComplex *z = (FFTComplex *)output;
+
+ n = 1 << s->mdct_bits;
+ n2 = n >> 1;
+ n4 = n >> 2;
+ n8 = n >> 3;
+
+ /* pre rotation */
+ in1 = input;
+ in2 = input + n2 - 1;
+ in3 = input + 2;
+ in4 = input + n2 - 3;
+
+ for(k = 0; k < n4; k+=2) {
+ j=revtab[k];
+ j2=revtab[k+1];
+
+ temp1=*in2 * tcos[k];
+ temp2=*in1 * tsin[k];
+ temp3=*in2 * tsin[k];
+ temp4=*in1 * tcos[k];
+
+ temp5=*in4 * tcos[k+1];
+ temp6=*in3 * tsin[k+1];
+ temp7=*in4 * tsin[k+1];
+ temp8=*in3 * tcos[k+1];
+
+ z[j].re=temp1-temp2;
+ z[j].im=temp3+temp4;
+
+ z[j2].re=temp5-temp6;
+ z[j2].im=temp7+temp8;
+
+ in1 += 4;
+ in3 += 4;
+ in2 -= 4;
+ in4 -= 4;
+ }
+ s->fft_calc(s, z);
+
+ /* post rotation + reordering */
+ for(k = 0; k < n8; k+=2) {
+ temp1 = z[n8 - k - 1].im * tsin[n8 - k - 1];
+ temp2 = z[n8 - k - 1].re * tcos[n8 - k - 1];
+ temp3 = z[n8 - k - 1].im * tcos[n8 - k - 1];
+ temp4 = z[n8 - k - 1].re * tsin[n8 - k - 1];
+
+ temp5 = z[n8 + k].im * tsin[n8 + k];
+ temp6 = z[n8 + k].re * tcos[n8 + k];
+ temp7 = z[n8 + k].im * tcos[n8 + k];
+ temp8 = z[n8 + k].re * tsin[n8 + k];
+
+ temp11 = z[n8 - k - 2].im * tsin[n8 - k - 2];
+ temp12 = z[n8 - k - 2].re * tcos[n8 - k - 2];
+ temp13 = z[n8 - k - 2].im * tcos[n8 - k - 2];
+ temp14 = z[n8 - k - 2].re * tsin[n8 - k - 2];
+ temp15 = z[n8 + k + 1].im * tsin[n8 + k + 1];
+ temp16 = z[n8 + k + 1].re * tcos[n8 + k + 1];
+ temp17 = z[n8 + k + 1].im * tcos[n8 + k + 1];
+ temp18 = z[n8 + k + 1].re * tsin[n8 + k + 1];
+
+ z[n8 - k - 1].re = temp1 - temp2;
+ z[n8 - k - 1].im = temp7 + temp8;
+ z[n8 + k].re = temp5 - temp6;
+ z[n8 + k].im = temp3 + temp4;
+
+ z[n8 - k - 2].re = temp11 - temp12;
+ z[n8 - k - 2].im = temp17 + temp18;
+ z[n8 + k + 1].re = temp15 - temp16;
+ z[n8 + k + 1].im = temp13 + temp14;
+ }
+}
+#endif /* HAVE_INLINE_ASM */
+
+/**
+ * Compute inverse MDCT of size N = 2^nbits
+ * @param output N samples
+ * @param input N/2 samples
+ */
+static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+ int k;
+ int n = 1 << s->mdct_bits;
+ int n2 = n >> 1;
+ int n4 = n >> 2;
+
+ ff_imdct_half_mips(s, output+n4, input);
+
+ for(k = 0; k < n4; k+=4) {
+ output[k] = -output[n2-k-1];
+ output[k+1] = -output[n2-k-2];
+ output[k+2] = -output[n2-k-3];
+ output[k+3] = -output[n2-k-4];
+
+ output[n-k-1] = output[n2+k];
+ output[n-k-2] = output[n2+k+1];
+ output[n-k-3] = output[n2+k+2];
+ output[n-k-4] = output[n2+k+3];
+ }
+}
+
+/**
+ * Compute MDCT of size N = 2^nbits
+ * @param input N samples
+ * @param out N/2 samples
+ */
+static void ff_mdct_calc_mips(FFTContext *s, FFTSample *out, const FFTSample *input)
+{
+ int i, j, j2, n, n8, n4, n2, n3;
+ FFTSample re, im;
+ FFTSample re2, im2;
+
+ const uint16_t *revtab = s->revtab;
+ const FFTSample *tcos = s->tcos;
+ const FFTSample *tsin = s->tsin;
+ FFTComplex *x = (FFTComplex *)out;
+ FFTSample temp1, temp2, temp3, temp4;
+
+ FFTSample temp5, temp6, temp7, temp8;
+
+ n = 1 << s->mdct_bits;
+ n2 = n >> 1;
+ n4 = n >> 2;
+ n8 = n >> 3;
+ n3 = 3 * n4;
+
+ /* pre rotation */
+ for(i=0;i<n8;i++) {
+ re = (-input[2*i+n3] - input[n3-1-2*i]);
+ im = (-input[n4+2*i] + input[n4-1-2*i]);
+
+ re2 = (input[2*i] - input[n2-1-2*i]);
+ im2 = (-input[n2+2*i] - input[ n-1-2*i]);
+
+ j = revtab[i];
+ j2 = revtab[n8 + i];
+
+ temp1 = re * tcos[i];
+ temp2 = im * tsin[i];
+ temp3 = re * tsin[i];
+ temp4 = im * tcos[i];
+
+ temp5 = re2 * tcos[n8 + i];
+ temp6 = im2 * tsin[n8 + i];
+ temp7 = re2 * tsin[n8 + i];
+ temp8 = im2 * tcos[n8 + i];
+
+ x[j].re = -(temp1 + temp2);
+ x[j].im = temp3 - temp4;
+
+ x[j2].re = -(temp5 + temp6);
+ x[j2].im = temp7 - temp8;
+ }
+
+ s->fft_calc(s, x);
+
+ /* post rotation */
+ for(i=0;i<n8;i++) {
+ temp1 = x[n8-i-1].re * tcos[n8-i-1];
+ temp2 = x[n8-i-1].im * tsin[n8-i-1];
+ temp3 =x[n8+i].re * tsin[n8+i];
+ temp4 =x[n8+i].im * tcos[n8+i];
+
+ temp5 = x[n8+i].re * tcos[n8+i];
+ temp6 = x[n8+i].im * tsin[n8+i];
+ temp7 = x[n8-i-1].re * tsin[n8-i-1];
+ temp8 = x[n8-i-1].im * tcos[n8-i-1];
+
+ x[n8-i-1].re = -(temp2+temp1);
+ x[n8-i-1].im = temp4-temp3;
+ x[n8+i].re = -(temp5+temp6);
+ x[n8+i].im = temp8-temp7 ;
+ }
+}
+
+av_cold void ff_fft_init_mips(FFTContext *s)
+{
+#if HAVE_INLINE_ASM
+ s->fft_calc = ff_fft_calc_mips;
+#endif
+#if CONFIG_MDCT
+ s->imdct_calc = ff_imdct_calc_mips;
+ s->imdct_half = ff_imdct_half_mips;
+ s->mdct_calc = ff_mdct_calc_mips;
+#endif
+}
diff --git a/libavcodec/mips/fft_table.h b/libavcodec/mips/fft_table.h
new file mode 100644
index 0000000..2888f21
--- /dev/null
+++ b/libavcodec/mips/fft_table.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2012
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Stanislav Ocovaj (socovaj at mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * definitions and LUT table for MIPS FFT
+ */
+#ifndef AVCODEC_MIPS_FFT_TABLE_H
+#define AVCODEC_MIPS_FFT_TABLE_H
+
+#include "libavcodec/fft.h"
+
+enum _fftConsts{
+ MIN_LOG2_NFFT = 5, //!< Specifies miniumum allowed fft size
+ MAX_LOG2_NFFT = 12 //!< Specifies maxiumum allowed fft size
+};
+
+#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT)
+#define MIN_FFT_SIZE (1 << MAX_LOG2_NFFT)
+
+short fft_offsets_lut[] =
+{
+ 0, 2, 3, 4, 6, 8, 10, 11, 12, 14, 15, 16, 18, 19, 20, 22, 24, 26, 27, 28,
+ 30, 32, 34, 35, 36, 38, 40, 42, 43, 44, 46, 47, 48, 50, 51, 52, 54, 56, 58, 59,
+ 60, 62, 63, 64, 66, 67, 68, 70, 72, 74, 75, 76, 78, 79, 80, 82, 83, 84, 86, 88,
+ 90, 91, 92, 94, 96, 98, 99, 100, 102, 104, 106, 107, 108, 110, 111, 112, 114, 115, 116, 118,
+ 120, 122, 123, 124, 126, 128, 130, 131, 132, 134, 136, 138, 139, 140, 142, 143, 144, 146, 147, 148,
+ 150, 152, 154, 155, 156, 158, 160, 162, 163, 164, 166, 168, 170, 171, 172, 174, 175, 176, 178, 179,
+ 180, 182, 184, 186, 187, 188, 190, 191, 192, 194, 195, 196, 198, 200, 202, 203, 204, 206, 207, 208,
+ 210, 211, 212, 214, 216, 218, 219, 220, 222, 224, 226, 227, 228, 230, 232, 234, 235, 236, 238, 239,
+ 240, 242, 243, 244, 246, 248, 250, 251, 252, 254, 255, 256, 258, 259, 260, 262, 264, 266, 267, 268,
+ 270, 271, 272, 274, 275, 276, 278, 280, 282, 283, 284, 286, 288, 290, 291, 292, 294, 296, 298, 299,
+ 300, 302, 303, 304, 306, 307, 308, 310, 312, 314, 315, 316, 318, 319, 320, 322, 323, 324, 326, 328,
+ 330, 331, 332, 334, 335, 336, 338, 339, 340, 342, 344, 346, 347, 348, 350, 352, 354, 355, 356, 358,
+ 360, 362, 363, 364, 366, 367, 368, 370, 371, 372, 374, 376, 378, 379, 380, 382, 384, 386, 387, 388,
+ 390, 392, 394, 395, 396, 398, 399, 400, 402, 403, 404, 406, 408, 410, 411, 412, 414, 416, 418, 419,
+ 420, 422, 424, 426, 427, 428, 430, 431, 432, 434, 435, 436, 438, 440, 442, 443, 444, 446, 447, 448,
+ 450, 451, 452, 454, 456, 458, 459, 460, 462, 463, 464, 466, 467, 468, 470, 472, 474, 475, 476, 478,
+ 480, 482, 483, 484, 486, 488, 490, 491, 492, 494, 495, 496, 498, 499, 500, 502, 504, 506, 507, 508,
+ 510, 512, 514, 515, 516, 518, 520, 522, 523, 524, 526, 527, 528, 530, 531, 532, 534, 536, 538, 539,
+ 540, 542, 544, 546, 547, 548, 550, 552, 554, 555, 556, 558, 559, 560, 562, 563, 564, 566, 568, 570,
+ 571, 572, 574, 575, 576, 578, 579, 580, 582, 584, 586, 587, 588, 590, 591, 592, 594, 595, 596, 598,
+ 600, 602, 603, 604, 606, 608, 610, 611, 612, 614, 616, 618, 619, 620, 622, 623, 624, 626, 627, 628,
+ 630, 632, 634, 635, 636, 638, 640, 642, 643, 644, 646, 648, 650, 651, 652, 654, 655, 656, 658, 659,
+ 660, 662, 664, 666, 667, 668, 670, 672, 674, 675, 676, 678, 680, 682, 683, 684, 686, 687, 688, 690,
+ 691, 692, 694, 696, 698, 699, 700, 702, 703, 704, 706, 707, 708, 710, 712, 714, 715, 716, 718, 719,
+ 720, 722, 723, 724, 726, 728, 730, 731, 732, 734, 736, 738, 739, 740, 742, 744, 746, 747, 748, 750,
+ 751, 752, 754, 755, 756, 758, 760, 762, 763, 764, 766, 767, 768, 770, 771, 772, 774, 776, 778, 779,
+ 780, 782, 783, 784, 786, 787, 788, 790, 792, 794, 795, 796, 798, 800, 802, 803, 804, 806, 808, 810,
+ 811, 812, 814, 815, 816, 818, 819, 820, 822, 824, 826, 827, 828, 830, 831, 832, 834, 835, 836, 838,
+ 840, 842, 843, 844, 846, 847, 848, 850, 851, 852, 854, 856, 858, 859, 860, 862, 864, 866, 867, 868,
+ 870, 872, 874, 875, 876, 878, 879, 880, 882, 883, 884, 886, 888, 890, 891, 892, 894, 896, 898, 899,
+ 900, 902, 904, 906, 907, 908, 910, 911, 912, 914, 915, 916, 918, 920, 922, 923, 924, 926, 928, 930,
+ 931, 932, 934, 936, 938, 939, 940, 942, 943, 944, 946, 947, 948, 950, 952, 954, 955, 956, 958, 959,
+ 960, 962, 963, 964, 966, 968, 970, 971, 972, 974, 975, 976, 978, 979, 980, 982, 984, 986, 987, 988,
+ 990, 992, 994, 995, 996, 998,1000,1002,1003,1004,1006,1007,1008,1010,1011,1012,1014,1016,1018,1019,
+ 1020,1022,1023
+};
+
+#endif /* AVCODEC_MIPS_FFT_TABLE_H */
diff --git a/libavcodec/mips/fmtconvert_mips.c b/libavcodec/mips/fmtconvert_mips.c
new file mode 100644
index 0000000..9e45ba1
--- /dev/null
+++ b/libavcodec/mips/fmtconvert_mips.c
@@ -0,0 +1,336 @@
+/*
+ * Format Conversion Utils for MIPS
+ *
+ * Copyright (c) 2012
+ * MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author: Zoran Lukic (zoranl at mips.com)
+ * Author: Nedeljko Babic (nbabic at mips.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "config.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+#if HAVE_INLINE_ASM
+static void float_to_int16_mips(int16_t *dst, const float *src, long len) {
+ const float *src_end = src + len;
+ int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7;
+ float src0, src1, src2, src3, src4, src5, src6, src7;
+
+ /*
+ * loop is 8 times unrolled in assembler in order to achieve better performance
+ */
+ __asm__ __volatile__(
+ "beq %[len], $zero, fti16_end%= \n\t"
+ "fti16_lp%=: \n\t"
+ "lwc1 %[src0], 0(%[src]) \n\t"
+ "lwc1 %[src1], 4(%[src]) \n\t"
+ "lwc1 %[src2], 8(%[src]) \n\t"
+ "lwc1 %[src3], 12(%[src]) \n\t"
+ "cvt.w.s %[src0], %[src0] \n\t"
+ "cvt.w.s %[src1], %[src1] \n\t"
+ "cvt.w.s %[src2], %[src2] \n\t"
+ "cvt.w.s %[src3], %[src3] \n\t"
+ "mfc1 %[ret0], %[src0] \n\t"
+ "mfc1 %[ret1], %[src1] \n\t"
+ "mfc1 %[ret2], %[src2] \n\t"
+ "mfc1 %[ret3], %[src3] \n\t"
+ "lwc1 %[src4], 16(%[src]) \n\t"
+ "lwc1 %[src5], 20(%[src]) \n\t"
+ "lwc1 %[src6], 24(%[src]) \n\t"
+ "lwc1 %[src7], 28(%[src]) \n\t"
+ "cvt.w.s %[src4], %[src4] \n\t"
+ "cvt.w.s %[src5], %[src5] \n\t"
+ "cvt.w.s %[src6], %[src6] \n\t"
+ "cvt.w.s %[src7], %[src7] \n\t"
+ "addiu %[src], 32 \n\t"
+ "shll_s.w %[ret0], %[ret0], 16 \n\t"
+ "shll_s.w %[ret1], %[ret1], 16 \n\t"
+ "shll_s.w %[ret2], %[ret2], 16 \n\t"
+ "shll_s.w %[ret3], %[ret3], 16 \n\t"
+ "srl %[ret0], %[ret0], 16 \n\t"
+ "srl %[ret1], %[ret1], 16 \n\t"
+ "srl %[ret2], %[ret2], 16 \n\t"
+ "srl %[ret3], %[ret3], 16 \n\t"
+ "sh %[ret0], 0(%[dst]) \n\t"
+ "sh %[ret1], 2(%[dst]) \n\t"
+ "sh %[ret2], 4(%[dst]) \n\t"
+ "sh %[ret3], 6(%[dst]) \n\t"
+ "mfc1 %[ret4], %[src4] \n\t"
+ "mfc1 %[ret5], %[src5] \n\t"
+ "mfc1 %[ret6], %[src6] \n\t"
+ "mfc1 %[ret7], %[src7] \n\t"
+ "shll_s.w %[ret4], %[ret4], 16 \n\t"
+ "shll_s.w %[ret5], %[ret5], 16 \n\t"
+ "shll_s.w %[ret6], %[ret6], 16 \n\t"
+ "shll_s.w %[ret7], %[ret7], 16 \n\t"
+ "srl %[ret4], %[ret4], 16 \n\t"
+ "srl %[ret5], %[ret5], 16 \n\t"
+ "srl %[ret6], %[ret6], 16 \n\t"
+ "srl %[ret7], %[ret7], 16 \n\t"
+ "sh %[ret4], 8(%[dst]) \n\t"
+ "sh %[ret5], 10(%[dst]) \n\t"
+ "sh %[ret6], 12(%[dst]) \n\t"
+ "sh %[ret7], 14(%[dst]) \n\t"
+ "addiu %[dst], 16 \n\t"
+ "bne %[src], %[src_end], fti16_lp%= \n\t"
+ "fti16_end%=: \n\t"
+ : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3),
+ [ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7),
+ [src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3),
+ [src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7),
+ [src]"+r"(src), [dst]"+r"(dst)
+ : [src_end]"r"(src_end), [len]"r"(len)
+ : "memory"
+ );
+}
+
+static void int32_to_float_fmul_scalar_mips(float *dst, const int *src,
+ float mul, int len) {
+ /*
+ * variables used in inline assembler
+ */
+ float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15;
+
+ int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23;
+ const int *src_end = src + len;
+ /*
+ * loop is 8 times unrolled in assembler in order to achieve better performance
+ */
+ __asm__ __volatile__ (
+ "i32tf_lp%=: \n\t"
+ "lw %[rpom11], 0(%[src]) \n\t"
+ "lw %[rpom21], 4(%[src]) \n\t"
+ "lw %[rpom1], 8(%[src]) \n\t"
+ "lw %[rpom2], 12(%[src]) \n\t"
+ "mtc1 %[rpom11], %[temp1] \n\t"
+ "mtc1 %[rpom21], %[temp3] \n\t"
+ "mtc1 %[rpom1], %[temp5] \n\t"
+ "mtc1 %[rpom2], %[temp7] \n\t"
+
+ "lw %[rpom13], 16(%[src]) \n\t"
+ "lw %[rpom23], 20(%[src]) \n\t"
+ "lw %[rpom12], 24(%[src]) \n\t"
+ "lw %[rpom22], 28(%[src]) \n\t"
+ "mtc1 %[rpom13], %[temp9] \n\t"
+ "mtc1 %[rpom23], %[temp11] \n\t"
+ "mtc1 %[rpom12], %[temp13] \n\t"
+ "mtc1 %[rpom22], %[temp15] \n\t"
+
+ "addiu %[src], 32 \n\t"
+ "cvt.s.w %[temp1], %[temp1] \n\t"
+ "cvt.s.w %[temp3], %[temp3] \n\t"
+ "cvt.s.w %[temp5], %[temp5] \n\t"
+ "cvt.s.w %[temp7], %[temp7] \n\t"
+
+ "cvt.s.w %[temp9], %[temp9] \n\t"
+ "cvt.s.w %[temp11], %[temp11] \n\t"
+ "cvt.s.w %[temp13], %[temp13] \n\t"
+ "cvt.s.w %[temp15], %[temp15] \n\t"
+
+ "mul.s %[temp1], %[temp1], %[mul] \n\t"
+ "mul.s %[temp3], %[temp3], %[mul] \n\t"
+ "mul.s %[temp5], %[temp5], %[mul] \n\t"
+ "mul.s %[temp7], %[temp7], %[mul] \n\t"
+
+ "mul.s %[temp9], %[temp9], %[mul] \n\t"
+ "mul.s %[temp11], %[temp11], %[mul] \n\t"
+ "mul.s %[temp13], %[temp13], %[mul] \n\t"
+ "mul.s %[temp15], %[temp15], %[mul] \n\t"
+
+ "swc1 %[temp1], 0(%[dst]) \n\t" /*dst[i] = src[i] * mul; */
+ "swc1 %[temp3], 4(%[dst]) \n\t" /*dst[i+1] = src[i+1] * mul;*/
+ "swc1 %[temp5], 8(%[dst]) \n\t" /*dst[i+2] = src[i+2] * mul;*/
+ "swc1 %[temp7], 12(%[dst]) \n\t" /*dst[i+3] = src[i+3] * mul;*/
+
+ "swc1 %[temp9], 16(%[dst]) \n\t" /*dst[i+4] = src[i+4] * mul;*/
+ "swc1 %[temp11], 20(%[dst]) \n\t" /*dst[i+5] = src[i+5] * mul;*/
+ "swc1 %[temp13], 24(%[dst]) \n\t" /*dst[i+6] = src[i+6] * mul;*/
+ "swc1 %[temp15], 28(%[dst]) \n\t" /*dst[i+7] = src[i+7] * mul;*/
+ "addiu %[dst], 32 \n\t"
+ "bne %[src], %[src_end], i32tf_lp%= \n\t"
+ : [temp1]"=&f"(temp1), [temp11]"=&f"(temp11),
+ [temp13]"=&f"(temp13), [temp15]"=&f"(temp15),
+ [temp3]"=&f"(temp3), [temp5]"=&f"(temp5),
+ [temp7]"=&f"(temp7), [temp9]"=&f"(temp9),
+ [rpom1]"=&r"(rpom1), [rpom2]"=&r"(rpom2),
+ [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21),
+ [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22),
+ [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23),
+ [dst]"+r"(dst), [src]"+r"(src)
+ : [mul]"f"(mul), [src_end]"r"(src_end)
+ : "memory"
+ );
+}
+
+static void float_to_int16_interleave_mips(int16_t *dst, const float **src, long len,
+ int channels)
+{
+ int c, ch2 = channels <<1;
+ int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7;
+ float src0, src1, src2, src3, src4, src5, src6, src7;
+ int16_t *dst_ptr0, *dst_ptr1, *dst_ptr2, *dst_ptr3;
+ int16_t *dst_ptr4, *dst_ptr5, *dst_ptr6, *dst_ptr7;
+ const float *src_ptr, *src_ptr2, *src_end;
+
+ if (channels == 2) {
+ src_ptr = &src[0][0];
+ src_ptr2 = &src[1][0];
+ src_end = src_ptr + len;
+
+ __asm__ __volatile__ (
+ "fti16i2_lp%=: \n\t"
+ "lwc1 %[src0], 0(%[src_ptr]) \n\t"
+ "lwc1 %[src1], 0(%[src_ptr2]) \n\t"
+ "addiu %[src_ptr], 4 \n\t"
+ "cvt.w.s $f9, %[src0] \n\t"
+ "cvt.w.s $f10, %[src1] \n\t"
+ "mfc1 %[ret0], $f9 \n\t"
+ "mfc1 %[ret1], $f10 \n\t"
+ "shll_s.w %[ret0], %[ret0], 16 \n\t"
+ "shll_s.w %[ret1], %[ret1], 16 \n\t"
+ "addiu %[src_ptr2], 4 \n\t"
+ "srl %[ret0], %[ret0], 16 \n\t"
+ "srl %[ret1], %[ret1], 16 \n\t"
+ "sh %[ret0], 0(%[dst]) \n\t"
+ "sh %[ret1], 2(%[dst]) \n\t"
+ "addiu %[dst], 4 \n\t"
+ "bne %[src_ptr], %[src_end], fti16i2_lp%= \n\t"
+ : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1),
+ [src0]"=&f"(src0), [src1]"=&f"(src1),
+ [src_ptr]"+r"(src_ptr), [src_ptr2]"+r"(src_ptr2),
+ [dst]"+r"(dst)
+ : [src_end]"r"(src_end)
+ : "memory"
+ );
+ } else {
+ for (c = 0; c < channels; c++)
+ {
+ src_ptr = &src[c][0];
+ dst_ptr0 = &dst[c];
+ src_end = src_ptr + len;
+ /*
+ * loop is 8 times unrolled in assembler in order to achieve better performance
+ */
+ __asm__ __volatile__(
+ "fti16i_lp%=: \n\t"
+ "lwc1 %[src0], 0(%[src_ptr]) \n\t"
+ "lwc1 %[src1], 4(%[src_ptr]) \n\t"
+ "lwc1 %[src2], 8(%[src_ptr]) \n\t"
+ "lwc1 %[src3], 12(%[src_ptr]) \n\t"
+ "cvt.w.s %[src0], %[src0] \n\t"
+ "cvt.w.s %[src1], %[src1] \n\t"
+ "cvt.w.s %[src2], %[src2] \n\t"
+ "cvt.w.s %[src3], %[src3] \n\t"
+ "mfc1 %[ret0], %[src0] \n\t"
+ "mfc1 %[ret1], %[src1] \n\t"
+ "mfc1 %[ret2], %[src2] \n\t"
+ "mfc1 %[ret3], %[src3] \n\t"
+ "lwc1 %[src4], 16(%[src_ptr]) \n\t"
+ "lwc1 %[src5], 20(%[src_ptr]) \n\t"
+ "lwc1 %[src6], 24(%[src_ptr]) \n\t"
+ "lwc1 %[src7], 28(%[src_ptr]) \n\t"
+ "addu %[dst_ptr1], %[dst_ptr0], %[ch2] \n\t"
+ "addu %[dst_ptr2], %[dst_ptr1], %[ch2] \n\t"
+ "addu %[dst_ptr3], %[dst_ptr2], %[ch2] \n\t"
+ "addu %[dst_ptr4], %[dst_ptr3], %[ch2] \n\t"
+ "addu %[dst_ptr5], %[dst_ptr4], %[ch2] \n\t"
+ "addu %[dst_ptr6], %[dst_ptr5], %[ch2] \n\t"
+ "addu %[dst_ptr7], %[dst_ptr6], %[ch2] \n\t"
+ "addiu %[src_ptr], 32 \n\t"
+ "cvt.w.s %[src4], %[src4] \n\t"
+ "cvt.w.s %[src5], %[src5] \n\t"
+ "cvt.w.s %[src6], %[src6] \n\t"
+ "cvt.w.s %[src7], %[src7] \n\t"
+ "shll_s.w %[ret0], %[ret0], 16 \n\t"
+ "shll_s.w %[ret1], %[ret1], 16 \n\t"
+ "shll_s.w %[ret2], %[ret2], 16 \n\t"
+ "shll_s.w %[ret3], %[ret3], 16 \n\t"
+ "srl %[ret0], %[ret0], 16 \n\t"
+ "srl %[ret1], %[ret1], 16 \n\t"
+ "srl %[ret2], %[ret2], 16 \n\t"
+ "srl %[ret3], %[ret3], 16 \n\t"
+ "sh %[ret0], 0(%[dst_ptr0]) \n\t"
+ "sh %[ret1], 0(%[dst_ptr1]) \n\t"
+ "sh %[ret2], 0(%[dst_ptr2]) \n\t"
+ "sh %[ret3], 0(%[dst_ptr3]) \n\t"
+ "mfc1 %[ret4], %[src4] \n\t"
+ "mfc1 %[ret5], %[src5] \n\t"
+ "mfc1 %[ret6], %[src6] \n\t"
+ "mfc1 %[ret7], %[src7] \n\t"
+ "shll_s.w %[ret4], %[ret4], 16 \n\t"
+ "shll_s.w %[ret5], %[ret5], 16 \n\t"
+ "shll_s.w %[ret6], %[ret6], 16 \n\t"
+ "shll_s.w %[ret7], %[ret7], 16 \n\t"
+ "srl %[ret4], %[ret4], 16 \n\t"
+ "srl %[ret5], %[ret5], 16 \n\t"
+ "srl %[ret6], %[ret6], 16 \n\t"
+ "srl %[ret7], %[ret7], 16 \n\t"
+ "sh %[ret4], 0(%[dst_ptr4]) \n\t"
+ "sh %[ret5], 0(%[dst_ptr5]) \n\t"
+ "sh %[ret6], 0(%[dst_ptr6]) \n\t"
+ "sh %[ret7], 0(%[dst_ptr7]) \n\t"
+ "addu %[dst_ptr0], %[dst_ptr7], %[ch2] \n\t"
+ "bne %[src_ptr], %[src_end], fti16i_lp%= \n\t"
+ : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3),
+ [ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7),
+ [src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3),
+ [src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7),
+ [dst_ptr1]"=&r"(dst_ptr1), [dst_ptr2]"=&r"(dst_ptr2), [dst_ptr3]"=&r"(dst_ptr3),
+ [dst_ptr4]"=&r"(dst_ptr4), [dst_ptr5]"=&r"(dst_ptr5), [dst_ptr6]"=&r"(dst_ptr6),
+ [dst_ptr7]"=&r"(dst_ptr7), [dst_ptr0]"+r"(dst_ptr0), [src_ptr]"+r"(src_ptr)
+ : [ch2]"r"(ch2), [src_end]"r"(src_end)
+ : "memory"
+ );
+ }
+ }
+}
+#endif
+
+av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c) {
+#if HAVE_INLINE_ASM
+ c->float_to_int16_interleave = float_to_int16_interleave_mips;
+ c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips;
+ c->float_to_int16 = float_to_int16_mips;
+#endif
+}
--
1.7.3.4
More information about the ffmpeg-devel
mailing list