[FFmpeg-devel] [RFC/PATCH] More flexible variafloat_to_int16 , WMA optimization, Vorbis
Loren Merritt
lorenm
Wed Jul 16 02:32:34 CEST 2008
On Wed, 16 Jul 2008, Siarhei Siamashka wrote:
>
> Well, merging the loops that are run after iFFT and combining them with
> windowing code can probably provide interesting results. At least it should
> eliminate a lot of intermediate load and store operations. Maybe having iFFT
> output processed in a single loop could allow reading old saved data and
> also replace it with new saved data at the same time? At least in some
> simple cases when previous and current blocks have the same size.
sure, I'll try it.
>> See patch (which won't apply to svn, since it depends on other patches I
>> haven't committed yet, but the strategy should be clear).
>
> Hmm, did you forget to attach this patch?
oops
> But could you also benchmark SSE version of float_to_int16_interleave from
> my original submission on the cores where SSE2 was winning? It is quite a bit
> faster than the code from SVN in my tests:
> FLOAT_TO_INT16_INTERLEAVE(sse,
> "1: \n"
> "cvtps2pi (%2,%0), %%mm0 \n"
> "cvtps2pi 8(%2,%0), %%mm2 \n"
> "cvtps2pi (%3,%0), %%mm1 \n"
> "cvtps2pi 8(%3,%0), %%mm3 \n"
> "add $16, %0 \n"
> "packssdw %%mm1, %%mm0 \n"
> "packssdw %%mm3, %%mm2 \n"
> "pshufw $0xD8, %%mm0, %%mm0 \n"
> "pshufw $0xD8, %%mm2, %%mm2 \n"
> "movq %%mm0, -16(%1,%0) \n"
> "movq %%mm2, -8(%1,%0) \n"
> "js 1b \n"
> "emms \n"
k8:
1139 float_to_int16_interleave_siarhei
1161 float_to_int16_interleave_sse
1304 float_to_int16_interleave_sse2
conroe:
978 float_to_int16_interleave_siarhei
1030 float_to_int16_interleave_sse
1071 float_to_int16_interleave_sse2
penryn:
997 float_to_int16_interleave_siarhei
1062 float_to_int16_interleave_sse
782 float_to_int16_interleave_sse2
prescott-celeron:
3846 float_to_int16_interleave_siarhei
3500 float_to_int16_interleave_sse
2219 float_to_int16_interleave_sse2
--Loren Merritt
-------------- next part --------------
Index: vorbis_dec.c
===================================================================
--- vorbis_dec.c (revision 14251)
+++ vorbis_dec.c (working copy)
@@ -152,7 +152,7 @@
uint_fast8_t previous_window;
float *channel_residues;
float *channel_floors;
- float *saved;
+ float **saved;
uint_fast32_t add_bias; // for float->int conversion
uint_fast32_t exp_bias;
} vorbis_context;
@@ -177,6 +177,8 @@
av_freep(&vc->channel_residues);
av_freep(&vc->channel_floors);
+ for(i=0; i<=vc->audio_channels; i++)
+ av_freep(vc->saved+i);
av_freep(&vc->saved);
av_freep(&vc->residues);
@@ -844,6 +846,7 @@
static int vorbis_parse_id_hdr(vorbis_context *vc){
GetBitContext *gb=&vc->gb;
uint_fast8_t bl0, bl1;
+ int i;
if ((get_bits(gb, 8)!='v') || (get_bits(gb, 8)!='o') ||
(get_bits(gb, 8)!='r') || (get_bits(gb, 8)!='b') ||
@@ -893,7 +896,9 @@
vc->channel_residues= av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float));
vc->channel_floors = av_malloc((vc->blocksize[1]/2)*vc->audio_channels * sizeof(float));
- vc->saved = av_mallocz((vc->blocksize[1]/4)*vc->audio_channels * sizeof(float));
+ vc->saved = av_malloc(vc->audio_channels * sizeof(float*));
+ for(i=0; i<=vc->audio_channels; i++)
+ vc->saved[i] = av_mallocz((vc->blocksize[1]/4) * sizeof(float));
vc->previous_window=0;
ff_mdct_init(&vc->mdct[0], bl0, 1);
@@ -1522,12 +1527,13 @@
uint_fast16_t bs1=vc->blocksize[1];
float *residue=vc->channel_residues+res_chan[j]*blocksize/2;
float *floor=vc->channel_floors+j*blocksize/2;
- float *saved=vc->saved+j*bs1/4;
+ float *saved=vc->saved[j];
float *ret=vc->channel_residues+j*retlen;
float *buf=floor;
const float *win=vc->win[blockflag&previous_window];
- vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, floor, residue);
+ FFSWAP(float*, vc->saved[j], vc->saved[vc->audio_channels]);
+ vc->mdct[0].fft.imdct_half(&vc->mdct[blockflag], buf, vc->saved[j], floor, residue);
if(blockflag == previous_window) {
vc->dsp.vector_fmul_window(ret, saved, buf, win, fadd_bias, blocksize/4);
@@ -1538,7 +1544,6 @@
copy_normalize(ret, saved, (bs1-bs0)/4, vc->exp_bias, fadd_bias);
vc->dsp.vector_fmul_window(ret+(bs1-bs0)/4, saved+(bs1-bs0)/4, buf, win, fadd_bias, bs0/4);
}
- memcpy(saved, buf+blocksize/4, blocksize/4*sizeof(float));
}
vc->previous_window = blockflag;
Index: dsputil.h
===================================================================
--- dsputil.h (revision 14207)
+++ dsputil.h (working copy)
@@ -641,7 +641,7 @@
void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
void (*imdct_calc)(struct MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
- void (*imdct_half)(struct MDCTContext *s, FFTSample *output,
+ void (*imdct_half)(struct MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp);
} FFTContext;
@@ -688,15 +688,15 @@
int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
void ff_imdct_calc(MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
+void ff_imdct_half(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp);
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp);
void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
const FFTSample *input, FFTSample *tmp);
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp);
void ff_mdct_calc(MDCTContext *s, FFTSample *out,
const FFTSample *input, FFTSample *tmp);
Index: mdct.c
===================================================================
--- mdct.c (revision 14207)
+++ mdct.c (working copy)
@@ -166,11 +166,12 @@
/**
* Compute the middle half of the inverse MDCT of size N = 2^nbits,
* thus excluding the parts that can be derived by symmetry
- * @param output N/2 samples
+ * @param out_left N/4 samples
+ * @param out_right N/4 samples
* @param input N/2 samples
* @param tmp N/2 samples
*/
-void ff_imdct_half(MDCTContext *s, FFTSample *output,
+void ff_imdct_half(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp)
{
int k, n8, n4, n;
@@ -181,11 +182,12 @@
imdct_c(s, input, tmp);
+ out_left += n4-1;
for(k = 0; k < n8; k++) {
- output[n4-1-2*k] = z[n8+k].im;
- output[n4-1-2*k-1] = -z[n8-k-1].re;
- output[n4 + 2*k] = -z[n8+k].re;
- output[n4 + 2*k+1] = z[n8-k-1].im;
+ out_left[-2*k] = z[n8+k].im;
+ out_left[-2*k-1] = -z[n8-k-1].re;
+ out_right[2*k] = -z[n8+k].re;
+ out_right[2*k+1] = z[n8-k-1].im;
}
}
Index: i386/fft_sse.c
===================================================================
--- i386/fft_sse.c (revision 14207)
+++ i386/fft_sse.c (working copy)
@@ -313,7 +313,7 @@
);
}
-void ff_imdct_half_sse(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_sse(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp)
{
x86_reg j, k;
@@ -331,8 +331,8 @@
asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1));
asm volatile(
"1: \n\t"
- "movaps (%3,%1), %%xmm0 \n\t"
- "movaps (%3,%0), %%xmm1 \n\t"
+ "movaps (%4,%1), %%xmm0 \n\t"
+ "movaps (%4,%0), %%xmm1 \n\t"
"xorps %%xmm7, %%xmm0 \n\t"
"movaps %%xmm0, %%xmm2 \n\t"
"shufps $141,%%xmm1, %%xmm0 \n\t"
@@ -340,13 +340,13 @@
"shufps $54, %%xmm0, %%xmm0 \n\t"
"shufps $156,%%xmm2, %%xmm2 \n\t"
"xorps %%xmm7, %%xmm0 \n\t"
- "movaps %%xmm2, (%2,%1) \n\t"
+ "movaps %%xmm2, (%3,%1) \n\t"
"movaps %%xmm0, (%2,%0) \n\t"
"sub $16, %1 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
:"+r"(j), "+r"(k)
- :"r"(output+n4), "r"(z+n8)
+ :"r"(out_left+n4), "r"(out_right), "r"(z+n8)
:"memory"
);
}
Index: i386/fft_3dn2.c
===================================================================
--- i386/fft_3dn2.c (revision 14207)
+++ i386/fft_3dn2.c (working copy)
@@ -224,7 +224,7 @@
asm volatile("femms");
}
-void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *out_left, FFTSample *out_right,
const FFTSample *input, FFTSample *tmp)
{
x86_reg j, k;
@@ -242,20 +242,20 @@
asm volatile("movd %0, %%mm7" ::"r"(1<<31));
asm volatile(
"1: \n\t"
- "movq (%3,%1), %%mm0 \n\t" // z[n8+k]
- "pswapd (%3,%0), %%mm1 \n\t" // z[n8-1-k]
+ "movq (%4,%1), %%mm0 \n\t" // z[n8+k]
+ "pswapd (%4,%0), %%mm1 \n\t" // z[n8-1-k]
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm1, %%mm0 \n\t"
"punpckhdq %%mm2, %%mm1 \n\t"
"pxor %%mm7, %%mm0 \n\t"
"pxor %%mm7, %%mm1 \n\t"
- "movq %%mm0, (%2,%1) \n\t" // output[n4+2*k] = { -z[n8+k].re, z[n8-1-k].im }
+ "movq %%mm0, (%3,%1) \n\t" // output[n4+2*k] = { -z[n8+k].re, z[n8-1-k].im }
"movq %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
"sub $8, %1 \n\t"
"add $8, %0 \n\t"
"jl 1b \n\t"
:"+r"(j), "+r"(k)
- :"r"(output+n4), "r"(z+n8)
+ :"r"(out_left+n4), "r"(out_right), "r"(z+n8)
:"memory"
);
asm volatile("femms");
More information about the ffmpeg-devel
mailing list