[FFmpeg-devel] [RFC/PATCH] More flexible variafloat_to_int16 , WMA optimization, Vorbis

Tue Jul 15 16:58:23 CEST 2008

On Tue, 15 Jul 2008, Michael Niedermayer wrote:

> May i suggest an array of src pointers instead of stride?
> Reason is, if we want to use this function in the future in a generic
> converter stride will not be enough because we likely will have a
> array of source pointers from the user.
> Besides it would allow reordering channels.

done. (wma part breaks with c version, since it doesn't use [384,386] bias yet)

> It also might be worth to look at mplayer/liba52/resample_mmx.c, maybe some
> of that code could be reused. Especially as we do not have a MMX
> float_to_int16, besides the trick used could be tried with SSE2.

I'm not very interested in optimizing for pentium2 / k6-1. I'm not sure I 
could, anyway; that's so far removed from anything I can benchmark on.

--Loren Merritt
-------------- next part --------------
Index: dsputil.c
===================================================================

--- dsputil.c	(revision 14207)
+++ dsputil.c	(working copy)
@@ -3962,17 +3962,17 @@
         dst[i] = float_to_int16_one(src+i);
 }
 
-void ff_float_to_int16_interleave_c(int16_t *dst, const float *src, long len, int channels){
+void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
     int i,j,c;
     if(channels==2){
         for(i=0; i<len; i++){
-            dst[2*i]   = float_to_int16_one(src+i);
-            dst[2*i+1] = float_to_int16_one(src+i+len);
+            dst[2*i]   = float_to_int16_one(src[0]+i);
+            dst[2*i+1] = float_to_int16_one(src[1]+i);
         }
     }else{
-        for(c=0; c<channels; c++, src+=len)
+        for(c=0; c<channels; c++)
             for(i=0, j=c; i<len; i++, j+=channels)
-                dst[j] = float_to_int16_one(src+i);
+                dst[j] = float_to_int16_one(src[c]+i);
     }
 }
 
Index: dsputil.h
===================================================================
--- dsputil.h	(revision 14207)
+++ dsputil.h	(working copy)
@@ -372,7 +372,7 @@
     /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
      * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
     void (*float_to_int16)(int16_t *dst, const float *src, long len);
-    void (*float_to_int16_interleave)(int16_t *dst, const float *src, long len, int channels);
+    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
 
     /* (I)DCT */
     void (*fdct)(DCTELEM *block/* align 16*/);
Index: i386/dsputil_mmx.c
===================================================================
--- i386/dsputil_mmx.c	(revision 14236)
+++ i386/dsputil_mmx.c	(working copy)
@@ -2156,32 +2156,32 @@
 
 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
-static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float *src, long len, int channels){\
-    DECLARE_ALIGNED_16(int16_t, tmp[len*channels]);\
+static av_noinline void float_to_int16_interleave2_##cpu(int16_t *dst, const float **src, long len, int channels){\
+    DECLARE_ALIGNED_16(int16_t, tmp[len]);\
     int i,j,c;\
-    float_to_int16_##cpu(tmp, src, len*channels);\
     for(c=0; c<channels; c++){\
-        int16_t *ptmp = tmp+c*len;\
+        float_to_int16_##cpu(tmp, src[c], len);\
         for(i=0, j=c; i<len; i++, j+=channels)\
-            dst[j] = ptmp[i];\
+            dst[j] = tmp[i];\
     }\
 }\
 \
-static void float_to_int16_interleave_##cpu(int16_t *dst, const float *src, long len, int channels){\
+static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
     if(channels==1)\
-        float_to_int16_##cpu(dst, src, len);\
+        float_to_int16_##cpu(dst, src[0], len);\
     else if(channels>2)\
         float_to_int16_interleave2_##cpu(dst, src, len, channels);\
     else{\
-        float *src1;\
+        const float *src0 = src[0];\
+        const float *src1 = src[1];\
         asm volatile(\
             "shl $2, %0 \n"\
             "add %0, %1 \n"\
             "add %0, %2 \n"\
-            "lea (%2,%0), %3 \n"\
+            "add %0, %3 \n"\
             "neg %0 \n"\
             body\
-            :"+r"(len), "+r"(dst), "+r"(src), "=r"(src1)\
+            :"+r"(len), "+r"(dst), "+r"(src0), "+r"(src1)\
         );\
     }\
 }
Index: vorbis_dec.c
===================================================================
--- vorbis_dec.c	(revision 14207)
+++ vorbis_dec.c	(working copy)
@@ -1551,6 +1551,8 @@
 {
     vorbis_context *vc = avccontext->priv_data ;
     GetBitContext *gb = &(vc->gb);
+    const float *channel_ptrs[vc->audio_channels];
+    int i;
 
     int_fast16_t len;
 
@@ -1577,7 +1579,9 @@
 
     AV_DEBUG("parsed %d bytes %d bits, returned %d samples (*ch*bits) \n", get_bits_count(gb)/8, get_bits_count(gb)%8, len);
 
-    vc->dsp.float_to_int16_interleave(data, vc->ret, len, vc->audio_channels);
+    for(i=0; i<vc->audio_channels; i++)
+        channel_ptrs[i] = vc->ret+i*len;
+    vc->dsp.float_to_int16_interleave(data, channel_ptrs, len, vc->audio_channels);
     *data_size=len*2*vc->audio_channels;
 
     return buf_size ;
Index: wmadec.c
===================================================================
--- wmadec.c	(revision 14207)
+++ wmadec.c	(working copy)
@@ -715,9 +715,8 @@
 /* decode a frame of frame_len samples */
 static int wma_decode_frame(WMACodecContext *s, int16_t *samples)
 {
-    int ret, i, n, ch, incr;
-    int16_t *ptr;
-    float *iptr;
+    int ret, ch;
+    const float *channel_ptrs[s->nb_channels];
 
 #ifdef TRACE
     tprintf(s->avctx, "***decode_frame: %d size=%d\n", s->frame_count++, s->frame_len);
@@ -734,19 +733,13 @@
             break;
     }
 
-    /* convert frame to integer */
-    n = s->frame_len;
-    incr = s->nb_channels;
-    for(ch = 0; ch < s->nb_channels; ch++) {
-        ptr = samples + ch;
-        iptr = s->frame_out[ch];
+    for(ch=0; ch<s->nb_channels; ch++)
+        channel_ptrs[ch] = s->frame_out[ch];
+    s->dsp.float_to_int16_interleave(samples, channel_ptrs, s->frame_len, s->nb_channels);
 
-        for(i=0;i<n;i++) {
-            *ptr = av_clip_int16(lrintf(*iptr++));
-            ptr += incr;
-        }
+    for(ch = 0; ch < s->nb_channels; ch++) {
         /* prepare for next block */
-        memmove(&s->frame_out[ch][0], &s->frame_out[ch][s->frame_len],
+        memcpy(&s->frame_out[ch][0], &s->frame_out[ch][s->frame_len],
                 s->frame_len * sizeof(float));
     }