[FFmpeg-devel] [PATCH] SSE-optimized vector_clipf()

Sun Aug 9 20:06:17 CEST 2009

This version is faster than x87 for me, though of course slower than 
non-simd sse:

static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
 				   uint32_t maxi, uint32_t maxisign)
{
     if(a > mini) a = mini;
     if((a^(1<<31)) > maxisign) a = maxi;
     return a;
}

static void vector_clipf_c(float *dst, float min, float max, int len) {
     if( min<0 && max>0 ) {
         uint32_t mini = *(uint32_t*)&min;
         uint32_t maxi = *(uint32_t*)&max;
         uint32_t maxisign = maxi ^ (1<<31);
         uint32_t *dsti = (uint32_t*)dst;
         int i;
         for(i=0; i<len; i+=8) {
             dsti[i+0] = clipf_c_one(dsti[i+0], mini, maxi, maxisign);
             dsti[i+1] = clipf_c_one(dsti[i+1], mini, maxi, maxisign);
             dsti[i+2] = clipf_c_one(dsti[i+2], mini, maxi, maxisign);
             dsti[i+3] = clipf_c_one(dsti[i+3], mini, maxi, maxisign);
             dsti[i+4] = clipf_c_one(dsti[i+4], mini, maxi, maxisign);
             dsti[i+5] = clipf_c_one(dsti[i+5], mini, maxi, maxisign);
             dsti[i+6] = clipf_c_one(dsti[i+6], mini, maxi, maxisign);
             dsti[i+7] = clipf_c_one(dsti[i+7], mini, maxi, maxisign);
         }
     } else {
         // generic code here, or optimize for other combinations of signs
     }
}

--Loren Merritt