[FFmpeg-devel] [PATCH] WMA Voice decoder
Ronald S. Bultje
rsbultje
Wed Feb 10 20:03:39 CET 2010
Hi,
On Wed, Feb 10, 2010 at 1:30 PM, Reimar D?ffinger
<Reimar.Doeffinger at gmx.de> wrote:
> On Tue, Feb 09, 2010 at 03:01:18PM -0500, Ronald S. Bultje wrote:
>> static void dequant_lsps(double *restrict lsps, int num,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const uint16_t *values,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const uint16_t *sizes,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?int n_stages, const uint8_t *table,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const double *restrict mul_q,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const double *restrict base_q)
>
> I think so, yes. Only way to really know is looking at the generated assembler
> code though...
So the concern was whether it'd reload base_q[n]/mul_q[n] within the
inner loop in dequant_lsps(). Disassembly (gdb):
0x0040fc00 <dequant_lsps+0>: push %ebp
0x0040fc01 <dequant_lsps+1>: push %edi
0x0040fc02 <dequant_lsps+2>: push %esi
0x0040fc03 <dequant_lsps+3>: push %ebx
0x0040fc04 <dequant_lsps+4>: sub $0x2c,%esp
0x0040fc07 <dequant_lsps+7>: mov %eax,%ebx
0x0040fc09 <dequant_lsps+9>: mov %edx,%edi
0x0040fc0b <dequant_lsps+11>: mov %ecx,0x1c(%esp)
0x0040fc0f <dequant_lsps+15>: mov 0x48(%esp),%ebp
0x0040fc13 <__inline_memset_chk+0>: lea 0x0(,%edx,8),%eax
0x0040fc1a <__inline_memset_chk+7>: mov %eax,0x8(%esp)
0x0040fc1e <__inline_memset_chk+11>: movl $0x0,0x4(%esp)
0x0040fc26 <__inline_memset_chk+19>: mov %ebx,(%esp)
0x0040fc29 <__inline_memset_chk+22>: call 0x4eaacc <dyld_stub_memset>
outer loop start:
0x0040fc2e <dequant_lsps+46>: mov 0x44(%esp),%eax
0x0040fc32 <dequant_lsps+50>: test %eax,%eax
0x0040fc34 <dequant_lsps+52>: jle 0x40fc95 <dequant_lsps+149>
0x0040fc36 <dequant_lsps+54>: mov $0x1,%ecx
0x0040fc3b <dequant_lsps+59>: mov 0x1c(%esp),%edx
0x0040fc3f <dequant_lsps+63>: movzwl -0x2(%edx,%ecx,2),%eax
0x0040fc44 <dequant_lsps+68>: imul %edi,%eax
0x0040fc47 <dequant_lsps+71>: lea (%eax,%ebp,1),%esi
inner loop start:
0x0040fc4a <dequant_lsps+74>: test %edi,%edi
0x0040fc4c <dequant_lsps+76>: jle 0x40fc7b <dequant_lsps+123>
0x0040fc4e <dequant_lsps+78>: xor %edx,%edx
0x0040fc50 <dequant_lsps+80>: movzbl (%edx,%esi,1),%eax
0x0040fc54 <dequant_lsps+84>: cvtsi2sd %eax,%xmm0
0x0040fc58 <dequant_lsps+88>: mov 0x4c(%esp),%eax <<
0x0040fc5c <dequant_lsps+92>: mulsd -0x8(%eax,%ecx,8),%xmm0 <<
0x0040fc62 <dequant_lsps+98>: mov 0x50(%esp),%eax <<
0x0040fc66 <dequant_lsps+102>: addsd -0x8(%eax,%ecx,8),%xmm0 <<
0x0040fc6c <dequant_lsps+108>: addsd (%ebx,%edx,8),%xmm0
0x0040fc71 <dequant_lsps+113>: movsd %xmm0,(%ebx,%edx,8)
0x0040fc76 <dequant_lsps+118>: inc %edx
0x0040fc77 <dequant_lsps+119>: cmp %edi,%edx
0x0040fc79 <dequant_lsps+121>: jne 0x40fc50 <dequant_lsps+80>
inner loop end
0x0040fc7b <dequant_lsps+123>: mov 0x40(%esp),%edx
0x0040fc7f <dequant_lsps+127>: movzwl -0x2(%edx,%ecx,2),%eax
0x0040fc84 <dequant_lsps+132>: cmp 0x44(%esp),%ecx
0x0040fc88 <dequant_lsps+136>: je 0x40fc95 <dequant_lsps+149>
0x0040fc8a <dequant_lsps+138>: movzwl %ax,%eax
0x0040fc8d <dequant_lsps+141>: imul %edi,%eax
0x0040fc90 <dequant_lsps+144>: add %eax,%ebp
0x0040fc92 <dequant_lsps+146>: inc %ecx
0x0040fc93 <dequant_lsps+147>: jmp 0x40fc3b <dequant_lsps+59>
0x0040fc95 <dequant_lsps+149>: add $0x2c,%esp
0x0040fc98 <dequant_lsps+152>: pop %ebx
0x0040fc99 <dequant_lsps+153>: pop %esi
0x0040fc9a <dequant_lsps+154>: pop %edi
0x0040fc9b <dequant_lsps+155>: pop %ebp
0x0040fc9c <dequant_lsps+156>: ret
So it's essentially not using it (ecx is n in the inner loop, and it's
reading them from memory directly on-use, see "<<"). Not sure how to
fix gcc's stupidity here. Removing restrict and manually loading the
variables into a temp variable:
for (n = 0; n < n_stages; n++) {
const uint8_t *t_off = &table[values[n] * num];
double base = base_q[n], mul = mul_q[n];
for (m = 0; m < num; m++)
lsps[m] += base + mul * t_off[m];
table += sizes[n] * num;
}
gives:
outer loop start:
0x0040fc0e <dequant_lsps+46>: mov 0x44(%esp),%eax
0x0040fc12 <dequant_lsps+50>: test %eax,%eax
0x0040fc14 <dequant_lsps+52>: jle 0x40fc89 <dequant_lsps+169>
0x0040fc16 <dequant_lsps+54>: mov $0x1,%edi
0x0040fc1b <dequant_lsps+59>: mov 0x1c(%esp),%edx
0x0040fc1f <dequant_lsps+63>: movzwl -0x2(%edx,%edi,2),%eax
0x0040fc24 <dequant_lsps+68>: imul %esi,%eax
0x0040fc27 <dequant_lsps+71>: lea (%eax,%ebp,1),%ecx
0x0040fc2a <dequant_lsps+74>: mov 0x50(%esp),%eax
0x0040fc2e <dequant_lsps+78>: movsd -0x8(%eax,%edi,8),%xmm2
0x0040fc34 <dequant_lsps+84>: mov 0x4c(%esp),%edx
0x0040fc38 <dequant_lsps+88>: movsd -0x8(%edx,%edi,8),%xmm1
inner loop start:
0x0040fc3e <dequant_lsps+94>: test %esi,%esi
0x0040fc40 <dequant_lsps+96>: jle 0x40fc6f <dequant_lsps+143>
0x0040fc42 <dequant_lsps+98>: xor %edx,%edx
0x0040fc44 <dequant_lsps+100>: nopw 0x0(%eax,%eax,1)
0x0040fc4a <dequant_lsps+106>: nopw 0x0(%eax,%eax,1)
0x0040fc50 <dequant_lsps+112>: movzbl (%edx,%ecx,1),%eax
0x0040fc54 <dequant_lsps+116>: cvtsi2sd %eax,%xmm0
0x0040fc58 <dequant_lsps+120>: mulsd %xmm1,%xmm0 <<
0x0040fc5c <dequant_lsps+124>: addsd %xmm2,%xmm0 <<
0x0040fc60 <dequant_lsps+128>: addsd (%ebx,%edx,8),%xmm0
0x0040fc65 <dequant_lsps+133>: movsd %xmm0,(%ebx,%edx,8)
0x0040fc6a <dequant_lsps+138>: inc %edx
0x0040fc6b <dequant_lsps+139>: cmp %esi,%edx
0x0040fc6d <dequant_lsps+141>: jne 0x40fc50 <dequant_lsps+112>
inner loop end
0x0040fc6f <dequant_lsps+143>: mov 0x40(%esp),%edx
0x0040fc73 <dequant_lsps+147>: movzwl -0x2(%edx,%edi,2),%eax
0x0040fc78 <dequant_lsps+152>: cmp 0x44(%esp),%edi
0x0040fc7c <dequant_lsps+156>: je 0x40fc89 <dequant_lsps+169>
0x0040fc7e <dequant_lsps+158>: movzwl %ax,%eax
0x0040fc81 <dequant_lsps+161>: imul %esi,%eax
0x0040fc84 <dequant_lsps+164>: add %eax,%ebp
0x0040fc86 <dequant_lsps+166>: inc %edi
0x0040fc87 <dequant_lsps+167>: jmp 0x40fc1b <dequant_lsps+59>
So 6 (instead of 8) instructions in the inner loop, and I have no idea
what those nops are doing above there (alignment?). Shall I use this
version then?
Ronald
More information about the ffmpeg-devel
mailing list