[FFmpeg-devel] [PATCH] WMA Voice decoder

Wed Feb 10 20:03:39 CET 2010

Hi,

On Wed, Feb 10, 2010 at 1:30 PM, Reimar D?ffinger
<Reimar.Doeffinger at gmx.de> wrote:
> On Tue, Feb 09, 2010 at 03:01:18PM -0500, Ronald S. Bultje wrote:
>> static void dequant_lsps(double *restrict lsps, int num,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const uint16_t *values,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const uint16_t *sizes,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?int n_stages, const uint8_t *table,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const double *restrict mul_q,
>> ? ? ? ? ? ? ? ? ? ? ? ? ?const double *restrict base_q)
>
> I think so, yes. Only way to really know is looking at the generated assembler
> code though...

So the concern was whether it'd reload base_q[n]/mul_q[n] within the
inner loop in dequant_lsps(). Disassembly (gdb):

0x0040fc00 <dequant_lsps+0>:	push   %ebp
0x0040fc01 <dequant_lsps+1>:	push   %edi
0x0040fc02 <dequant_lsps+2>:	push   %esi
0x0040fc03 <dequant_lsps+3>:	push   %ebx
0x0040fc04 <dequant_lsps+4>:	sub    $0x2c,%esp
0x0040fc07 <dequant_lsps+7>:	mov    %eax,%ebx
0x0040fc09 <dequant_lsps+9>:	mov    %edx,%edi
0x0040fc0b <dequant_lsps+11>:	mov    %ecx,0x1c(%esp)
0x0040fc0f <dequant_lsps+15>:	mov    0x48(%esp),%ebp
0x0040fc13 <__inline_memset_chk+0>:	lea    0x0(,%edx,8),%eax
0x0040fc1a <__inline_memset_chk+7>:	mov    %eax,0x8(%esp)
0x0040fc1e <__inline_memset_chk+11>:	movl   $0x0,0x4(%esp)
0x0040fc26 <__inline_memset_chk+19>:	mov    %ebx,(%esp)
0x0040fc29 <__inline_memset_chk+22>:	call   0x4eaacc <dyld_stub_memset>

outer loop start:

0x0040fc2e <dequant_lsps+46>:	mov    0x44(%esp),%eax
0x0040fc32 <dequant_lsps+50>:	test   %eax,%eax
0x0040fc34 <dequant_lsps+52>:	jle    0x40fc95 <dequant_lsps+149>
0x0040fc36 <dequant_lsps+54>:	mov    $0x1,%ecx
0x0040fc3b <dequant_lsps+59>:	mov    0x1c(%esp),%edx
0x0040fc3f <dequant_lsps+63>:	movzwl -0x2(%edx,%ecx,2),%eax
0x0040fc44 <dequant_lsps+68>:	imul   %edi,%eax
0x0040fc47 <dequant_lsps+71>:	lea    (%eax,%ebp,1),%esi

inner loop start:

0x0040fc4a <dequant_lsps+74>:	test   %edi,%edi
0x0040fc4c <dequant_lsps+76>:	jle    0x40fc7b <dequant_lsps+123>
0x0040fc4e <dequant_lsps+78>:	xor    %edx,%edx

0x0040fc50 <dequant_lsps+80>:	movzbl (%edx,%esi,1),%eax
0x0040fc54 <dequant_lsps+84>:	cvtsi2sd %eax,%xmm0
0x0040fc58 <dequant_lsps+88>:	mov    0x4c(%esp),%eax <<
0x0040fc5c <dequant_lsps+92>:	mulsd  -0x8(%eax,%ecx,8),%xmm0 <<
0x0040fc62 <dequant_lsps+98>:	mov    0x50(%esp),%eax <<
0x0040fc66 <dequant_lsps+102>:	addsd  -0x8(%eax,%ecx,8),%xmm0 <<
0x0040fc6c <dequant_lsps+108>:	addsd  (%ebx,%edx,8),%xmm0
0x0040fc71 <dequant_lsps+113>:	movsd  %xmm0,(%ebx,%edx,8)

0x0040fc76 <dequant_lsps+118>:	inc    %edx
0x0040fc77 <dequant_lsps+119>:	cmp    %edi,%edx
0x0040fc79 <dequant_lsps+121>:	jne    0x40fc50 <dequant_lsps+80>

inner loop end

0x0040fc7b <dequant_lsps+123>:	mov    0x40(%esp),%edx
0x0040fc7f <dequant_lsps+127>:	movzwl -0x2(%edx,%ecx,2),%eax
0x0040fc84 <dequant_lsps+132>:	cmp    0x44(%esp),%ecx
0x0040fc88 <dequant_lsps+136>:	je     0x40fc95 <dequant_lsps+149>
0x0040fc8a <dequant_lsps+138>:	movzwl %ax,%eax
0x0040fc8d <dequant_lsps+141>:	imul   %edi,%eax
0x0040fc90 <dequant_lsps+144>:	add    %eax,%ebp
0x0040fc92 <dequant_lsps+146>:	inc    %ecx
0x0040fc93 <dequant_lsps+147>:	jmp    0x40fc3b <dequant_lsps+59>
0x0040fc95 <dequant_lsps+149>:	add    $0x2c,%esp
0x0040fc98 <dequant_lsps+152>:	pop    %ebx
0x0040fc99 <dequant_lsps+153>:	pop    %esi
0x0040fc9a <dequant_lsps+154>:	pop    %edi
0x0040fc9b <dequant_lsps+155>:	pop    %ebp
0x0040fc9c <dequant_lsps+156>:	ret

So it's essentially not using it (ecx is n in the inner loop, and it's
reading them from memory directly on-use, see "<<"). Not sure how to
fix gcc's stupidity here. Removing restrict and manually loading the
variables into a temp variable:

    for (n = 0; n < n_stages; n++) {
        const uint8_t *t_off = &table[values[n] * num];
        double base = base_q[n], mul = mul_q[n];

        for (m = 0; m < num; m++)
            lsps[m] += base + mul * t_off[m];

        table += sizes[n] * num;
    }

gives:

outer loop start:

0x0040fc0e <dequant_lsps+46>:	mov    0x44(%esp),%eax
0x0040fc12 <dequant_lsps+50>:	test   %eax,%eax
0x0040fc14 <dequant_lsps+52>:	jle    0x40fc89 <dequant_lsps+169>

0x0040fc16 <dequant_lsps+54>:	mov    $0x1,%edi
0x0040fc1b <dequant_lsps+59>:	mov    0x1c(%esp),%edx
0x0040fc1f <dequant_lsps+63>:	movzwl -0x2(%edx,%edi,2),%eax
0x0040fc24 <dequant_lsps+68>:	imul   %esi,%eax
0x0040fc27 <dequant_lsps+71>:	lea    (%eax,%ebp,1),%ecx
0x0040fc2a <dequant_lsps+74>:	mov    0x50(%esp),%eax
0x0040fc2e <dequant_lsps+78>:	movsd  -0x8(%eax,%edi,8),%xmm2
0x0040fc34 <dequant_lsps+84>:	mov    0x4c(%esp),%edx
0x0040fc38 <dequant_lsps+88>:	movsd  -0x8(%edx,%edi,8),%xmm1

inner loop start:

0x0040fc3e <dequant_lsps+94>:	test   %esi,%esi
0x0040fc40 <dequant_lsps+96>:	jle    0x40fc6f <dequant_lsps+143>
0x0040fc42 <dequant_lsps+98>:	xor    %edx,%edx
0x0040fc44 <dequant_lsps+100>:	nopw   0x0(%eax,%eax,1)
0x0040fc4a <dequant_lsps+106>:	nopw   0x0(%eax,%eax,1)

0x0040fc50 <dequant_lsps+112>:	movzbl (%edx,%ecx,1),%eax
0x0040fc54 <dequant_lsps+116>:	cvtsi2sd %eax,%xmm0
0x0040fc58 <dequant_lsps+120>:	mulsd  %xmm1,%xmm0 <<
0x0040fc5c <dequant_lsps+124>:	addsd  %xmm2,%xmm0 <<
0x0040fc60 <dequant_lsps+128>:	addsd  (%ebx,%edx,8),%xmm0
0x0040fc65 <dequant_lsps+133>:	movsd  %xmm0,(%ebx,%edx,8)

0x0040fc6a <dequant_lsps+138>:	inc    %edx
0x0040fc6b <dequant_lsps+139>:	cmp    %esi,%edx
0x0040fc6d <dequant_lsps+141>:	jne    0x40fc50 <dequant_lsps+112>

inner loop end

0x0040fc6f <dequant_lsps+143>:	mov    0x40(%esp),%edx
0x0040fc73 <dequant_lsps+147>:	movzwl -0x2(%edx,%edi,2),%eax
0x0040fc78 <dequant_lsps+152>:	cmp    0x44(%esp),%edi
0x0040fc7c <dequant_lsps+156>:	je     0x40fc89 <dequant_lsps+169>
0x0040fc7e <dequant_lsps+158>:	movzwl %ax,%eax
0x0040fc81 <dequant_lsps+161>:	imul   %esi,%eax
0x0040fc84 <dequant_lsps+164>:	add    %eax,%ebp
0x0040fc86 <dequant_lsps+166>:	inc    %edi
0x0040fc87 <dequant_lsps+167>:	jmp    0x40fc1b <dequant_lsps+59>

So 6 (instead of 8) instructions in the inner loop, and I have no idea
what those nops are doing above there (alignment?). Shall I use this
version then?

Ronald