[Ffmpeg-devel] int vs. float profiler, take 2
Gabriel Gerhardsson
gabrielg
Sat May 21 10:59:30 CEST 2005
On Fri, 2005-05-20 at 17:05 -0600, Mike Melanson wrote:
> Hi,
> Since the first version of my little profiler generated a reasonable
> amount of activity, attached is a slightly improved version. This one
> does the following:
>
> * runs all 4 of the functions n times as a cache warmup (n=1000 in the
> code); this actually does help with cycle count consistency
> * fetches an overhead cycle count as a baseline
> * C code can fetch iteration count
>
> The ASM code has ITERATIONS set to 1 right now. I would be interested to
> know the results from varying CPUs using 1, 10, and 100 iterations.
>
> Results from my VIA P3-class CPU:
>
> warming up with 1000 cycles...
> integer_adder(), 1 adds, 27 cycles used (overhead = 26)
> float_adder(), 1 adds, 27 cycles used (overhead = 26)
> integer_mult(), 1 mults, 34 cycles used (overhead = 28)
> float_mult(), 1 mults, 27 cycles used (overhead = 26)
>
> warming up with 1000 cycles...
> integer_adder(), 10 adds, 36 cycles used (overhead = 26)
> float_adder(), 10 adds, 36 cycles used (overhead = 26)
> integer_mult(), 10 mults, 115 cycles used (overhead = 28)
> float_mult(), 10 mults, 36 cycles used (overhead = 26)
>
> warming up with 1000 cycles...
> integer_adder(), 100 adds, 156 cycles used (overhead = 26)
> float_adder(), 100 adds, 576 cycles used (overhead = 26)
> integer_mult(), 100 mults, 925 cycles used (overhead = 28)
> float_mult(), 100 mults, 578 cycles used (overhead = 26)
>
> The benchmark still suffers from the result dependency problem. But the
> same problem comes up in the multimedia decoding algorithms, right? If
> the multiplications were parallelizable, wouldn't we be using
> SSE/SSE2/AltiVec instructions to parallelize them?
>
> Thanks...
Hello
When timing a few instructions like this, it's important to serialize
the rdtsc instructions. Otherwise they will just be executed
out-of-order on any modern processor. Please try the attached math.asm.
Only the timing code has changed, not the timed instructions. Please use
this timing code in your next version of this benchmark.
/Gabriel
-------------- next part --------------
global get_iterations:function
global integer_adder:function
global float_adder:function
global integer_mult:function
global float_mult:function
%define ITERATIONS 1
; int get_iterations(void)
get_iterations:
mov eax, ITERATIONS
ret
; int integer_adder(
; unsigned int *cycle_count,
; unsigned int *overhead_cycle_count);
integer_adder:
push ebp ; set up context and save registers
mov ebp,esp
push ebx
push ecx
push edx
; profile overhead cycle count
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
push ecx ; operation, just the push here since we're measuring overhead
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ecx
pop ebx
sub eax, ebx
mov ebx, [ebp+12]
mov [ebx], eax
mov ecx, 0 ; ecx will serve as the accumulator
; first timestamp
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
times ITERATIONS add ecx, 5 ; perform n additions
push ecx ; save result
; second timestamp
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ecx
pop ebx
sub eax, ebx ; calculate the cycles elapsed
mov ebx, [ebp+8] ; load the address of the cycle count parameter
mov [ebx], eax ; save the cycle count
mov eax, ecx ; return the sum through eax
pop edx ; restore the CPU state
pop ecx
pop ebx
pop ebp
ret
; double float_adder(unsigned int *cycle_count,
; unsigned int *overhead_cycle_count);
float_adder:
push ebp ; set up context and save registers
mov ebp,esp
push eax
push ebx
push ecx
push edx
; profile overhead cycle count
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ebx
sub eax, ebx
mov ebx, [ebp+12]
mov [ebx], eax
fld1 ; push 1 on the stack
fld1 ; push 1 on the stack
times 4 fadd ST1 ; turn 1 into 5
fldz ; push zero on the stack
; first timestamp
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
times ITERATIONS fadd ST1 ; perform n float adds
; second timestamp
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ebx
sub eax, ebx ; calculate the cycles elapsed
mov ebx, [ebp+8] ; load the address of the cycle count parameter
mov [ebx], eax
pop edx ; restore the CPU state
pop ecx
pop ebx
pop eax
pop ebp
ret
; int integer_mult(unsigned int *cycle_count,
; unsigned int *overhead_cycle_count);
integer_mult:
push ebp ; set up context and save registers
mov ebp,esp
push ebx
push ecx
push edx
; profile overhead cycle count
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
push ecx ; operation, just the push here since we're measuring overhead
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ecx
pop ebx
sub eax, ebx
mov ebx, [ebp+12]
mov [ebx], eax
mov ecx, 5 ; ecx holds the multiplier
; first timestamp
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
mov eax, 1 ; set up the base for multiplication
times ITERATIONS mul ecx ; perform n int mults (eax *= ecx)
push eax ; save result
; second timestamp
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ecx
pop ebx
sub eax, ebx ; calculate the cycles elapsed
mov ebx, [ebp+8] ; load the address of the cycle count parameter
mov [ebx], eax
mov eax, ecx ; return the sum through eax
pop edx ; restore the CPU state
pop ecx
pop ebx
pop ebp
ret
; double float_mult(unsigned int *cycle_count,
; unsigned int *overhead_cycle_count);
float_mult:
push ebp ; set up context and save registers
mov ebp,esp
push eax
push ebx
push ecx
push edx
; profile overhead cycle count
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ebx
sub eax, ebx
mov ebx, [ebp+12]
mov [ebx], eax
fld1 ; push 1 on the stack
fld1 ; push 1 on the stack
times 4 fadd ST1 ; turn 1 into 5 (use as the multiplier)
fld1 ; push 1 again (base for multiplication)
; first timestamp
xor eax, eax
cpuid
rdtsc
push eax ; save
xor eax, eax
cpuid
times ITERATIONS fmul ST1 ; perform n float ops (ST0 *= ST1)
; second timestamp
xor eax, eax
cpuid
rdtsc
push eax
xor eax, eax
cpuid
pop eax
pop ebx
sub eax, ebx ; calculate the cycles elapsed
mov ebx, [ebp+8] ; load the address of the cycle count parameter
mov [ebx], eax
pop edx ; restore the CPU state
pop ecx
pop ebx
pop eax
pop ebp
ret
More information about the ffmpeg-devel
mailing list