[sdiy] Re: 1970's again? Now DSP assembly

Kenneth Elhardt elhardt at worldnet.att.net
Mon Jan 31 10:39:16 CET 2005


Martin Fay writes:
>>That C doesn't look like it plays to the x86 architecture to me, but a few
missing details probably hold the key.<<

It's C.  It's not supposed to play to any architecture (nothing plays to x86
crap architecture).  It's a simple for loop and it accumulates 16 bit
samples into a floating point accumulator.  It can't get much simpler than
that.  But as I pointed out, even when I tried to speed it up, no luck.  The
Microsoft compiler doesn't even allow register variables (since it doesn't
have enough of them).  The x86 can't do auto inc of address registers.
Microsoft likes to make calls to do simple things rather than put the code
right where it should go.  For example, to take the sin of a number in asm
all you do is load the number into a floating point register and then do an
fsin instruction.  That's it.  But Microsoft generates 40 instructions to do
the same thing, which include calls to other functions.  You'll notice they
do some call off into some ftol (float to long) routine below, who knows how
long that is, rather than do a single fistp instruction.  It's awful.  I
hate waste.

>>What types are fir[] and k?<<

The fir is float.  k is just integer for indexing the array.

>>Would you mind posting the asm?<<

Without the proper tabs, it looks like crap, but here is what the compiler
generates vs my asm code below it (equal to my code from the label "floop1"
downward).  The place where most of the time is spent is in that inner loop
that starts just below the C code line 1088.  As you can see it's a
convoluted mess of 19 instructions, versus a nice clean 7 instructions for
my inner loop that starts at the label "floop2".  In my asm code, almost all
variables are held in registers, and I don't wastefully keep dumping them
out and updating variables in memory during the loop.  The asm code would be
much smaller on a DSP with zero overhead loops, auto incrementing of
registers, and fmac instructions, you could probably get the inner loop down
to about two instructions.

Microsoft Generated
---------------------

1085:     for(i=0;i<length;i++) {
0040502C   mov         dword ptr [i],0
00405033   jmp         FIRFilter(0x0040503e)+0D6h
00405035   mov         edx,dword ptr [i]
00405038   add         edx,1
0040503B   mov         dword ptr [i],edx
0040503E   mov         eax,dword ptr [i]
00405041   cmp         eax,dword ptr [length]
00405044   jae         FIRFilter(0x004050c0)+158h
1086:         s=srcstep++;
00405046   mov         ecx,dword ptr [srcstep]
00405049   mov         dword ptr [s],ecx
0040504C   mov         edx,dword ptr [srcstep]
0040504F   add         edx,2
00405052   mov         dword ptr [srcstep],edx
1087:         ftemp=0;
00405055   mov         dword ptr [ftemp],0
1088:         for(k=0;k<=510;k++) ftemp+=(fir[k]*(float)*s++);
0040505C   mov         dword ptr [k],0
00405063   jmp         FIRFilter(0x0040506e)+106h
00405065   mov         eax,dword ptr [k]
00405068   add         eax,1
0040506B   mov         dword ptr [k],eax
0040506E   cmp         dword ptr [k],1FEh
00405075   jg          FIRFilter(0x004050a4)+13Ch
00405077   mov         ecx,dword ptr [s]
0040507A   movsx       edx,word ptr [ecx]
0040507D   mov         dword ptr [ebp-824h],edx
00405083   fild        dword ptr [ebp-824h]
00405089   mov         eax,dword ptr [k]
0040508C   fmul        dword ptr fir[eax*4]
00405093   fadd        dword ptr [ftemp]
00405096   fstp        dword ptr [ftemp]
00405099   mov         ecx,dword ptr [s]
0040509C   add         ecx,2
0040509F   mov         dword ptr [s],ecx
004050A2   jmp         FIRFilter(0x00405065)+0FDh
1091:         *d++=(short)ftemp;
004050A4   fld         dword ptr [ftemp]
004050A7   call        __ftol(0x0040833c)
004050AC   mov         edx,dword ptr [d]
004050AF   mov         word ptr [edx],ax
004050B2   mov         eax,dword ptr [d]
004050B5   add         eax,2
004050B8   mov         dword ptr [d],eax
1092:     }
004050BB   jmp         FIRFilter(0x00405035)+0CDh
1093: }
004050C0   mov         esp,ebp
004050C2   pop         ebp
004050C3   ret


Ken's Code
------------

FilterLoopFast(float *fptr, unsigned long firsize, short *sptr, short *dptr,
unsigned long count) {
__asm {
    push esi
    push edi
    mov  esi,sptr
    mov  edi,dptr
    mov  ecx,count
    mov  eax,firsize
    dec  eax     ;length--
    add  eax,eax    ;length<<1 (sample rewind value)
floop1:  fldz        ;accum=0.0
    mov  ebx,fptr
    mov  edx,firsize
    //
floop2:  fild  short ptr [esi] ;get (float) sample
    fmul  dword ptr [ebx] ;sample * FIR coefficient
    faddp  st(1),st(0)   ;accumulate
    add  esi,2     ;sptr++
    add  ebx,4     ;fptr++
    dec  edx     ;fir--
    jnz  floop2
    //
    fistp  short ptr [edi] ;store (short) filtered sample
    sub  esi,eax    ;back up *sptr to point to next sample
    add  edi,2
    dec  ecx     ;length--
    jnz  floop1
    //
    pop edi
    pop esi
} // endasm
}




More information about the Synth-diy mailing list