This pre alfa release uses 16 instruction per sample + the startup overhead.
Code: Select all
#define NWORDS 10
// .cc_top biquadAsm, biquadAsm.func
.globl fastbiquadAsm
.globl fastbiquadAsm.nstackwords
.linkset fastbiquadAsm.nstackwords,NWORDS
fastbiquadAsm:
entsp NWORDS
stw r1, sp[0]
stw r4, sp[1]
stw r5, sp[2]
stw r6, sp[3]
stw r7, sp[4]
stw r8, sp[5]
stw r9, sp[6]
stw r10, sp[7]
add r10,r0,0
// load coefs from struct
#define b0 r0
ldw b0,r2[0]
#define b1 r1
ldw b1,r2[1]
#define a1 r4
ldw a1,r2[3]
#define a2 r5
ldw a2,r2[4]
#define b2 r2
ldw b2,r2[2]
ldc r6, 0
ldc r7, 0
ldc r8, 0
ldc r9, 0
.align 4
LOOP:
//; R6: X[n-2] R7: X[n-1] r8: Y[N-2], R9: Y[N-1],
ldc r3, 0
ldc r11, 0
maccs r3, r11, a2, r8 //out+=-½A2*Y2 scale up
maccs r3, r11, b2, r6 //out+=½ B2*X2 frees r6, scale down
maccs r3, r11, b1, r7 //out+=½ B1*X1 scale down
shl r6,r9,1 // Y1*2 reallocates r6
maccs r3, r11, a1, r6 //out+=½(-½A1* 2*Y1) = -½ A1*Y1 scale up
in r6, res[r10]
ashr r6, r6, 7 //X0=x*2^-7
maccs r3, r11, b0, r6 //out+=½ B0*X0
shr r11, r11, 24
shl r3, r3, 8
or r3, r11, r3 //frees r11
ldw r11,sp[0]
out res[r11],r3 //out = 2^8* ½(X0B0+X1B1+X2B2-A1Y1-A2Y2) | X0=x*2-^7
//; R6: X[n-1] R7: X[n-2] r8: Y[N-1], R9: Y[N-2],
.align 4
ldc r3, 0
ldc r11, 0
maccs r3, r11, a2, r9 //out+=-A2*Y2 scale up
maccs r3, r11, b2, r7 //out+=½ B2*X2 frees r7, scale down
maccs r3, r11, b1, r6 //out+=½ B1*X1 scale down
shl r7, r8,1 // Y1*2 reallocates r7
maccs r3, r11, a1, r7 //out+=½ (-½A1* 2*Y1) = -½ A1*Y1 scale up
in r7, res[r10]
ashr r7, r7, 7 //X0=x*2^-7
maccs r3, r11, b0, r7 //out+=½ B0*X0
shr r11, r11, 24
shl r3, r3, 8
or r3, r11, r3 //frees r11
ldw r11,sp[0]
out res[r11],r3 //out = 2^8* ½(X0B0+X1B1+X2B2-½A1Y1-A2Y2) | X0=x*2^-7
//; R6: X[n] R7: X[n-1] r8: Y[N], R9: Y[N-1],
bu LOOP
allDone: // Now just restore all registers.
ldw r4, sp[1]
ldw r5, sp[2]
ldw r6, sp[3]
ldw r7, sp[4]
ldw r8, sp[5]
ldw r9, sp[6]
ldw r10, sp[7]
retsp NWORDS
// .cc_bottom fastbiquadAsm.func
( No FNOPS created in LOOP, but it might be unnecessary many shift inside the loop)