Code: Select all
#include <print.h>
#include <xclib.h>
// Returns a + (b << 1)
static inline unsigned lda16f(unsigned a, unsigned b)
{
unsigned val;
asm("lda16 %0, %1[%2]" : "=r"(val) : "r"(a), "r"(b));
return val;
}
unsigned f(unsigned threshold, unsigned a[8]) {
unsigned val;
unsigned cmp = (threshold << 24) | 0x00ffffff;
unsigned tmp;
tmp = a[7];
val = tmp > cmp;
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[6];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[5];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[4];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[3];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[2];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[1];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp = a[0];
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
tmp <<= 8;
val = lda16f(tmp > cmp, val);
return bitrev(val);
}
int main()
{
unsigned char a[32] = {
10, 20, 121, 200, 82, 122, 7, 230,
164, 150, 9, 190, 187, 255, 0, 66,
101, 0, 200, 1, 89, 10, 20, 30,
99, 254, 3, 100, 206, 32, 64, 99
};
unsigned threshold = 100;
unsigned result = f(threshold, (a, unsigned[]));
printhexln(result);
return 0;
}
When built with -O2 -fschedule the code takes 400 processor clock cycles assuming no more than 4 threads are running. This is 1000ns if the processor is running at 400Mhz and 800ns at 500MHz.