//printf("DEBUG: channel %d, bo %d, off %d\n", channel, bo, 16 - bo1);
__asm__ volatile(
ASMALIGN(4)
".L03:\n\t"
"0:\n\t"
"movq (%1),%%mm0\n\t"
"movq 64(%1),%%mm4\n\t"
"pmaddwd (%2),%%mm0\n\t"
"add $8,%3\n\t"
"decl %0\n\t"
"jnz .L03\n\t"
"jnz 0b\n\t"
"movl $7,%0\n\t"
".L04:\n\t"
"1:\n\t"
"add $128,%1\n\t"
"jnz .L04\n\t"
"jnz 1b\n\t"