Lines 615-645
Link Here
|
615 |
} |
615 |
} |
616 |
|
616 |
|
617 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
617 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
|
|
618 |
void *dst_reg = dst, *src_reg = src; |
619 |
|
618 |
asm volatile( //FIXME could save 1 instruction if done as 8x4 ... |
620 |
asm volatile( //FIXME could save 1 instruction if done as 8x4 ... |
619 |
"movd %4, %%mm0 \n\t" |
621 |
"movd (%1), %%mm0 \n\t" |
620 |
"movd %5, %%mm1 \n\t" |
622 |
"movd (%1,%5), %%mm1 \n\t" |
621 |
"movd %6, %%mm2 \n\t" |
623 |
"lea (%1, %5, 2), %1 \n\t" |
622 |
"movd %7, %%mm3 \n\t" |
624 |
"movd (%1), %%mm2 \n\t" |
|
|
625 |
"movd (%1,%5), %%mm3 \n\t" |
623 |
"punpcklbw %%mm1, %%mm0 \n\t" |
626 |
"punpcklbw %%mm1, %%mm0 \n\t" |
624 |
"punpcklbw %%mm3, %%mm2 \n\t" |
627 |
"punpcklbw %%mm3, %%mm2 \n\t" |
625 |
"movq %%mm0, %%mm1 \n\t" |
628 |
"movq %%mm0, %%mm1 \n\t" |
626 |
"punpcklwd %%mm2, %%mm0 \n\t" |
629 |
"punpcklwd %%mm2, %%mm0 \n\t" |
627 |
"punpckhwd %%mm2, %%mm1 \n\t" |
630 |
"punpckhwd %%mm2, %%mm1 \n\t" |
628 |
"movd %%mm0, %0 \n\t" |
631 |
"movd %%mm0, (%0) \n\t" |
629 |
"punpckhdq %%mm0, %%mm0 \n\t" |
632 |
"punpckhdq %%mm0, %%mm0 \n\t" |
630 |
"movd %%mm0, %1 \n\t" |
633 |
"movd %%mm0, (%0,%4) \n\t" |
631 |
"movd %%mm1, %2 \n\t" |
634 |
"lea (%0, %4, 2), %0 \n\t" |
|
|
635 |
"movd %%mm1, (%0) \n\t" |
632 |
"punpckhdq %%mm1, %%mm1 \n\t" |
636 |
"punpckhdq %%mm1, %%mm1 \n\t" |
633 |
"movd %%mm1, %3 \n\t" |
637 |
"movd %%mm1, (%0,%4) \n\t" |
634 |
|
638 |
: "=&r" (dst_reg), |
635 |
: "=m" (*(uint32_t*)(dst + 0*dst_stride)), |
639 |
"=&r" (src_reg) |
636 |
"=m" (*(uint32_t*)(dst + 1*dst_stride)), |
640 |
: "0" (dst_reg), |
637 |
"=m" (*(uint32_t*)(dst + 2*dst_stride)), |
641 |
"1" (src_reg), |
638 |
"=m" (*(uint32_t*)(dst + 3*dst_stride)) |
642 |
"r" (dst_stride), |
639 |
: "m" (*(uint32_t*)(src + 0*src_stride)), |
643 |
"r" (src_stride) |
640 |
"m" (*(uint32_t*)(src + 1*src_stride)), |
|
|
641 |
"m" (*(uint32_t*)(src + 2*src_stride)), |
642 |
"m" (*(uint32_t*)(src + 3*src_stride)) |
643 |
); |
644 |
); |
644 |
} |
645 |
} |
645 |
|
646 |
|