Lines 616-645
Link Here
|
616 |
|
616 |
|
617 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
617 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
618 |
asm volatile( //FIXME could save 1 instruction if done as 8x4 ... |
618 |
asm volatile( //FIXME could save 1 instruction if done as 8x4 ... |
619 |
"movd %4, %%mm0 \n\t" |
619 |
"push %2 \n\t" |
620 |
"movd %5, %%mm1 \n\t" |
620 |
"push %0 \n\t" |
621 |
"movd %6, %%mm2 \n\t" |
621 |
"movd (%2), %%mm0 \n\t" |
622 |
"movd %7, %%mm3 \n\t" |
622 |
"movd (%2,%3), %%mm1 \n\t" |
|
|
623 |
"lea (%2, %3, 2), %2 \n\t" |
624 |
"movd (%2), %%mm2 \n\t" |
625 |
"movd (%2,%3), %%mm3 \n\t" |
623 |
"punpcklbw %%mm1, %%mm0 \n\t" |
626 |
"punpcklbw %%mm1, %%mm0 \n\t" |
624 |
"punpcklbw %%mm3, %%mm2 \n\t" |
627 |
"punpcklbw %%mm3, %%mm2 \n\t" |
625 |
"movq %%mm0, %%mm1 \n\t" |
628 |
"movq %%mm0, %%mm1 \n\t" |
626 |
"punpcklwd %%mm2, %%mm0 \n\t" |
629 |
"punpcklwd %%mm2, %%mm0 \n\t" |
627 |
"punpckhwd %%mm2, %%mm1 \n\t" |
630 |
"punpckhwd %%mm2, %%mm1 \n\t" |
628 |
"movd %%mm0, %0 \n\t" |
631 |
"movd %%mm0, (%0) \n\t" |
629 |
"punpckhdq %%mm0, %%mm0 \n\t" |
632 |
"punpckhdq %%mm0, %%mm0 \n\t" |
630 |
"movd %%mm0, %1 \n\t" |
633 |
"movd %%mm0, (%0,%1) \n\t" |
631 |
"movd %%mm1, %2 \n\t" |
634 |
"lea (%0, %1, 2), %0 \n\t" |
|
|
635 |
"movd %%mm1, (%0) \n\t" |
632 |
"punpckhdq %%mm1, %%mm1 \n\t" |
636 |
"punpckhdq %%mm1, %%mm1 \n\t" |
633 |
"movd %%mm1, %3 \n\t" |
637 |
"movd %%mm1, (%0,%1) \n\t" |
634 |
|
638 |
"popl %0 \n\t" |
635 |
: "=m" (*(uint32_t*)(dst + 0*dst_stride)), |
639 |
"popl %2 \n\t" |
636 |
"=m" (*(uint32_t*)(dst + 1*dst_stride)), |
640 |
:: "r" (dst), |
637 |
"=m" (*(uint32_t*)(dst + 2*dst_stride)), |
641 |
"r" (dst_stride), |
638 |
"=m" (*(uint32_t*)(dst + 3*dst_stride)) |
642 |
"r" (src), |
639 |
: "m" (*(uint32_t*)(src + 0*src_stride)), |
643 |
"r" (src_stride) |
640 |
"m" (*(uint32_t*)(src + 1*src_stride)), |
|
|
641 |
"m" (*(uint32_t*)(src + 2*src_stride)), |
642 |
"m" (*(uint32_t*)(src + 3*src_stride)) |
643 |
); |
644 |
); |
644 |
} |
645 |
} |
645 |
|
646 |
|