Lines 613-648
Link Here
|
613 |
"+m" (*(uint64_t*)(src - 1*stride)), |
613 |
"+m" (*(uint64_t*)(src - 1*stride)), |
614 |
"+m" (*(uint64_t*)(src + 0*stride)), |
614 |
"+m" (*(uint64_t*)(src + 0*stride)), |
615 |
"+m" (*(uint64_t*)(src + 1*stride)) |
615 |
"+m" (*(uint64_t*)(src + 1*stride)) |
616 |
: "g" (2*strength), "m"(ff_pb_FC) |
616 |
: "g" (2*(long)strength), "m"(ff_pb_FC) |
617 |
); |
617 |
); |
618 |
} |
618 |
} |
619 |
|
619 |
|
620 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
620 |
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ |
|
|
621 |
long dummy; |
621 |
asm volatile( //FIXME could save 1 instruction if done as 8x4 ... |
622 |
asm volatile( //FIXME could save 1 instruction if done as 8x4 ... |
622 |
"movd %4, %%mm0 \n\t" |
623 |
"movd (%3), %%mm0 \n\t" |
623 |
"movd %5, %%mm1 \n\t" |
624 |
"movd (%3, %4), %%mm1 \n\t" |
624 |
"movd %6, %%mm2 \n\t" |
625 |
"movd (%3, %4, 2), %%mm2 \n\t" |
625 |
"movd %7, %%mm3 \n\t" |
626 |
"lea (%4, %4, 2), %0 \n\t" |
|
|
627 |
"movd (%3, %0), %%mm3 \n\t" |
626 |
"punpcklbw %%mm1, %%mm0 \n\t" |
628 |
"punpcklbw %%mm1, %%mm0 \n\t" |
627 |
"punpcklbw %%mm3, %%mm2 \n\t" |
629 |
"punpcklbw %%mm3, %%mm2 \n\t" |
628 |
"movq %%mm0, %%mm1 \n\t" |
630 |
"movq %%mm0, %%mm1 \n\t" |
629 |
"punpcklwd %%mm2, %%mm0 \n\t" |
631 |
"punpcklwd %%mm2, %%mm0 \n\t" |
630 |
"punpckhwd %%mm2, %%mm1 \n\t" |
632 |
"punpckhwd %%mm2, %%mm1 \n\t" |
631 |
"movd %%mm0, %0 \n\t" |
633 |
"movd %%mm0, (%1) \n\t" |
632 |
"punpckhdq %%mm0, %%mm0 \n\t" |
634 |
"punpckhdq %%mm0, %%mm0 \n\t" |
633 |
"movd %%mm0, %1 \n\t" |
635 |
"movd %%mm0, (%1, %2) \n\t" |
634 |
"movd %%mm1, %2 \n\t" |
636 |
"movd %%mm1, (%1, %2, 2) \n\t" |
635 |
"punpckhdq %%mm1, %%mm1 \n\t" |
637 |
"punpckhdq %%mm1, %%mm1 \n\t" |
636 |
"movd %%mm1, %3 \n\t" |
638 |
"lea (%2, %2, 2), %0 \n\t" |
|
|
639 |
"movd %%mm1, (%1, %0) \n\t" |
637 |
|
640 |
|
638 |
: "=m" (*(uint32_t*)(dst + 0*dst_stride)), |
641 |
: "=&r" (dummy) |
639 |
"=m" (*(uint32_t*)(dst + 1*dst_stride)), |
642 |
: "r" (dst), |
640 |
"=m" (*(uint32_t*)(dst + 2*dst_stride)), |
643 |
"r" ((long)dst_stride), |
641 |
"=m" (*(uint32_t*)(dst + 3*dst_stride)) |
644 |
"r" (src), |
642 |
: "m" (*(uint32_t*)(src + 0*src_stride)), |
645 |
"r" ((long)src_stride) |
643 |
"m" (*(uint32_t*)(src + 1*src_stride)), |
646 |
: "memory" |
644 |
"m" (*(uint32_t*)(src + 2*src_stride)), |
|
|
645 |
"m" (*(uint32_t*)(src + 3*src_stride)) |
646 |
); |
647 |
); |
647 |
} |
648 |
} |
648 |
|
649 |
|
Lines 662-668
Link Here
|
662 |
"+m" (temp[1]), |
663 |
"+m" (temp[1]), |
663 |
"+m" (temp[2]), |
664 |
"+m" (temp[2]), |
664 |
"+m" (temp[3]) |
665 |
"+m" (temp[3]) |
665 |
: "g" (2*strength), "m"(ff_pb_FC) |
666 |
: "g" (2*(long)strength), "m"(ff_pb_FC) |
666 |
); |
667 |
); |
667 |
|
668 |
|
668 |
asm volatile( |
669 |
asm volatile( |
Lines 1727-1733
Link Here
|
1727 |
|
1728 |
|
1728 |
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
1729 |
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ |
1729 |
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
1730 |
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\ |
1730 |
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ |
1731 |
"movq %5, %%mm4 \n\t" /* 20 */\ |
1731 |
"pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
1732 |
"pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ |
1732 |
"movq "#in7", " #m3 " \n\t" /* d */\ |
1733 |
"movq "#in7", " #m3 " \n\t" /* d */\ |
1733 |
"movq "#in0", %%mm5 \n\t" /* D */\ |
1734 |
"movq "#in0", %%mm5 \n\t" /* D */\ |
Lines 1739-1745
Link Here
|
1739 |
"paddw " #m5 ", %%mm6 \n\t" /* x2 */\ |
1740 |
"paddw " #m5 ", %%mm6 \n\t" /* x2 */\ |
1740 |
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ |
1741 |
"paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ |
1741 |
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ |
1742 |
"psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ |
1742 |
"pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ |
1743 |
"pmullw %6, %%mm5 \n\t" /* -6x2 + 3x3 */\ |
1743 |
"paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
1744 |
"paddw " #rnd ", %%mm4 \n\t" /* x2 */\ |
1744 |
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ |
1745 |
"paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ |
1745 |
"psraw $5, %%mm5 \n\t"\ |
1746 |
"psraw $5, %%mm5 \n\t"\ |
Lines 1773-1787
Link Here
|
1773 |
"paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
1774 |
"paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
1774 |
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
1775 |
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
1775 |
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
1776 |
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
1776 |
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
1777 |
"pmullw %6, %%mm6 \n\t" /* 3c - 6b */\ |
1777 |
"paddw %%mm4, %%mm0 \n\t" /* a */\ |
1778 |
"paddw %%mm4, %%mm0 \n\t" /* a */\ |
1778 |
"paddw %%mm1, %%mm5 \n\t" /* d */\ |
1779 |
"paddw %%mm1, %%mm5 \n\t" /* d */\ |
1779 |
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
1780 |
"pmullw %5, %%mm0 \n\t" /* 20a */\ |
1780 |
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
1781 |
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
1781 |
"paddw %6, %%mm6 \n\t"\ |
1782 |
"paddw %8, %%mm6 \n\t"\ |
1782 |
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1783 |
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1783 |
"psraw $5, %%mm0 \n\t"\ |
1784 |
"psraw $5, %%mm0 \n\t"\ |
1784 |
"movq %%mm0, %5 \n\t"\ |
1785 |
"movq %%mm0, %7 \n\t"\ |
1785 |
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1786 |
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1786 |
\ |
1787 |
\ |
1787 |
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ |
1788 |
"movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ |
Lines 1799-1813
Link Here
|
1799 |
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ |
1800 |
"psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ |
1800 |
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ |
1801 |
"punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ |
1801 |
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ |
1802 |
"punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ |
1802 |
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
1803 |
"pmullw %6, %%mm3 \n\t" /* 3c - 6b */\ |
1803 |
"paddw %%mm2, %%mm1 \n\t" /* a */\ |
1804 |
"paddw %%mm2, %%mm1 \n\t" /* a */\ |
1804 |
"paddw %%mm6, %%mm4 \n\t" /* d */\ |
1805 |
"paddw %%mm6, %%mm4 \n\t" /* d */\ |
1805 |
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
1806 |
"pmullw %5, %%mm1 \n\t" /* 20a */\ |
1806 |
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
1807 |
"psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ |
1807 |
"paddw %6, %%mm1 \n\t"\ |
1808 |
"paddw %8, %%mm1 \n\t"\ |
1808 |
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
1809 |
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ |
1809 |
"psraw $5, %%mm3 \n\t"\ |
1810 |
"psraw $5, %%mm3 \n\t"\ |
1810 |
"movq %5, %%mm1 \n\t"\ |
1811 |
"movq %7, %%mm1 \n\t"\ |
1811 |
"packuswb %%mm3, %%mm1 \n\t"\ |
1812 |
"packuswb %%mm3, %%mm1 \n\t"\ |
1812 |
OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
1813 |
OP_MMX2(%%mm1, (%1),%%mm4, q)\ |
1813 |
/* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
1814 |
/* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ |
Lines 1825-1831
Link Here
|
1825 |
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ |
1826 |
"psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ |
1826 |
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ |
1827 |
"movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ |
1827 |
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ |
1828 |
"psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ |
1828 |
"pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ |
1829 |
"pmullw %6, %%mm0 \n\t" /* 3c - 6b */\ |
1829 |
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
1830 |
"punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ |
1830 |
"paddw %%mm3, %%mm2 \n\t" /* d */\ |
1831 |
"paddw %%mm3, %%mm2 \n\t" /* d */\ |
1831 |
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ |
1832 |
"psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ |
Lines 1833-1840
Link Here
|
1833 |
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ |
1834 |
"punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ |
1834 |
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ |
1835 |
"punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ |
1835 |
"paddw %%mm2, %%mm6 \n\t" /* a */\ |
1836 |
"paddw %%mm2, %%mm6 \n\t" /* a */\ |
1836 |
"pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ |
1837 |
"pmullw %5, %%mm6 \n\t" /* 20a */\ |
1837 |
"paddw %6, %%mm0 \n\t"\ |
1838 |
"paddw %8, %%mm0 \n\t"\ |
1838 |
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1839 |
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1839 |
"psraw $5, %%mm0 \n\t"\ |
1840 |
"psraw $5, %%mm0 \n\t"\ |
1840 |
/* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ |
1841 |
/* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ |
Lines 1848-1857
Link Here
|
1848 |
"paddw %%mm2, %%mm5 \n\t" /* d */\ |
1849 |
"paddw %%mm2, %%mm5 \n\t" /* d */\ |
1849 |
"paddw %%mm6, %%mm6 \n\t" /* 2b */\ |
1850 |
"paddw %%mm6, %%mm6 \n\t" /* 2b */\ |
1850 |
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ |
1851 |
"psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ |
1851 |
"pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ |
1852 |
"pmullw %5, %%mm3 \n\t" /* 20a */\ |
1852 |
"pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ |
1853 |
"pmullw %6, %%mm4 \n\t" /* 3c - 6b */\ |
1853 |
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
1854 |
"psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ |
1854 |
"paddw %6, %%mm4 \n\t"\ |
1855 |
"paddw %8, %%mm4 \n\t"\ |
1855 |
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
1856 |
"paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ |
1856 |
"psraw $5, %%mm4 \n\t"\ |
1857 |
"psraw $5, %%mm4 \n\t"\ |
1857 |
"packuswb %%mm4, %%mm0 \n\t"\ |
1858 |
"packuswb %%mm4, %%mm0 \n\t"\ |
Lines 1862-1868
Link Here
|
1862 |
"decl %2 \n\t"\ |
1863 |
"decl %2 \n\t"\ |
1863 |
" jnz 1b \n\t"\ |
1864 |
" jnz 1b \n\t"\ |
1864 |
: "+a"(src), "+c"(dst), "+m"(h)\ |
1865 |
: "+a"(src), "+c"(dst), "+m"(h)\ |
1865 |
: "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
1866 |
: "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\ |
1866 |
: "memory"\ |
1867 |
: "memory"\ |
1867 |
);\ |
1868 |
);\ |
1868 |
}\ |
1869 |
}\ |
Lines 1940-1951
Link Here
|
1940 |
"paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
1941 |
"paddw %%mm5, %%mm5 \n\t" /* 2b */\ |
1941 |
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
1942 |
"psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ |
1942 |
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
1943 |
"pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ |
1943 |
"pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ |
1944 |
"pmullw %6, %%mm6 \n\t" /* 3c - 6b */\ |
1944 |
"paddw %%mm4, %%mm0 \n\t" /* a */\ |
1945 |
"paddw %%mm4, %%mm0 \n\t" /* a */\ |
1945 |
"paddw %%mm1, %%mm5 \n\t" /* d */\ |
1946 |
"paddw %%mm1, %%mm5 \n\t" /* d */\ |
1946 |
"pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ |
1947 |
"pmullw %5, %%mm0 \n\t" /* 20a */\ |
1947 |
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
1948 |
"psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ |
1948 |
"paddw %6, %%mm6 \n\t"\ |
1949 |
"paddw %8, %%mm6 \n\t"\ |
1949 |
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1950 |
"paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ |
1950 |
"psraw $5, %%mm0 \n\t"\ |
1951 |
"psraw $5, %%mm0 \n\t"\ |
1951 |
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
1952 |
/* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ |
Lines 1961-1970
Link Here
|
1961 |
"paddw %%mm5, %%mm4 \n\t" /* d */\ |
1962 |
"paddw %%mm5, %%mm4 \n\t" /* d */\ |
1962 |
"paddw %%mm2, %%mm2 \n\t" /* 2b */\ |
1963 |
"paddw %%mm2, %%mm2 \n\t" /* 2b */\ |
1963 |
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ |
1964 |
"psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ |
1964 |
"pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ |
1965 |
"pmullw %5, %%mm1 \n\t" /* 20a */\ |
1965 |
"pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ |
1966 |
"pmullw %6, %%mm3 \n\t" /* 3c - 6b */\ |
1966 |
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
1967 |
"psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ |
1967 |
"paddw %6, %%mm1 \n\t"\ |
1968 |
"paddw %8, %%mm1 \n\t"\ |
1968 |
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
1969 |
"paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ |
1969 |
"psraw $5, %%mm3 \n\t"\ |
1970 |
"psraw $5, %%mm3 \n\t"\ |
1970 |
"packuswb %%mm3, %%mm0 \n\t"\ |
1971 |
"packuswb %%mm3, %%mm0 \n\t"\ |
Lines 1975-1981
Link Here
|
1975 |
"decl %2 \n\t"\ |
1976 |
"decl %2 \n\t"\ |
1976 |
" jnz 1b \n\t"\ |
1977 |
" jnz 1b \n\t"\ |
1977 |
: "+a"(src), "+c"(dst), "+m"(h)\ |
1978 |
: "+a"(src), "+c"(dst), "+m"(h)\ |
1978 |
: "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ |
1979 |
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\ |
1979 |
: "memory"\ |
1980 |
: "memory"\ |
1980 |
);\ |
1981 |
);\ |
1981 |
}\ |
1982 |
}\ |
Lines 2054-2092
Link Here
|
2054 |
"movq 8(%0), %%mm1 \n\t"\ |
2055 |
"movq 8(%0), %%mm1 \n\t"\ |
2055 |
"movq 16(%0), %%mm2 \n\t"\ |
2056 |
"movq 16(%0), %%mm2 \n\t"\ |
2056 |
"movq 24(%0), %%mm3 \n\t"\ |
2057 |
"movq 24(%0), %%mm3 \n\t"\ |
2057 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2058 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2058 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
2059 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
2059 |
"add %4, %1 \n\t"\ |
2060 |
"add %4, %1 \n\t"\ |
2060 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
2061 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
2061 |
\ |
2062 |
\ |
2062 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2063 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2063 |
"add %4, %1 \n\t"\ |
2064 |
"add %4, %1 \n\t"\ |
2064 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2065 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2065 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
2066 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ |
2066 |
"add %4, %1 \n\t"\ |
2067 |
"add %4, %1 \n\t"\ |
2067 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
2068 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ |
2068 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
2069 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ |
2069 |
"add %4, %1 \n\t"\ |
2070 |
"add %4, %1 \n\t"\ |
2070 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
2071 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ |
2071 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
2072 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ |
2072 |
"add %4, %1 \n\t"\ |
2073 |
"add %4, %1 \n\t"\ |
2073 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
2074 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ |
2074 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
2075 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ |
2075 |
"add %4, %1 \n\t"\ |
2076 |
"add %4, %1 \n\t"\ |
2076 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
2077 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ |
2077 |
\ |
2078 |
\ |
2078 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
2079 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ |
2079 |
"add %4, %1 \n\t" \ |
2080 |
"add %4, %1 \n\t" \ |
2080 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
2081 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ |
2081 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
2082 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ |
2082 |
\ |
2083 |
\ |
2083 |
"add $136, %0 \n\t"\ |
2084 |
"add $136, %0 \n\t"\ |
2084 |
"add %6, %1 \n\t"\ |
2085 |
"add %8, %1 \n\t"\ |
2085 |
"decl %2 \n\t"\ |
2086 |
"decl %2 \n\t"\ |
2086 |
" jnz 1b \n\t"\ |
2087 |
" jnz 1b \n\t"\ |
2087 |
\ |
2088 |
\ |
2088 |
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2089 |
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2089 |
: "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
2090 |
: "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*(long)dstStride)\ |
2090 |
:"memory"\ |
2091 |
:"memory"\ |
2091 |
);\ |
2092 |
);\ |
2092 |
}\ |
2093 |
}\ |
Lines 2126-2152
Link Here
|
2126 |
"movq 8(%0), %%mm1 \n\t"\ |
2127 |
"movq 8(%0), %%mm1 \n\t"\ |
2127 |
"movq 16(%0), %%mm2 \n\t"\ |
2128 |
"movq 16(%0), %%mm2 \n\t"\ |
2128 |
"movq 24(%0), %%mm3 \n\t"\ |
2129 |
"movq 24(%0), %%mm3 \n\t"\ |
2129 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2130 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ |
2130 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
2131 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ |
2131 |
"add %4, %1 \n\t"\ |
2132 |
"add %4, %1 \n\t"\ |
2132 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
2133 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ |
2133 |
\ |
2134 |
\ |
2134 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2135 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ |
2135 |
"add %4, %1 \n\t"\ |
2136 |
"add %4, %1 \n\t"\ |
2136 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2137 |
QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ |
2137 |
\ |
2138 |
\ |
2138 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2139 |
QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ |
2139 |
"add %4, %1 \n\t"\ |
2140 |
"add %4, %1 \n\t"\ |
2140 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2141 |
QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ |
2141 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ |
2142 |
QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ |
2142 |
\ |
2143 |
\ |
2143 |
"add $72, %0 \n\t"\ |
2144 |
"add $72, %0 \n\t"\ |
2144 |
"add %6, %1 \n\t"\ |
2145 |
"add %8, %1 \n\t"\ |
2145 |
"decl %2 \n\t"\ |
2146 |
"decl %2 \n\t"\ |
2146 |
" jnz 1b \n\t"\ |
2147 |
" jnz 1b \n\t"\ |
2147 |
\ |
2148 |
\ |
2148 |
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2149 |
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\ |
2149 |
: "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
2150 |
: "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*(long)dstStride)\ |
2150 |
: "memory"\ |
2151 |
: "memory"\ |
2151 |
);\ |
2152 |
);\ |
2152 |
}\ |
2153 |
}\ |