Lines 463-471
Link Here
|
463 |
"1: \n\t" |
463 |
"1: \n\t" |
464 |
|
464 |
|
465 |
"mov %6, %%"REG_a" \n\t" |
465 |
"mov %6, %%"REG_a" \n\t" |
466 |
"mov %4, %%"REG_b" \n\t" |
466 |
"mov %4, %%"REG_S" \n\t" |
467 |
|
467 |
|
468 |
snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6") |
468 |
snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
469 |
snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
469 |
snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
470 |
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
470 |
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
471 |
snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ |
471 |
snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ |
Lines 482-491
Link Here
|
482 |
snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
482 |
snow_vertical_compose_sse2_sub("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
483 |
snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7") |
483 |
snow_vertical_compose_sse2_store(REG_a,"xmm1","xmm3","xmm5","xmm7") |
484 |
"mov %3, %%"REG_c" \n\t" |
484 |
"mov %3, %%"REG_c" \n\t" |
485 |
snow_vertical_compose_sse2_load(REG_b,"xmm0","xmm2","xmm4","xmm6") |
485 |
snow_vertical_compose_sse2_load(REG_S,"xmm0","xmm2","xmm4","xmm6") |
486 |
snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") |
486 |
snow_vertical_compose_sse2_add(REG_c,"xmm1","xmm3","xmm5","xmm7") |
487 |
snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
487 |
snow_vertical_compose_sse2_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
488 |
snow_vertical_compose_sse2_store(REG_b,"xmm0","xmm2","xmm4","xmm6") |
488 |
snow_vertical_compose_sse2_store(REG_S,"xmm0","xmm2","xmm4","xmm6") |
489 |
"mov %2, %%"REG_a" \n\t" |
489 |
"mov %2, %%"REG_a" \n\t" |
490 |
snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7") |
490 |
snow_vertical_compose_sse2_load(REG_c,"xmm1","xmm3","xmm5","xmm7") |
491 |
snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
491 |
snow_vertical_compose_sse2_add(REG_a,"xmm0","xmm2","xmm4","xmm6") |
Lines 495-507
Link Here
|
495 |
"pcmpeqd %%xmm1, %%xmm1 \n\t" |
495 |
"pcmpeqd %%xmm1, %%xmm1 \n\t" |
496 |
"pslld $31, %%xmm1 \n\t" |
496 |
"pslld $31, %%xmm1 \n\t" |
497 |
"psrld $28, %%xmm1 \n\t" |
497 |
"psrld $28, %%xmm1 \n\t" |
498 |
"mov %1, %%"REG_b" \n\t" |
498 |
"mov %1, %%"REG_S" \n\t" |
499 |
|
499 |
|
500 |
snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") |
500 |
snow_vertical_compose_sse2_r2r_add("xmm1","xmm1","xmm1","xmm1","xmm0","xmm2","xmm4","xmm6") |
501 |
snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6") |
501 |
snow_vertical_compose_sse2_sra("4","xmm0","xmm2","xmm4","xmm6") |
502 |
snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
502 |
snow_vertical_compose_sse2_add(REG_c,"xmm0","xmm2","xmm4","xmm6") |
503 |
snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") |
503 |
snow_vertical_compose_sse2_store(REG_c,"xmm0","xmm2","xmm4","xmm6") |
504 |
snow_vertical_compose_sse2_add(REG_b,"xmm0","xmm2","xmm4","xmm6") |
504 |
snow_vertical_compose_sse2_add(REG_S,"xmm0","xmm2","xmm4","xmm6") |
505 |
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
505 |
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
506 |
snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ |
506 |
snow_vertical_compose_sse2_sll("1","xmm0","xmm2","xmm4","xmm6")\ |
507 |
snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
507 |
snow_vertical_compose_sse2_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
Lines 515-521
Link Here
|
515 |
:"+d"(i) |
515 |
:"+d"(i) |
516 |
: |
516 |
: |
517 |
"m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
517 |
"m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
518 |
"%"REG_a"","%"REG_b"","%"REG_c""); |
518 |
"%"REG_a"","%"REG_S"","%"REG_c""); |
519 |
} |
519 |
} |
520 |
|
520 |
|
521 |
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ |
521 |
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ |
Lines 570-578
Link Here
|
570 |
"1: \n\t" |
570 |
"1: \n\t" |
571 |
|
571 |
|
572 |
"mov %6, %%"REG_a" \n\t" |
572 |
"mov %6, %%"REG_a" \n\t" |
573 |
"mov %4, %%"REG_b" \n\t" |
573 |
"mov %4, %%"REG_S" \n\t" |
574 |
|
574 |
|
575 |
snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6") |
575 |
snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
576 |
snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
576 |
snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
577 |
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
577 |
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
578 |
snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") |
578 |
snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") |
Lines 589-598
Link Here
|
589 |
snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
589 |
snow_vertical_compose_mmx_sub("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
590 |
snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7") |
590 |
snow_vertical_compose_mmx_store(REG_a,"mm1","mm3","mm5","mm7") |
591 |
"mov %3, %%"REG_c" \n\t" |
591 |
"mov %3, %%"REG_c" \n\t" |
592 |
snow_vertical_compose_mmx_load(REG_b,"mm0","mm2","mm4","mm6") |
592 |
snow_vertical_compose_mmx_load(REG_S,"mm0","mm2","mm4","mm6") |
593 |
snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") |
593 |
snow_vertical_compose_mmx_add(REG_c,"mm1","mm3","mm5","mm7") |
594 |
snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
594 |
snow_vertical_compose_mmx_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
595 |
snow_vertical_compose_mmx_store(REG_b,"mm0","mm2","mm4","mm6") |
595 |
snow_vertical_compose_mmx_store(REG_S,"mm0","mm2","mm4","mm6") |
596 |
"mov %2, %%"REG_a" \n\t" |
596 |
"mov %2, %%"REG_a" \n\t" |
597 |
snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7") |
597 |
snow_vertical_compose_mmx_load(REG_c,"mm1","mm3","mm5","mm7") |
598 |
snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
598 |
snow_vertical_compose_mmx_add(REG_a,"mm0","mm2","mm4","mm6") |
Lines 602-614
Link Here
|
602 |
"pcmpeqd %%mm1, %%mm1 \n\t" |
602 |
"pcmpeqd %%mm1, %%mm1 \n\t" |
603 |
"pslld $31, %%mm1 \n\t" |
603 |
"pslld $31, %%mm1 \n\t" |
604 |
"psrld $28, %%mm1 \n\t" |
604 |
"psrld $28, %%mm1 \n\t" |
605 |
"mov %1, %%"REG_b" \n\t" |
605 |
"mov %1, %%"REG_S" \n\t" |
606 |
|
606 |
|
607 |
snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") |
607 |
snow_vertical_compose_mmx_r2r_add("mm1","mm1","mm1","mm1","mm0","mm2","mm4","mm6") |
608 |
snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6") |
608 |
snow_vertical_compose_mmx_sra("4","mm0","mm2","mm4","mm6") |
609 |
snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
609 |
snow_vertical_compose_mmx_add(REG_c,"mm0","mm2","mm4","mm6") |
610 |
snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") |
610 |
snow_vertical_compose_mmx_store(REG_c,"mm0","mm2","mm4","mm6") |
611 |
snow_vertical_compose_mmx_add(REG_b,"mm0","mm2","mm4","mm6") |
611 |
snow_vertical_compose_mmx_add(REG_S,"mm0","mm2","mm4","mm6") |
612 |
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
612 |
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
613 |
snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") |
613 |
snow_vertical_compose_mmx_sll("1","mm0","mm2","mm4","mm6") |
614 |
snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
614 |
snow_vertical_compose_mmx_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
Lines 622-636
Link Here
|
622 |
:"+d"(i) |
622 |
:"+d"(i) |
623 |
: |
623 |
: |
624 |
"m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
624 |
"m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5): |
625 |
"%"REG_a"","%"REG_b"","%"REG_c""); |
625 |
"%"REG_a"","%"REG_S"","%"REG_c""); |
626 |
} |
626 |
} |
627 |
|
627 |
|
628 |
#define snow_inner_add_yblock_sse2_header \ |
628 |
#define snow_inner_add_yblock_sse2_header \ |
629 |
DWTELEM * * dst_array = sb->line + src_y;\ |
629 |
DWTELEM * * dst_array = sb->line + src_y;\ |
|
|
630 |
long pic_reg_b;\ |
630 |
asm volatile(\ |
631 |
asm volatile(\ |
631 |
"mov %6, %%"REG_c" \n\t"\ |
632 |
"mov %%"REG_b", %2 \n\t"\ |
632 |
"mov %5, %%"REG_b" \n\t"\ |
633 |
"mov %7, %%"REG_c" \n\t"\ |
633 |
"mov %3, %%"REG_S" \n\t"\ |
634 |
"mov %6, %%"REG_b" \n\t"\ |
|
|
635 |
"mov %4, %%"REG_S" \n\t"\ |
634 |
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
636 |
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
635 |
"pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
637 |
"pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
636 |
"pslld $31, %%xmm3 \n\t"\ |
638 |
"pslld $31, %%xmm3 \n\t"\ |
Lines 638-644
Link Here
|
638 |
"1: \n\t"\ |
640 |
"1: \n\t"\ |
639 |
"mov %1, %%"REG_D" \n\t"\ |
641 |
"mov %1, %%"REG_D" \n\t"\ |
640 |
"mov (%%"REG_D"), %%"REG_D" \n\t"\ |
642 |
"mov (%%"REG_D"), %%"REG_D" \n\t"\ |
641 |
"add %2, %%"REG_D" \n\t" |
643 |
"add %3, %%"REG_D" \n\t" |
642 |
|
644 |
|
643 |
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ |
645 |
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ |
644 |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
646 |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
Lines 686-695
Link Here
|
686 |
|
688 |
|
687 |
#define snow_inner_add_yblock_sse2_end_common2\ |
689 |
#define snow_inner_add_yblock_sse2_end_common2\ |
688 |
"jnz 1b \n\t"\ |
690 |
"jnz 1b \n\t"\ |
689 |
:"+m"(dst8),"+m"(dst_array)\ |
691 |
"mov %2, %%"REG_b" \n\t"\ |
|
|
692 |
:"+m"(dst8),"+m"(dst_array),"+m"(pic_reg_b)\ |
690 |
:\ |
693 |
:\ |
691 |
"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
694 |
"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
692 |
"%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
695 |
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
693 |
|
696 |
|
694 |
#define snow_inner_add_yblock_sse2_end_8\ |
697 |
#define snow_inner_add_yblock_sse2_end_8\ |
695 |
"sal $1, %%"REG_c" \n\t"\ |
698 |
"sal $1, %%"REG_c" \n\t"\ |
Lines 727-733
Link Here
|
727 |
|
730 |
|
728 |
"mov %1, %%"REG_D" \n\t" |
731 |
"mov %1, %%"REG_D" \n\t" |
729 |
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" |
732 |
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" |
730 |
"add %2, %%"REG_D" \n\t" |
733 |
"add %3, %%"REG_D" \n\t" |
731 |
|
734 |
|
732 |
"movdqa (%%"REG_D"), %%xmm4 \n\t" |
735 |
"movdqa (%%"REG_D"), %%xmm4 \n\t" |
733 |
"movdqa %%xmm5, %%xmm6 \n\t" |
736 |
"movdqa %%xmm5, %%xmm6 \n\t" |
Lines 792-801
Link Here
|
792 |
|
795 |
|
793 |
#define snow_inner_add_yblock_mmx_header \ |
796 |
#define snow_inner_add_yblock_mmx_header \ |
794 |
DWTELEM * * dst_array = sb->line + src_y;\ |
797 |
DWTELEM * * dst_array = sb->line + src_y;\ |
|
|
798 |
long pic_reg_b;\ |
795 |
asm volatile(\ |
799 |
asm volatile(\ |
796 |
"mov %6, %%"REG_c" \n\t"\ |
800 |
"mov %%"REG_b", %2 \n\t"\ |
797 |
"mov %5, %%"REG_b" \n\t"\ |
801 |
"mov %7, %%"REG_c" \n\t"\ |
798 |
"mov %3, %%"REG_S" \n\t"\ |
802 |
"mov %6, %%"REG_b" \n\t"\ |
|
|
803 |
"mov %4, %%"REG_S" \n\t"\ |
799 |
"pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
804 |
"pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
800 |
"pcmpeqd %%mm3, %%mm3 \n\t"\ |
805 |
"pcmpeqd %%mm3, %%mm3 \n\t"\ |
801 |
"pslld $31, %%mm3 \n\t"\ |
806 |
"pslld $31, %%mm3 \n\t"\ |
Lines 803-809
Link Here
|
803 |
"1: \n\t"\ |
808 |
"1: \n\t"\ |
804 |
"mov %1, %%"REG_D" \n\t"\ |
809 |
"mov %1, %%"REG_D" \n\t"\ |
805 |
"mov (%%"REG_D"), %%"REG_D" \n\t"\ |
810 |
"mov (%%"REG_D"), %%"REG_D" \n\t"\ |
806 |
"add %2, %%"REG_D" \n\t" |
811 |
"add %3, %%"REG_D" \n\t" |
807 |
|
812 |
|
808 |
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ |
813 |
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ |
809 |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
814 |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
Lines 859-868
Link Here
|
859 |
"add %%"REG_c", %0 \n\t"\ |
864 |
"add %%"REG_c", %0 \n\t"\ |
860 |
"dec %%"REG_b" \n\t"\ |
865 |
"dec %%"REG_b" \n\t"\ |
861 |
"jnz 1b \n\t"\ |
866 |
"jnz 1b \n\t"\ |
862 |
:"+m"(dst8),"+m"(dst_array)\ |
867 |
"mov %2,%%"REG_b" \n\t"\ |
|
|
868 |
:"+m"(dst8),"+m"(dst_array),"+m"(pic_reg_b)\ |
863 |
:\ |
869 |
:\ |
864 |
"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
870 |
"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ |
865 |
"%"REG_b"","%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
871 |
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
866 |
|
872 |
|
867 |
static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
873 |
static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, |
868 |
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
874 |
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |