diff -Naur liboil-0.3.3/liboil/powerpc/recon8x8_altivec.c liboil-0.3.3.fix/liboil/powerpc/recon8x8_altivec.c --- liboil-0.3.3/liboil/powerpc/recon8x8_altivec.c 2005-08-14 12:55:33.000000000 +0100 +++ liboil-0.3.3.fix/liboil/powerpc/recon8x8_altivec.c 2005-10-20 01:56:22.000000000 +0100 @@ -46,131 +46,131 @@ /* r3, r4, r5 */ recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds) { - asm { + asm( //trying cache hints - lis r6,0x0108 - or r6,r6,r5 - dstst r3,r6,0 + "lis r6,0x0108\n" + "or r6,r6,r5\n" + "dstst r3,r6,0\n" - vspltish v1,7 + "vspltish v1,7\n" - vspltish v8,1 - xor r6,r6,r6 + "vspltish v8,1\n\n" + "xor r6,r6,r6\n" - lvx v0,r4,r6 //get 8 shorts - vslh v8,v8,v1 //now have 128 - addi r6,r6,16 + "lvx v0,r4,r6\n" //get 8 shorts + "vslh v8,v8,v1\n" //now have 128 + "addi r6,r6,16\n" - lvx v1,r4,r6 //get 8 shorts - vaddshs v0,v0,v8 //+=128 - addi r6,r6,16 + "lvx v1,r4,r6\n" //get 8 shorts + "vaddshs v0,v0,v8\n" //+=128 + "addi r6,r6,16\n" - lvx v2,r4,r6 //get 8 shorts - vaddshs v1,v1,v8 //+=128 - addi r6,r6,16 - vpkshus v0,v0,v0 //convert to bytes + "lvx v2,r4,r6\n" //get 8 shorts + "vaddshs v1,v1,v8\n" //+=128 + "addi r6,r6,16\n" + "vpkshus v0,v0,v0\n" //convert to bytes - lvx v3,r4,r6 //get 8 shorts - vaddshs v2,v2,v8 //+=128 - addi r6,r6,16 - vpkshus v1,v1,v1 //convert to bytes + "lvx v3,r4,r6\n" //get 8 shorts + "vaddshs v2,v2,v8\n" //+=128 + "addi r6,r6,16\n" + "vpkshus v1,v1,v1\n" //convert to bytes - lvx v4,r4,r6 //get 8 shorts - vaddshs v3,v3,v8 //+=128 - addi r6,r6,16 - vpkshus v2,v2,v2 //convert to bytes + "lvx v4,r4,r6\n" //get 8 shorts + "vaddshs v3,v3,v8\n" //+=128 + "addi r6,r6,16\n" + "vpkshus v2,v2,v2\n" //convert to bytes - lvx v5,r4,r6 //get 8 shorts - vaddshs v4,v4,v8 //+=128 - addi r6,r6,16 - vpkshus v3,v3,v3 //convert to bytes + "lvx v5,r4,r6\n" //get 8 shorts + "vaddshs v4,v4,v8\n" //+=128 + "addi r6,r6,16\n" + "vpkshus v3,v3,v3\n" //convert to bytes - lvx v6,r4,r6 //get 8 shorts - vaddshs v5,v5,v8 //+=128 - addi r6,r6,16 - vpkshus v4,v4,v4 //convert to bytes + "lvx v6,r4,r6\n" //get 8 shorts + "vaddshs v5,v5,v8\n" //+=128 + "addi r6,r6,16\n" + "vpkshus v4,v4,v4\n" //convert to bytes - lvx v7,r4,r6 //get 8 shorts - xor r6,r6,r6 - vaddshs v6,v6,v8 //+=128 - vpkshus v5,v5,v5 //convert to bytes + "lvx v7,r4,r6\n" //get 8 shorts + "xor r6,r6,r6\n" + "vaddshs v6,v6,v8\n" //+=128 + "vpkshus v5,v5,v5\n" //convert to bytes - lvsr v9,r3,r6 //load alignment vector for stores - vaddshs v7,v7,v8 //+=128 - vpkshus v6,v6,v6 //convert to bytes + "lvsr v9,r3,r6\n" //load alignment vector for stores + "vaddshs v7,v7,v8\n" //+=128 + "vpkshus v6,v6,v6\n" //convert to bytes - vpkshus v7,v7,v7 //convert to bytes + "vpkshus v7,v7,v7\n" //convert to bytes - li r7,4 - vperm v0,v0,v0,v9 + "li r7,4\n" + "vperm v0,v0,v0,v9\n" - stvewx v0,r3,r6 - add r6,r6,r5 + "stvewx v0,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v0,r3,r7 - add r7,r7,r5 - vperm v1,v1,v1,v9 + "stvewx v0,r3,r7\n" + "add r7,r7,r5\n" + "vperm v1,v1,v1,v9\n" - stvewx v1,r3,r6 - add r6,r6,r5 + "stvewx v1,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v1,r3,r7 - add r7,r7,r5 - vperm v2,v2,v2,v9 + "stvewx v1,r3,r7\n" + "add r7,r7,r5\n" + "vperm v2,v2,v2,v9\n" - stvewx v2,r3,r6 - add r6,r6,r5 + "stvewx v2,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v2,r3,r7 - add r7,r7,r5 - vperm v3,v3,v3,v9 + "stvewx v2,r3,r7\n" + "add r7,r7,r5\n" + "vperm v3,v3,v3,v9\n" - stvewx v3,r3,r6 - add r6,r6,r5 + "stvewx v3,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v3,r3,r7 - add r7,r7,r5 - vperm v4,v4,v4,v9 + "stvewx v3,r3,r7\n" + "add r7,r7,r5\n" + "vperm v4,v4,v4,v9\n" - stvewx v4,r3,r6 - add r6,r6,r5 + "stvewx v4,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v4,r3,r7 - add r7,r7,r5 - vperm v5,v5,v5,v9 + "stvewx v4,r3,r7\n" + "add r7,r7,r5\n" + "vperm v5,v5,v5,v9\n" - stvewx v5,r3,r6 - add r6,r6,r5 + "stvewx v5,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v5,r3,r7 - add r7,r7,r5 - vperm v6,v6,v6,v9 + "stvewx v5,r3,r7\n" + "add r7,r7,r5\n" + "vperm v6,v6,v6,v9\n" - stvewx v6,r3,r6 - add r6,r6,r5 + "stvewx v6,r3,r6\n" + "add r6,r6,r5\n" - lvsr v9,r3,r6 //load alignment vector for stores + "lvsr v9,r3,r6\n" //load alignment vector for stores - stvewx v6,r3,r7 - add r7,r7,r5 - vperm v7,v7,v7,v9 + "stvewx v6,r3,r7\n" + "add r7,r7,r5\n" + "vperm v7,v7,v7,v9\n" - stvewx v7,r3,r6 + "stvewx v7,r3,r6\n" - stvewx v7,r3,r7 - } + "stvewx v7,r3,r7\n" + ); } OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC); @@ -178,217 +178,216 @@ static void /* r3, r4, r5, r6 */ recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss) { - asm - { + asm( //trying cache hints - lis r7,0x0108 - or r7,r7,r6 - dstst r3,r7,0 + "lis r7,0x0108\n" + "or r7,r7,r6\n" + "dstst r3,r7,0\n" - xor r7,r7,r7 - li r8,16 + "xor r7,r7,r7\n" + "li r8,16\n" - lvsl v8,r4,r7 //load alignment vector for refs - vxor v9,v9,v9 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "vxor v9,v9,v9\n" - lvx v10,r4,r7 //get 8 refs - add r7,r7,r6 + "lvx v10,r4,r7\n" //get 8 refs + "add r7,r7,r6\n" - lvx v0,r4,r8 //need another 16 bytes for misaligned data -- 0 - add r8,r8,r6 + "lvx v0,r4,r8\n" //need another 16 bytes for misaligned data -- 0 + "add r8,r8,r6\n" - lvx v11,r4,r7 //get 8 refs - vperm v10,v10,v0,v8 + "lvx v11,r4,r7\n" //get 8 refs + "vperm v10,v10,v0,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - add r7,r7,r6 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "add r7,r7,r6\n" - lvx v1,r4,r8 //need another 16 bytes for misaligned data -- 1 - add r8,r8,r6 + "lvx v1,r4,r8\n" //need another 16 bytes for misaligned data -- 1 + "add r8,r8,r6\n" - lvx v12,r4,r7 //get 8 refs - vperm v11,v11,v1,v8 + "lvx v12,r4,r7\n" //get 8 refs + "vperm v11,v11,v1,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - add r7,r7,r6 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "add r7,r7,r6\n" - lvx v2,r4,r8 //need another 16 bytes for misaligned data -- 2 - add r8,r8,r6 + "lvx v2,r4,r8\n" //need another 16 bytes for misaligned data -- 2 + "add r8,r8,r6\n" - lvx v13,r4,r7 //get 8 refs - vperm v12,v12,v2,v8 + "lvx v13,r4,r7\n" //get 8 refs + "vperm v12,v12,v2,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - add r7,r7,r6 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "add r7,r7,r6\n" - lvx v3,r4,r8 //need another 16 bytes for misaligned data -- 3 - add r8,r8,r6 + "lvx v3,r4,r8\n" //need another 16 bytes for misaligned data -- 3 + "add r8,r8,r6\n" - lvx v14,r4,r7 //get 8 refs - vperm v13,v13,v3,v8 + "lvx v14,r4,r7\n" //get 8 refs + "vperm v13,v13,v3,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - add r7,r7,r6 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "add r7,r7,r6\n" - lvx v4,r4,r8 //need another 16 bytes for misaligned data -- 4 - add r8,r8,r6 + "lvx v4,r4,r8\n" //need another 16 bytes for misaligned data -- 4 + "add r8,r8,r6\n" - lvx v15,r4,r7 //get 8 refs - vperm v14,v14,v4,v8 + "lvx v15,r4,r7\n" //get 8 refs + "vperm v14,v14,v4,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - add r7,r7,r6 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "add r7,r7,r6\n" - lvx v5,r4,r8 //need another 16 bytes for misaligned data -- 5 - add r8,r8,r6 + "lvx v5,r4,r8\n" //need another 16 bytes for misaligned data -- 5 + "add r8,r8,r6\n" - lvx v16,r4,r7 //get 8 refs - vperm v15,v15,v5,v8 + "lvx v16,r4,r7\n" //get 8 refs + "vperm v15,v15,v5,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - add r7,r7,r6 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "add r7,r7,r6\n" - lvx v6,r4,r8 //need another 16 bytes for misaligned data -- 6 - add r8,r8,r6 + "lvx v6,r4,r8\n" //need another 16 bytes for misaligned data -- 6 + "add r8,r8,r6\n" - lvx v17,r4,r7 //get 8 refs - vperm v16,v16,v6,v8 + "lvx v17,r4,r7\n" //get 8 refs + "vperm v16,v16,v6,v8\n" - lvsl v8,r4,r7 //load alignment vector for refs - xor r7,r7,r7 + "lvsl v8,r4,r7\n" //load alignment vector for refs + "xor r7,r7,r7\n" - lvx v7,r4,r8 //need another 16 bytes for misaligned data -- 7 - add r8,r8,r6 + "lvx v7,r4,r8\n" //need another 16 bytes for misaligned data -- 7 + "add r8,r8,r6\n" - lvx v0,r5,r7 //get 8 shorts - vperm v17,v17,v7,v8 - addi r7,r7,16 + "lvx v0,r5,r7\n" //get 8 shorts + "vperm v17,v17,v7,v8\n" + "addi r7,r7,16\n" - lvx v1,r5,r7 //get 8 shorts - vmrghb v10,v9,v10 //unsigned byte -> unsigned half - addi r7,r7,16 + "lvx v1,r5,r7\n" //get 8 shorts + "vmrghb v10,v9,v10\n" //unsigned byte -> unsigned half + "addi r7,r7,16\n" - lvx v2,r5,r7 //get 8 shorts - vmrghb v11,v9,v11 //unsigned byte -> unsigned half - vaddshs v0,v0,v10 - addi r7,r7,16 + "lvx v2,r5,r7\n" //get 8 shorts + "vmrghb v11,v9,v11\n" //unsigned byte -> unsigned half + "vaddshs v0,v0,v10\n" + "addi r7,r7,16\n" - lvx v3,r5,r7 //get 8 shorts - vmrghb v12,v9,v12 //unsigned byte -> unsigned half - vaddshs v1,v1,v11 - addi r7,r7,16 + "lvx v3,r5,r7\n" //get 8 shorts + "vmrghb v12,v9,v12\n" //unsigned byte -> unsigned half + "vaddshs v1,v1,v11\n" + "addi r7,r7,16\n" - lvx v4,r5,r7 //get 8 shorts - vmrghb v13,v9,v13 //unsigned byte -> unsigned half - vaddshs v2,v2,v12 - addi r7,r7,16 + "lvx v4,r5,r7\n" //get 8 shorts + "vmrghb v13,v9,v13\n" //unsigned byte -> unsigned half + "vaddshs v2,v2,v12\n" + "addi r7,r7,16\n" - lvx v5,r5,r7 //get 8 shorts - vmrghb v14,v9,v14 //unsigned byte -> unsigned half - vaddshs v3,v3,v13 - addi r7,r7,16 + "lvx v5,r5,r7\n" //get 8 shorts + "vmrghb v14,v9,v14\n" //unsigned byte -> unsigned half + "vaddshs v3,v3,v13\n" + "addi r7,r7,16\n" - lvx v6,r5,r7 //get 8 shorts - vmrghb v15,v9,v15 //unsigned byte -> unsigned half - vaddshs v4,v4,v14 - addi r7,r7,16 + "lvx v6,r5,r7\n" //get 8 shorts + "vmrghb v15,v9,v15\n" //unsigned byte -> unsigned half + "vaddshs v4,v4,v14\n" + "addi r7,r7,16\n" - lvx v7,r5,r7 //get 8 shorts - vmrghb v16,v9,v16 //unsigned byte -> unsigned half - vaddshs v5,v5,v15 + "lvx v7,r5,r7\n" //get 8 shorts + "vmrghb v16,v9,v16\n" //unsigned byte -> unsigned half + "vaddshs v5,v5,v15\n" - vmrghb v17,v9,v17 //unsigned byte -> unsigned half - vaddshs v6,v6,v16 + "vmrghb v17,v9,v17\n" //unsigned byte -> unsigned half + "vaddshs v6,v6,v16\n" - vpkshus v0,v0,v0 - vaddshs v7,v7,v17 + "vpkshus v0,v0,v0\n" + "vaddshs v7,v7,v17\n" - vpkshus v1,v1,v1 - xor r7,r7,r7 + "vpkshus v1,v1,v1\n" + "xor r7,r7,r7\n" - vpkshus v2,v2,v2 + "vpkshus v2,v2,v2\n" - vpkshus v3,v3,v3 + "vpkshus v3,v3,v3\n" - vpkshus v4,v4,v4 + "vpkshus v4,v4,v4\n" - vpkshus v5,v5,v5 + "vpkshus v5,v5,v5\n" - vpkshus v6,v6,v6 + "vpkshus v6,v6,v6\n" - lvsr v9,r3,r7 //load alignment vector for stores - vpkshus v7,v7,v7 + "lvsr v9,r3,r7\n" //load alignment vector for stores + "vpkshus v7,v7,v7\n" - li r8,4 - vperm v0,v0,v0,v9 //adjust for writes + "li r8,4\n" + "vperm v0,v0,v0,v9\n" //adjust for writes - stvewx v0,r3,r7 - add r7,r7,r6 + "stvewx v0,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v0,r3,r8 - add r8,r8,r6 - vperm v1,v1,v1,v9 + "stvewx v0,r3,r8\n" + "add r8,r8,r6\n" + "vperm v1,v1,v1,v9\n" - stvewx v1,r3,r7 - add r7,r7,r6 + "stvewx v1,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v1,r3,r8 - add r8,r8,r6 - vperm v2,v2,v2,v9 + "stvewx v1,r3,r8\n" + "add r8,r8,r6\n" + "vperm v2,v2,v2,v9\n" - stvewx v2,r3,r7 - add r7,r7,r6 + "stvewx v2,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v2,r3,r8 - add r8,r8,r6 - vperm v3,v3,v3,v9 + "stvewx v2,r3,r8\n" + "add r8,r8,r6\n" + "vperm v3,v3,v3,v9\n" - stvewx v3,r3,r7 - add r7,r7,r6 + "stvewx v3,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v3,r3,r8 - add r8,r8,r6 - vperm v4,v4,v4,v9 + "stvewx v3,r3,r8\n" + "add r8,r8,r6\n" + "vperm v4,v4,v4,v9\n" - stvewx v4,r3,r7 - add r7,r7,r6 + "stvewx v4,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v4,r3,r8 - add r8,r8,r6 - vperm v5,v5,v5,v9 + "stvewx v4,r3,r8\n" + "add r8,r8,r6\n" + "vperm v5,v5,v5,v9\n" - stvewx v5,r3,r7 - add r7,r7,r6 + "stvewx v5,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v5,r3,r8 - add r8,r8,r6 - vperm v6,v6,v6,v9 + "stvewx v5,r3,r8\n" + "add r8,r8,r6\n" + "vperm v6,v6,v6,v9\n" - stvewx v6,r3,r7 - add r7,r7,r6 + "stvewx v6,r3,r7\n" + "add r7,r7,r6\n" - lvsr v9,r3,r7 //load alignment vector for stores + "lvsr v9,r3,r7\n" //load alignment vector for stores - stvewx v6,r3,r8 - add r8,r8,r6 - vperm v7,v7,v7,v9 + "stvewx v6,r3,r8\n" + "add r8,r8,r6\n" + "vperm v7,v7,v7,v9\n" - stvewx v7,r3,r7 + "stvewx v7,r3,r7\n" - stvewx v7,r3,r8 - } + "stvewx v7,r3,r8\n" + ); } OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC); @@ -396,321 +395,320 @@ static void /* r3, r4, r5, r6, r7 */ recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) { - asm - { + asm( //trying cache hints - lis r8,0x0108 - or r8,r8,r7 - dstst r3,r8,0 + "lis r8,0x0108\n" + "or r8,r8,r7\n" + "dstst r3,r8,0\n" - xor r8,r8,r8 - li r9,16 + "xor r8,r8,r8\n" + "li r9,16\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - vxor v9,v9,v9 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "vxor v9,v9,v9\n" - lvx v10,r4,r8 //get 8 RefPtr1 -- 0 - add r8,r8,r7 + "lvx v10,r4,r8\n" //get 8 RefPtr1 -- 0 + "add r8,r8,r7\n" - lvx v0,r4,r9 //need another 16 bytes for misaligned data -- 0 - add r9,r9,r7 + "lvx v0,r4,r9\n" //need another 16 bytes for misaligned data -- 0 + "add r9,r9,r7\n" - lvx v11,r4,r8 //get 8 RefPtr1 -- 1 - vperm v10,v10,v0,v8 + "lvx v11,r4,r8\n" //get 8 RefPtr1 -- 1 + "vperm v10,v10,v0,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v1,r4,r9 //need another 16 bytes for misaligned data -- 1 - vmrghb v10,v9,v10 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v1,r4,r9\n" //need another 16 bytes for misaligned data -- 1 + "vmrghb v10,v9,v10\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v12,r4,r8 //get 8 RefPtr1 -- 2 - vperm v11,v11,v1,v8 + "lvx v12,r4,r8\n" //get 8 RefPtr1 -- 2 + "vperm v11,v11,v1,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v2,r4,r9 //need another 16 bytes for misaligned data -- 2 - vmrghb v11,v9,v11 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v2,r4,r9\n" //need another 16 bytes for misaligned data -- 2 + "vmrghb v11,v9,v11\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v13,r4,r8 //get 8 RefPtr1 -- 3 - vperm v12,v12,v2,v8 + "lvx v13,r4,r8\n" //get 8 RefPtr1 -- 3 + "vperm v12,v12,v2,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v3,r4,r9 //need another 16 bytes for misaligned data -- 3 - vmrghb v12,v9,v12 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v3,r4,r9\n" //need another 16 bytes for misaligned data -- 3 + "vmrghb v12,v9,v12\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v14,r4,r8 //get 8 RefPtr1 -- 4 - vperm v13,v13,v3,v8 + "lvx v14,r4,r8\n" //get 8 RefPtr1 -- 4 + "vperm v13,v13,v3,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v4,r4,r9 //need another 16 bytes for misaligned data -- 4 - vmrghb v13,v9,v13 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v4,r4,r9\n" //need another 16 bytes for misaligned data -- 4 + "vmrghb v13,v9,v13\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v15,r4,r8 //get 8 RefPtr1 -- 5 - vperm v14,v14,v4,v8 + "lvx v15,r4,r8\n" //get 8 RefPtr1 -- 5 + "vperm v14,v14,v4,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v5,r4,r9 //need another 16 bytes for misaligned data -- 5 - vmrghb v14,v9,v14 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v5,r4,r9\n" //need another 16 bytes for misaligned data -- 5 + "vmrghb v14,v9,v14\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v16,r4,r8 //get 8 RefPtr1 -- 6 - vperm v15,v15,v5,v8 + "lvx v16,r4,r8\n" //get 8 RefPtr1 -- 6 + "vperm v15,v15,v5,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v6,r4,r9 //need another 16 bytes for misaligned data -- 6 - vmrghb v15,v9,v15 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v6,r4,r9\n" //need another 16 bytes for misaligned data -- 6 + "vmrghb v15,v9,v15\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v17,r4,r8 //get 8 RefPtr1 -- 7 - vperm v16,v16,v6,v8 + "lvx v17,r4,r8\n" //get 8 RefPtr1 -- 7 + "vperm v16,v16,v6,v8\n" - lvsl v8,r4,r8 //load alignment vector for RefPtr1 - add r8,r8,r7 + "lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 + "add r8,r8,r7\n" - lvx v7,r4,r9 //need another 16 bytes for misaligned data -- 7 - vmrghb v16,v9,v16 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v7,r4,r9\n" //need another 16 bytes for misaligned data -- 7 + "vmrghb v16,v9,v16\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" //-------- - vperm v17,v17,v7,v8 - xor r8,r8,r8 - li r9,16 + "vperm v17,v17,v7,v8\n" + "xor r8,r8,r8\n" + "li r9,16\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - vmrghb v17,v9,v17 //unsigned byte -> unsigned half + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "vmrghb v17,v9,v17\n" //unsigned byte -> unsigned half - lvx v20,r5,r8 //get 8 RefPtr2 -- 0 - add r8,r8,r7 + "lvx v20,r5,r8\n" //get 8 RefPtr2 -- 0 + "add r8,r8,r7\n" - lvx v0,r5,r9 //need another 16 bytes for misaligned data -- 0 - add r9,r9,r7 + "lvx v0,r5,r9\n" //need another 16 bytes for misaligned data -- 0 + "add r9,r9,r7\n" - lvx v21,r5,r8 //get 8 RefPtr2 -- 1 - vperm v20,v20,v0,v18 + "lvx v21,r5,r8\n" //get 8 RefPtr2 -- 1 + "vperm v20,v20,v0,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v1,r5,r9 //need another 16 bytes for misaligned data -- 1 - vmrghb v20,v9,v20 //unsigned byte -> unsigned half - add r9,r9,r7 + "lvx v1,r5,r9\n" //need another 16 bytes for misaligned data -- 1 + "vmrghb v20,v9,v20\n" //unsigned byte -> unsigned half + "add r9,r9,r7\n" - lvx v22,r5,r8 //get 8 RefPtr2 -- 2 - vperm v21,v21,v1,v18 + "lvx v22,r5,r8\n" //get 8 RefPtr2 -- 2 + "vperm v21,v21,v1,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v2,r5,r9 //need another 16 bytes for misaligned data -- 2 - vmrghb v21,v9,v21 //unsigned byte -> unsigned half - vadduhm v10,v10,v20 - add r9,r9,r7 + "lvx v2,r5,r9\n" //need another 16 bytes for misaligned data -- 2 + "vmrghb v21,v9,v21\n" //unsigned byte -> unsigned half + "vadduhm v10,v10,v20\n" + "add r9,r9,r7\n" - lvx v23,r5,r8 //get 8 RefPtr2 -- 3 - vperm v22,v22,v2,v18 + "lvx v23,r5,r8\n" //get 8 RefPtr2 -- 3 + "vperm v22,v22,v2,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v3,r5,r9 //need another 16 bytes for misaligned data -- 3 - vmrghb v22,v9,v22 //unsigned byte -> unsigned half - vadduhm v11,v11,v21 - add r9,r9,r7 + "lvx v3,r5,r9\n" //need another 16 bytes for misaligned data -- 3 + "vmrghb v22,v9,v22\n" //unsigned byte -> unsigned half + "vadduhm v11,v11,v21\n" + "add r9,r9,r7\n" - lvx v24,r5,r8 //get 8 RefPtr2 -- 4 - vperm v23,v23,v3,v18 + "lvx v24,r5,r8\n" //get 8 RefPtr2 -- 4 + "vperm v23,v23,v3,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v4,r5,r9 //need another 16 bytes for misaligned data -- 4 - vmrghb v23,v9,v23 //unsigned byte -> unsigned half - vadduhm v12,v12,v22 - add r9,r9,r7 + "lvx v4,r5,r9\n" //need another 16 bytes for misaligned data -- 4 + "vmrghb v23,v9,v23\n" //unsigned byte -> unsigned half + "vadduhm v12,v12,v22\n" + "add r9,r9,r7\n" - lvx v25,r5,r8 //get 8 RefPtr2 -- 5 - vperm v24,v24,v4,v18 + "lvx v25,r5,r8\n" //get 8 RefPtr2 -- 5 + "vperm v24,v24,v4,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v5,r5,r9 //need another 16 bytes for misaligned data -- 5 - vmrghb v24,v9,v24 //unsigned byte -> unsigned half - vadduhm v13,v13,v23 - add r9,r9,r7 + "lvx v5,r5,r9\n" //need another 16 bytes for misaligned data -- 5 + "vmrghb v24,v9,v24\n" //unsigned byte -> unsigned half + "vadduhm v13,v13,v23\n" + "add r9,r9,r7\n" - lvx v26,r5,r8 //get 8 RefPtr2 -- 6 - vperm v25,v25,v5,v18 + "lvx v26,r5,r8\n" //get 8 RefPtr2 -- 6 + "vperm v25,v25,v5,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v6,r5,r9 //need another 16 bytes for misaligned data -- 6 - vmrghb v25,v9,v25 //unsigned byte -> unsigned half - vadduhm v14,v14,v24 - add r9,r9,r7 + "lvx v6,r5,r9\n" //need another 16 bytes for misaligned data -- 6 + "vmrghb v25,v9,v25\n" //unsigned byte -> unsigned half + "vadduhm v14,v14,v24\n" + "add r9,r9,r7\n" - lvx v27,r5,r8 //get 8 RefPtr2 -- 7 - vperm v26,v26,v6,v18 + "lvx v27,r5,r8\n" //get 8 RefPtr2 -- 7 + "vperm v26,v26,v6,v18\n" - lvsl v18,r5,r8 //load alignment vector for RefPtr2 - add r8,r8,r7 + "lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 + "add r8,r8,r7\n" - lvx v7,r5,r9 //need another 16 bytes for misaligned data -- 7 - vmrghb v26,v9,v26 //unsigned byte -> unsigned half - vadduhm v15,v15,v25 - add r9,r9,r7 + "lvx v7,r5,r9\n" //need another 16 bytes for misaligned data -- 7 + "vmrghb v26,v9,v26\n" //unsigned byte -> unsigned half + "vadduhm v15,v15,v25\n" + "add r9,r9,r7\n" - vperm v27,v27,v7,v18 - xor r8,r8,r8 + "vperm v27,v27,v7,v18\n" + "xor r8,r8,r8\n" - vmrghb v27,v9,v27 //unsigned byte -> unsigned half - vadduhm v16,v16,v26 + "vmrghb v27,v9,v27\n" //unsigned byte -> unsigned half + "vadduhm v16,v16,v26\n" - vadduhm v17,v17,v27 - vspltish v8,1 + "vadduhm v17,v17,v27\n" + "vspltish v8,1\n" //-------- - lvx v0,r6,r8 //get 8 shorts - vsrh v10,v10,v8 - addi r8,r8,16 - - lvx v1,r6,r8 //get 8 shorts - vsrh v11,v11,v8 - addi r8,r8,16 - - lvx v2,r6,r8 //get 8 shorts - vsrh v12,v12,v8 - addi r8,r8,16 - - lvx v3,r6,r8 //get 8 shorts - vsrh v13,v13,v8 - addi r8,r8,16 - - lvx v4,r6,r8 //get 8 shorts - vsrh v14,v14,v8 - addi r8,r8,16 - - lvx v5,r6,r8 //get 8 shorts - vsrh v15,v15,v8 - addi r8,r8,16 - - lvx v6,r6,r8 //get 8 shorts - vsrh v16,v16,v8 - addi r8,r8,16 - - lvx v7,r6,r8 //get 8 shorts - vsrh v17,v17,v8 - xor r8,r8,r8 + "lvx v0,r6,r8\n" //get 8 shorts + "vsrh v10,v10,v8\n" + "addi r8,r8,16\n" + + "lvx v1,r6,r8\n" //get 8 shorts + "vsrh v11,v11,v8\n" + "addi r8,r8,16\n" + + "lvx v2,r6,r8\n" //get 8 shorts + "vsrh v12,v12,v8\n" + "addi r8,r8,16\n" + + "lvx v3,r6,r8\n" //get 8 shorts + "vsrh v13,v13,v8\n" + "addi r8,r8,16\n" + + "lvx v4,r6,r8\n" //get 8 shorts + "vsrh v14,v14,v8\n" + "addi r8,r8,16\n" + + "lvx v5,r6,r8\n" //get 8 shorts + "vsrh v15,v15,v8\n" + "addi r8,r8,16\n" + + "lvx v6,r6,r8\n" //get 8 shorts + "vsrh v16,v16,v8\n" + "addi r8,r8,16\n" + + "lvx v7,r6,r8\n" //get 8 shorts + "vsrh v17,v17,v8\n" + "xor r8,r8,r8\n" //-------- - lvsr v9,r3,r8 //load alignment vector for stores - vaddshs v0,v0,v10 + "lvsr v9,r3,r8\n" //load alignment vector for stores + "vaddshs v0,v0,v10\n" - vaddshs v1,v1,v11 - vpkshus v0,v0,v0 + "vaddshs v1,v1,v11\n" + "vpkshus v0,v0,v0\n" - vaddshs v2,v2,v12 - vpkshus v1,v1,v1 + "vaddshs v2,v2,v12\n" + "vpkshus v1,v1,v1\n" - vaddshs v3,v3,v13 - vpkshus v2,v2,v2 + "vaddshs v3,v3,v13\n" + "vpkshus v2,v2,v2\n" - vaddshs v4,v4,v14 - vpkshus v3,v3,v3 + "vaddshs v4,v4,v14\n" + "vpkshus v3,v3,v3\n" - vaddshs v5,v5,v15 - vpkshus v4,v4,v4 + "vaddshs v5,v5,v15\n" + "vpkshus v4,v4,v4\n" - vaddshs v6,v6,v16 - vpkshus v5,v5,v5 + "vaddshs v6,v6,v16\n" + "vpkshus v5,v5,v5\n" - vaddshs v7,v7,v17 - vpkshus v6,v6,v6 + "vaddshs v7,v7,v17\n" + "vpkshus v6,v6,v6\n" - vpkshus v7,v7,v7 + "vpkshus v7,v7,v7\n" - li r9,4 - vperm v0,v0,v0,v9 //adjust for writes + "li r9,4\n" + "vperm v0,v0,v0,v9\n" //adjust for writes - stvewx v0,r3,r8 - add r8,r8,r7 + "stvewx v0,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v0,r3,r9 - add r9,r9,r7 - vperm v1,v1,v1,v9 + "stvewx v0,r3,r9\n" + "add r9,r9,r7\n" + "vperm v1,v1,v1,v9\n" - stvewx v1,r3,r8 - add r8,r8,r7 + "stvewx v1,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v1,r3,r9 - add r9,r9,r7 - vperm v2,v2,v2,v9 + "stvewx v1,r3,r9\n" + "add r9,r9,r7\n" + "vperm v2,v2,v2,v9\n" - stvewx v2,r3,r8 - add r8,r8,r7 + "stvewx v2,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v2,r3,r9 - add r9,r9,r7 - vperm v3,v3,v3,v9 + "stvewx v2,r3,r9\n" + "add r9,r9,r7\n" + "vperm v3,v3,v3,v9\n" - stvewx v3,r3,r8 - add r8,r8,r7 + "stvewx v3,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v3,r3,r9 - add r9,r9,r7 - vperm v4,v4,v4,v9 + "stvewx v3,r3,r9\n" + "add r9,r9,r7\n" + "vperm v4,v4,v4,v9\n" - stvewx v4,r3,r8 - add r8,r8,r7 + "stvewx v4,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v4,r3,r9 - add r9,r9,r7 - vperm v5,v5,v5,v9 + "stvewx v4,r3,r9\n" + "add r9,r9,r7\n" + "vperm v5,v5,v5,v9\n" - stvewx v5,r3,r8 - add r8,r8,r7 + "stvewx v5,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v5,r3,r9 - add r9,r9,r7 - vperm v6,v6,v6,v9 + "stvewx v5,r3,r9\n" + "add r9,r9,r7\n" + "vperm v6,v6,v6,v9\n" - stvewx v6,r3,r8 - add r8,r8,r7 + "stvewx v6,r3,r8\n" + "add r8,r8,r7\n" - lvsr v9,r3,r8 //load alignment vector for stores + "lvsr v9,r3,r8\n" //load alignment vector for stores - stvewx v6,r3,r9 - add r9,r9,r7 - vperm v7,v7,v7,v9 + "stvewx v6,r3,r9\n" + "add r9,r9,r7\n" + "vperm v7,v7,v7,v9\n" - stvewx v7,r3,r8 + "stvewx v7,r3,r8\n" - stvewx v7,r3,r9 - } + "stvewx v7,r3,r9\n" + ); } OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC);