|
|
/* r3, r4, r5 */ | /* r3, r4, r5 */ |
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds) | recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds) |
{ | { |
asm { |
asm( |
//trying cache hints | //trying cache hints |
lis r6,0x0108 |
"lis r6,0x0108\n" |
or r6,r6,r5 |
"or r6,r6,r5\n" |
dstst r3,r6,0 |
"dstst r3,r6,0\n" |
| |
vspltish v1,7 |
"vspltish v1,7\n" |
| |
vspltish v8,1 |
"vspltish v8,1\n\n" |
xor r6,r6,r6 |
"xor r6,r6,r6\n" |
| |
lvx v0,r4,r6 //get 8 shorts |
"lvx v0,r4,r6\n" //get 8 shorts |
vslh v8,v8,v1 //now have 128 |
"vslh v8,v8,v1\n" //now have 128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
| |
lvx v1,r4,r6 //get 8 shorts |
"lvx v1,r4,r6\n" //get 8 shorts |
vaddshs v0,v0,v8 //+=128 |
"vaddshs v0,v0,v8\n" //+=128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
| |
lvx v2,r4,r6 //get 8 shorts |
"lvx v2,r4,r6\n" //get 8 shorts |
vaddshs v1,v1,v8 //+=128 |
"vaddshs v1,v1,v8\n" //+=128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
vpkshus v0,v0,v0 //convert to bytes |
"vpkshus v0,v0,v0\n" //convert to bytes |
| |
lvx v3,r4,r6 //get 8 shorts |
"lvx v3,r4,r6\n" //get 8 shorts |
vaddshs v2,v2,v8 //+=128 |
"vaddshs v2,v2,v8\n" //+=128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
vpkshus v1,v1,v1 //convert to bytes |
"vpkshus v1,v1,v1\n" //convert to bytes |
| |
lvx v4,r4,r6 //get 8 shorts |
"lvx v4,r4,r6\n" //get 8 shorts |
vaddshs v3,v3,v8 //+=128 |
"vaddshs v3,v3,v8\n" //+=128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
vpkshus v2,v2,v2 //convert to bytes |
"vpkshus v2,v2,v2\n" //convert to bytes |
| |
lvx v5,r4,r6 //get 8 shorts |
"lvx v5,r4,r6\n" //get 8 shorts |
vaddshs v4,v4,v8 //+=128 |
"vaddshs v4,v4,v8\n" //+=128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
vpkshus v3,v3,v3 //convert to bytes |
"vpkshus v3,v3,v3\n" //convert to bytes |
| |
lvx v6,r4,r6 //get 8 shorts |
"lvx v6,r4,r6\n" //get 8 shorts |
vaddshs v5,v5,v8 //+=128 |
"vaddshs v5,v5,v8\n" //+=128 |
addi r6,r6,16 |
"addi r6,r6,16\n" |
vpkshus v4,v4,v4 //convert to bytes |
"vpkshus v4,v4,v4\n" //convert to bytes |
| |
lvx v7,r4,r6 //get 8 shorts |
"lvx v7,r4,r6\n" //get 8 shorts |
xor r6,r6,r6 |
"xor r6,r6,r6\n" |
vaddshs v6,v6,v8 //+=128 |
"vaddshs v6,v6,v8\n" //+=128 |
vpkshus v5,v5,v5 //convert to bytes |
"vpkshus v5,v5,v5\n" //convert to bytes |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
vaddshs v7,v7,v8 //+=128 |
"vaddshs v7,v7,v8\n" //+=128 |
vpkshus v6,v6,v6 //convert to bytes |
"vpkshus v6,v6,v6\n" //convert to bytes |
| |
vpkshus v7,v7,v7 //convert to bytes |
"vpkshus v7,v7,v7\n" //convert to bytes |
| |
li r7,4 |
"li r7,4\n" |
vperm v0,v0,v0,v9 |
"vperm v0,v0,v0,v9\n" |
| |
stvewx v0,r3,r6 |
"stvewx v0,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v0,r3,r7 |
"stvewx v0,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v1,v1,v1,v9 |
"vperm v1,v1,v1,v9\n" |
| |
stvewx v1,r3,r6 |
"stvewx v1,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v1,r3,r7 |
"stvewx v1,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v2,v2,v2,v9 |
"vperm v2,v2,v2,v9\n" |
| |
stvewx v2,r3,r6 |
"stvewx v2,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v2,r3,r7 |
"stvewx v2,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v3,v3,v3,v9 |
"vperm v3,v3,v3,v9\n" |
| |
stvewx v3,r3,r6 |
"stvewx v3,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v3,r3,r7 |
"stvewx v3,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v4,v4,v4,v9 |
"vperm v4,v4,v4,v9\n" |
| |
stvewx v4,r3,r6 |
"stvewx v4,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v4,r3,r7 |
"stvewx v4,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v5,v5,v5,v9 |
"vperm v5,v5,v5,v9\n" |
| |
stvewx v5,r3,r6 |
"stvewx v5,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v5,r3,r7 |
"stvewx v5,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v6,v6,v6,v9 |
"vperm v6,v6,v6,v9\n" |
| |
stvewx v6,r3,r6 |
"stvewx v6,r3,r6\n" |
add r6,r6,r5 |
"add r6,r6,r5\n" |
| |
lvsr v9,r3,r6 //load alignment vector for stores |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
| |
stvewx v6,r3,r7 |
"stvewx v6,r3,r7\n" |
add r7,r7,r5 |
"add r7,r7,r5\n" |
vperm v7,v7,v7,v9 |
"vperm v7,v7,v7,v9\n" |
| |
stvewx v7,r3,r6 |
"stvewx v7,r3,r6\n" |
| |
stvewx v7,r3,r7 |
"stvewx v7,r3,r7\n" |
} |
); |
} | } |
| |
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC); | OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC); |
|
|
static void /* r3, r4, r5, r6 */ | static void /* r3, r4, r5, r6 */ |
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss) | recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss) |
{ | { |
asm |
asm( |
{ |
|
//trying cache hints | //trying cache hints |
lis r7,0x0108 |
"lis r7,0x0108\n" |
or r7,r7,r6 |
"or r7,r7,r6\n" |
dstst r3,r7,0 |
"dstst r3,r7,0\n" |
| |
xor r7,r7,r7 |
"xor r7,r7,r7\n" |
li r8,16 |
"li r8,16\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
vxor v9,v9,v9 |
"vxor v9,v9,v9\n" |
| |
lvx v10,r4,r7 //get 8 refs |
"lvx v10,r4,r7\n" //get 8 refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v0,r4,r8 //need another 16 bytes for misaligned data -- 0 |
"lvx v0,r4,r8\n" //need another 16 bytes for misaligned data -- 0 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v11,r4,r7 //get 8 refs |
"lvx v11,r4,r7\n" //get 8 refs |
vperm v10,v10,v0,v8 |
"vperm v10,v10,v0,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v1,r4,r8 //need another 16 bytes for misaligned data -- 1 |
"lvx v1,r4,r8\n" //need another 16 bytes for misaligned data -- 1 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v12,r4,r7 //get 8 refs |
"lvx v12,r4,r7\n" //get 8 refs |
vperm v11,v11,v1,v8 |
"vperm v11,v11,v1,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v2,r4,r8 //need another 16 bytes for misaligned data -- 2 |
"lvx v2,r4,r8\n" //need another 16 bytes for misaligned data -- 2 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v13,r4,r7 //get 8 refs |
"lvx v13,r4,r7\n" //get 8 refs |
vperm v12,v12,v2,v8 |
"vperm v12,v12,v2,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v3,r4,r8 //need another 16 bytes for misaligned data -- 3 |
"lvx v3,r4,r8\n" //need another 16 bytes for misaligned data -- 3 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v14,r4,r7 //get 8 refs |
"lvx v14,r4,r7\n" //get 8 refs |
vperm v13,v13,v3,v8 |
"vperm v13,v13,v3,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v4,r4,r8 //need another 16 bytes for misaligned data -- 4 |
"lvx v4,r4,r8\n" //need another 16 bytes for misaligned data -- 4 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v15,r4,r7 //get 8 refs |
"lvx v15,r4,r7\n" //get 8 refs |
vperm v14,v14,v4,v8 |
"vperm v14,v14,v4,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v5,r4,r8 //need another 16 bytes for misaligned data -- 5 |
"lvx v5,r4,r8\n" //need another 16 bytes for misaligned data -- 5 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v16,r4,r7 //get 8 refs |
"lvx v16,r4,r7\n" //get 8 refs |
vperm v15,v15,v5,v8 |
"vperm v15,v15,v5,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvx v6,r4,r8 //need another 16 bytes for misaligned data -- 6 |
"lvx v6,r4,r8\n" //need another 16 bytes for misaligned data -- 6 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v17,r4,r7 //get 8 refs |
"lvx v17,r4,r7\n" //get 8 refs |
vperm v16,v16,v6,v8 |
"vperm v16,v16,v6,v8\n" |
| |
lvsl v8,r4,r7 //load alignment vector for refs |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
xor r7,r7,r7 |
"xor r7,r7,r7\n" |
| |
lvx v7,r4,r8 //need another 16 bytes for misaligned data -- 7 |
"lvx v7,r4,r8\n" //need another 16 bytes for misaligned data -- 7 |
add r8,r8,r6 |
"add r8,r8,r6\n" |
| |
lvx v0,r5,r7 //get 8 shorts |
"lvx v0,r5,r7\n" //get 8 shorts |
vperm v17,v17,v7,v8 |
"vperm v17,v17,v7,v8\n" |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v1,r5,r7 //get 8 shorts |
"lvx v1,r5,r7\n" //get 8 shorts |
vmrghb v10,v9,v10 //unsigned byte -> unsigned half |
"vmrghb v10,v9,v10\n" //unsigned byte -> unsigned half |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v2,r5,r7 //get 8 shorts |
"lvx v2,r5,r7\n" //get 8 shorts |
vmrghb v11,v9,v11 //unsigned byte -> unsigned half |
"vmrghb v11,v9,v11\n" //unsigned byte -> unsigned half |
vaddshs v0,v0,v10 |
"vaddshs v0,v0,v10\n" |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v3,r5,r7 //get 8 shorts |
"lvx v3,r5,r7\n" //get 8 shorts |
vmrghb v12,v9,v12 //unsigned byte -> unsigned half |
"vmrghb v12,v9,v12\n" //unsigned byte -> unsigned half |
vaddshs v1,v1,v11 |
"vaddshs v1,v1,v11\n" |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v4,r5,r7 //get 8 shorts |
"lvx v4,r5,r7\n" //get 8 shorts |
vmrghb v13,v9,v13 //unsigned byte -> unsigned half |
"vmrghb v13,v9,v13\n" //unsigned byte -> unsigned half |
vaddshs v2,v2,v12 |
"vaddshs v2,v2,v12\n" |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v5,r5,r7 //get 8 shorts |
"lvx v5,r5,r7\n" //get 8 shorts |
vmrghb v14,v9,v14 //unsigned byte -> unsigned half |
"vmrghb v14,v9,v14\n" //unsigned byte -> unsigned half |
vaddshs v3,v3,v13 |
"vaddshs v3,v3,v13\n" |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v6,r5,r7 //get 8 shorts |
"lvx v6,r5,r7\n" //get 8 shorts |
vmrghb v15,v9,v15 //unsigned byte -> unsigned half |
"vmrghb v15,v9,v15\n" //unsigned byte -> unsigned half |
vaddshs v4,v4,v14 |
"vaddshs v4,v4,v14\n" |
addi r7,r7,16 |
"addi r7,r7,16\n" |
| |
lvx v7,r5,r7 //get 8 shorts |
"lvx v7,r5,r7\n" //get 8 shorts |
vmrghb v16,v9,v16 //unsigned byte -> unsigned half |
"vmrghb v16,v9,v16\n" //unsigned byte -> unsigned half |
vaddshs v5,v5,v15 |
"vaddshs v5,v5,v15\n" |
| |
vmrghb v17,v9,v17 //unsigned byte -> unsigned half |
"vmrghb v17,v9,v17\n" //unsigned byte -> unsigned half |
vaddshs v6,v6,v16 |
"vaddshs v6,v6,v16\n" |
| |
vpkshus v0,v0,v0 |
"vpkshus v0,v0,v0\n" |
vaddshs v7,v7,v17 |
"vaddshs v7,v7,v17\n" |
| |
vpkshus v1,v1,v1 |
"vpkshus v1,v1,v1\n" |
xor r7,r7,r7 |
"xor r7,r7,r7\n" |
| |
vpkshus v2,v2,v2 |
"vpkshus v2,v2,v2\n" |
| |
vpkshus v3,v3,v3 |
"vpkshus v3,v3,v3\n" |
| |
vpkshus v4,v4,v4 |
"vpkshus v4,v4,v4\n" |
| |
vpkshus v5,v5,v5 |
"vpkshus v5,v5,v5\n" |
| |
vpkshus v6,v6,v6 |
"vpkshus v6,v6,v6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
vpkshus v7,v7,v7 |
"vpkshus v7,v7,v7\n" |
| |
li r8,4 |
"li r8,4\n" |
vperm v0,v0,v0,v9 //adjust for writes |
"vperm v0,v0,v0,v9\n" //adjust for writes |
| |
stvewx v0,r3,r7 |
"stvewx v0,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v0,r3,r8 |
"stvewx v0,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v1,v1,v1,v9 |
"vperm v1,v1,v1,v9\n" |
| |
stvewx v1,r3,r7 |
"stvewx v1,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v1,r3,r8 |
"stvewx v1,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v2,v2,v2,v9 |
"vperm v2,v2,v2,v9\n" |
| |
stvewx v2,r3,r7 |
"stvewx v2,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v2,r3,r8 |
"stvewx v2,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v3,v3,v3,v9 |
"vperm v3,v3,v3,v9\n" |
| |
stvewx v3,r3,r7 |
"stvewx v3,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v3,r3,r8 |
"stvewx v3,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v4,v4,v4,v9 |
"vperm v4,v4,v4,v9\n" |
| |
stvewx v4,r3,r7 |
"stvewx v4,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v4,r3,r8 |
"stvewx v4,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v5,v5,v5,v9 |
"vperm v5,v5,v5,v9\n" |
| |
stvewx v5,r3,r7 |
"stvewx v5,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v5,r3,r8 |
"stvewx v5,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v6,v6,v6,v9 |
"vperm v6,v6,v6,v9\n" |
| |
stvewx v6,r3,r7 |
"stvewx v6,r3,r7\n" |
add r7,r7,r6 |
"add r7,r7,r6\n" |
| |
lvsr v9,r3,r7 //load alignment vector for stores |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
| |
stvewx v6,r3,r8 |
"stvewx v6,r3,r8\n" |
add r8,r8,r6 |
"add r8,r8,r6\n" |
vperm v7,v7,v7,v9 |
"vperm v7,v7,v7,v9\n" |
| |
stvewx v7,r3,r7 |
"stvewx v7,r3,r7\n" |
| |
stvewx v7,r3,r8 |
"stvewx v7,r3,r8\n" |
} |
); |
} | } |
| |
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC); | OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC); |
|
|
static void /* r3, r4, r5, r6, r7 */ | static void /* r3, r4, r5, r6, r7 */ |
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) | recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) |
{ | { |
asm |
asm( |
{ |
|
//trying cache hints | //trying cache hints |
lis r8,0x0108 |
"lis r8,0x0108\n" |
or r8,r8,r7 |
"or r8,r8,r7\n" |
dstst r3,r8,0 |
"dstst r3,r8,0\n" |
| |
xor r8,r8,r8 |
"xor r8,r8,r8\n" |
li r9,16 |
"li r9,16\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
vxor v9,v9,v9 |
"vxor v9,v9,v9\n" |
| |
lvx v10,r4,r8 //get 8 RefPtr1 -- 0 |
"lvx v10,r4,r8\n" //get 8 RefPtr1 -- 0 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v0,r4,r9 //need another 16 bytes for misaligned data -- 0 |
"lvx v0,r4,r9\n" //need another 16 bytes for misaligned data -- 0 |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v11,r4,r8 //get 8 RefPtr1 -- 1 |
"lvx v11,r4,r8\n" //get 8 RefPtr1 -- 1 |
vperm v10,v10,v0,v8 |
"vperm v10,v10,v0,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v1,r4,r9 //need another 16 bytes for misaligned data -- 1 |
"lvx v1,r4,r9\n" //need another 16 bytes for misaligned data -- 1 |
vmrghb v10,v9,v10 //unsigned byte -> unsigned half |
"vmrghb v10,v9,v10\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v12,r4,r8 //get 8 RefPtr1 -- 2 |
"lvx v12,r4,r8\n" //get 8 RefPtr1 -- 2 |
vperm v11,v11,v1,v8 |
"vperm v11,v11,v1,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v2,r4,r9 //need another 16 bytes for misaligned data -- 2 |
"lvx v2,r4,r9\n" //need another 16 bytes for misaligned data -- 2 |
vmrghb v11,v9,v11 //unsigned byte -> unsigned half |
"vmrghb v11,v9,v11\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v13,r4,r8 //get 8 RefPtr1 -- 3 |
"lvx v13,r4,r8\n" //get 8 RefPtr1 -- 3 |
vperm v12,v12,v2,v8 |
"vperm v12,v12,v2,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v3,r4,r9 //need another 16 bytes for misaligned data -- 3 |
"lvx v3,r4,r9\n" //need another 16 bytes for misaligned data -- 3 |
vmrghb v12,v9,v12 //unsigned byte -> unsigned half |
"vmrghb v12,v9,v12\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v14,r4,r8 //get 8 RefPtr1 -- 4 |
"lvx v14,r4,r8\n" //get 8 RefPtr1 -- 4 |
vperm v13,v13,v3,v8 |
"vperm v13,v13,v3,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v4,r4,r9 //need another 16 bytes for misaligned data -- 4 |
"lvx v4,r4,r9\n" //need another 16 bytes for misaligned data -- 4 |
vmrghb v13,v9,v13 //unsigned byte -> unsigned half |
"vmrghb v13,v9,v13\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v15,r4,r8 //get 8 RefPtr1 -- 5 |
"lvx v15,r4,r8\n" //get 8 RefPtr1 -- 5 |
vperm v14,v14,v4,v8 |
"vperm v14,v14,v4,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v5,r4,r9 //need another 16 bytes for misaligned data -- 5 |
"lvx v5,r4,r9\n" //need another 16 bytes for misaligned data -- 5 |
vmrghb v14,v9,v14 //unsigned byte -> unsigned half |
"vmrghb v14,v9,v14\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v16,r4,r8 //get 8 RefPtr1 -- 6 |
"lvx v16,r4,r8\n" //get 8 RefPtr1 -- 6 |
vperm v15,v15,v5,v8 |
"vperm v15,v15,v5,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v6,r4,r9 //need another 16 bytes for misaligned data -- 6 |
"lvx v6,r4,r9\n" //need another 16 bytes for misaligned data -- 6 |
vmrghb v15,v9,v15 //unsigned byte -> unsigned half |
"vmrghb v15,v9,v15\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v17,r4,r8 //get 8 RefPtr1 -- 7 |
"lvx v17,r4,r8\n" //get 8 RefPtr1 -- 7 |
vperm v16,v16,v6,v8 |
"vperm v16,v16,v6,v8\n" |
| |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v7,r4,r9 //need another 16 bytes for misaligned data -- 7 |
"lvx v7,r4,r9\n" //need another 16 bytes for misaligned data -- 7 |
vmrghb v16,v9,v16 //unsigned byte -> unsigned half |
"vmrghb v16,v9,v16\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
//-------- | //-------- |
vperm v17,v17,v7,v8 |
"vperm v17,v17,v7,v8\n" |
xor r8,r8,r8 |
"xor r8,r8,r8\n" |
li r9,16 |
"li r9,16\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
vmrghb v17,v9,v17 //unsigned byte -> unsigned half |
"vmrghb v17,v9,v17\n" //unsigned byte -> unsigned half |
| |
lvx v20,r5,r8 //get 8 RefPtr2 -- 0 |
"lvx v20,r5,r8\n" //get 8 RefPtr2 -- 0 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v0,r5,r9 //need another 16 bytes for misaligned data -- 0 |
"lvx v0,r5,r9\n" //need another 16 bytes for misaligned data -- 0 |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v21,r5,r8 //get 8 RefPtr2 -- 1 |
"lvx v21,r5,r8\n" //get 8 RefPtr2 -- 1 |
vperm v20,v20,v0,v18 |
"vperm v20,v20,v0,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v1,r5,r9 //need another 16 bytes for misaligned data -- 1 |
"lvx v1,r5,r9\n" //need another 16 bytes for misaligned data -- 1 |
vmrghb v20,v9,v20 //unsigned byte -> unsigned half |
"vmrghb v20,v9,v20\n" //unsigned byte -> unsigned half |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v22,r5,r8 //get 8 RefPtr2 -- 2 |
"lvx v22,r5,r8\n" //get 8 RefPtr2 -- 2 |
vperm v21,v21,v1,v18 |
"vperm v21,v21,v1,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v2,r5,r9 //need another 16 bytes for misaligned data -- 2 |
"lvx v2,r5,r9\n" //need another 16 bytes for misaligned data -- 2 |
vmrghb v21,v9,v21 //unsigned byte -> unsigned half |
"vmrghb v21,v9,v21\n" //unsigned byte -> unsigned half |
vadduhm v10,v10,v20 |
"vadduhm v10,v10,v20\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v23,r5,r8 //get 8 RefPtr2 -- 3 |
"lvx v23,r5,r8\n" //get 8 RefPtr2 -- 3 |
vperm v22,v22,v2,v18 |
"vperm v22,v22,v2,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v3,r5,r9 //need another 16 bytes for misaligned data -- 3 |
"lvx v3,r5,r9\n" //need another 16 bytes for misaligned data -- 3 |
vmrghb v22,v9,v22 //unsigned byte -> unsigned half |
"vmrghb v22,v9,v22\n" //unsigned byte -> unsigned half |
vadduhm v11,v11,v21 |
"vadduhm v11,v11,v21\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v24,r5,r8 //get 8 RefPtr2 -- 4 |
"lvx v24,r5,r8\n" //get 8 RefPtr2 -- 4 |
vperm v23,v23,v3,v18 |
"vperm v23,v23,v3,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v4,r5,r9 //need another 16 bytes for misaligned data -- 4 |
"lvx v4,r5,r9\n" //need another 16 bytes for misaligned data -- 4 |
vmrghb v23,v9,v23 //unsigned byte -> unsigned half |
"vmrghb v23,v9,v23\n" //unsigned byte -> unsigned half |
vadduhm v12,v12,v22 |
"vadduhm v12,v12,v22\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v25,r5,r8 //get 8 RefPtr2 -- 5 |
"lvx v25,r5,r8\n" //get 8 RefPtr2 -- 5 |
vperm v24,v24,v4,v18 |
"vperm v24,v24,v4,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v5,r5,r9 //need another 16 bytes for misaligned data -- 5 |
"lvx v5,r5,r9\n" //need another 16 bytes for misaligned data -- 5 |
vmrghb v24,v9,v24 //unsigned byte -> unsigned half |
"vmrghb v24,v9,v24\n" //unsigned byte -> unsigned half |
vadduhm v13,v13,v23 |
"vadduhm v13,v13,v23\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v26,r5,r8 //get 8 RefPtr2 -- 6 |
"lvx v26,r5,r8\n" //get 8 RefPtr2 -- 6 |
vperm v25,v25,v5,v18 |
"vperm v25,v25,v5,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v6,r5,r9 //need another 16 bytes for misaligned data -- 6 |
"lvx v6,r5,r9\n" //need another 16 bytes for misaligned data -- 6 |
vmrghb v25,v9,v25 //unsigned byte -> unsigned half |
"vmrghb v25,v9,v25\n" //unsigned byte -> unsigned half |
vadduhm v14,v14,v24 |
"vadduhm v14,v14,v24\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
lvx v27,r5,r8 //get 8 RefPtr2 -- 7 |
"lvx v27,r5,r8\n" //get 8 RefPtr2 -- 7 |
vperm v26,v26,v6,v18 |
"vperm v26,v26,v6,v18\n" |
| |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvx v7,r5,r9 //need another 16 bytes for misaligned data -- 7 |
"lvx v7,r5,r9\n" //need another 16 bytes for misaligned data -- 7 |
vmrghb v26,v9,v26 //unsigned byte -> unsigned half |
"vmrghb v26,v9,v26\n" //unsigned byte -> unsigned half |
vadduhm v15,v15,v25 |
"vadduhm v15,v15,v25\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
| |
vperm v27,v27,v7,v18 |
"vperm v27,v27,v7,v18\n" |
xor r8,r8,r8 |
"xor r8,r8,r8\n" |
| |
vmrghb v27,v9,v27 //unsigned byte -> unsigned half |
"vmrghb v27,v9,v27\n" //unsigned byte -> unsigned half |
vadduhm v16,v16,v26 |
"vadduhm v16,v16,v26\n" |
| |
vadduhm v17,v17,v27 |
"vadduhm v17,v17,v27\n" |
vspltish v8,1 |
"vspltish v8,1\n" |
//-------- | //-------- |
lvx v0,r6,r8 //get 8 shorts |
"lvx v0,r6,r8\n" //get 8 shorts |
vsrh v10,v10,v8 |
"vsrh v10,v10,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v1,r6,r8 //get 8 shorts |
"lvx v1,r6,r8\n" //get 8 shorts |
vsrh v11,v11,v8 |
"vsrh v11,v11,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v2,r6,r8 //get 8 shorts |
"lvx v2,r6,r8\n" //get 8 shorts |
vsrh v12,v12,v8 |
"vsrh v12,v12,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v3,r6,r8 //get 8 shorts |
"lvx v3,r6,r8\n" //get 8 shorts |
vsrh v13,v13,v8 |
"vsrh v13,v13,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v4,r6,r8 //get 8 shorts |
"lvx v4,r6,r8\n" //get 8 shorts |
vsrh v14,v14,v8 |
"vsrh v14,v14,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v5,r6,r8 //get 8 shorts |
"lvx v5,r6,r8\n" //get 8 shorts |
vsrh v15,v15,v8 |
"vsrh v15,v15,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v6,r6,r8 //get 8 shorts |
"lvx v6,r6,r8\n" //get 8 shorts |
vsrh v16,v16,v8 |
"vsrh v16,v16,v8\n" |
addi r8,r8,16 |
"addi r8,r8,16\n" |
|
|
lvx v7,r6,r8 //get 8 shorts |
"lvx v7,r6,r8\n" //get 8 shorts |
vsrh v17,v17,v8 |
"vsrh v17,v17,v8\n" |
xor r8,r8,r8 |
"xor r8,r8,r8\n" |
//-------- | //-------- |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
vaddshs v0,v0,v10 |
"vaddshs v0,v0,v10\n" |
| |
vaddshs v1,v1,v11 |
"vaddshs v1,v1,v11\n" |
vpkshus v0,v0,v0 |
"vpkshus v0,v0,v0\n" |
| |
vaddshs v2,v2,v12 |
"vaddshs v2,v2,v12\n" |
vpkshus v1,v1,v1 |
"vpkshus v1,v1,v1\n" |
| |
vaddshs v3,v3,v13 |
"vaddshs v3,v3,v13\n" |
vpkshus v2,v2,v2 |
"vpkshus v2,v2,v2\n" |
| |
vaddshs v4,v4,v14 |
"vaddshs v4,v4,v14\n" |
vpkshus v3,v3,v3 |
"vpkshus v3,v3,v3\n" |
| |
vaddshs v5,v5,v15 |
"vaddshs v5,v5,v15\n" |
vpkshus v4,v4,v4 |
"vpkshus v4,v4,v4\n" |
| |
vaddshs v6,v6,v16 |
"vaddshs v6,v6,v16\n" |
vpkshus v5,v5,v5 |
"vpkshus v5,v5,v5\n" |
| |
vaddshs v7,v7,v17 |
"vaddshs v7,v7,v17\n" |
vpkshus v6,v6,v6 |
"vpkshus v6,v6,v6\n" |
| |
vpkshus v7,v7,v7 |
"vpkshus v7,v7,v7\n" |
| |
li r9,4 |
"li r9,4\n" |
vperm v0,v0,v0,v9 //adjust for writes |
"vperm v0,v0,v0,v9\n" //adjust for writes |
| |
stvewx v0,r3,r8 |
"stvewx v0,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v0,r3,r9 |
"stvewx v0,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v1,v1,v1,v9 |
"vperm v1,v1,v1,v9\n" |
| |
stvewx v1,r3,r8 |
"stvewx v1,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v1,r3,r9 |
"stvewx v1,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v2,v2,v2,v9 |
"vperm v2,v2,v2,v9\n" |
| |
stvewx v2,r3,r8 |
"stvewx v2,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v2,r3,r9 |
"stvewx v2,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v3,v3,v3,v9 |
"vperm v3,v3,v3,v9\n" |
| |
stvewx v3,r3,r8 |
"stvewx v3,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v3,r3,r9 |
"stvewx v3,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v4,v4,v4,v9 |
"vperm v4,v4,v4,v9\n" |
| |
stvewx v4,r3,r8 |
"stvewx v4,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v4,r3,r9 |
"stvewx v4,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v5,v5,v5,v9 |
"vperm v5,v5,v5,v9\n" |
| |
stvewx v5,r3,r8 |
"stvewx v5,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v5,r3,r9 |
"stvewx v5,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v6,v6,v6,v9 |
"vperm v6,v6,v6,v9\n" |
| |
stvewx v6,r3,r8 |
"stvewx v6,r3,r8\n" |
add r8,r8,r7 |
"add r8,r8,r7\n" |
| |
lvsr v9,r3,r8 //load alignment vector for stores |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
| |
stvewx v6,r3,r9 |
"stvewx v6,r3,r9\n" |
add r9,r9,r7 |
"add r9,r9,r7\n" |
vperm v7,v7,v7,v9 |
"vperm v7,v7,v7,v9\n" |
| |
stvewx v7,r3,r8 |
"stvewx v7,r3,r8\n" |
| |
stvewx v7,r3,r9 |
"stvewx v7,r3,r9\n" |
} |
); |
} | } |
| |
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC); | OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC); |