View | Details | Raw Unified
Collapse All | Expand All

(-) liboil-0.3.3/liboil/powerpc/recon8x8_altivec.c (-459 / +457 lines)
 Lines 46-176    Link Here 
                    /*       r3,            r4,         r5 */
                    /*       r3,            r4,         r5 */
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds)
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds)
{
{
    asm {
    asm(
		//trying cache hints
		//trying cache hints
		lis			r6,0x0108
		"lis			r6,0x0108\n"
		or			r6,r6,r5
		"or			r6,r6,r5\n"
		dstst		r3,r6,0
		"dstst		r3,r6,0\n"
		vspltish	v1,7
		"vspltish	v1,7\n"
		vspltish	v8,1
		"vspltish	v8,1\n\n"
		xor			r6,r6,r6
		"xor			r6,r6,r6\n"
		
		
		lvx			v0,r4,r6				//get 8 shorts					
		"lvx			v0,r4,r6\n"				//get 8 shorts					
		vslh		v8,v8,v1				//now have 128
		"vslh		v8,v8,v1\n"				//now have 128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		
		
		lvx			v1,r4,r6				//get 8 shorts					
		"lvx			v1,r4,r6\n"				//get 8 shorts					
		vaddshs		v0,v0,v8				//+=128
		"vaddshs		v0,v0,v8\n"				//+=128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		lvx			v2,r4,r6				//get 8 shorts					
		"lvx			v2,r4,r6\n"				//get 8 shorts					
		vaddshs		v1,v1,v8				//+=128
		"vaddshs		v1,v1,v8\n"				//+=128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		vpkshus		v0,v0,v0				//convert to bytes
		"vpkshus		v0,v0,v0\n"				//convert to bytes
		lvx			v3,r4,r6				//get 8 shorts					
		"lvx			v3,r4,r6\n"				//get 8 shorts					
		vaddshs		v2,v2,v8				//+=128
		"vaddshs		v2,v2,v8\n"				//+=128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		vpkshus		v1,v1,v1				//convert to bytes
		"vpkshus		v1,v1,v1\n"				//convert to bytes
		lvx			v4,r4,r6				//get 8 shorts					
		"lvx			v4,r4,r6\n"				//get 8 shorts					
		vaddshs		v3,v3,v8				//+=128
		"vaddshs		v3,v3,v8\n"				//+=128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		vpkshus		v2,v2,v2				//convert to bytes
		"vpkshus		v2,v2,v2\n"				//convert to bytes
		lvx			v5,r4,r6				//get 8 shorts					
		"lvx			v5,r4,r6\n"				//get 8 shorts					
		vaddshs		v4,v4,v8				//+=128
		"vaddshs		v4,v4,v8\n"				//+=128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		vpkshus		v3,v3,v3				//convert to bytes
		"vpkshus		v3,v3,v3\n"				//convert to bytes
		lvx			v6,r4,r6				//get 8 shorts					
		"lvx			v6,r4,r6\n"				//get 8 shorts					
		vaddshs		v5,v5,v8				//+=128
		"vaddshs		v5,v5,v8\n"				//+=128
		addi		r6,r6,16
		"addi		r6,r6,16\n"
		vpkshus		v4,v4,v4				//convert to bytes
		"vpkshus		v4,v4,v4\n"				//convert to bytes
		lvx			v7,r4,r6				//get 8 shorts	
		"lvx			v7,r4,r6\n"				//get 8 shorts	
		xor			r6,r6,r6				
		"xor			r6,r6,r6\n"				
		vaddshs		v6,v6,v8				//+=128
		"vaddshs		v6,v6,v8\n"				//+=128
		vpkshus		v5,v5,v5				//convert to bytes
		"vpkshus		v5,v5,v5\n"				//convert to bytes
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		vaddshs		v7,v7,v8				//+=128
		"vaddshs		v7,v7,v8\n"				//+=128
		vpkshus		v6,v6,v6				//convert to bytes
		"vpkshus		v6,v6,v6\n"				//convert to bytes
		vpkshus		v7,v7,v7				//convert to bytes
		"vpkshus		v7,v7,v7\n"				//convert to bytes
		li			r7,4
		"li			r7,4\n"
		vperm		v0,v0,v0,v9
		"vperm		v0,v0,v0,v9\n"
		stvewx		v0,r3,r6
		"stvewx		v0,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v0,r3,r7
		"stvewx		v0,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v1,v1,v1,v9
		"vperm		v1,v1,v1,v9\n"
		stvewx		v1,r3,r6
		"stvewx		v1,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v1,r3,r7
		"stvewx		v1,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v2,v2,v2,v9
		"vperm		v2,v2,v2,v9\n"
		stvewx		v2,r3,r6
		"stvewx		v2,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v2,r3,r7
		"stvewx		v2,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v3,v3,v3,v9
		"vperm		v3,v3,v3,v9\n"
		stvewx		v3,r3,r6
		"stvewx		v3,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v3,r3,r7
		"stvewx		v3,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v4,v4,v4,v9
		"vperm		v4,v4,v4,v9\n"
		stvewx		v4,r3,r6
		"stvewx		v4,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v4,r3,r7
		"stvewx		v4,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v5,v5,v5,v9
		"vperm		v5,v5,v5,v9\n"
		stvewx		v5,r3,r6
		"stvewx		v5,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v5,r3,r7
		"stvewx		v5,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v6,v6,v6,v9
		"vperm		v6,v6,v6,v9\n"
		stvewx		v6,r3,r6
		"stvewx		v6,r3,r6\n"
		add			r6,r6,r5
		"add			r6,r6,r5\n"
		lvsr		v9,r3,r6				//load alignment vector for stores
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
		stvewx		v6,r3,r7
		"stvewx		v6,r3,r7\n"
		add			r7,r7,r5
		"add			r7,r7,r5\n"
		vperm		v7,v7,v7,v9
		"vperm		v7,v7,v7,v9\n"
		stvewx		v7,r3,r6
		"stvewx		v7,r3,r6\n"
		stvewx		v7,r3,r7
		"stvewx		v7,r3,r7\n"
    }
    );
}
}
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC);
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC);
 Lines 178-394    Link Here 
static void          /*      r3,            r4,           r5,         r6 */
static void          /*      r3,            r4,           r5,         r6 */
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss)
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss)
{
{
	asm
	asm(
	{
		//trying cache hints
		//trying cache hints
		lis			r7,0x0108
		"lis			r7,0x0108\n"
		or			r7,r7,r6
		"or			r7,r7,r6\n"
		dstst		r3,r7,0
		"dstst		r3,r7,0\n"
		
		
		xor			r7,r7,r7
		"xor			r7,r7,r7\n"
		li			r8,16
		"li			r8,16\n"
		
		
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		vxor		v9,v9,v9
		"vxor		v9,v9,v9\n"
		
		
		lvx			v10,r4,r7				//get 8 refs
		"lvx			v10,r4,r7\n"				//get 8 refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v0,r4,r8				//need another 16 bytes for misaligned data -- 0
		"lvx			v0,r4,r8\n"				//need another 16 bytes for misaligned data -- 0
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v11,r4,r7				//get 8 refs
		"lvx			v11,r4,r7\n"				//get 8 refs
		vperm		v10,v10,v0,v8
		"vperm		v10,v10,v0,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v1,r4,r8				//need another 16 bytes for misaligned data -- 1
		"lvx			v1,r4,r8\n"				//need another 16 bytes for misaligned data -- 1
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v12,r4,r7				//get 8 refs
		"lvx			v12,r4,r7\n"				//get 8 refs
		vperm		v11,v11,v1,v8
		"vperm		v11,v11,v1,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v2,r4,r8				//need another 16 bytes for misaligned data -- 2
		"lvx			v2,r4,r8\n"				//need another 16 bytes for misaligned data -- 2
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v13,r4,r7				//get 8 refs
		"lvx			v13,r4,r7\n"				//get 8 refs
		vperm		v12,v12,v2,v8
		"vperm		v12,v12,v2,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v3,r4,r8				//need another 16 bytes for misaligned data -- 3
		"lvx			v3,r4,r8\n"				//need another 16 bytes for misaligned data -- 3
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v14,r4,r7				//get 8 refs
		"lvx			v14,r4,r7\n"				//get 8 refs
		vperm		v13,v13,v3,v8
		"vperm		v13,v13,v3,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v4,r4,r8				//need another 16 bytes for misaligned data -- 4
		"lvx			v4,r4,r8\n"				//need another 16 bytes for misaligned data -- 4
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v15,r4,r7				//get 8 refs
		"lvx			v15,r4,r7\n"				//get 8 refs
		vperm		v14,v14,v4,v8
		"vperm		v14,v14,v4,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v5,r4,r8				//need another 16 bytes for misaligned data -- 5
		"lvx			v5,r4,r8\n"				//need another 16 bytes for misaligned data -- 5
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v16,r4,r7				//get 8 refs
		"lvx			v16,r4,r7\n"				//get 8 refs
		vperm		v15,v15,v5,v8
		"vperm		v15,v15,v5,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		add			r7,r7,r6					
		"add			r7,r7,r6\n"					
		lvx			v6,r4,r8				//need another 16 bytes for misaligned data -- 6
		"lvx			v6,r4,r8\n"				//need another 16 bytes for misaligned data -- 6
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v17,r4,r7				//get 8 refs
		"lvx			v17,r4,r7\n"				//get 8 refs
		vperm		v16,v16,v6,v8
		"vperm		v16,v16,v6,v8\n"
		lvsl		v8,r4,r7				//load alignment vector for refs
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
		xor			r7,r7,r7
		"xor			r7,r7,r7\n"
		lvx			v7,r4,r8				//need another 16 bytes for misaligned data -- 7
		"lvx			v7,r4,r8\n"				//need another 16 bytes for misaligned data -- 7
		add			r8,r8,r6					
		"add			r8,r8,r6\n"					
		lvx			v0,r5,r7				//get 8 shorts 				
		"lvx			v0,r5,r7\n"				//get 8 shorts 				
		vperm		v17,v17,v7,v8
		"vperm		v17,v17,v7,v8\n"
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v1,r5,r7				//get 8 shorts 				
		"lvx			v1,r5,r7\n"				//get 8 shorts 				
		vmrghb		v10,v9,v10				//unsigned byte -> unsigned half		
		"vmrghb		v10,v9,v10\n"				//unsigned byte -> unsigned half		
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v2,r5,r7				//get 8 shorts 				
		"lvx			v2,r5,r7\n"				//get 8 shorts 				
		vmrghb		v11,v9,v11				//unsigned byte -> unsigned half		
		"vmrghb		v11,v9,v11\n"				//unsigned byte -> unsigned half		
		vaddshs		v0,v0,v10
		"vaddshs		v0,v0,v10\n"
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v3,r5,r7				//get 8 shorts 				
		"lvx			v3,r5,r7\n"				//get 8 shorts 				
		vmrghb		v12,v9,v12				//unsigned byte -> unsigned half		
		"vmrghb		v12,v9,v12\n"				//unsigned byte -> unsigned half		
		vaddshs		v1,v1,v11
		"vaddshs		v1,v1,v11\n"
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v4,r5,r7				//get 8 shorts 				
		"lvx			v4,r5,r7\n"				//get 8 shorts 				
		vmrghb		v13,v9,v13				//unsigned byte -> unsigned half		
		"vmrghb		v13,v9,v13\n"				//unsigned byte -> unsigned half		
		vaddshs		v2,v2,v12
		"vaddshs		v2,v2,v12\n"
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v5,r5,r7				//get 8 shorts 				
		"lvx			v5,r5,r7\n"				//get 8 shorts 				
		vmrghb		v14,v9,v14				//unsigned byte -> unsigned half		
		"vmrghb		v14,v9,v14\n"				//unsigned byte -> unsigned half		
		vaddshs		v3,v3,v13
		"vaddshs		v3,v3,v13\n"
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v6,r5,r7				//get 8 shorts 				
		"lvx			v6,r5,r7\n"				//get 8 shorts 				
		vmrghb		v15,v9,v15				//unsigned byte -> unsigned half		
		"vmrghb		v15,v9,v15\n"				//unsigned byte -> unsigned half		
		vaddshs		v4,v4,v14
		"vaddshs		v4,v4,v14\n"
		addi		r7,r7,16
		"addi		r7,r7,16\n"
		lvx			v7,r5,r7				//get 8 shorts 				
		"lvx			v7,r5,r7\n"				//get 8 shorts 				
		vmrghb		v16,v9,v16				//unsigned byte -> unsigned half		
		"vmrghb		v16,v9,v16\n"				//unsigned byte -> unsigned half		
		vaddshs		v5,v5,v15
		"vaddshs		v5,v5,v15\n"
		
		
		vmrghb		v17,v9,v17				//unsigned byte -> unsigned half	
		"vmrghb		v17,v9,v17\n"				//unsigned byte -> unsigned half	
		vaddshs		v6,v6,v16
		"vaddshs		v6,v6,v16\n"
		
		
		vpkshus		v0,v0,v0				
		"vpkshus		v0,v0,v0\n"				
		vaddshs		v7,v7,v17
		"vaddshs		v7,v7,v17\n"
			
			
		vpkshus		v1,v1,v1				
		"vpkshus		v1,v1,v1\n"				
		xor			r7,r7,r7
		"xor			r7,r7,r7\n"
		vpkshus		v2,v2,v2				
		"vpkshus		v2,v2,v2\n"				
		vpkshus		v3,v3,v3				
		"vpkshus		v3,v3,v3\n"				
		vpkshus		v4,v4,v4				
		"vpkshus		v4,v4,v4\n"				
		vpkshus		v5,v5,v5				
		"vpkshus		v5,v5,v5\n"				
		vpkshus		v6,v6,v6				
		"vpkshus		v6,v6,v6\n"				
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		vpkshus		v7,v7,v7
		"vpkshus		v7,v7,v7\n"
		li			r8,4
		"li			r8,4\n"
		vperm		v0,v0,v0,v9				//adjust for writes
		"vperm		v0,v0,v0,v9\n"				//adjust for writes
		stvewx		v0,r3,r7
		"stvewx		v0,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v0,r3,r8
		"stvewx		v0,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v1,v1,v1,v9
		"vperm		v1,v1,v1,v9\n"
		stvewx		v1,r3,r7
		"stvewx		v1,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v1,r3,r8
		"stvewx		v1,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v2,v2,v2,v9
		"vperm		v2,v2,v2,v9\n"
		stvewx		v2,r3,r7
		"stvewx		v2,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v2,r3,r8
		"stvewx		v2,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v3,v3,v3,v9
		"vperm		v3,v3,v3,v9\n"
		stvewx		v3,r3,r7
		"stvewx		v3,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v3,r3,r8
		"stvewx		v3,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v4,v4,v4,v9
		"vperm		v4,v4,v4,v9\n"
		stvewx		v4,r3,r7
		"stvewx		v4,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v4,r3,r8
		"stvewx		v4,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v5,v5,v5,v9
		"vperm		v5,v5,v5,v9\n"
		stvewx		v5,r3,r7
		"stvewx		v5,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v5,r3,r8
		"stvewx		v5,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v6,v6,v6,v9
		"vperm		v6,v6,v6,v9\n"
		stvewx		v6,r3,r7
		"stvewx		v6,r3,r7\n"
		add			r7,r7,r6	
		"add			r7,r7,r6\n"	
		lvsr		v9,r3,r7				//load alignment vector for stores
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
		stvewx		v6,r3,r8
		"stvewx		v6,r3,r8\n"
		add			r8,r8,r6	
		"add			r8,r8,r6\n"	
		vperm		v7,v7,v7,v9
		"vperm		v7,v7,v7,v9\n"
		stvewx		v7,r3,r7
		"stvewx		v7,r3,r7\n"
						
						
		stvewx		v7,r3,r8
		"stvewx		v7,r3,r8\n"
	}
	);
}
}
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC);
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC);
 Lines 396-716    Link Here 
static void          /*      r3,             r4,       r5,             r6,         r7 */
static void          /*      r3,             r4,       r5,             r6,         r7 */
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss)
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss)
{
{
	asm
	asm(
	{
		//trying cache hints
		//trying cache hints
		lis			r8,0x0108
		"lis			r8,0x0108\n"
		or			r8,r8,r7
		"or			r8,r8,r7\n"
		dstst		r3,r8,0
		"dstst		r3,r8,0\n"
		xor			r8,r8,r8
		"xor			r8,r8,r8\n"
		li			r9,16
		"li			r9,16\n"
		
		
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		vxor		v9,v9,v9
		"vxor		v9,v9,v9\n"
		
		
		lvx			v10,r4,r8				//get 8 RefPtr1 -- 0
		"lvx			v10,r4,r8\n"				//get 8 RefPtr1 -- 0
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v0,r4,r9				//need another 16 bytes for misaligned data -- 0
		"lvx			v0,r4,r9\n"				//need another 16 bytes for misaligned data -- 0
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v11,r4,r8				//get 8 RefPtr1 -- 1
		"lvx			v11,r4,r8\n"				//get 8 RefPtr1 -- 1
		vperm		v10,v10,v0,v8
		"vperm		v10,v10,v0,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v1,r4,r9				//need another 16 bytes for misaligned data -- 1
		"lvx			v1,r4,r9\n"				//need another 16 bytes for misaligned data -- 1
		vmrghb		v10,v9,v10				//unsigned byte -> unsigned half		
		"vmrghb		v10,v9,v10\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v12,r4,r8				//get 8 RefPtr1 -- 2
		"lvx			v12,r4,r8\n"				//get 8 RefPtr1 -- 2
		vperm		v11,v11,v1,v8
		"vperm		v11,v11,v1,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v2,r4,r9				//need another 16 bytes for misaligned data -- 2
		"lvx			v2,r4,r9\n"				//need another 16 bytes for misaligned data -- 2
		vmrghb		v11,v9,v11				//unsigned byte -> unsigned half		
		"vmrghb		v11,v9,v11\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v13,r4,r8				//get 8 RefPtr1 -- 3
		"lvx			v13,r4,r8\n"				//get 8 RefPtr1 -- 3
		vperm		v12,v12,v2,v8
		"vperm		v12,v12,v2,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v3,r4,r9				//need another 16 bytes for misaligned data -- 3
		"lvx			v3,r4,r9\n"				//need another 16 bytes for misaligned data -- 3
		vmrghb		v12,v9,v12				//unsigned byte -> unsigned half		
		"vmrghb		v12,v9,v12\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v14,r4,r8				//get 8 RefPtr1 -- 4
		"lvx			v14,r4,r8\n"				//get 8 RefPtr1 -- 4
		vperm		v13,v13,v3,v8
		"vperm		v13,v13,v3,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v4,r4,r9				//need another 16 bytes for misaligned data -- 4
		"lvx			v4,r4,r9\n"				//need another 16 bytes for misaligned data -- 4
		vmrghb		v13,v9,v13				//unsigned byte -> unsigned half		
		"vmrghb		v13,v9,v13\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v15,r4,r8				//get 8 RefPtr1 -- 5
		"lvx			v15,r4,r8\n"				//get 8 RefPtr1 -- 5
		vperm		v14,v14,v4,v8
		"vperm		v14,v14,v4,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v5,r4,r9				//need another 16 bytes for misaligned data -- 5
		"lvx			v5,r4,r9\n"				//need another 16 bytes for misaligned data -- 5
		vmrghb		v14,v9,v14				//unsigned byte -> unsigned half		
		"vmrghb		v14,v9,v14\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v16,r4,r8				//get 8 RefPtr1 -- 6
		"lvx			v16,r4,r8\n"				//get 8 RefPtr1 -- 6
		vperm		v15,v15,v5,v8
		"vperm		v15,v15,v5,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v6,r4,r9				//need another 16 bytes for misaligned data -- 6
		"lvx			v6,r4,r9\n"				//need another 16 bytes for misaligned data -- 6
		vmrghb		v15,v9,v15				//unsigned byte -> unsigned half		
		"vmrghb		v15,v9,v15\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v17,r4,r8				//get 8 RefPtr1 -- 7
		"lvx			v17,r4,r8\n"				//get 8 RefPtr1 -- 7
		vperm		v16,v16,v6,v8
		"vperm		v16,v16,v6,v8\n"
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v7,r4,r9				//need another 16 bytes for misaligned data -- 7
		"lvx			v7,r4,r9\n"				//need another 16 bytes for misaligned data -- 7
		vmrghb		v16,v9,v16				//unsigned byte -> unsigned half		
		"vmrghb		v16,v9,v16\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
//--------
//--------
		vperm		v17,v17,v7,v8
		"vperm		v17,v17,v7,v8\n"
		xor			r8,r8,r8
		"xor			r8,r8,r8\n"
		li			r9,16
		"li			r9,16\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		vmrghb		v17,v9,v17				//unsigned byte -> unsigned half		
		"vmrghb		v17,v9,v17\n"				//unsigned byte -> unsigned half		
		
		
		lvx			v20,r5,r8				//get 8 RefPtr2 -- 0
		"lvx			v20,r5,r8\n"				//get 8 RefPtr2 -- 0
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v0,r5,r9				//need another 16 bytes for misaligned data -- 0
		"lvx			v0,r5,r9\n"				//need another 16 bytes for misaligned data -- 0
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v21,r5,r8				//get 8 RefPtr2 -- 1
		"lvx			v21,r5,r8\n"				//get 8 RefPtr2 -- 1
		vperm		v20,v20,v0,v18
		"vperm		v20,v20,v0,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v1,r5,r9				//need another 16 bytes for misaligned data -- 1
		"lvx			v1,r5,r9\n"				//need another 16 bytes for misaligned data -- 1
		vmrghb		v20,v9,v20				//unsigned byte -> unsigned half		
		"vmrghb		v20,v9,v20\n"				//unsigned byte -> unsigned half		
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v22,r5,r8				//get 8 RefPtr2 -- 2
		"lvx			v22,r5,r8\n"				//get 8 RefPtr2 -- 2
		vperm		v21,v21,v1,v18
		"vperm		v21,v21,v1,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v2,r5,r9				//need another 16 bytes for misaligned data -- 2
		"lvx			v2,r5,r9\n"				//need another 16 bytes for misaligned data -- 2
		vmrghb		v21,v9,v21				//unsigned byte -> unsigned half	
		"vmrghb		v21,v9,v21\n"				//unsigned byte -> unsigned half	
		vadduhm		v10,v10,v20	
		"vadduhm		v10,v10,v20\n"	
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v23,r5,r8				//get 8 RefPtr2 -- 3
		"lvx			v23,r5,r8\n"				//get 8 RefPtr2 -- 3
		vperm		v22,v22,v2,v18
		"vperm		v22,v22,v2,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v3,r5,r9				//need another 16 bytes for misaligned data -- 3
		"lvx			v3,r5,r9\n"				//need another 16 bytes for misaligned data -- 3
		vmrghb		v22,v9,v22				//unsigned byte -> unsigned half		
		"vmrghb		v22,v9,v22\n"				//unsigned byte -> unsigned half		
		vadduhm		v11,v11,v21	
		"vadduhm		v11,v11,v21\n"	
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v24,r5,r8				//get 8 RefPtr2 -- 4
		"lvx			v24,r5,r8\n"				//get 8 RefPtr2 -- 4
		vperm		v23,v23,v3,v18
		"vperm		v23,v23,v3,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v4,r5,r9				//need another 16 bytes for misaligned data -- 4
		"lvx			v4,r5,r9\n"				//need another 16 bytes for misaligned data -- 4
		vmrghb		v23,v9,v23				//unsigned byte -> unsigned half		
		"vmrghb		v23,v9,v23\n"				//unsigned byte -> unsigned half		
		vadduhm		v12,v12,v22	
		"vadduhm		v12,v12,v22\n"	
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v25,r5,r8				//get 8 RefPtr2 -- 5
		"lvx			v25,r5,r8\n"				//get 8 RefPtr2 -- 5
		vperm		v24,v24,v4,v18
		"vperm		v24,v24,v4,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v5,r5,r9				//need another 16 bytes for misaligned data -- 5
		"lvx			v5,r5,r9\n"				//need another 16 bytes for misaligned data -- 5
		vmrghb		v24,v9,v24				//unsigned byte -> unsigned half		
		"vmrghb		v24,v9,v24\n"				//unsigned byte -> unsigned half		
		vadduhm		v13,v13,v23	
		"vadduhm		v13,v13,v23\n"	
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v26,r5,r8				//get 8 RefPtr2 -- 6
		"lvx			v26,r5,r8\n"				//get 8 RefPtr2 -- 6
		vperm		v25,v25,v5,v18
		"vperm		v25,v25,v5,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v6,r5,r9				//need another 16 bytes for misaligned data -- 6
		"lvx			v6,r5,r9\n"				//need another 16 bytes for misaligned data -- 6
		vmrghb		v25,v9,v25				//unsigned byte -> unsigned half		
		"vmrghb		v25,v9,v25\n"				//unsigned byte -> unsigned half		
		vadduhm		v14,v14,v24	
		"vadduhm		v14,v14,v24\n"	
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		lvx			v27,r5,r8				//get 8 RefPtr2 -- 7
		"lvx			v27,r5,r8\n"				//get 8 RefPtr2 -- 7
		vperm		v26,v26,v6,v18
		"vperm		v26,v26,v6,v18\n"
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
		add			r8,r8,r7					
		"add			r8,r8,r7\n"					
		lvx			v7,r5,r9				//need another 16 bytes for misaligned data -- 7
		"lvx			v7,r5,r9\n"				//need another 16 bytes for misaligned data -- 7
		vmrghb		v26,v9,v26				//unsigned byte -> unsigned half		
		"vmrghb		v26,v9,v26\n"				//unsigned byte -> unsigned half		
		vadduhm		v15,v15,v25	
		"vadduhm		v15,v15,v25\n"	
		add			r9,r9,r7					
		"add			r9,r9,r7\n"					
		vperm		v27,v27,v7,v18
		"vperm		v27,v27,v7,v18\n"
		xor			r8,r8,r8
		"xor			r8,r8,r8\n"
		vmrghb		v27,v9,v27				//unsigned byte -> unsigned half		
		"vmrghb		v27,v9,v27\n"				//unsigned byte -> unsigned half		
		vadduhm		v16,v16,v26	
		"vadduhm		v16,v16,v26\n"	
		vadduhm		v17,v17,v27	
		"vadduhm		v17,v17,v27\n"	
		vspltish	v8,1
		"vspltish	v8,1\n"
//--------
//--------
		lvx			v0,r6,r8				//get 8 shorts 				
		"lvx			v0,r6,r8\n"				//get 8 shorts 				
		vsrh		v10,v10,v8
		"vsrh		v10,v10,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v1,r6,r8				//get 8 shorts 				
		"lvx			v1,r6,r8\n"				//get 8 shorts 				
		vsrh		v11,v11,v8
		"vsrh		v11,v11,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v2,r6,r8				//get 8 shorts 				
		"lvx			v2,r6,r8\n"				//get 8 shorts 				
		vsrh		v12,v12,v8
		"vsrh		v12,v12,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v3,r6,r8				//get 8 shorts 				
		"lvx			v3,r6,r8\n"				//get 8 shorts 				
		vsrh		v13,v13,v8
		"vsrh		v13,v13,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v4,r6,r8				//get 8 shorts 				
		"lvx			v4,r6,r8\n"				//get 8 shorts 				
		vsrh		v14,v14,v8
		"vsrh		v14,v14,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v5,r6,r8				//get 8 shorts 				
		"lvx			v5,r6,r8\n"				//get 8 shorts 				
		vsrh		v15,v15,v8
		"vsrh		v15,v15,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v6,r6,r8				//get 8 shorts 				
		"lvx			v6,r6,r8\n"				//get 8 shorts 				
		vsrh		v16,v16,v8
		"vsrh		v16,v16,v8\n"
		addi		r8,r8,16
		"addi		r8,r8,16\n"
		lvx			v7,r6,r8				//get 8 shorts 				
		"lvx			v7,r6,r8\n"				//get 8 shorts 				
		vsrh		v17,v17,v8
		"vsrh		v17,v17,v8\n"
		xor			r8,r8,r8
		"xor			r8,r8,r8\n"
//--------
//--------
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		vaddshs		v0,v0,v10
		"vaddshs		v0,v0,v10\n"
		vaddshs		v1,v1,v11
		"vaddshs		v1,v1,v11\n"
		vpkshus		v0,v0,v0				
		"vpkshus		v0,v0,v0\n"				
		vaddshs		v2,v2,v12
		"vaddshs		v2,v2,v12\n"
		vpkshus		v1,v1,v1				
		"vpkshus		v1,v1,v1\n"				
		vaddshs		v3,v3,v13
		"vaddshs		v3,v3,v13\n"
		vpkshus		v2,v2,v2				
		"vpkshus		v2,v2,v2\n"				
		vaddshs		v4,v4,v14
		"vaddshs		v4,v4,v14\n"
		vpkshus		v3,v3,v3				
		"vpkshus		v3,v3,v3\n"				
		vaddshs		v5,v5,v15
		"vaddshs		v5,v5,v15\n"
		vpkshus		v4,v4,v4				
		"vpkshus		v4,v4,v4\n"				
		vaddshs		v6,v6,v16
		"vaddshs		v6,v6,v16\n"
		vpkshus		v5,v5,v5				
		"vpkshus		v5,v5,v5\n"				
		vaddshs		v7,v7,v17
		"vaddshs		v7,v7,v17\n"
		vpkshus		v6,v6,v6				
		"vpkshus		v6,v6,v6\n"				
		vpkshus		v7,v7,v7
		"vpkshus		v7,v7,v7\n"
		li			r9,4
		"li			r9,4\n"
		vperm		v0,v0,v0,v9				//adjust for writes
		"vperm		v0,v0,v0,v9\n"				//adjust for writes
		stvewx		v0,r3,r8
		"stvewx		v0,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v0,r3,r9
		"stvewx		v0,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v1,v1,v1,v9
		"vperm		v1,v1,v1,v9\n"
		stvewx		v1,r3,r8
		"stvewx		v1,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v1,r3,r9
		"stvewx		v1,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v2,v2,v2,v9
		"vperm		v2,v2,v2,v9\n"
		stvewx		v2,r3,r8
		"stvewx		v2,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v2,r3,r9
		"stvewx		v2,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v3,v3,v3,v9
		"vperm		v3,v3,v3,v9\n"
		stvewx		v3,r3,r8
		"stvewx		v3,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v3,r3,r9
		"stvewx		v3,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v4,v4,v4,v9
		"vperm		v4,v4,v4,v9\n"
		stvewx		v4,r3,r8
		"stvewx		v4,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v4,r3,r9
		"stvewx		v4,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v5,v5,v5,v9
		"vperm		v5,v5,v5,v9\n"
		stvewx		v5,r3,r8
		"stvewx		v5,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v5,r3,r9
		"stvewx		v5,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v6,v6,v6,v9
		"vperm		v6,v6,v6,v9\n"
		stvewx		v6,r3,r8
		"stvewx		v6,r3,r8\n"
		add			r8,r8,r7	
		"add			r8,r8,r7\n"	
		lvsr		v9,r3,r8				//load alignment vector for stores
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
		stvewx		v6,r3,r9
		"stvewx		v6,r3,r9\n"
		add			r9,r9,r7	
		"add			r9,r9,r7\n"	
		vperm		v7,v7,v7,v9
		"vperm		v7,v7,v7,v9\n"
		stvewx		v7,r3,r8
		"stvewx		v7,r3,r8\n"
		stvewx		v7,r3,r9
		"stvewx		v7,r3,r9\n"
	}
	);
}
}
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC);
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC);