Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 109270
Collapse All | Expand All

(-)liboil-0.3.3/liboil/powerpc/recon8x8_altivec.c (-459 / +457 lines)
Lines 46-176 Link Here
46
                    /*       r3,            r4,         r5 */
46
                    /*       r3,            r4,         r5 */
47
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds)
47
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds)
48
{
48
{
49
    asm {
49
    asm(
50
		//trying cache hints
50
		//trying cache hints
51
		lis			r6,0x0108
51
		"lis			r6,0x0108\n"
52
		or			r6,r6,r5
52
		"or			r6,r6,r5\n"
53
		dstst		r3,r6,0
53
		"dstst		r3,r6,0\n"
54
54
55
		vspltish	v1,7
55
		"vspltish	v1,7\n"
56
56
57
		vspltish	v8,1
57
		"vspltish	v8,1\n\n"
58
		xor			r6,r6,r6
58
		"xor			r6,r6,r6\n"
59
		
59
		
60
		lvx			v0,r4,r6				//get 8 shorts					
60
		"lvx			v0,r4,r6\n"				//get 8 shorts					
61
		vslh		v8,v8,v1				//now have 128
61
		"vslh		v8,v8,v1\n"				//now have 128
62
		addi		r6,r6,16
62
		"addi		r6,r6,16\n"
63
		
63
		
64
		lvx			v1,r4,r6				//get 8 shorts					
64
		"lvx			v1,r4,r6\n"				//get 8 shorts					
65
		vaddshs		v0,v0,v8				//+=128
65
		"vaddshs		v0,v0,v8\n"				//+=128
66
		addi		r6,r6,16
66
		"addi		r6,r6,16\n"
67
67
68
		lvx			v2,r4,r6				//get 8 shorts					
68
		"lvx			v2,r4,r6\n"				//get 8 shorts					
69
		vaddshs		v1,v1,v8				//+=128
69
		"vaddshs		v1,v1,v8\n"				//+=128
70
		addi		r6,r6,16
70
		"addi		r6,r6,16\n"
71
		vpkshus		v0,v0,v0				//convert to bytes
71
		"vpkshus		v0,v0,v0\n"				//convert to bytes
72
72
73
		lvx			v3,r4,r6				//get 8 shorts					
73
		"lvx			v3,r4,r6\n"				//get 8 shorts					
74
		vaddshs		v2,v2,v8				//+=128
74
		"vaddshs		v2,v2,v8\n"				//+=128
75
		addi		r6,r6,16
75
		"addi		r6,r6,16\n"
76
		vpkshus		v1,v1,v1				//convert to bytes
76
		"vpkshus		v1,v1,v1\n"				//convert to bytes
77
77
78
		lvx			v4,r4,r6				//get 8 shorts					
78
		"lvx			v4,r4,r6\n"				//get 8 shorts					
79
		vaddshs		v3,v3,v8				//+=128
79
		"vaddshs		v3,v3,v8\n"				//+=128
80
		addi		r6,r6,16
80
		"addi		r6,r6,16\n"
81
		vpkshus		v2,v2,v2				//convert to bytes
81
		"vpkshus		v2,v2,v2\n"				//convert to bytes
82
82
83
		lvx			v5,r4,r6				//get 8 shorts					
83
		"lvx			v5,r4,r6\n"				//get 8 shorts					
84
		vaddshs		v4,v4,v8				//+=128
84
		"vaddshs		v4,v4,v8\n"				//+=128
85
		addi		r6,r6,16
85
		"addi		r6,r6,16\n"
86
		vpkshus		v3,v3,v3				//convert to bytes
86
		"vpkshus		v3,v3,v3\n"				//convert to bytes
87
87
88
		lvx			v6,r4,r6				//get 8 shorts					
88
		"lvx			v6,r4,r6\n"				//get 8 shorts					
89
		vaddshs		v5,v5,v8				//+=128
89
		"vaddshs		v5,v5,v8\n"				//+=128
90
		addi		r6,r6,16
90
		"addi		r6,r6,16\n"
91
		vpkshus		v4,v4,v4				//convert to bytes
91
		"vpkshus		v4,v4,v4\n"				//convert to bytes
92
92
93
		lvx			v7,r4,r6				//get 8 shorts	
93
		"lvx			v7,r4,r6\n"				//get 8 shorts	
94
		xor			r6,r6,r6				
94
		"xor			r6,r6,r6\n"				
95
		vaddshs		v6,v6,v8				//+=128
95
		"vaddshs		v6,v6,v8\n"				//+=128
96
		vpkshus		v5,v5,v5				//convert to bytes
96
		"vpkshus		v5,v5,v5\n"				//convert to bytes
97
97
98
		lvsr		v9,r3,r6				//load alignment vector for stores
98
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
99
		vaddshs		v7,v7,v8				//+=128
99
		"vaddshs		v7,v7,v8\n"				//+=128
100
		vpkshus		v6,v6,v6				//convert to bytes
100
		"vpkshus		v6,v6,v6\n"				//convert to bytes
101
101
102
		vpkshus		v7,v7,v7				//convert to bytes
102
		"vpkshus		v7,v7,v7\n"				//convert to bytes
103
103
104
		li			r7,4
104
		"li			r7,4\n"
105
		vperm		v0,v0,v0,v9
105
		"vperm		v0,v0,v0,v9\n"
106
106
107
		stvewx		v0,r3,r6
107
		"stvewx		v0,r3,r6\n"
108
		add			r6,r6,r5
108
		"add			r6,r6,r5\n"
109
109
110
		lvsr		v9,r3,r6				//load alignment vector for stores
110
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
111
111
112
		stvewx		v0,r3,r7
112
		"stvewx		v0,r3,r7\n"
113
		add			r7,r7,r5
113
		"add			r7,r7,r5\n"
114
		vperm		v1,v1,v1,v9
114
		"vperm		v1,v1,v1,v9\n"
115
115
116
		stvewx		v1,r3,r6
116
		"stvewx		v1,r3,r6\n"
117
		add			r6,r6,r5
117
		"add			r6,r6,r5\n"
118
118
119
		lvsr		v9,r3,r6				//load alignment vector for stores
119
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
120
120
121
		stvewx		v1,r3,r7
121
		"stvewx		v1,r3,r7\n"
122
		add			r7,r7,r5
122
		"add			r7,r7,r5\n"
123
		vperm		v2,v2,v2,v9
123
		"vperm		v2,v2,v2,v9\n"
124
124
125
		stvewx		v2,r3,r6
125
		"stvewx		v2,r3,r6\n"
126
		add			r6,r6,r5
126
		"add			r6,r6,r5\n"
127
127
128
		lvsr		v9,r3,r6				//load alignment vector for stores
128
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
129
129
130
		stvewx		v2,r3,r7
130
		"stvewx		v2,r3,r7\n"
131
		add			r7,r7,r5
131
		"add			r7,r7,r5\n"
132
		vperm		v3,v3,v3,v9
132
		"vperm		v3,v3,v3,v9\n"
133
133
134
		stvewx		v3,r3,r6
134
		"stvewx		v3,r3,r6\n"
135
		add			r6,r6,r5
135
		"add			r6,r6,r5\n"
136
136
137
		lvsr		v9,r3,r6				//load alignment vector for stores
137
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
138
138
139
		stvewx		v3,r3,r7
139
		"stvewx		v3,r3,r7\n"
140
		add			r7,r7,r5
140
		"add			r7,r7,r5\n"
141
		vperm		v4,v4,v4,v9
141
		"vperm		v4,v4,v4,v9\n"
142
142
143
		stvewx		v4,r3,r6
143
		"stvewx		v4,r3,r6\n"
144
		add			r6,r6,r5
144
		"add			r6,r6,r5\n"
145
145
146
		lvsr		v9,r3,r6				//load alignment vector for stores
146
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
147
147
148
		stvewx		v4,r3,r7
148
		"stvewx		v4,r3,r7\n"
149
		add			r7,r7,r5
149
		"add			r7,r7,r5\n"
150
		vperm		v5,v5,v5,v9
150
		"vperm		v5,v5,v5,v9\n"
151
151
152
		stvewx		v5,r3,r6
152
		"stvewx		v5,r3,r6\n"
153
		add			r6,r6,r5
153
		"add			r6,r6,r5\n"
154
154
155
		lvsr		v9,r3,r6				//load alignment vector for stores
155
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
156
156
157
		stvewx		v5,r3,r7
157
		"stvewx		v5,r3,r7\n"
158
		add			r7,r7,r5
158
		"add			r7,r7,r5\n"
159
		vperm		v6,v6,v6,v9
159
		"vperm		v6,v6,v6,v9\n"
160
160
161
		stvewx		v6,r3,r6
161
		"stvewx		v6,r3,r6\n"
162
		add			r6,r6,r5
162
		"add			r6,r6,r5\n"
163
163
164
		lvsr		v9,r3,r6				//load alignment vector for stores
164
		"lvsr		v9,r3,r6\n"				//load alignment vector for stores
165
165
166
		stvewx		v6,r3,r7
166
		"stvewx		v6,r3,r7\n"
167
		add			r7,r7,r5
167
		"add			r7,r7,r5\n"
168
		vperm		v7,v7,v7,v9
168
		"vperm		v7,v7,v7,v9\n"
169
169
170
		stvewx		v7,r3,r6
170
		"stvewx		v7,r3,r6\n"
171
171
172
		stvewx		v7,r3,r7
172
		"stvewx		v7,r3,r7\n"
173
    }
173
    );
174
}
174
}
175
175
176
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC);
176
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC);
Lines 178-394 Link Here
178
static void          /*      r3,            r4,           r5,         r6 */
178
static void          /*      r3,            r4,           r5,         r6 */
179
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss)
179
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss)
180
{
180
{
181
	asm
181
	asm(
182
	{
183
		//trying cache hints
182
		//trying cache hints
184
		lis			r7,0x0108
183
		"lis			r7,0x0108\n"
185
		or			r7,r7,r6
184
		"or			r7,r7,r6\n"
186
		dstst		r3,r7,0
185
		"dstst		r3,r7,0\n"
187
		
186
		
188
		xor			r7,r7,r7
187
		"xor			r7,r7,r7\n"
189
		li			r8,16
188
		"li			r8,16\n"
190
		
189
		
191
		lvsl		v8,r4,r7				//load alignment vector for refs
190
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
192
		vxor		v9,v9,v9
191
		"vxor		v9,v9,v9\n"
193
		
192
		
194
		lvx			v10,r4,r7				//get 8 refs
193
		"lvx			v10,r4,r7\n"				//get 8 refs
195
		add			r7,r7,r6					
194
		"add			r7,r7,r6\n"					
196
195
197
		lvx			v0,r4,r8				//need another 16 bytes for misaligned data -- 0
196
		"lvx			v0,r4,r8\n"				//need another 16 bytes for misaligned data -- 0
198
		add			r8,r8,r6					
197
		"add			r8,r8,r6\n"					
199
198
200
		lvx			v11,r4,r7				//get 8 refs
199
		"lvx			v11,r4,r7\n"				//get 8 refs
201
		vperm		v10,v10,v0,v8
200
		"vperm		v10,v10,v0,v8\n"
202
201
203
		lvsl		v8,r4,r7				//load alignment vector for refs
202
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
204
		add			r7,r7,r6					
203
		"add			r7,r7,r6\n"					
205
204
206
		lvx			v1,r4,r8				//need another 16 bytes for misaligned data -- 1
205
		"lvx			v1,r4,r8\n"				//need another 16 bytes for misaligned data -- 1
207
		add			r8,r8,r6					
206
		"add			r8,r8,r6\n"					
208
207
209
		lvx			v12,r4,r7				//get 8 refs
208
		"lvx			v12,r4,r7\n"				//get 8 refs
210
		vperm		v11,v11,v1,v8
209
		"vperm		v11,v11,v1,v8\n"
211
210
212
		lvsl		v8,r4,r7				//load alignment vector for refs
211
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
213
		add			r7,r7,r6					
212
		"add			r7,r7,r6\n"					
214
213
215
		lvx			v2,r4,r8				//need another 16 bytes for misaligned data -- 2
214
		"lvx			v2,r4,r8\n"				//need another 16 bytes for misaligned data -- 2
216
		add			r8,r8,r6					
215
		"add			r8,r8,r6\n"					
217
216
218
		lvx			v13,r4,r7				//get 8 refs
217
		"lvx			v13,r4,r7\n"				//get 8 refs
219
		vperm		v12,v12,v2,v8
218
		"vperm		v12,v12,v2,v8\n"
220
219
221
		lvsl		v8,r4,r7				//load alignment vector for refs
220
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
222
		add			r7,r7,r6					
221
		"add			r7,r7,r6\n"					
223
222
224
		lvx			v3,r4,r8				//need another 16 bytes for misaligned data -- 3
223
		"lvx			v3,r4,r8\n"				//need another 16 bytes for misaligned data -- 3
225
		add			r8,r8,r6					
224
		"add			r8,r8,r6\n"					
226
225
227
		lvx			v14,r4,r7				//get 8 refs
226
		"lvx			v14,r4,r7\n"				//get 8 refs
228
		vperm		v13,v13,v3,v8
227
		"vperm		v13,v13,v3,v8\n"
229
228
230
		lvsl		v8,r4,r7				//load alignment vector for refs
229
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
231
		add			r7,r7,r6					
230
		"add			r7,r7,r6\n"					
232
231
233
		lvx			v4,r4,r8				//need another 16 bytes for misaligned data -- 4
232
		"lvx			v4,r4,r8\n"				//need another 16 bytes for misaligned data -- 4
234
		add			r8,r8,r6					
233
		"add			r8,r8,r6\n"					
235
234
236
		lvx			v15,r4,r7				//get 8 refs
235
		"lvx			v15,r4,r7\n"				//get 8 refs
237
		vperm		v14,v14,v4,v8
236
		"vperm		v14,v14,v4,v8\n"
238
237
239
		lvsl		v8,r4,r7				//load alignment vector for refs
238
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
240
		add			r7,r7,r6					
239
		"add			r7,r7,r6\n"					
241
240
242
		lvx			v5,r4,r8				//need another 16 bytes for misaligned data -- 5
241
		"lvx			v5,r4,r8\n"				//need another 16 bytes for misaligned data -- 5
243
		add			r8,r8,r6					
242
		"add			r8,r8,r6\n"					
244
243
245
		lvx			v16,r4,r7				//get 8 refs
244
		"lvx			v16,r4,r7\n"				//get 8 refs
246
		vperm		v15,v15,v5,v8
245
		"vperm		v15,v15,v5,v8\n"
247
246
248
		lvsl		v8,r4,r7				//load alignment vector for refs
247
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
249
		add			r7,r7,r6					
248
		"add			r7,r7,r6\n"					
250
249
251
		lvx			v6,r4,r8				//need another 16 bytes for misaligned data -- 6
250
		"lvx			v6,r4,r8\n"				//need another 16 bytes for misaligned data -- 6
252
		add			r8,r8,r6					
251
		"add			r8,r8,r6\n"					
253
252
254
		lvx			v17,r4,r7				//get 8 refs
253
		"lvx			v17,r4,r7\n"				//get 8 refs
255
		vperm		v16,v16,v6,v8
254
		"vperm		v16,v16,v6,v8\n"
256
255
257
		lvsl		v8,r4,r7				//load alignment vector for refs
256
		"lvsl		v8,r4,r7\n"				//load alignment vector for refs
258
		xor			r7,r7,r7
257
		"xor			r7,r7,r7\n"
259
258
260
		lvx			v7,r4,r8				//need another 16 bytes for misaligned data -- 7
259
		"lvx			v7,r4,r8\n"				//need another 16 bytes for misaligned data -- 7
261
		add			r8,r8,r6					
260
		"add			r8,r8,r6\n"					
262
261
263
		lvx			v0,r5,r7				//get 8 shorts 				
262
		"lvx			v0,r5,r7\n"				//get 8 shorts 				
264
		vperm		v17,v17,v7,v8
263
		"vperm		v17,v17,v7,v8\n"
265
		addi		r7,r7,16
264
		"addi		r7,r7,16\n"
266
265
267
		lvx			v1,r5,r7				//get 8 shorts 				
266
		"lvx			v1,r5,r7\n"				//get 8 shorts 				
268
		vmrghb		v10,v9,v10				//unsigned byte -> unsigned half		
267
		"vmrghb		v10,v9,v10\n"				//unsigned byte -> unsigned half		
269
		addi		r7,r7,16
268
		"addi		r7,r7,16\n"
270
269
271
		lvx			v2,r5,r7				//get 8 shorts 				
270
		"lvx			v2,r5,r7\n"				//get 8 shorts 				
272
		vmrghb		v11,v9,v11				//unsigned byte -> unsigned half		
271
		"vmrghb		v11,v9,v11\n"				//unsigned byte -> unsigned half		
273
		vaddshs		v0,v0,v10
272
		"vaddshs		v0,v0,v10\n"
274
		addi		r7,r7,16
273
		"addi		r7,r7,16\n"
275
274
276
		lvx			v3,r5,r7				//get 8 shorts 				
275
		"lvx			v3,r5,r7\n"				//get 8 shorts 				
277
		vmrghb		v12,v9,v12				//unsigned byte -> unsigned half		
276
		"vmrghb		v12,v9,v12\n"				//unsigned byte -> unsigned half		
278
		vaddshs		v1,v1,v11
277
		"vaddshs		v1,v1,v11\n"
279
		addi		r7,r7,16
278
		"addi		r7,r7,16\n"
280
279
281
		lvx			v4,r5,r7				//get 8 shorts 				
280
		"lvx			v4,r5,r7\n"				//get 8 shorts 				
282
		vmrghb		v13,v9,v13				//unsigned byte -> unsigned half		
281
		"vmrghb		v13,v9,v13\n"				//unsigned byte -> unsigned half		
283
		vaddshs		v2,v2,v12
282
		"vaddshs		v2,v2,v12\n"
284
		addi		r7,r7,16
283
		"addi		r7,r7,16\n"
285
284
286
		lvx			v5,r5,r7				//get 8 shorts 				
285
		"lvx			v5,r5,r7\n"				//get 8 shorts 				
287
		vmrghb		v14,v9,v14				//unsigned byte -> unsigned half		
286
		"vmrghb		v14,v9,v14\n"				//unsigned byte -> unsigned half		
288
		vaddshs		v3,v3,v13
287
		"vaddshs		v3,v3,v13\n"
289
		addi		r7,r7,16
288
		"addi		r7,r7,16\n"
290
289
291
		lvx			v6,r5,r7				//get 8 shorts 				
290
		"lvx			v6,r5,r7\n"				//get 8 shorts 				
292
		vmrghb		v15,v9,v15				//unsigned byte -> unsigned half		
291
		"vmrghb		v15,v9,v15\n"				//unsigned byte -> unsigned half		
293
		vaddshs		v4,v4,v14
292
		"vaddshs		v4,v4,v14\n"
294
		addi		r7,r7,16
293
		"addi		r7,r7,16\n"
295
294
296
		lvx			v7,r5,r7				//get 8 shorts 				
295
		"lvx			v7,r5,r7\n"				//get 8 shorts 				
297
		vmrghb		v16,v9,v16				//unsigned byte -> unsigned half		
296
		"vmrghb		v16,v9,v16\n"				//unsigned byte -> unsigned half		
298
		vaddshs		v5,v5,v15
297
		"vaddshs		v5,v5,v15\n"
299
		
298
		
300
		vmrghb		v17,v9,v17				//unsigned byte -> unsigned half	
299
		"vmrghb		v17,v9,v17\n"				//unsigned byte -> unsigned half	
301
		vaddshs		v6,v6,v16
300
		"vaddshs		v6,v6,v16\n"
302
		
301
		
303
		vpkshus		v0,v0,v0				
302
		"vpkshus		v0,v0,v0\n"				
304
		vaddshs		v7,v7,v17
303
		"vaddshs		v7,v7,v17\n"
305
			
304
			
306
		vpkshus		v1,v1,v1				
305
		"vpkshus		v1,v1,v1\n"				
307
		xor			r7,r7,r7
306
		"xor			r7,r7,r7\n"
308
307
309
		vpkshus		v2,v2,v2				
308
		"vpkshus		v2,v2,v2\n"				
310
309
311
		vpkshus		v3,v3,v3				
310
		"vpkshus		v3,v3,v3\n"				
312
311
313
		vpkshus		v4,v4,v4				
312
		"vpkshus		v4,v4,v4\n"				
314
313
315
		vpkshus		v5,v5,v5				
314
		"vpkshus		v5,v5,v5\n"				
316
315
317
		vpkshus		v6,v6,v6				
316
		"vpkshus		v6,v6,v6\n"				
318
317
319
		lvsr		v9,r3,r7				//load alignment vector for stores
318
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
320
		vpkshus		v7,v7,v7
319
		"vpkshus		v7,v7,v7\n"
321
320
322
		li			r8,4
321
		"li			r8,4\n"
323
		vperm		v0,v0,v0,v9				//adjust for writes
322
		"vperm		v0,v0,v0,v9\n"				//adjust for writes
324
323
325
		stvewx		v0,r3,r7
324
		"stvewx		v0,r3,r7\n"
326
		add			r7,r7,r6	
325
		"add			r7,r7,r6\n"	
327
326
328
		lvsr		v9,r3,r7				//load alignment vector for stores
327
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
329
328
330
		stvewx		v0,r3,r8
329
		"stvewx		v0,r3,r8\n"
331
		add			r8,r8,r6	
330
		"add			r8,r8,r6\n"	
332
		vperm		v1,v1,v1,v9
331
		"vperm		v1,v1,v1,v9\n"
333
332
334
		stvewx		v1,r3,r7
333
		"stvewx		v1,r3,r7\n"
335
		add			r7,r7,r6	
334
		"add			r7,r7,r6\n"	
336
335
337
		lvsr		v9,r3,r7				//load alignment vector for stores
336
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
338
337
339
		stvewx		v1,r3,r8
338
		"stvewx		v1,r3,r8\n"
340
		add			r8,r8,r6	
339
		"add			r8,r8,r6\n"	
341
		vperm		v2,v2,v2,v9
340
		"vperm		v2,v2,v2,v9\n"
342
341
343
		stvewx		v2,r3,r7
342
		"stvewx		v2,r3,r7\n"
344
		add			r7,r7,r6	
343
		"add			r7,r7,r6\n"	
345
344
346
		lvsr		v9,r3,r7				//load alignment vector for stores
345
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
347
346
348
		stvewx		v2,r3,r8
347
		"stvewx		v2,r3,r8\n"
349
		add			r8,r8,r6	
348
		"add			r8,r8,r6\n"	
350
		vperm		v3,v3,v3,v9
349
		"vperm		v3,v3,v3,v9\n"
351
350
352
		stvewx		v3,r3,r7
351
		"stvewx		v3,r3,r7\n"
353
		add			r7,r7,r6	
352
		"add			r7,r7,r6\n"	
354
353
355
		lvsr		v9,r3,r7				//load alignment vector for stores
354
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
356
355
357
		stvewx		v3,r3,r8
356
		"stvewx		v3,r3,r8\n"
358
		add			r8,r8,r6	
357
		"add			r8,r8,r6\n"	
359
		vperm		v4,v4,v4,v9
358
		"vperm		v4,v4,v4,v9\n"
360
359
361
		stvewx		v4,r3,r7
360
		"stvewx		v4,r3,r7\n"
362
		add			r7,r7,r6	
361
		"add			r7,r7,r6\n"	
363
362
364
		lvsr		v9,r3,r7				//load alignment vector for stores
363
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
365
364
366
		stvewx		v4,r3,r8
365
		"stvewx		v4,r3,r8\n"
367
		add			r8,r8,r6	
366
		"add			r8,r8,r6\n"	
368
		vperm		v5,v5,v5,v9
367
		"vperm		v5,v5,v5,v9\n"
369
368
370
		stvewx		v5,r3,r7
369
		"stvewx		v5,r3,r7\n"
371
		add			r7,r7,r6	
370
		"add			r7,r7,r6\n"	
372
371
373
		lvsr		v9,r3,r7				//load alignment vector for stores
372
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
374
373
375
		stvewx		v5,r3,r8
374
		"stvewx		v5,r3,r8\n"
376
		add			r8,r8,r6	
375
		"add			r8,r8,r6\n"	
377
		vperm		v6,v6,v6,v9
376
		"vperm		v6,v6,v6,v9\n"
378
377
379
		stvewx		v6,r3,r7
378
		"stvewx		v6,r3,r7\n"
380
		add			r7,r7,r6	
379
		"add			r7,r7,r6\n"	
381
380
382
		lvsr		v9,r3,r7				//load alignment vector for stores
381
		"lvsr		v9,r3,r7\n"				//load alignment vector for stores
383
382
384
		stvewx		v6,r3,r8
383
		"stvewx		v6,r3,r8\n"
385
		add			r8,r8,r6	
384
		"add			r8,r8,r6\n"	
386
		vperm		v7,v7,v7,v9
385
		"vperm		v7,v7,v7,v9\n"
387
386
388
		stvewx		v7,r3,r7
387
		"stvewx		v7,r3,r7\n"
389
						
388
						
390
		stvewx		v7,r3,r8
389
		"stvewx		v7,r3,r8\n"
391
	}
390
	);
392
}
391
}
393
392
394
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC);
393
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC);
Lines 396-716 Link Here
396
static void          /*      r3,             r4,       r5,             r6,         r7 */
395
static void          /*      r3,             r4,       r5,             r6,         r7 */
397
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss)
396
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss)
398
{
397
{
399
	asm
398
	asm(
400
	{
401
		//trying cache hints
399
		//trying cache hints
402
		lis			r8,0x0108
400
		"lis			r8,0x0108\n"
403
		or			r8,r8,r7
401
		"or			r8,r8,r7\n"
404
		dstst		r3,r8,0
402
		"dstst		r3,r8,0\n"
405
403
406
		xor			r8,r8,r8
404
		"xor			r8,r8,r8\n"
407
		li			r9,16
405
		"li			r9,16\n"
408
		
406
		
409
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
407
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
410
		vxor		v9,v9,v9
408
		"vxor		v9,v9,v9\n"
411
		
409
		
412
		lvx			v10,r4,r8				//get 8 RefPtr1 -- 0
410
		"lvx			v10,r4,r8\n"				//get 8 RefPtr1 -- 0
413
		add			r8,r8,r7					
411
		"add			r8,r8,r7\n"					
414
412
415
		lvx			v0,r4,r9				//need another 16 bytes for misaligned data -- 0
413
		"lvx			v0,r4,r9\n"				//need another 16 bytes for misaligned data -- 0
416
		add			r9,r9,r7					
414
		"add			r9,r9,r7\n"					
417
415
418
		lvx			v11,r4,r8				//get 8 RefPtr1 -- 1
416
		"lvx			v11,r4,r8\n"				//get 8 RefPtr1 -- 1
419
		vperm		v10,v10,v0,v8
417
		"vperm		v10,v10,v0,v8\n"
420
418
421
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
419
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
422
		add			r8,r8,r7					
420
		"add			r8,r8,r7\n"					
423
421
424
		lvx			v1,r4,r9				//need another 16 bytes for misaligned data -- 1
422
		"lvx			v1,r4,r9\n"				//need another 16 bytes for misaligned data -- 1
425
		vmrghb		v10,v9,v10				//unsigned byte -> unsigned half		
423
		"vmrghb		v10,v9,v10\n"				//unsigned byte -> unsigned half		
426
		add			r9,r9,r7					
424
		"add			r9,r9,r7\n"					
427
425
428
		lvx			v12,r4,r8				//get 8 RefPtr1 -- 2
426
		"lvx			v12,r4,r8\n"				//get 8 RefPtr1 -- 2
429
		vperm		v11,v11,v1,v8
427
		"vperm		v11,v11,v1,v8\n"
430
428
431
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
429
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
432
		add			r8,r8,r7					
430
		"add			r8,r8,r7\n"					
433
431
434
		lvx			v2,r4,r9				//need another 16 bytes for misaligned data -- 2
432
		"lvx			v2,r4,r9\n"				//need another 16 bytes for misaligned data -- 2
435
		vmrghb		v11,v9,v11				//unsigned byte -> unsigned half		
433
		"vmrghb		v11,v9,v11\n"				//unsigned byte -> unsigned half		
436
		add			r9,r9,r7					
434
		"add			r9,r9,r7\n"					
437
435
438
		lvx			v13,r4,r8				//get 8 RefPtr1 -- 3
436
		"lvx			v13,r4,r8\n"				//get 8 RefPtr1 -- 3
439
		vperm		v12,v12,v2,v8
437
		"vperm		v12,v12,v2,v8\n"
440
438
441
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
439
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
442
		add			r8,r8,r7					
440
		"add			r8,r8,r7\n"					
443
441
444
		lvx			v3,r4,r9				//need another 16 bytes for misaligned data -- 3
442
		"lvx			v3,r4,r9\n"				//need another 16 bytes for misaligned data -- 3
445
		vmrghb		v12,v9,v12				//unsigned byte -> unsigned half		
443
		"vmrghb		v12,v9,v12\n"				//unsigned byte -> unsigned half		
446
		add			r9,r9,r7					
444
		"add			r9,r9,r7\n"					
447
445
448
		lvx			v14,r4,r8				//get 8 RefPtr1 -- 4
446
		"lvx			v14,r4,r8\n"				//get 8 RefPtr1 -- 4
449
		vperm		v13,v13,v3,v8
447
		"vperm		v13,v13,v3,v8\n"
450
448
451
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
449
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
452
		add			r8,r8,r7					
450
		"add			r8,r8,r7\n"					
453
451
454
		lvx			v4,r4,r9				//need another 16 bytes for misaligned data -- 4
452
		"lvx			v4,r4,r9\n"				//need another 16 bytes for misaligned data -- 4
455
		vmrghb		v13,v9,v13				//unsigned byte -> unsigned half		
453
		"vmrghb		v13,v9,v13\n"				//unsigned byte -> unsigned half		
456
		add			r9,r9,r7					
454
		"add			r9,r9,r7\n"					
457
455
458
		lvx			v15,r4,r8				//get 8 RefPtr1 -- 5
456
		"lvx			v15,r4,r8\n"				//get 8 RefPtr1 -- 5
459
		vperm		v14,v14,v4,v8
457
		"vperm		v14,v14,v4,v8\n"
460
458
461
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
459
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
462
		add			r8,r8,r7					
460
		"add			r8,r8,r7\n"					
463
461
464
		lvx			v5,r4,r9				//need another 16 bytes for misaligned data -- 5
462
		"lvx			v5,r4,r9\n"				//need another 16 bytes for misaligned data -- 5
465
		vmrghb		v14,v9,v14				//unsigned byte -> unsigned half		
463
		"vmrghb		v14,v9,v14\n"				//unsigned byte -> unsigned half		
466
		add			r9,r9,r7					
464
		"add			r9,r9,r7\n"					
467
465
468
		lvx			v16,r4,r8				//get 8 RefPtr1 -- 6
466
		"lvx			v16,r4,r8\n"				//get 8 RefPtr1 -- 6
469
		vperm		v15,v15,v5,v8
467
		"vperm		v15,v15,v5,v8\n"
470
468
471
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
469
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
472
		add			r8,r8,r7					
470
		"add			r8,r8,r7\n"					
473
471
474
		lvx			v6,r4,r9				//need another 16 bytes for misaligned data -- 6
472
		"lvx			v6,r4,r9\n"				//need another 16 bytes for misaligned data -- 6
475
		vmrghb		v15,v9,v15				//unsigned byte -> unsigned half		
473
		"vmrghb		v15,v9,v15\n"				//unsigned byte -> unsigned half		
476
		add			r9,r9,r7					
474
		"add			r9,r9,r7\n"					
477
475
478
		lvx			v17,r4,r8				//get 8 RefPtr1 -- 7
476
		"lvx			v17,r4,r8\n"				//get 8 RefPtr1 -- 7
479
		vperm		v16,v16,v6,v8
477
		"vperm		v16,v16,v6,v8\n"
480
478
481
		lvsl		v8,r4,r8				//load alignment vector for RefPtr1
479
		"lvsl		v8,r4,r8\n"				//load alignment vector for RefPtr1
482
		add			r8,r8,r7					
480
		"add			r8,r8,r7\n"					
483
481
484
		lvx			v7,r4,r9				//need another 16 bytes for misaligned data -- 7
482
		"lvx			v7,r4,r9\n"				//need another 16 bytes for misaligned data -- 7
485
		vmrghb		v16,v9,v16				//unsigned byte -> unsigned half		
483
		"vmrghb		v16,v9,v16\n"				//unsigned byte -> unsigned half		
486
		add			r9,r9,r7					
484
		"add			r9,r9,r7\n"					
487
//--------
485
//--------
488
		vperm		v17,v17,v7,v8
486
		"vperm		v17,v17,v7,v8\n"
489
		xor			r8,r8,r8
487
		"xor			r8,r8,r8\n"
490
		li			r9,16
488
		"li			r9,16\n"
491
489
492
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
490
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
493
		vmrghb		v17,v9,v17				//unsigned byte -> unsigned half		
491
		"vmrghb		v17,v9,v17\n"				//unsigned byte -> unsigned half		
494
		
492
		
495
		lvx			v20,r5,r8				//get 8 RefPtr2 -- 0
493
		"lvx			v20,r5,r8\n"				//get 8 RefPtr2 -- 0
496
		add			r8,r8,r7					
494
		"add			r8,r8,r7\n"					
497
495
498
		lvx			v0,r5,r9				//need another 16 bytes for misaligned data -- 0
496
		"lvx			v0,r5,r9\n"				//need another 16 bytes for misaligned data -- 0
499
		add			r9,r9,r7					
497
		"add			r9,r9,r7\n"					
500
498
501
		lvx			v21,r5,r8				//get 8 RefPtr2 -- 1
499
		"lvx			v21,r5,r8\n"				//get 8 RefPtr2 -- 1
502
		vperm		v20,v20,v0,v18
500
		"vperm		v20,v20,v0,v18\n"
503
501
504
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
502
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
505
		add			r8,r8,r7					
503
		"add			r8,r8,r7\n"					
506
504
507
		lvx			v1,r5,r9				//need another 16 bytes for misaligned data -- 1
505
		"lvx			v1,r5,r9\n"				//need another 16 bytes for misaligned data -- 1
508
		vmrghb		v20,v9,v20				//unsigned byte -> unsigned half		
506
		"vmrghb		v20,v9,v20\n"				//unsigned byte -> unsigned half		
509
		add			r9,r9,r7					
507
		"add			r9,r9,r7\n"					
510
508
511
		lvx			v22,r5,r8				//get 8 RefPtr2 -- 2
509
		"lvx			v22,r5,r8\n"				//get 8 RefPtr2 -- 2
512
		vperm		v21,v21,v1,v18
510
		"vperm		v21,v21,v1,v18\n"
513
511
514
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
512
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
515
		add			r8,r8,r7					
513
		"add			r8,r8,r7\n"					
516
514
517
		lvx			v2,r5,r9				//need another 16 bytes for misaligned data -- 2
515
		"lvx			v2,r5,r9\n"				//need another 16 bytes for misaligned data -- 2
518
		vmrghb		v21,v9,v21				//unsigned byte -> unsigned half	
516
		"vmrghb		v21,v9,v21\n"				//unsigned byte -> unsigned half	
519
		vadduhm		v10,v10,v20	
517
		"vadduhm		v10,v10,v20\n"	
520
		add			r9,r9,r7					
518
		"add			r9,r9,r7\n"					
521
519
522
		lvx			v23,r5,r8				//get 8 RefPtr2 -- 3
520
		"lvx			v23,r5,r8\n"				//get 8 RefPtr2 -- 3
523
		vperm		v22,v22,v2,v18
521
		"vperm		v22,v22,v2,v18\n"
524
522
525
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
523
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
526
		add			r8,r8,r7					
524
		"add			r8,r8,r7\n"					
527
525
528
		lvx			v3,r5,r9				//need another 16 bytes for misaligned data -- 3
526
		"lvx			v3,r5,r9\n"				//need another 16 bytes for misaligned data -- 3
529
		vmrghb		v22,v9,v22				//unsigned byte -> unsigned half		
527
		"vmrghb		v22,v9,v22\n"				//unsigned byte -> unsigned half		
530
		vadduhm		v11,v11,v21	
528
		"vadduhm		v11,v11,v21\n"	
531
		add			r9,r9,r7					
529
		"add			r9,r9,r7\n"					
532
530
533
		lvx			v24,r5,r8				//get 8 RefPtr2 -- 4
531
		"lvx			v24,r5,r8\n"				//get 8 RefPtr2 -- 4
534
		vperm		v23,v23,v3,v18
532
		"vperm		v23,v23,v3,v18\n"
535
533
536
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
534
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
537
		add			r8,r8,r7					
535
		"add			r8,r8,r7\n"					
538
536
539
		lvx			v4,r5,r9				//need another 16 bytes for misaligned data -- 4
537
		"lvx			v4,r5,r9\n"				//need another 16 bytes for misaligned data -- 4
540
		vmrghb		v23,v9,v23				//unsigned byte -> unsigned half		
538
		"vmrghb		v23,v9,v23\n"				//unsigned byte -> unsigned half		
541
		vadduhm		v12,v12,v22	
539
		"vadduhm		v12,v12,v22\n"	
542
		add			r9,r9,r7					
540
		"add			r9,r9,r7\n"					
543
541
544
		lvx			v25,r5,r8				//get 8 RefPtr2 -- 5
542
		"lvx			v25,r5,r8\n"				//get 8 RefPtr2 -- 5
545
		vperm		v24,v24,v4,v18
543
		"vperm		v24,v24,v4,v18\n"
546
544
547
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
545
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
548
		add			r8,r8,r7					
546
		"add			r8,r8,r7\n"					
549
547
550
		lvx			v5,r5,r9				//need another 16 bytes for misaligned data -- 5
548
		"lvx			v5,r5,r9\n"				//need another 16 bytes for misaligned data -- 5
551
		vmrghb		v24,v9,v24				//unsigned byte -> unsigned half		
549
		"vmrghb		v24,v9,v24\n"				//unsigned byte -> unsigned half		
552
		vadduhm		v13,v13,v23	
550
		"vadduhm		v13,v13,v23\n"	
553
		add			r9,r9,r7					
551
		"add			r9,r9,r7\n"					
554
552
555
		lvx			v26,r5,r8				//get 8 RefPtr2 -- 6
553
		"lvx			v26,r5,r8\n"				//get 8 RefPtr2 -- 6
556
		vperm		v25,v25,v5,v18
554
		"vperm		v25,v25,v5,v18\n"
557
555
558
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
556
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
559
		add			r8,r8,r7					
557
		"add			r8,r8,r7\n"					
560
558
561
		lvx			v6,r5,r9				//need another 16 bytes for misaligned data -- 6
559
		"lvx			v6,r5,r9\n"				//need another 16 bytes for misaligned data -- 6
562
		vmrghb		v25,v9,v25				//unsigned byte -> unsigned half		
560
		"vmrghb		v25,v9,v25\n"				//unsigned byte -> unsigned half		
563
		vadduhm		v14,v14,v24	
561
		"vadduhm		v14,v14,v24\n"	
564
		add			r9,r9,r7					
562
		"add			r9,r9,r7\n"					
565
563
566
		lvx			v27,r5,r8				//get 8 RefPtr2 -- 7
564
		"lvx			v27,r5,r8\n"				//get 8 RefPtr2 -- 7
567
		vperm		v26,v26,v6,v18
565
		"vperm		v26,v26,v6,v18\n"
568
566
569
		lvsl		v18,r5,r8				//load alignment vector for RefPtr2
567
		"lvsl		v18,r5,r8\n"				//load alignment vector for RefPtr2
570
		add			r8,r8,r7					
568
		"add			r8,r8,r7\n"					
571
569
572
		lvx			v7,r5,r9				//need another 16 bytes for misaligned data -- 7
570
		"lvx			v7,r5,r9\n"				//need another 16 bytes for misaligned data -- 7
573
		vmrghb		v26,v9,v26				//unsigned byte -> unsigned half		
571
		"vmrghb		v26,v9,v26\n"				//unsigned byte -> unsigned half		
574
		vadduhm		v15,v15,v25	
572
		"vadduhm		v15,v15,v25\n"	
575
		add			r9,r9,r7					
573
		"add			r9,r9,r7\n"					
576
574
577
		vperm		v27,v27,v7,v18
575
		"vperm		v27,v27,v7,v18\n"
578
		xor			r8,r8,r8
576
		"xor			r8,r8,r8\n"
579
577
580
		vmrghb		v27,v9,v27				//unsigned byte -> unsigned half		
578
		"vmrghb		v27,v9,v27\n"				//unsigned byte -> unsigned half		
581
		vadduhm		v16,v16,v26	
579
		"vadduhm		v16,v16,v26\n"	
582
580
583
		vadduhm		v17,v17,v27	
581
		"vadduhm		v17,v17,v27\n"	
584
		vspltish	v8,1
582
		"vspltish	v8,1\n"
585
//--------
583
//--------
586
		lvx			v0,r6,r8				//get 8 shorts 				
584
		"lvx			v0,r6,r8\n"				//get 8 shorts 				
587
		vsrh		v10,v10,v8
585
		"vsrh		v10,v10,v8\n"
588
		addi		r8,r8,16
586
		"addi		r8,r8,16\n"
589
587
590
		lvx			v1,r6,r8				//get 8 shorts 				
588
		"lvx			v1,r6,r8\n"				//get 8 shorts 				
591
		vsrh		v11,v11,v8
589
		"vsrh		v11,v11,v8\n"
592
		addi		r8,r8,16
590
		"addi		r8,r8,16\n"
593
591
594
		lvx			v2,r6,r8				//get 8 shorts 				
592
		"lvx			v2,r6,r8\n"				//get 8 shorts 				
595
		vsrh		v12,v12,v8
593
		"vsrh		v12,v12,v8\n"
596
		addi		r8,r8,16
594
		"addi		r8,r8,16\n"
597
595
598
		lvx			v3,r6,r8				//get 8 shorts 				
596
		"lvx			v3,r6,r8\n"				//get 8 shorts 				
599
		vsrh		v13,v13,v8
597
		"vsrh		v13,v13,v8\n"
600
		addi		r8,r8,16
598
		"addi		r8,r8,16\n"
601
599
602
		lvx			v4,r6,r8				//get 8 shorts 				
600
		"lvx			v4,r6,r8\n"				//get 8 shorts 				
603
		vsrh		v14,v14,v8
601
		"vsrh		v14,v14,v8\n"
604
		addi		r8,r8,16
602
		"addi		r8,r8,16\n"
605
603
606
		lvx			v5,r6,r8				//get 8 shorts 				
604
		"lvx			v5,r6,r8\n"				//get 8 shorts 				
607
		vsrh		v15,v15,v8
605
		"vsrh		v15,v15,v8\n"
608
		addi		r8,r8,16
606
		"addi		r8,r8,16\n"
609
607
610
		lvx			v6,r6,r8				//get 8 shorts 				
608
		"lvx			v6,r6,r8\n"				//get 8 shorts 				
611
		vsrh		v16,v16,v8
609
		"vsrh		v16,v16,v8\n"
612
		addi		r8,r8,16
610
		"addi		r8,r8,16\n"
613
611
614
		lvx			v7,r6,r8				//get 8 shorts 				
612
		"lvx			v7,r6,r8\n"				//get 8 shorts 				
615
		vsrh		v17,v17,v8
613
		"vsrh		v17,v17,v8\n"
616
		xor			r8,r8,r8
614
		"xor			r8,r8,r8\n"
617
//--------
615
//--------
618
		lvsr		v9,r3,r8				//load alignment vector for stores
616
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
619
		vaddshs		v0,v0,v10
617
		"vaddshs		v0,v0,v10\n"
620
618
621
		vaddshs		v1,v1,v11
619
		"vaddshs		v1,v1,v11\n"
622
		vpkshus		v0,v0,v0				
620
		"vpkshus		v0,v0,v0\n"				
623
621
624
		vaddshs		v2,v2,v12
622
		"vaddshs		v2,v2,v12\n"
625
		vpkshus		v1,v1,v1				
623
		"vpkshus		v1,v1,v1\n"				
626
624
627
		vaddshs		v3,v3,v13
625
		"vaddshs		v3,v3,v13\n"
628
		vpkshus		v2,v2,v2				
626
		"vpkshus		v2,v2,v2\n"				
629
627
630
		vaddshs		v4,v4,v14
628
		"vaddshs		v4,v4,v14\n"
631
		vpkshus		v3,v3,v3				
629
		"vpkshus		v3,v3,v3\n"				
632
630
633
		vaddshs		v5,v5,v15
631
		"vaddshs		v5,v5,v15\n"
634
		vpkshus		v4,v4,v4				
632
		"vpkshus		v4,v4,v4\n"				
635
633
636
		vaddshs		v6,v6,v16
634
		"vaddshs		v6,v6,v16\n"
637
		vpkshus		v5,v5,v5				
635
		"vpkshus		v5,v5,v5\n"				
638
636
639
		vaddshs		v7,v7,v17
637
		"vaddshs		v7,v7,v17\n"
640
		vpkshus		v6,v6,v6				
638
		"vpkshus		v6,v6,v6\n"				
641
639
642
		vpkshus		v7,v7,v7
640
		"vpkshus		v7,v7,v7\n"
643
641
644
		li			r9,4
642
		"li			r9,4\n"
645
		vperm		v0,v0,v0,v9				//adjust for writes
643
		"vperm		v0,v0,v0,v9\n"				//adjust for writes
646
644
647
		stvewx		v0,r3,r8
645
		"stvewx		v0,r3,r8\n"
648
		add			r8,r8,r7	
646
		"add			r8,r8,r7\n"	
649
647
650
		lvsr		v9,r3,r8				//load alignment vector for stores
648
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
651
649
652
		stvewx		v0,r3,r9
650
		"stvewx		v0,r3,r9\n"
653
		add			r9,r9,r7	
651
		"add			r9,r9,r7\n"	
654
		vperm		v1,v1,v1,v9
652
		"vperm		v1,v1,v1,v9\n"
655
653
656
		stvewx		v1,r3,r8
654
		"stvewx		v1,r3,r8\n"
657
		add			r8,r8,r7	
655
		"add			r8,r8,r7\n"	
658
656
659
		lvsr		v9,r3,r8				//load alignment vector for stores
657
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
660
658
661
		stvewx		v1,r3,r9
659
		"stvewx		v1,r3,r9\n"
662
		add			r9,r9,r7	
660
		"add			r9,r9,r7\n"	
663
		vperm		v2,v2,v2,v9
661
		"vperm		v2,v2,v2,v9\n"
664
662
665
		stvewx		v2,r3,r8
663
		"stvewx		v2,r3,r8\n"
666
		add			r8,r8,r7	
664
		"add			r8,r8,r7\n"	
667
665
668
		lvsr		v9,r3,r8				//load alignment vector for stores
666
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
669
667
670
		stvewx		v2,r3,r9
668
		"stvewx		v2,r3,r9\n"
671
		add			r9,r9,r7	
669
		"add			r9,r9,r7\n"	
672
		vperm		v3,v3,v3,v9
670
		"vperm		v3,v3,v3,v9\n"
673
671
674
		stvewx		v3,r3,r8
672
		"stvewx		v3,r3,r8\n"
675
		add			r8,r8,r7	
673
		"add			r8,r8,r7\n"	
676
674
677
		lvsr		v9,r3,r8				//load alignment vector for stores
675
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
678
676
679
		stvewx		v3,r3,r9
677
		"stvewx		v3,r3,r9\n"
680
		add			r9,r9,r7	
678
		"add			r9,r9,r7\n"	
681
		vperm		v4,v4,v4,v9
679
		"vperm		v4,v4,v4,v9\n"
682
680
683
		stvewx		v4,r3,r8
681
		"stvewx		v4,r3,r8\n"
684
		add			r8,r8,r7	
682
		"add			r8,r8,r7\n"	
685
683
686
		lvsr		v9,r3,r8				//load alignment vector for stores
684
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
687
685
688
		stvewx		v4,r3,r9
686
		"stvewx		v4,r3,r9\n"
689
		add			r9,r9,r7	
687
		"add			r9,r9,r7\n"	
690
		vperm		v5,v5,v5,v9
688
		"vperm		v5,v5,v5,v9\n"
691
689
692
		stvewx		v5,r3,r8
690
		"stvewx		v5,r3,r8\n"
693
		add			r8,r8,r7	
691
		"add			r8,r8,r7\n"	
694
692
695
		lvsr		v9,r3,r8				//load alignment vector for stores
693
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
696
694
697
		stvewx		v5,r3,r9
695
		"stvewx		v5,r3,r9\n"
698
		add			r9,r9,r7	
696
		"add			r9,r9,r7\n"	
699
		vperm		v6,v6,v6,v9
697
		"vperm		v6,v6,v6,v9\n"
700
698
701
		stvewx		v6,r3,r8
699
		"stvewx		v6,r3,r8\n"
702
		add			r8,r8,r7	
700
		"add			r8,r8,r7\n"	
703
701
704
		lvsr		v9,r3,r8				//load alignment vector for stores
702
		"lvsr		v9,r3,r8\n"				//load alignment vector for stores
705
703
706
		stvewx		v6,r3,r9
704
		"stvewx		v6,r3,r9\n"
707
		add			r9,r9,r7	
705
		"add			r9,r9,r7\n"	
708
		vperm		v7,v7,v7,v9
706
		"vperm		v7,v7,v7,v9\n"
709
707
710
		stvewx		v7,r3,r8
708
		"stvewx		v7,r3,r8\n"
711
709
712
		stvewx		v7,r3,r9
710
		"stvewx		v7,r3,r9\n"
713
	}
711
	);
714
}
712
}
715
713
716
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC);
714
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC);

Return to bug 109270