Lines 46-176
Link Here
|
46 |
/* r3, r4, r5 */ |
46 |
/* r3, r4, r5 */ |
47 |
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds) |
47 |
recon8x8_intra_altivec (uint8_t *dest, int16_t *change, int ds) |
48 |
{ |
48 |
{ |
49 |
asm { |
49 |
asm( |
50 |
//trying cache hints |
50 |
//trying cache hints |
51 |
lis r6,0x0108 |
51 |
"lis r6,0x0108\n" |
52 |
or r6,r6,r5 |
52 |
"or r6,r6,r5\n" |
53 |
dstst r3,r6,0 |
53 |
"dstst r3,r6,0\n" |
54 |
|
54 |
|
55 |
vspltish v1,7 |
55 |
"vspltish v1,7\n" |
56 |
|
56 |
|
57 |
vspltish v8,1 |
57 |
"vspltish v8,1\n\n" |
58 |
xor r6,r6,r6 |
58 |
"xor r6,r6,r6\n" |
59 |
|
59 |
|
60 |
lvx v0,r4,r6 //get 8 shorts |
60 |
"lvx v0,r4,r6\n" //get 8 shorts |
61 |
vslh v8,v8,v1 //now have 128 |
61 |
"vslh v8,v8,v1\n" //now have 128 |
62 |
addi r6,r6,16 |
62 |
"addi r6,r6,16\n" |
63 |
|
63 |
|
64 |
lvx v1,r4,r6 //get 8 shorts |
64 |
"lvx v1,r4,r6\n" //get 8 shorts |
65 |
vaddshs v0,v0,v8 //+=128 |
65 |
"vaddshs v0,v0,v8\n" //+=128 |
66 |
addi r6,r6,16 |
66 |
"addi r6,r6,16\n" |
67 |
|
67 |
|
68 |
lvx v2,r4,r6 //get 8 shorts |
68 |
"lvx v2,r4,r6\n" //get 8 shorts |
69 |
vaddshs v1,v1,v8 //+=128 |
69 |
"vaddshs v1,v1,v8\n" //+=128 |
70 |
addi r6,r6,16 |
70 |
"addi r6,r6,16\n" |
71 |
vpkshus v0,v0,v0 //convert to bytes |
71 |
"vpkshus v0,v0,v0\n" //convert to bytes |
72 |
|
72 |
|
73 |
lvx v3,r4,r6 //get 8 shorts |
73 |
"lvx v3,r4,r6\n" //get 8 shorts |
74 |
vaddshs v2,v2,v8 //+=128 |
74 |
"vaddshs v2,v2,v8\n" //+=128 |
75 |
addi r6,r6,16 |
75 |
"addi r6,r6,16\n" |
76 |
vpkshus v1,v1,v1 //convert to bytes |
76 |
"vpkshus v1,v1,v1\n" //convert to bytes |
77 |
|
77 |
|
78 |
lvx v4,r4,r6 //get 8 shorts |
78 |
"lvx v4,r4,r6\n" //get 8 shorts |
79 |
vaddshs v3,v3,v8 //+=128 |
79 |
"vaddshs v3,v3,v8\n" //+=128 |
80 |
addi r6,r6,16 |
80 |
"addi r6,r6,16\n" |
81 |
vpkshus v2,v2,v2 //convert to bytes |
81 |
"vpkshus v2,v2,v2\n" //convert to bytes |
82 |
|
82 |
|
83 |
lvx v5,r4,r6 //get 8 shorts |
83 |
"lvx v5,r4,r6\n" //get 8 shorts |
84 |
vaddshs v4,v4,v8 //+=128 |
84 |
"vaddshs v4,v4,v8\n" //+=128 |
85 |
addi r6,r6,16 |
85 |
"addi r6,r6,16\n" |
86 |
vpkshus v3,v3,v3 //convert to bytes |
86 |
"vpkshus v3,v3,v3\n" //convert to bytes |
87 |
|
87 |
|
88 |
lvx v6,r4,r6 //get 8 shorts |
88 |
"lvx v6,r4,r6\n" //get 8 shorts |
89 |
vaddshs v5,v5,v8 //+=128 |
89 |
"vaddshs v5,v5,v8\n" //+=128 |
90 |
addi r6,r6,16 |
90 |
"addi r6,r6,16\n" |
91 |
vpkshus v4,v4,v4 //convert to bytes |
91 |
"vpkshus v4,v4,v4\n" //convert to bytes |
92 |
|
92 |
|
93 |
lvx v7,r4,r6 //get 8 shorts |
93 |
"lvx v7,r4,r6\n" //get 8 shorts |
94 |
xor r6,r6,r6 |
94 |
"xor r6,r6,r6\n" |
95 |
vaddshs v6,v6,v8 //+=128 |
95 |
"vaddshs v6,v6,v8\n" //+=128 |
96 |
vpkshus v5,v5,v5 //convert to bytes |
96 |
"vpkshus v5,v5,v5\n" //convert to bytes |
97 |
|
97 |
|
98 |
lvsr v9,r3,r6 //load alignment vector for stores |
98 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
99 |
vaddshs v7,v7,v8 //+=128 |
99 |
"vaddshs v7,v7,v8\n" //+=128 |
100 |
vpkshus v6,v6,v6 //convert to bytes |
100 |
"vpkshus v6,v6,v6\n" //convert to bytes |
101 |
|
101 |
|
102 |
vpkshus v7,v7,v7 //convert to bytes |
102 |
"vpkshus v7,v7,v7\n" //convert to bytes |
103 |
|
103 |
|
104 |
li r7,4 |
104 |
"li r7,4\n" |
105 |
vperm v0,v0,v0,v9 |
105 |
"vperm v0,v0,v0,v9\n" |
106 |
|
106 |
|
107 |
stvewx v0,r3,r6 |
107 |
"stvewx v0,r3,r6\n" |
108 |
add r6,r6,r5 |
108 |
"add r6,r6,r5\n" |
109 |
|
109 |
|
110 |
lvsr v9,r3,r6 //load alignment vector for stores |
110 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
111 |
|
111 |
|
112 |
stvewx v0,r3,r7 |
112 |
"stvewx v0,r3,r7\n" |
113 |
add r7,r7,r5 |
113 |
"add r7,r7,r5\n" |
114 |
vperm v1,v1,v1,v9 |
114 |
"vperm v1,v1,v1,v9\n" |
115 |
|
115 |
|
116 |
stvewx v1,r3,r6 |
116 |
"stvewx v1,r3,r6\n" |
117 |
add r6,r6,r5 |
117 |
"add r6,r6,r5\n" |
118 |
|
118 |
|
119 |
lvsr v9,r3,r6 //load alignment vector for stores |
119 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
120 |
|
120 |
|
121 |
stvewx v1,r3,r7 |
121 |
"stvewx v1,r3,r7\n" |
122 |
add r7,r7,r5 |
122 |
"add r7,r7,r5\n" |
123 |
vperm v2,v2,v2,v9 |
123 |
"vperm v2,v2,v2,v9\n" |
124 |
|
124 |
|
125 |
stvewx v2,r3,r6 |
125 |
"stvewx v2,r3,r6\n" |
126 |
add r6,r6,r5 |
126 |
"add r6,r6,r5\n" |
127 |
|
127 |
|
128 |
lvsr v9,r3,r6 //load alignment vector for stores |
128 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
129 |
|
129 |
|
130 |
stvewx v2,r3,r7 |
130 |
"stvewx v2,r3,r7\n" |
131 |
add r7,r7,r5 |
131 |
"add r7,r7,r5\n" |
132 |
vperm v3,v3,v3,v9 |
132 |
"vperm v3,v3,v3,v9\n" |
133 |
|
133 |
|
134 |
stvewx v3,r3,r6 |
134 |
"stvewx v3,r3,r6\n" |
135 |
add r6,r6,r5 |
135 |
"add r6,r6,r5\n" |
136 |
|
136 |
|
137 |
lvsr v9,r3,r6 //load alignment vector for stores |
137 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
138 |
|
138 |
|
139 |
stvewx v3,r3,r7 |
139 |
"stvewx v3,r3,r7\n" |
140 |
add r7,r7,r5 |
140 |
"add r7,r7,r5\n" |
141 |
vperm v4,v4,v4,v9 |
141 |
"vperm v4,v4,v4,v9\n" |
142 |
|
142 |
|
143 |
stvewx v4,r3,r6 |
143 |
"stvewx v4,r3,r6\n" |
144 |
add r6,r6,r5 |
144 |
"add r6,r6,r5\n" |
145 |
|
145 |
|
146 |
lvsr v9,r3,r6 //load alignment vector for stores |
146 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
147 |
|
147 |
|
148 |
stvewx v4,r3,r7 |
148 |
"stvewx v4,r3,r7\n" |
149 |
add r7,r7,r5 |
149 |
"add r7,r7,r5\n" |
150 |
vperm v5,v5,v5,v9 |
150 |
"vperm v5,v5,v5,v9\n" |
151 |
|
151 |
|
152 |
stvewx v5,r3,r6 |
152 |
"stvewx v5,r3,r6\n" |
153 |
add r6,r6,r5 |
153 |
"add r6,r6,r5\n" |
154 |
|
154 |
|
155 |
lvsr v9,r3,r6 //load alignment vector for stores |
155 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
156 |
|
156 |
|
157 |
stvewx v5,r3,r7 |
157 |
"stvewx v5,r3,r7\n" |
158 |
add r7,r7,r5 |
158 |
"add r7,r7,r5\n" |
159 |
vperm v6,v6,v6,v9 |
159 |
"vperm v6,v6,v6,v9\n" |
160 |
|
160 |
|
161 |
stvewx v6,r3,r6 |
161 |
"stvewx v6,r3,r6\n" |
162 |
add r6,r6,r5 |
162 |
"add r6,r6,r5\n" |
163 |
|
163 |
|
164 |
lvsr v9,r3,r6 //load alignment vector for stores |
164 |
"lvsr v9,r3,r6\n" //load alignment vector for stores |
165 |
|
165 |
|
166 |
stvewx v6,r3,r7 |
166 |
"stvewx v6,r3,r7\n" |
167 |
add r7,r7,r5 |
167 |
"add r7,r7,r5\n" |
168 |
vperm v7,v7,v7,v9 |
168 |
"vperm v7,v7,v7,v9\n" |
169 |
|
169 |
|
170 |
stvewx v7,r3,r6 |
170 |
"stvewx v7,r3,r6\n" |
171 |
|
171 |
|
172 |
stvewx v7,r3,r7 |
172 |
"stvewx v7,r3,r7\n" |
173 |
} |
173 |
); |
174 |
} |
174 |
} |
175 |
|
175 |
|
176 |
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC); |
176 |
OIL_DEFINE_IMPL_FULL (recon8x8_intra_altivec, recon8x8_intra, OIL_IMPL_FLAG_ALTIVEC); |
Lines 178-394
Link Here
|
178 |
static void /* r3, r4, r5, r6 */ |
178 |
static void /* r3, r4, r5, r6 */ |
179 |
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss) |
179 |
recon8x8_inter_altivec (uint8_t *dest, uint8_t *src, int16_t *change, int dss) |
180 |
{ |
180 |
{ |
181 |
asm |
181 |
asm( |
182 |
{ |
|
|
183 |
//trying cache hints |
182 |
//trying cache hints |
184 |
lis r7,0x0108 |
183 |
"lis r7,0x0108\n" |
185 |
or r7,r7,r6 |
184 |
"or r7,r7,r6\n" |
186 |
dstst r3,r7,0 |
185 |
"dstst r3,r7,0\n" |
187 |
|
186 |
|
188 |
xor r7,r7,r7 |
187 |
"xor r7,r7,r7\n" |
189 |
li r8,16 |
188 |
"li r8,16\n" |
190 |
|
189 |
|
191 |
lvsl v8,r4,r7 //load alignment vector for refs |
190 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
192 |
vxor v9,v9,v9 |
191 |
"vxor v9,v9,v9\n" |
193 |
|
192 |
|
194 |
lvx v10,r4,r7 //get 8 refs |
193 |
"lvx v10,r4,r7\n" //get 8 refs |
195 |
add r7,r7,r6 |
194 |
"add r7,r7,r6\n" |
196 |
|
195 |
|
197 |
lvx v0,r4,r8 //need another 16 bytes for misaligned data -- 0 |
196 |
"lvx v0,r4,r8\n" //need another 16 bytes for misaligned data -- 0 |
198 |
add r8,r8,r6 |
197 |
"add r8,r8,r6\n" |
199 |
|
198 |
|
200 |
lvx v11,r4,r7 //get 8 refs |
199 |
"lvx v11,r4,r7\n" //get 8 refs |
201 |
vperm v10,v10,v0,v8 |
200 |
"vperm v10,v10,v0,v8\n" |
202 |
|
201 |
|
203 |
lvsl v8,r4,r7 //load alignment vector for refs |
202 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
204 |
add r7,r7,r6 |
203 |
"add r7,r7,r6\n" |
205 |
|
204 |
|
206 |
lvx v1,r4,r8 //need another 16 bytes for misaligned data -- 1 |
205 |
"lvx v1,r4,r8\n" //need another 16 bytes for misaligned data -- 1 |
207 |
add r8,r8,r6 |
206 |
"add r8,r8,r6\n" |
208 |
|
207 |
|
209 |
lvx v12,r4,r7 //get 8 refs |
208 |
"lvx v12,r4,r7\n" //get 8 refs |
210 |
vperm v11,v11,v1,v8 |
209 |
"vperm v11,v11,v1,v8\n" |
211 |
|
210 |
|
212 |
lvsl v8,r4,r7 //load alignment vector for refs |
211 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
213 |
add r7,r7,r6 |
212 |
"add r7,r7,r6\n" |
214 |
|
213 |
|
215 |
lvx v2,r4,r8 //need another 16 bytes for misaligned data -- 2 |
214 |
"lvx v2,r4,r8\n" //need another 16 bytes for misaligned data -- 2 |
216 |
add r8,r8,r6 |
215 |
"add r8,r8,r6\n" |
217 |
|
216 |
|
218 |
lvx v13,r4,r7 //get 8 refs |
217 |
"lvx v13,r4,r7\n" //get 8 refs |
219 |
vperm v12,v12,v2,v8 |
218 |
"vperm v12,v12,v2,v8\n" |
220 |
|
219 |
|
221 |
lvsl v8,r4,r7 //load alignment vector for refs |
220 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
222 |
add r7,r7,r6 |
221 |
"add r7,r7,r6\n" |
223 |
|
222 |
|
224 |
lvx v3,r4,r8 //need another 16 bytes for misaligned data -- 3 |
223 |
"lvx v3,r4,r8\n" //need another 16 bytes for misaligned data -- 3 |
225 |
add r8,r8,r6 |
224 |
"add r8,r8,r6\n" |
226 |
|
225 |
|
227 |
lvx v14,r4,r7 //get 8 refs |
226 |
"lvx v14,r4,r7\n" //get 8 refs |
228 |
vperm v13,v13,v3,v8 |
227 |
"vperm v13,v13,v3,v8\n" |
229 |
|
228 |
|
230 |
lvsl v8,r4,r7 //load alignment vector for refs |
229 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
231 |
add r7,r7,r6 |
230 |
"add r7,r7,r6\n" |
232 |
|
231 |
|
233 |
lvx v4,r4,r8 //need another 16 bytes for misaligned data -- 4 |
232 |
"lvx v4,r4,r8\n" //need another 16 bytes for misaligned data -- 4 |
234 |
add r8,r8,r6 |
233 |
"add r8,r8,r6\n" |
235 |
|
234 |
|
236 |
lvx v15,r4,r7 //get 8 refs |
235 |
"lvx v15,r4,r7\n" //get 8 refs |
237 |
vperm v14,v14,v4,v8 |
236 |
"vperm v14,v14,v4,v8\n" |
238 |
|
237 |
|
239 |
lvsl v8,r4,r7 //load alignment vector for refs |
238 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
240 |
add r7,r7,r6 |
239 |
"add r7,r7,r6\n" |
241 |
|
240 |
|
242 |
lvx v5,r4,r8 //need another 16 bytes for misaligned data -- 5 |
241 |
"lvx v5,r4,r8\n" //need another 16 bytes for misaligned data -- 5 |
243 |
add r8,r8,r6 |
242 |
"add r8,r8,r6\n" |
244 |
|
243 |
|
245 |
lvx v16,r4,r7 //get 8 refs |
244 |
"lvx v16,r4,r7\n" //get 8 refs |
246 |
vperm v15,v15,v5,v8 |
245 |
"vperm v15,v15,v5,v8\n" |
247 |
|
246 |
|
248 |
lvsl v8,r4,r7 //load alignment vector for refs |
247 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
249 |
add r7,r7,r6 |
248 |
"add r7,r7,r6\n" |
250 |
|
249 |
|
251 |
lvx v6,r4,r8 //need another 16 bytes for misaligned data -- 6 |
250 |
"lvx v6,r4,r8\n" //need another 16 bytes for misaligned data -- 6 |
252 |
add r8,r8,r6 |
251 |
"add r8,r8,r6\n" |
253 |
|
252 |
|
254 |
lvx v17,r4,r7 //get 8 refs |
253 |
"lvx v17,r4,r7\n" //get 8 refs |
255 |
vperm v16,v16,v6,v8 |
254 |
"vperm v16,v16,v6,v8\n" |
256 |
|
255 |
|
257 |
lvsl v8,r4,r7 //load alignment vector for refs |
256 |
"lvsl v8,r4,r7\n" //load alignment vector for refs |
258 |
xor r7,r7,r7 |
257 |
"xor r7,r7,r7\n" |
259 |
|
258 |
|
260 |
lvx v7,r4,r8 //need another 16 bytes for misaligned data -- 7 |
259 |
"lvx v7,r4,r8\n" //need another 16 bytes for misaligned data -- 7 |
261 |
add r8,r8,r6 |
260 |
"add r8,r8,r6\n" |
262 |
|
261 |
|
263 |
lvx v0,r5,r7 //get 8 shorts |
262 |
"lvx v0,r5,r7\n" //get 8 shorts |
264 |
vperm v17,v17,v7,v8 |
263 |
"vperm v17,v17,v7,v8\n" |
265 |
addi r7,r7,16 |
264 |
"addi r7,r7,16\n" |
266 |
|
265 |
|
267 |
lvx v1,r5,r7 //get 8 shorts |
266 |
"lvx v1,r5,r7\n" //get 8 shorts |
268 |
vmrghb v10,v9,v10 //unsigned byte -> unsigned half |
267 |
"vmrghb v10,v9,v10\n" //unsigned byte -> unsigned half |
269 |
addi r7,r7,16 |
268 |
"addi r7,r7,16\n" |
270 |
|
269 |
|
271 |
lvx v2,r5,r7 //get 8 shorts |
270 |
"lvx v2,r5,r7\n" //get 8 shorts |
272 |
vmrghb v11,v9,v11 //unsigned byte -> unsigned half |
271 |
"vmrghb v11,v9,v11\n" //unsigned byte -> unsigned half |
273 |
vaddshs v0,v0,v10 |
272 |
"vaddshs v0,v0,v10\n" |
274 |
addi r7,r7,16 |
273 |
"addi r7,r7,16\n" |
275 |
|
274 |
|
276 |
lvx v3,r5,r7 //get 8 shorts |
275 |
"lvx v3,r5,r7\n" //get 8 shorts |
277 |
vmrghb v12,v9,v12 //unsigned byte -> unsigned half |
276 |
"vmrghb v12,v9,v12\n" //unsigned byte -> unsigned half |
278 |
vaddshs v1,v1,v11 |
277 |
"vaddshs v1,v1,v11\n" |
279 |
addi r7,r7,16 |
278 |
"addi r7,r7,16\n" |
280 |
|
279 |
|
281 |
lvx v4,r5,r7 //get 8 shorts |
280 |
"lvx v4,r5,r7\n" //get 8 shorts |
282 |
vmrghb v13,v9,v13 //unsigned byte -> unsigned half |
281 |
"vmrghb v13,v9,v13\n" //unsigned byte -> unsigned half |
283 |
vaddshs v2,v2,v12 |
282 |
"vaddshs v2,v2,v12\n" |
284 |
addi r7,r7,16 |
283 |
"addi r7,r7,16\n" |
285 |
|
284 |
|
286 |
lvx v5,r5,r7 //get 8 shorts |
285 |
"lvx v5,r5,r7\n" //get 8 shorts |
287 |
vmrghb v14,v9,v14 //unsigned byte -> unsigned half |
286 |
"vmrghb v14,v9,v14\n" //unsigned byte -> unsigned half |
288 |
vaddshs v3,v3,v13 |
287 |
"vaddshs v3,v3,v13\n" |
289 |
addi r7,r7,16 |
288 |
"addi r7,r7,16\n" |
290 |
|
289 |
|
291 |
lvx v6,r5,r7 //get 8 shorts |
290 |
"lvx v6,r5,r7\n" //get 8 shorts |
292 |
vmrghb v15,v9,v15 //unsigned byte -> unsigned half |
291 |
"vmrghb v15,v9,v15\n" //unsigned byte -> unsigned half |
293 |
vaddshs v4,v4,v14 |
292 |
"vaddshs v4,v4,v14\n" |
294 |
addi r7,r7,16 |
293 |
"addi r7,r7,16\n" |
295 |
|
294 |
|
296 |
lvx v7,r5,r7 //get 8 shorts |
295 |
"lvx v7,r5,r7\n" //get 8 shorts |
297 |
vmrghb v16,v9,v16 //unsigned byte -> unsigned half |
296 |
"vmrghb v16,v9,v16\n" //unsigned byte -> unsigned half |
298 |
vaddshs v5,v5,v15 |
297 |
"vaddshs v5,v5,v15\n" |
299 |
|
298 |
|
300 |
vmrghb v17,v9,v17 //unsigned byte -> unsigned half |
299 |
"vmrghb v17,v9,v17\n" //unsigned byte -> unsigned half |
301 |
vaddshs v6,v6,v16 |
300 |
"vaddshs v6,v6,v16\n" |
302 |
|
301 |
|
303 |
vpkshus v0,v0,v0 |
302 |
"vpkshus v0,v0,v0\n" |
304 |
vaddshs v7,v7,v17 |
303 |
"vaddshs v7,v7,v17\n" |
305 |
|
304 |
|
306 |
vpkshus v1,v1,v1 |
305 |
"vpkshus v1,v1,v1\n" |
307 |
xor r7,r7,r7 |
306 |
"xor r7,r7,r7\n" |
308 |
|
307 |
|
309 |
vpkshus v2,v2,v2 |
308 |
"vpkshus v2,v2,v2\n" |
310 |
|
309 |
|
311 |
vpkshus v3,v3,v3 |
310 |
"vpkshus v3,v3,v3\n" |
312 |
|
311 |
|
313 |
vpkshus v4,v4,v4 |
312 |
"vpkshus v4,v4,v4\n" |
314 |
|
313 |
|
315 |
vpkshus v5,v5,v5 |
314 |
"vpkshus v5,v5,v5\n" |
316 |
|
315 |
|
317 |
vpkshus v6,v6,v6 |
316 |
"vpkshus v6,v6,v6\n" |
318 |
|
317 |
|
319 |
lvsr v9,r3,r7 //load alignment vector for stores |
318 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
320 |
vpkshus v7,v7,v7 |
319 |
"vpkshus v7,v7,v7\n" |
321 |
|
320 |
|
322 |
li r8,4 |
321 |
"li r8,4\n" |
323 |
vperm v0,v0,v0,v9 //adjust for writes |
322 |
"vperm v0,v0,v0,v9\n" //adjust for writes |
324 |
|
323 |
|
325 |
stvewx v0,r3,r7 |
324 |
"stvewx v0,r3,r7\n" |
326 |
add r7,r7,r6 |
325 |
"add r7,r7,r6\n" |
327 |
|
326 |
|
328 |
lvsr v9,r3,r7 //load alignment vector for stores |
327 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
329 |
|
328 |
|
330 |
stvewx v0,r3,r8 |
329 |
"stvewx v0,r3,r8\n" |
331 |
add r8,r8,r6 |
330 |
"add r8,r8,r6\n" |
332 |
vperm v1,v1,v1,v9 |
331 |
"vperm v1,v1,v1,v9\n" |
333 |
|
332 |
|
334 |
stvewx v1,r3,r7 |
333 |
"stvewx v1,r3,r7\n" |
335 |
add r7,r7,r6 |
334 |
"add r7,r7,r6\n" |
336 |
|
335 |
|
337 |
lvsr v9,r3,r7 //load alignment vector for stores |
336 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
338 |
|
337 |
|
339 |
stvewx v1,r3,r8 |
338 |
"stvewx v1,r3,r8\n" |
340 |
add r8,r8,r6 |
339 |
"add r8,r8,r6\n" |
341 |
vperm v2,v2,v2,v9 |
340 |
"vperm v2,v2,v2,v9\n" |
342 |
|
341 |
|
343 |
stvewx v2,r3,r7 |
342 |
"stvewx v2,r3,r7\n" |
344 |
add r7,r7,r6 |
343 |
"add r7,r7,r6\n" |
345 |
|
344 |
|
346 |
lvsr v9,r3,r7 //load alignment vector for stores |
345 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
347 |
|
346 |
|
348 |
stvewx v2,r3,r8 |
347 |
"stvewx v2,r3,r8\n" |
349 |
add r8,r8,r6 |
348 |
"add r8,r8,r6\n" |
350 |
vperm v3,v3,v3,v9 |
349 |
"vperm v3,v3,v3,v9\n" |
351 |
|
350 |
|
352 |
stvewx v3,r3,r7 |
351 |
"stvewx v3,r3,r7\n" |
353 |
add r7,r7,r6 |
352 |
"add r7,r7,r6\n" |
354 |
|
353 |
|
355 |
lvsr v9,r3,r7 //load alignment vector for stores |
354 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
356 |
|
355 |
|
357 |
stvewx v3,r3,r8 |
356 |
"stvewx v3,r3,r8\n" |
358 |
add r8,r8,r6 |
357 |
"add r8,r8,r6\n" |
359 |
vperm v4,v4,v4,v9 |
358 |
"vperm v4,v4,v4,v9\n" |
360 |
|
359 |
|
361 |
stvewx v4,r3,r7 |
360 |
"stvewx v4,r3,r7\n" |
362 |
add r7,r7,r6 |
361 |
"add r7,r7,r6\n" |
363 |
|
362 |
|
364 |
lvsr v9,r3,r7 //load alignment vector for stores |
363 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
365 |
|
364 |
|
366 |
stvewx v4,r3,r8 |
365 |
"stvewx v4,r3,r8\n" |
367 |
add r8,r8,r6 |
366 |
"add r8,r8,r6\n" |
368 |
vperm v5,v5,v5,v9 |
367 |
"vperm v5,v5,v5,v9\n" |
369 |
|
368 |
|
370 |
stvewx v5,r3,r7 |
369 |
"stvewx v5,r3,r7\n" |
371 |
add r7,r7,r6 |
370 |
"add r7,r7,r6\n" |
372 |
|
371 |
|
373 |
lvsr v9,r3,r7 //load alignment vector for stores |
372 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
374 |
|
373 |
|
375 |
stvewx v5,r3,r8 |
374 |
"stvewx v5,r3,r8\n" |
376 |
add r8,r8,r6 |
375 |
"add r8,r8,r6\n" |
377 |
vperm v6,v6,v6,v9 |
376 |
"vperm v6,v6,v6,v9\n" |
378 |
|
377 |
|
379 |
stvewx v6,r3,r7 |
378 |
"stvewx v6,r3,r7\n" |
380 |
add r7,r7,r6 |
379 |
"add r7,r7,r6\n" |
381 |
|
380 |
|
382 |
lvsr v9,r3,r7 //load alignment vector for stores |
381 |
"lvsr v9,r3,r7\n" //load alignment vector for stores |
383 |
|
382 |
|
384 |
stvewx v6,r3,r8 |
383 |
"stvewx v6,r3,r8\n" |
385 |
add r8,r8,r6 |
384 |
"add r8,r8,r6\n" |
386 |
vperm v7,v7,v7,v9 |
385 |
"vperm v7,v7,v7,v9\n" |
387 |
|
386 |
|
388 |
stvewx v7,r3,r7 |
387 |
"stvewx v7,r3,r7\n" |
389 |
|
388 |
|
390 |
stvewx v7,r3,r8 |
389 |
"stvewx v7,r3,r8\n" |
391 |
} |
390 |
); |
392 |
} |
391 |
} |
393 |
|
392 |
|
394 |
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC); |
393 |
OIL_DEFINE_IMPL_FULL (recon8x8_inter_altivec, recon8x8_inter, OIL_IMPL_FLAG_ALTIVEC); |
Lines 396-716
Link Here
|
396 |
static void /* r3, r4, r5, r6, r7 */ |
395 |
static void /* r3, r4, r5, r6, r7 */ |
397 |
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) |
396 |
recon8x8_inter2_altivec (uint8_t *dest, uint8_t *s1, uint8_t *s2, int16_t *change, int dsss) |
398 |
{ |
397 |
{ |
399 |
asm |
398 |
asm( |
400 |
{ |
|
|
401 |
//trying cache hints |
399 |
//trying cache hints |
402 |
lis r8,0x0108 |
400 |
"lis r8,0x0108\n" |
403 |
or r8,r8,r7 |
401 |
"or r8,r8,r7\n" |
404 |
dstst r3,r8,0 |
402 |
"dstst r3,r8,0\n" |
405 |
|
403 |
|
406 |
xor r8,r8,r8 |
404 |
"xor r8,r8,r8\n" |
407 |
li r9,16 |
405 |
"li r9,16\n" |
408 |
|
406 |
|
409 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
407 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
410 |
vxor v9,v9,v9 |
408 |
"vxor v9,v9,v9\n" |
411 |
|
409 |
|
412 |
lvx v10,r4,r8 //get 8 RefPtr1 -- 0 |
410 |
"lvx v10,r4,r8\n" //get 8 RefPtr1 -- 0 |
413 |
add r8,r8,r7 |
411 |
"add r8,r8,r7\n" |
414 |
|
412 |
|
415 |
lvx v0,r4,r9 //need another 16 bytes for misaligned data -- 0 |
413 |
"lvx v0,r4,r9\n" //need another 16 bytes for misaligned data -- 0 |
416 |
add r9,r9,r7 |
414 |
"add r9,r9,r7\n" |
417 |
|
415 |
|
418 |
lvx v11,r4,r8 //get 8 RefPtr1 -- 1 |
416 |
"lvx v11,r4,r8\n" //get 8 RefPtr1 -- 1 |
419 |
vperm v10,v10,v0,v8 |
417 |
"vperm v10,v10,v0,v8\n" |
420 |
|
418 |
|
421 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
419 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
422 |
add r8,r8,r7 |
420 |
"add r8,r8,r7\n" |
423 |
|
421 |
|
424 |
lvx v1,r4,r9 //need another 16 bytes for misaligned data -- 1 |
422 |
"lvx v1,r4,r9\n" //need another 16 bytes for misaligned data -- 1 |
425 |
vmrghb v10,v9,v10 //unsigned byte -> unsigned half |
423 |
"vmrghb v10,v9,v10\n" //unsigned byte -> unsigned half |
426 |
add r9,r9,r7 |
424 |
"add r9,r9,r7\n" |
427 |
|
425 |
|
428 |
lvx v12,r4,r8 //get 8 RefPtr1 -- 2 |
426 |
"lvx v12,r4,r8\n" //get 8 RefPtr1 -- 2 |
429 |
vperm v11,v11,v1,v8 |
427 |
"vperm v11,v11,v1,v8\n" |
430 |
|
428 |
|
431 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
429 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
432 |
add r8,r8,r7 |
430 |
"add r8,r8,r7\n" |
433 |
|
431 |
|
434 |
lvx v2,r4,r9 //need another 16 bytes for misaligned data -- 2 |
432 |
"lvx v2,r4,r9\n" //need another 16 bytes for misaligned data -- 2 |
435 |
vmrghb v11,v9,v11 //unsigned byte -> unsigned half |
433 |
"vmrghb v11,v9,v11\n" //unsigned byte -> unsigned half |
436 |
add r9,r9,r7 |
434 |
"add r9,r9,r7\n" |
437 |
|
435 |
|
438 |
lvx v13,r4,r8 //get 8 RefPtr1 -- 3 |
436 |
"lvx v13,r4,r8\n" //get 8 RefPtr1 -- 3 |
439 |
vperm v12,v12,v2,v8 |
437 |
"vperm v12,v12,v2,v8\n" |
440 |
|
438 |
|
441 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
439 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
442 |
add r8,r8,r7 |
440 |
"add r8,r8,r7\n" |
443 |
|
441 |
|
444 |
lvx v3,r4,r9 //need another 16 bytes for misaligned data -- 3 |
442 |
"lvx v3,r4,r9\n" //need another 16 bytes for misaligned data -- 3 |
445 |
vmrghb v12,v9,v12 //unsigned byte -> unsigned half |
443 |
"vmrghb v12,v9,v12\n" //unsigned byte -> unsigned half |
446 |
add r9,r9,r7 |
444 |
"add r9,r9,r7\n" |
447 |
|
445 |
|
448 |
lvx v14,r4,r8 //get 8 RefPtr1 -- 4 |
446 |
"lvx v14,r4,r8\n" //get 8 RefPtr1 -- 4 |
449 |
vperm v13,v13,v3,v8 |
447 |
"vperm v13,v13,v3,v8\n" |
450 |
|
448 |
|
451 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
449 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
452 |
add r8,r8,r7 |
450 |
"add r8,r8,r7\n" |
453 |
|
451 |
|
454 |
lvx v4,r4,r9 //need another 16 bytes for misaligned data -- 4 |
452 |
"lvx v4,r4,r9\n" //need another 16 bytes for misaligned data -- 4 |
455 |
vmrghb v13,v9,v13 //unsigned byte -> unsigned half |
453 |
"vmrghb v13,v9,v13\n" //unsigned byte -> unsigned half |
456 |
add r9,r9,r7 |
454 |
"add r9,r9,r7\n" |
457 |
|
455 |
|
458 |
lvx v15,r4,r8 //get 8 RefPtr1 -- 5 |
456 |
"lvx v15,r4,r8\n" //get 8 RefPtr1 -- 5 |
459 |
vperm v14,v14,v4,v8 |
457 |
"vperm v14,v14,v4,v8\n" |
460 |
|
458 |
|
461 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
459 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
462 |
add r8,r8,r7 |
460 |
"add r8,r8,r7\n" |
463 |
|
461 |
|
464 |
lvx v5,r4,r9 //need another 16 bytes for misaligned data -- 5 |
462 |
"lvx v5,r4,r9\n" //need another 16 bytes for misaligned data -- 5 |
465 |
vmrghb v14,v9,v14 //unsigned byte -> unsigned half |
463 |
"vmrghb v14,v9,v14\n" //unsigned byte -> unsigned half |
466 |
add r9,r9,r7 |
464 |
"add r9,r9,r7\n" |
467 |
|
465 |
|
468 |
lvx v16,r4,r8 //get 8 RefPtr1 -- 6 |
466 |
"lvx v16,r4,r8\n" //get 8 RefPtr1 -- 6 |
469 |
vperm v15,v15,v5,v8 |
467 |
"vperm v15,v15,v5,v8\n" |
470 |
|
468 |
|
471 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
469 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
472 |
add r8,r8,r7 |
470 |
"add r8,r8,r7\n" |
473 |
|
471 |
|
474 |
lvx v6,r4,r9 //need another 16 bytes for misaligned data -- 6 |
472 |
"lvx v6,r4,r9\n" //need another 16 bytes for misaligned data -- 6 |
475 |
vmrghb v15,v9,v15 //unsigned byte -> unsigned half |
473 |
"vmrghb v15,v9,v15\n" //unsigned byte -> unsigned half |
476 |
add r9,r9,r7 |
474 |
"add r9,r9,r7\n" |
477 |
|
475 |
|
478 |
lvx v17,r4,r8 //get 8 RefPtr1 -- 7 |
476 |
"lvx v17,r4,r8\n" //get 8 RefPtr1 -- 7 |
479 |
vperm v16,v16,v6,v8 |
477 |
"vperm v16,v16,v6,v8\n" |
480 |
|
478 |
|
481 |
lvsl v8,r4,r8 //load alignment vector for RefPtr1 |
479 |
"lvsl v8,r4,r8\n" //load alignment vector for RefPtr1 |
482 |
add r8,r8,r7 |
480 |
"add r8,r8,r7\n" |
483 |
|
481 |
|
484 |
lvx v7,r4,r9 //need another 16 bytes for misaligned data -- 7 |
482 |
"lvx v7,r4,r9\n" //need another 16 bytes for misaligned data -- 7 |
485 |
vmrghb v16,v9,v16 //unsigned byte -> unsigned half |
483 |
"vmrghb v16,v9,v16\n" //unsigned byte -> unsigned half |
486 |
add r9,r9,r7 |
484 |
"add r9,r9,r7\n" |
487 |
//-------- |
485 |
//-------- |
488 |
vperm v17,v17,v7,v8 |
486 |
"vperm v17,v17,v7,v8\n" |
489 |
xor r8,r8,r8 |
487 |
"xor r8,r8,r8\n" |
490 |
li r9,16 |
488 |
"li r9,16\n" |
491 |
|
489 |
|
492 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
490 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
493 |
vmrghb v17,v9,v17 //unsigned byte -> unsigned half |
491 |
"vmrghb v17,v9,v17\n" //unsigned byte -> unsigned half |
494 |
|
492 |
|
495 |
lvx v20,r5,r8 //get 8 RefPtr2 -- 0 |
493 |
"lvx v20,r5,r8\n" //get 8 RefPtr2 -- 0 |
496 |
add r8,r8,r7 |
494 |
"add r8,r8,r7\n" |
497 |
|
495 |
|
498 |
lvx v0,r5,r9 //need another 16 bytes for misaligned data -- 0 |
496 |
"lvx v0,r5,r9\n" //need another 16 bytes for misaligned data -- 0 |
499 |
add r9,r9,r7 |
497 |
"add r9,r9,r7\n" |
500 |
|
498 |
|
501 |
lvx v21,r5,r8 //get 8 RefPtr2 -- 1 |
499 |
"lvx v21,r5,r8\n" //get 8 RefPtr2 -- 1 |
502 |
vperm v20,v20,v0,v18 |
500 |
"vperm v20,v20,v0,v18\n" |
503 |
|
501 |
|
504 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
502 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
505 |
add r8,r8,r7 |
503 |
"add r8,r8,r7\n" |
506 |
|
504 |
|
507 |
lvx v1,r5,r9 //need another 16 bytes for misaligned data -- 1 |
505 |
"lvx v1,r5,r9\n" //need another 16 bytes for misaligned data -- 1 |
508 |
vmrghb v20,v9,v20 //unsigned byte -> unsigned half |
506 |
"vmrghb v20,v9,v20\n" //unsigned byte -> unsigned half |
509 |
add r9,r9,r7 |
507 |
"add r9,r9,r7\n" |
510 |
|
508 |
|
511 |
lvx v22,r5,r8 //get 8 RefPtr2 -- 2 |
509 |
"lvx v22,r5,r8\n" //get 8 RefPtr2 -- 2 |
512 |
vperm v21,v21,v1,v18 |
510 |
"vperm v21,v21,v1,v18\n" |
513 |
|
511 |
|
514 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
512 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
515 |
add r8,r8,r7 |
513 |
"add r8,r8,r7\n" |
516 |
|
514 |
|
517 |
lvx v2,r5,r9 //need another 16 bytes for misaligned data -- 2 |
515 |
"lvx v2,r5,r9\n" //need another 16 bytes for misaligned data -- 2 |
518 |
vmrghb v21,v9,v21 //unsigned byte -> unsigned half |
516 |
"vmrghb v21,v9,v21\n" //unsigned byte -> unsigned half |
519 |
vadduhm v10,v10,v20 |
517 |
"vadduhm v10,v10,v20\n" |
520 |
add r9,r9,r7 |
518 |
"add r9,r9,r7\n" |
521 |
|
519 |
|
522 |
lvx v23,r5,r8 //get 8 RefPtr2 -- 3 |
520 |
"lvx v23,r5,r8\n" //get 8 RefPtr2 -- 3 |
523 |
vperm v22,v22,v2,v18 |
521 |
"vperm v22,v22,v2,v18\n" |
524 |
|
522 |
|
525 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
523 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
526 |
add r8,r8,r7 |
524 |
"add r8,r8,r7\n" |
527 |
|
525 |
|
528 |
lvx v3,r5,r9 //need another 16 bytes for misaligned data -- 3 |
526 |
"lvx v3,r5,r9\n" //need another 16 bytes for misaligned data -- 3 |
529 |
vmrghb v22,v9,v22 //unsigned byte -> unsigned half |
527 |
"vmrghb v22,v9,v22\n" //unsigned byte -> unsigned half |
530 |
vadduhm v11,v11,v21 |
528 |
"vadduhm v11,v11,v21\n" |
531 |
add r9,r9,r7 |
529 |
"add r9,r9,r7\n" |
532 |
|
530 |
|
533 |
lvx v24,r5,r8 //get 8 RefPtr2 -- 4 |
531 |
"lvx v24,r5,r8\n" //get 8 RefPtr2 -- 4 |
534 |
vperm v23,v23,v3,v18 |
532 |
"vperm v23,v23,v3,v18\n" |
535 |
|
533 |
|
536 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
534 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
537 |
add r8,r8,r7 |
535 |
"add r8,r8,r7\n" |
538 |
|
536 |
|
539 |
lvx v4,r5,r9 //need another 16 bytes for misaligned data -- 4 |
537 |
"lvx v4,r5,r9\n" //need another 16 bytes for misaligned data -- 4 |
540 |
vmrghb v23,v9,v23 //unsigned byte -> unsigned half |
538 |
"vmrghb v23,v9,v23\n" //unsigned byte -> unsigned half |
541 |
vadduhm v12,v12,v22 |
539 |
"vadduhm v12,v12,v22\n" |
542 |
add r9,r9,r7 |
540 |
"add r9,r9,r7\n" |
543 |
|
541 |
|
544 |
lvx v25,r5,r8 //get 8 RefPtr2 -- 5 |
542 |
"lvx v25,r5,r8\n" //get 8 RefPtr2 -- 5 |
545 |
vperm v24,v24,v4,v18 |
543 |
"vperm v24,v24,v4,v18\n" |
546 |
|
544 |
|
547 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
545 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
548 |
add r8,r8,r7 |
546 |
"add r8,r8,r7\n" |
549 |
|
547 |
|
550 |
lvx v5,r5,r9 //need another 16 bytes for misaligned data -- 5 |
548 |
"lvx v5,r5,r9\n" //need another 16 bytes for misaligned data -- 5 |
551 |
vmrghb v24,v9,v24 //unsigned byte -> unsigned half |
549 |
"vmrghb v24,v9,v24\n" //unsigned byte -> unsigned half |
552 |
vadduhm v13,v13,v23 |
550 |
"vadduhm v13,v13,v23\n" |
553 |
add r9,r9,r7 |
551 |
"add r9,r9,r7\n" |
554 |
|
552 |
|
555 |
lvx v26,r5,r8 //get 8 RefPtr2 -- 6 |
553 |
"lvx v26,r5,r8\n" //get 8 RefPtr2 -- 6 |
556 |
vperm v25,v25,v5,v18 |
554 |
"vperm v25,v25,v5,v18\n" |
557 |
|
555 |
|
558 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
556 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
559 |
add r8,r8,r7 |
557 |
"add r8,r8,r7\n" |
560 |
|
558 |
|
561 |
lvx v6,r5,r9 //need another 16 bytes for misaligned data -- 6 |
559 |
"lvx v6,r5,r9\n" //need another 16 bytes for misaligned data -- 6 |
562 |
vmrghb v25,v9,v25 //unsigned byte -> unsigned half |
560 |
"vmrghb v25,v9,v25\n" //unsigned byte -> unsigned half |
563 |
vadduhm v14,v14,v24 |
561 |
"vadduhm v14,v14,v24\n" |
564 |
add r9,r9,r7 |
562 |
"add r9,r9,r7\n" |
565 |
|
563 |
|
566 |
lvx v27,r5,r8 //get 8 RefPtr2 -- 7 |
564 |
"lvx v27,r5,r8\n" //get 8 RefPtr2 -- 7 |
567 |
vperm v26,v26,v6,v18 |
565 |
"vperm v26,v26,v6,v18\n" |
568 |
|
566 |
|
569 |
lvsl v18,r5,r8 //load alignment vector for RefPtr2 |
567 |
"lvsl v18,r5,r8\n" //load alignment vector for RefPtr2 |
570 |
add r8,r8,r7 |
568 |
"add r8,r8,r7\n" |
571 |
|
569 |
|
572 |
lvx v7,r5,r9 //need another 16 bytes for misaligned data -- 7 |
570 |
"lvx v7,r5,r9\n" //need another 16 bytes for misaligned data -- 7 |
573 |
vmrghb v26,v9,v26 //unsigned byte -> unsigned half |
571 |
"vmrghb v26,v9,v26\n" //unsigned byte -> unsigned half |
574 |
vadduhm v15,v15,v25 |
572 |
"vadduhm v15,v15,v25\n" |
575 |
add r9,r9,r7 |
573 |
"add r9,r9,r7\n" |
576 |
|
574 |
|
577 |
vperm v27,v27,v7,v18 |
575 |
"vperm v27,v27,v7,v18\n" |
578 |
xor r8,r8,r8 |
576 |
"xor r8,r8,r8\n" |
579 |
|
577 |
|
580 |
vmrghb v27,v9,v27 //unsigned byte -> unsigned half |
578 |
"vmrghb v27,v9,v27\n" //unsigned byte -> unsigned half |
581 |
vadduhm v16,v16,v26 |
579 |
"vadduhm v16,v16,v26\n" |
582 |
|
580 |
|
583 |
vadduhm v17,v17,v27 |
581 |
"vadduhm v17,v17,v27\n" |
584 |
vspltish v8,1 |
582 |
"vspltish v8,1\n" |
585 |
//-------- |
583 |
//-------- |
586 |
lvx v0,r6,r8 //get 8 shorts |
584 |
"lvx v0,r6,r8\n" //get 8 shorts |
587 |
vsrh v10,v10,v8 |
585 |
"vsrh v10,v10,v8\n" |
588 |
addi r8,r8,16 |
586 |
"addi r8,r8,16\n" |
589 |
|
587 |
|
590 |
lvx v1,r6,r8 //get 8 shorts |
588 |
"lvx v1,r6,r8\n" //get 8 shorts |
591 |
vsrh v11,v11,v8 |
589 |
"vsrh v11,v11,v8\n" |
592 |
addi r8,r8,16 |
590 |
"addi r8,r8,16\n" |
593 |
|
591 |
|
594 |
lvx v2,r6,r8 //get 8 shorts |
592 |
"lvx v2,r6,r8\n" //get 8 shorts |
595 |
vsrh v12,v12,v8 |
593 |
"vsrh v12,v12,v8\n" |
596 |
addi r8,r8,16 |
594 |
"addi r8,r8,16\n" |
597 |
|
595 |
|
598 |
lvx v3,r6,r8 //get 8 shorts |
596 |
"lvx v3,r6,r8\n" //get 8 shorts |
599 |
vsrh v13,v13,v8 |
597 |
"vsrh v13,v13,v8\n" |
600 |
addi r8,r8,16 |
598 |
"addi r8,r8,16\n" |
601 |
|
599 |
|
602 |
lvx v4,r6,r8 //get 8 shorts |
600 |
"lvx v4,r6,r8\n" //get 8 shorts |
603 |
vsrh v14,v14,v8 |
601 |
"vsrh v14,v14,v8\n" |
604 |
addi r8,r8,16 |
602 |
"addi r8,r8,16\n" |
605 |
|
603 |
|
606 |
lvx v5,r6,r8 //get 8 shorts |
604 |
"lvx v5,r6,r8\n" //get 8 shorts |
607 |
vsrh v15,v15,v8 |
605 |
"vsrh v15,v15,v8\n" |
608 |
addi r8,r8,16 |
606 |
"addi r8,r8,16\n" |
609 |
|
607 |
|
610 |
lvx v6,r6,r8 //get 8 shorts |
608 |
"lvx v6,r6,r8\n" //get 8 shorts |
611 |
vsrh v16,v16,v8 |
609 |
"vsrh v16,v16,v8\n" |
612 |
addi r8,r8,16 |
610 |
"addi r8,r8,16\n" |
613 |
|
611 |
|
614 |
lvx v7,r6,r8 //get 8 shorts |
612 |
"lvx v7,r6,r8\n" //get 8 shorts |
615 |
vsrh v17,v17,v8 |
613 |
"vsrh v17,v17,v8\n" |
616 |
xor r8,r8,r8 |
614 |
"xor r8,r8,r8\n" |
617 |
//-------- |
615 |
//-------- |
618 |
lvsr v9,r3,r8 //load alignment vector for stores |
616 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
619 |
vaddshs v0,v0,v10 |
617 |
"vaddshs v0,v0,v10\n" |
620 |
|
618 |
|
621 |
vaddshs v1,v1,v11 |
619 |
"vaddshs v1,v1,v11\n" |
622 |
vpkshus v0,v0,v0 |
620 |
"vpkshus v0,v0,v0\n" |
623 |
|
621 |
|
624 |
vaddshs v2,v2,v12 |
622 |
"vaddshs v2,v2,v12\n" |
625 |
vpkshus v1,v1,v1 |
623 |
"vpkshus v1,v1,v1\n" |
626 |
|
624 |
|
627 |
vaddshs v3,v3,v13 |
625 |
"vaddshs v3,v3,v13\n" |
628 |
vpkshus v2,v2,v2 |
626 |
"vpkshus v2,v2,v2\n" |
629 |
|
627 |
|
630 |
vaddshs v4,v4,v14 |
628 |
"vaddshs v4,v4,v14\n" |
631 |
vpkshus v3,v3,v3 |
629 |
"vpkshus v3,v3,v3\n" |
632 |
|
630 |
|
633 |
vaddshs v5,v5,v15 |
631 |
"vaddshs v5,v5,v15\n" |
634 |
vpkshus v4,v4,v4 |
632 |
"vpkshus v4,v4,v4\n" |
635 |
|
633 |
|
636 |
vaddshs v6,v6,v16 |
634 |
"vaddshs v6,v6,v16\n" |
637 |
vpkshus v5,v5,v5 |
635 |
"vpkshus v5,v5,v5\n" |
638 |
|
636 |
|
639 |
vaddshs v7,v7,v17 |
637 |
"vaddshs v7,v7,v17\n" |
640 |
vpkshus v6,v6,v6 |
638 |
"vpkshus v6,v6,v6\n" |
641 |
|
639 |
|
642 |
vpkshus v7,v7,v7 |
640 |
"vpkshus v7,v7,v7\n" |
643 |
|
641 |
|
644 |
li r9,4 |
642 |
"li r9,4\n" |
645 |
vperm v0,v0,v0,v9 //adjust for writes |
643 |
"vperm v0,v0,v0,v9\n" //adjust for writes |
646 |
|
644 |
|
647 |
stvewx v0,r3,r8 |
645 |
"stvewx v0,r3,r8\n" |
648 |
add r8,r8,r7 |
646 |
"add r8,r8,r7\n" |
649 |
|
647 |
|
650 |
lvsr v9,r3,r8 //load alignment vector for stores |
648 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
651 |
|
649 |
|
652 |
stvewx v0,r3,r9 |
650 |
"stvewx v0,r3,r9\n" |
653 |
add r9,r9,r7 |
651 |
"add r9,r9,r7\n" |
654 |
vperm v1,v1,v1,v9 |
652 |
"vperm v1,v1,v1,v9\n" |
655 |
|
653 |
|
656 |
stvewx v1,r3,r8 |
654 |
"stvewx v1,r3,r8\n" |
657 |
add r8,r8,r7 |
655 |
"add r8,r8,r7\n" |
658 |
|
656 |
|
659 |
lvsr v9,r3,r8 //load alignment vector for stores |
657 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
660 |
|
658 |
|
661 |
stvewx v1,r3,r9 |
659 |
"stvewx v1,r3,r9\n" |
662 |
add r9,r9,r7 |
660 |
"add r9,r9,r7\n" |
663 |
vperm v2,v2,v2,v9 |
661 |
"vperm v2,v2,v2,v9\n" |
664 |
|
662 |
|
665 |
stvewx v2,r3,r8 |
663 |
"stvewx v2,r3,r8\n" |
666 |
add r8,r8,r7 |
664 |
"add r8,r8,r7\n" |
667 |
|
665 |
|
668 |
lvsr v9,r3,r8 //load alignment vector for stores |
666 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
669 |
|
667 |
|
670 |
stvewx v2,r3,r9 |
668 |
"stvewx v2,r3,r9\n" |
671 |
add r9,r9,r7 |
669 |
"add r9,r9,r7\n" |
672 |
vperm v3,v3,v3,v9 |
670 |
"vperm v3,v3,v3,v9\n" |
673 |
|
671 |
|
674 |
stvewx v3,r3,r8 |
672 |
"stvewx v3,r3,r8\n" |
675 |
add r8,r8,r7 |
673 |
"add r8,r8,r7\n" |
676 |
|
674 |
|
677 |
lvsr v9,r3,r8 //load alignment vector for stores |
675 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
678 |
|
676 |
|
679 |
stvewx v3,r3,r9 |
677 |
"stvewx v3,r3,r9\n" |
680 |
add r9,r9,r7 |
678 |
"add r9,r9,r7\n" |
681 |
vperm v4,v4,v4,v9 |
679 |
"vperm v4,v4,v4,v9\n" |
682 |
|
680 |
|
683 |
stvewx v4,r3,r8 |
681 |
"stvewx v4,r3,r8\n" |
684 |
add r8,r8,r7 |
682 |
"add r8,r8,r7\n" |
685 |
|
683 |
|
686 |
lvsr v9,r3,r8 //load alignment vector for stores |
684 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
687 |
|
685 |
|
688 |
stvewx v4,r3,r9 |
686 |
"stvewx v4,r3,r9\n" |
689 |
add r9,r9,r7 |
687 |
"add r9,r9,r7\n" |
690 |
vperm v5,v5,v5,v9 |
688 |
"vperm v5,v5,v5,v9\n" |
691 |
|
689 |
|
692 |
stvewx v5,r3,r8 |
690 |
"stvewx v5,r3,r8\n" |
693 |
add r8,r8,r7 |
691 |
"add r8,r8,r7\n" |
694 |
|
692 |
|
695 |
lvsr v9,r3,r8 //load alignment vector for stores |
693 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
696 |
|
694 |
|
697 |
stvewx v5,r3,r9 |
695 |
"stvewx v5,r3,r9\n" |
698 |
add r9,r9,r7 |
696 |
"add r9,r9,r7\n" |
699 |
vperm v6,v6,v6,v9 |
697 |
"vperm v6,v6,v6,v9\n" |
700 |
|
698 |
|
701 |
stvewx v6,r3,r8 |
699 |
"stvewx v6,r3,r8\n" |
702 |
add r8,r8,r7 |
700 |
"add r8,r8,r7\n" |
703 |
|
701 |
|
704 |
lvsr v9,r3,r8 //load alignment vector for stores |
702 |
"lvsr v9,r3,r8\n" //load alignment vector for stores |
705 |
|
703 |
|
706 |
stvewx v6,r3,r9 |
704 |
"stvewx v6,r3,r9\n" |
707 |
add r9,r9,r7 |
705 |
"add r9,r9,r7\n" |
708 |
vperm v7,v7,v7,v9 |
706 |
"vperm v7,v7,v7,v9\n" |
709 |
|
707 |
|
710 |
stvewx v7,r3,r8 |
708 |
"stvewx v7,r3,r8\n" |
711 |
|
709 |
|
712 |
stvewx v7,r3,r9 |
710 |
"stvewx v7,r3,r9\n" |
713 |
} |
711 |
); |
714 |
} |
712 |
} |
715 |
|
713 |
|
716 |
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC); |
714 |
OIL_DEFINE_IMPL_FULL (recon8x8_inter2_altivec, recon8x8_inter2, OIL_IMPL_FLAG_ALTIVEC); |