Lines 1-90
Link Here
|
1 |
; libFLAC - Free Lossless Audio Codec library |
1 |
/* libFLAC - Free Lossless Audio Codec library |
2 |
; Copyright (C) 2004 Josh Coalson |
2 |
* Copyright (C) 2004 Josh Coalson |
3 |
; |
3 |
* |
4 |
; Redistribution and use in source and binary forms, with or without |
4 |
* Redistribution and use in source and binary forms, with or without |
5 |
; modification, are permitted provided that the following conditions |
5 |
* modification, are permitted provided that the following conditions |
6 |
; are met: |
6 |
* are met: |
7 |
; |
7 |
* |
8 |
; - Redistributions of source code must retain the above copyright |
8 |
* - Redistributions of source code must retain the above copyright |
9 |
; notice, this list of conditions and the following disclaimer. |
9 |
* notice, this list of conditions and the following disclaimer. |
10 |
; |
10 |
* |
11 |
; - Redistributions in binary form must reproduce the above copyright |
11 |
* - Redistributions in binary form must reproduce the above copyright |
12 |
; notice, this list of conditions and the following disclaimer in the |
12 |
* notice, this list of conditions and the following disclaimer in the |
13 |
; documentation and/or other materials provided with the distribution. |
13 |
* documentation and/or other materials provided with the distribution. |
14 |
; |
14 |
* |
15 |
; - Neither the name of the Xiph.org Foundation nor the names of its |
15 |
* - Neither the name of the Xiph.org Foundation nor the names of its |
16 |
; contributors may be used to endorse or promote products derived from |
16 |
* contributors may be used to endorse or promote products derived from |
17 |
; this software without specific prior written permission. |
17 |
* this software without specific prior written permission. |
18 |
; |
18 |
* |
19 |
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
19 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
20 |
; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
20 |
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
21 |
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
21 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
22 |
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
22 |
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
23 |
; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
23 |
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
24 |
; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
24 |
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
25 |
; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
25 |
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
26 |
; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
26 |
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
27 |
; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
27 |
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
28 |
; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
28 |
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
29 |
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 |
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
30 |
|
30 |
*/ |
31 |
.text |
31 |
.text |
32 |
.align 2 |
32 |
.align 2 |
33 |
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 |
33 |
.globl FLAC__lpc_restore_signal_asm_ppc_altivec_16 |
34 |
.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 |
34 |
.globl FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 |
35 |
|
|
|
36 |
_FLAC__lpc_restore_signal_asm_ppc_altivec_16: |
37 |
; r3: residual[] |
38 |
; r4: data_len |
39 |
; r5: qlp_coeff[] |
40 |
; r6: order |
41 |
; r7: lp_quantization |
42 |
; r8: data[] |
43 |
|
44 |
; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() |
45 |
; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual |
46 |
; bps<=15 for mid-side coding, since that uses an extra bit) |
47 |
|
48 |
; these should be fast; the inner loop is unrolled (it takes no more than |
49 |
; 3*(order%4) instructions, all of which are arithmetic), and all of the |
50 |
; coefficients and all relevant history stay in registers, so the outer loop |
51 |
; has only one load from memory (the residual) |
52 |
|
53 |
; I have not yet run this through simg4, so there may be some avoidable stalls, |
54 |
; and there may be a somewhat more clever way to do the outer loop |
55 |
|
56 |
; the branch mechanism may prevent dynamic loading; I still need to examine |
57 |
; this issue, and there may be a more elegant method |
58 |
|
35 |
|
|
|
36 |
FLAC__lpc_restore_signal_asm_ppc_altivec_16: |
37 |
/* r3: residual[] |
38 |
* r4: data_len |
39 |
* r5: qlp_coeff[] |
40 |
* r6: order |
41 |
* r7: lp_quantization |
42 |
* r8: data[] |
43 |
* |
44 |
* see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() |
45 |
* these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual |
46 |
* bps<=15 for mid-side coding, since that uses an extra bit) |
47 |
* |
48 |
* these should be fast; the inner loop is unrolled (it takes no more than |
49 |
* 3*(order%4) instructions, all of which are arithmetic), and all of the |
50 |
* coefficients and all relevant history stay in registers, so the outer loop |
51 |
* has only one load from memory (the residual) |
52 |
* |
53 |
* I have not yet run this through simg4, so there may be some avoidable stalls, |
54 |
* and there may be a somewhat more clever way to do the outer loop |
55 |
* |
56 |
* the branch mechanism may prevent dynamic loading; I still need to examine |
57 |
* this issue, and there may be a more elegant method |
58 |
*/ |
59 |
stmw r31,-4(r1) |
59 |
stmw r31,-4(r1) |
60 |
|
60 |
|
61 |
addi r9,r1,-28 |
61 |
addi r9,r1,-28 |
62 |
li r31,0xf |
62 |
li r31,0xf |
63 |
andc r9,r9,r31 ; for quadword-aligned stack data |
63 |
andc r9,r9,r31 /* for quadword-aligned stack data */ |
64 |
|
64 |
|
65 |
slwi r6,r6,2 ; adjust for word size |
65 |
slwi r6,r6,2 /* adjust for word size */ |
66 |
slwi r4,r4,2 |
66 |
slwi r4,r4,2 |
67 |
add r4,r4,r8 ; r4 = data+data_len |
67 |
add r4,r4,r8 /* r4 = data+data_len */ |
68 |
|
68 |
|
69 |
mfspr r0,256 ; cache old vrsave |
69 |
mfspr r0,256 /* cache old vrsave */ |
70 |
addis r31,0,hi16(0xfffffc00) |
70 |
addis r31,0,0xfffffc00@ha |
71 |
ori r31,r31,lo16(0xfffffc00) |
71 |
ori r31,r31,0xfffffc00@l |
72 |
mtspr 256,r31 ; declare VRs in vrsave |
72 |
mtspr 256,r31 /* declare VRs in vrsave */ |
73 |
|
73 |
|
74 |
cmplw cr0,r8,r4 ; i<data_len |
74 |
cmplw cr0,r8,r4 /* i<data_len */ |
75 |
bc 4,0,L1400 |
75 |
bc 4,0,L1400 |
76 |
|
76 |
|
77 |
; load coefficients into v0-v7 and initial history into v8-v15 |
77 |
/* load coefficients into v0-v7 and initial history into v8-v15 */ |
78 |
li r31,0xf |
78 |
li r31,0xf |
79 |
and r31,r8,r31 ; r31: data%4 |
79 |
and r31,r8,r31 /* r31: data%4 */ |
80 |
li r11,16 |
80 |
li r11,16 |
81 |
subf r31,r31,r11 ; r31: 4-(data%4) |
81 |
subf r31,r31,r11 /* r31: 4-(data%4) */ |
82 |
slwi r31,r31,3 ; convert to bits for vsro |
82 |
slwi r31,r31,3 /* convert to bits for vsro */ |
83 |
li r10,-4 |
83 |
li r10,-4 |
84 |
stw r31,-4(r9) |
84 |
stw r31,-4(r9) |
85 |
lvewx v0,r10,r9 |
85 |
lvewx v0,r10,r9 |
86 |
vspltisb v18,-1 |
86 |
vspltisb v18,-1 |
87 |
vsro v18,v18,v0 ; v18: mask vector |
87 |
vsro v18,v18,v0 /* v18: mask vector */ |
88 |
|
88 |
|
89 |
li r31,0x8 |
89 |
li r31,0x8 |
90 |
lvsl v0,0,r31 |
90 |
lvsl v0,0,r31 |
Lines 94-107
Link Here
|
94 |
vspltisb v2,0 |
94 |
vspltisb v2,0 |
95 |
vspltisb v3,-1 |
95 |
vspltisb v3,-1 |
96 |
vmrglw v2,v2,v3 |
96 |
vmrglw v2,v2,v3 |
97 |
vsel v0,v1,v0,v2 ; v0: reversal permutation vector |
97 |
vsel v0,v1,v0,v2 /* v0: reversal permutation vector */ |
98 |
|
98 |
|
99 |
add r10,r5,r6 |
99 |
add r10,r5,r6 |
100 |
lvsl v17,0,r5 ; v17: coefficient alignment permutation vector |
100 |
lvsl v17,0,r5 /* v17: coefficient alignment permutation vector */ |
101 |
vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector |
101 |
vperm v17,v17,v17,v0 /* v17: reversal coefficient alignment permutation vector */ |
102 |
|
102 |
|
103 |
mr r11,r8 |
103 |
mr r11,r8 |
104 |
lvsl v16,0,r11 ; v16: history alignment permutation vector |
104 |
lvsl v16,0,r11 /* v16: history alignment permutation vector */ |
105 |
|
105 |
|
106 |
lvx v0,0,r5 |
106 |
lvx v0,0,r5 |
107 |
addi r5,r5,16 |
107 |
addi r5,r5,16 |
Lines 114-121
Link Here
|
114 |
cmplw cr0,r5,r10 |
114 |
cmplw cr0,r5,r10 |
115 |
bc 12,0,L1101 |
115 |
bc 12,0,L1101 |
116 |
vand v0,v0,v18 |
116 |
vand v0,v0,v18 |
117 |
addis r31,0,hi16(L1307) |
117 |
addis r31,0,L1307@ha |
118 |
ori r31,r31,lo16(L1307) |
118 |
ori r31,r31,L1307@l |
119 |
b L1199 |
119 |
b L1199 |
120 |
|
120 |
|
121 |
L1101: |
121 |
L1101: |
Lines 128-135
Link Here
|
128 |
cmplw cr0,r5,r10 |
128 |
cmplw cr0,r5,r10 |
129 |
bc 12,0,L1102 |
129 |
bc 12,0,L1102 |
130 |
vand v1,v1,v18 |
130 |
vand v1,v1,v18 |
131 |
addis r31,0,hi16(L1306) |
131 |
addis r31,0,L1306@ha |
132 |
ori r31,r31,lo16(L1306) |
132 |
ori r31,r31,L1306@l |
133 |
b L1199 |
133 |
b L1199 |
134 |
|
134 |
|
135 |
L1102: |
135 |
L1102: |
Lines 142-149
Link Here
|
142 |
cmplw cr0,r5,r10 |
142 |
cmplw cr0,r5,r10 |
143 |
bc 12,0,L1103 |
143 |
bc 12,0,L1103 |
144 |
vand v2,v2,v18 |
144 |
vand v2,v2,v18 |
145 |
addis r31,0,hi16(L1305) |
145 |
addis r31,0,L1305@ha |
146 |
ori r31,r31,lo16(L1305) |
146 |
ori r31,r31,L1305@l |
147 |
b L1199 |
147 |
b L1199 |
148 |
|
148 |
|
149 |
L1103: |
149 |
L1103: |
Lines 156-163
Link Here
|
156 |
cmplw cr0,r5,r10 |
156 |
cmplw cr0,r5,r10 |
157 |
bc 12,0,L1104 |
157 |
bc 12,0,L1104 |
158 |
vand v3,v3,v18 |
158 |
vand v3,v3,v18 |
159 |
addis r31,0,hi16(L1304) |
159 |
addis r31,0,L1304@ha |
160 |
ori r31,r31,lo16(L1304) |
160 |
ori r31,r31,L1304@l |
161 |
b L1199 |
161 |
b L1199 |
162 |
|
162 |
|
163 |
L1104: |
163 |
L1104: |
Lines 170-177
Link Here
|
170 |
cmplw cr0,r5,r10 |
170 |
cmplw cr0,r5,r10 |
171 |
bc 12,0,L1105 |
171 |
bc 12,0,L1105 |
172 |
vand v4,v4,v18 |
172 |
vand v4,v4,v18 |
173 |
addis r31,0,hi16(L1303) |
173 |
addis r31,0,L1303@ha |
174 |
ori r31,r31,lo16(L1303) |
174 |
ori r31,r31,L1303@l |
175 |
b L1199 |
175 |
b L1199 |
176 |
|
176 |
|
177 |
L1105: |
177 |
L1105: |
Lines 184-191
Link Here
|
184 |
cmplw cr0,r5,r10 |
184 |
cmplw cr0,r5,r10 |
185 |
bc 12,0,L1106 |
185 |
bc 12,0,L1106 |
186 |
vand v5,v5,v18 |
186 |
vand v5,v5,v18 |
187 |
addis r31,0,hi16(L1302) |
187 |
addis r31,0,L1302@ha |
188 |
ori r31,r31,lo16(L1302) |
188 |
ori r31,r31,L1302@l |
189 |
b L1199 |
189 |
b L1199 |
190 |
|
190 |
|
191 |
L1106: |
191 |
L1106: |
Lines 198-205
Link Here
|
198 |
cmplw cr0,r5,r10 |
198 |
cmplw cr0,r5,r10 |
199 |
bc 12,0,L1107 |
199 |
bc 12,0,L1107 |
200 |
vand v6,v6,v18 |
200 |
vand v6,v6,v18 |
201 |
addis r31,0,hi16(L1301) |
201 |
addis r31,0,L1301@ha |
202 |
ori r31,r31,lo16(L1301) |
202 |
ori r31,r31,L1301@l |
203 |
b L1199 |
203 |
b L1199 |
204 |
|
204 |
|
205 |
L1107: |
205 |
L1107: |
Lines 210-239
Link Here
|
210 |
lvx v19,0,r11 |
210 |
lvx v19,0,r11 |
211 |
vperm v15,v19,v15,v16 |
211 |
vperm v15,v19,v15,v16 |
212 |
vand v7,v7,v18 |
212 |
vand v7,v7,v18 |
213 |
addis r31,0,hi16(L1300) |
213 |
addis r31,0,L1300@ha |
214 |
ori r31,r31,lo16(L1300) |
214 |
ori r31,r31,L1300@l |
215 |
|
215 |
|
216 |
L1199: |
216 |
L1199: |
217 |
mtctr r31 |
217 |
mtctr r31 |
218 |
|
218 |
|
219 |
; set up invariant vectors |
219 |
/* set up invariant vectors */ |
220 |
vspltish v16,0 ; v16: zero vector |
220 |
vspltish v16,0 /* v16: zero vector */ |
221 |
|
221 |
|
222 |
li r10,-12 |
222 |
li r10,-12 |
223 |
lvsr v17,r10,r8 ; v17: result shift vector |
223 |
lvsr v17,r10,r8 /* v17: result shift vector */ |
224 |
lvsl v18,r10,r3 ; v18: residual shift back vector |
224 |
lvsl v18,r10,r3 /* v18: residual shift back vector */ |
225 |
|
225 |
|
226 |
li r10,-4 |
226 |
li r10,-4 |
227 |
stw r7,-4(r9) |
227 |
stw r7,-4(r9) |
228 |
lvewx v19,r10,r9 ; v19: lp_quantization vector |
228 |
lvewx v19,r10,r9 /* v19: lp_quantization vector */ |
229 |
|
229 |
|
230 |
L1200: |
230 |
L1200: |
231 |
vmulosh v20,v0,v8 ; v20: sum vector |
231 |
vmulosh v20,v0,v8 /* v20: sum vector */ |
232 |
bcctr 20,0 |
232 |
bcctr 20,0 |
233 |
|
233 |
|
234 |
L1300: |
234 |
L1300: |
235 |
vmulosh v21,v7,v15 |
235 |
vmulosh v21,v7,v15 |
236 |
vsldoi v15,v15,v14,4 ; increment history |
236 |
vsldoi v15,v15,v14,4 /* increment history */ |
237 |
vaddsws v20,v20,v21 |
237 |
vaddsws v20,v20,v21 |
238 |
|
238 |
|
239 |
L1301: |
239 |
L1301: |
Lines 267-339
Link Here
|
267 |
vaddsws v20,v20,v21 |
267 |
vaddsws v20,v20,v21 |
268 |
|
268 |
|
269 |
L1307: |
269 |
L1307: |
270 |
vsumsws v20,v20,v16 ; v20[3]: sum |
270 |
vsumsws v20,v20,v16 /* v20[3]: sum */ |
271 |
vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization |
271 |
vsraw v20,v20,v19 /* v20[3]: sum >> lp_quantization */ |
272 |
|
272 |
|
273 |
lvewx v21,0,r3 ; v21[n]: *residual |
273 |
lvewx v21,0,r3 /* v21[n]: *residual */ |
274 |
vperm v21,v21,v21,v18 ; v21[3]: *residual |
274 |
vperm v21,v21,v21,v18 /* v21[3]: *residual */ |
275 |
vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization) |
275 |
vaddsws v20,v21,v20 /* v20[3]: *residual + (sum >> lp_quantization) */ |
276 |
vsldoi v18,v18,v18,4 ; increment shift vector |
276 |
vsldoi v18,v18,v18,4 /* increment shift vector */ |
277 |
|
277 |
|
278 |
vperm v21,v20,v20,v17 ; v21[n]: shift for storage |
278 |
vperm v21,v20,v20,v17 /* v21[n]: shift for storage */ |
279 |
vsldoi v17,v17,v17,12 ; increment shift vector |
279 |
vsldoi v17,v17,v17,12 /* increment shift vector */ |
280 |
stvewx v21,0,r8 |
280 |
stvewx v21,0,r8 |
281 |
|
281 |
|
282 |
vsldoi v20,v20,v20,12 |
282 |
vsldoi v20,v20,v20,12 |
283 |
vsldoi v8,v8,v20,4 ; insert value onto history |
283 |
vsldoi v8,v8,v20,4 /* insert value onto history */ |
284 |
|
284 |
|
285 |
addi r3,r3,4 |
285 |
addi r3,r3,4 |
286 |
addi r8,r8,4 |
286 |
addi r8,r8,4 |
287 |
cmplw cr0,r8,r4 ; i<data_len |
287 |
cmplw cr0,r8,r4 /* i<data_len */ |
288 |
bc 12,0,L1200 |
288 |
bc 12,0,L1200 |
289 |
|
289 |
|
290 |
L1400: |
290 |
L1400: |
291 |
mtspr 256,r0 ; restore old vrsave |
291 |
mtspr 256,r0 /* restore old vrsave */ |
292 |
lmw r31,-4(r1) |
292 |
lmw r31,-4(r1) |
293 |
blr |
293 |
blr |
294 |
|
294 |
|
295 |
_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: |
295 |
FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: |
296 |
; r3: residual[] |
296 |
/* r3: residual[] |
297 |
; r4: data_len |
297 |
* r4: data_len |
298 |
; r5: qlp_coeff[] |
298 |
* r5: qlp_coeff[] |
299 |
; r6: order |
299 |
* r6: order |
300 |
; r7: lp_quantization |
300 |
* r7: lp_quantization |
301 |
; r8: data[] |
301 |
* r8: data[] |
302 |
|
302 |
* |
303 |
; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above |
303 |
* see FLAC__lpc_restore_signal_asm_ppc_altivec_16() above |
304 |
; this version assumes order<=8; it uses fewer vector registers, which should |
304 |
* this version assumes order<=8; it uses fewer vector registers, which should |
305 |
; save time in context switches, and has less code, which may improve |
305 |
* save time in context switches, and has less code, which may improve |
306 |
; instruction caching |
306 |
* instruction caching |
307 |
|
307 |
*/ |
308 |
stmw r31,-4(r1) |
308 |
stmw r31,-4(r1) |
309 |
|
309 |
|
310 |
addi r9,r1,-28 |
310 |
addi r9,r1,-28 |
311 |
li r31,0xf |
311 |
li r31,0xf |
312 |
andc r9,r9,r31 ; for quadword-aligned stack data |
312 |
andc r9,r9,r31 /* for quadword-aligned stack data */ |
313 |
|
313 |
|
314 |
slwi r6,r6,2 ; adjust for word size |
314 |
slwi r6,r6,2 /* adjust for word size */ |
315 |
slwi r4,r4,2 |
315 |
slwi r4,r4,2 |
316 |
add r4,r4,r8 ; r4 = data+data_len |
316 |
add r4,r4,r8 /* r4 = data+data_len */ |
317 |
|
317 |
|
318 |
mfspr r0,256 ; cache old vrsave |
318 |
mfspr r0,256 /* cache old vrsave */ |
319 |
addis r31,0,hi16(0xffc00000) |
319 |
addis r31,0,0xffc00000@ha |
320 |
ori r31,r31,lo16(0xffc00000) |
320 |
ori r31,r31,0xffc00000@l |
321 |
mtspr 256,r31 ; declare VRs in vrsave |
321 |
mtspr 256,r31 /* declare VRs in vrsave */ |
322 |
|
322 |
|
323 |
cmplw cr0,r8,r4 ; i<data_len |
323 |
cmplw cr0,r8,r4 /* i<data_len */ |
324 |
bc 4,0,L2400 |
324 |
bc 4,0,L2400 |
325 |
|
325 |
|
326 |
; load coefficients into v0-v1 and initial history into v2-v3 |
326 |
/* load coefficients into v0-v1 and initial history into v2-v3 */ |
327 |
li r31,0xf |
327 |
li r31,0xf |
328 |
and r31,r8,r31 ; r31: data%4 |
328 |
and r31,r8,r31 /* r31: data%4 */ |
329 |
li r11,16 |
329 |
li r11,16 |
330 |
subf r31,r31,r11 ; r31: 4-(data%4) |
330 |
subf r31,r31,r11 /* r31: 4-(data%4) */ |
331 |
slwi r31,r31,3 ; convert to bits for vsro |
331 |
slwi r31,r31,3 /* convert to bits for vsro */ |
332 |
li r10,-4 |
332 |
li r10,-4 |
333 |
stw r31,-4(r9) |
333 |
stw r31,-4(r9) |
334 |
lvewx v0,r10,r9 |
334 |
lvewx v0,r10,r9 |
335 |
vspltisb v6,-1 |
335 |
vspltisb v6,-1 |
336 |
vsro v6,v6,v0 ; v6: mask vector |
336 |
vsro v6,v6,v0 /* v6: mask vector */ |
337 |
|
337 |
|
338 |
li r31,0x8 |
338 |
li r31,0x8 |
339 |
lvsl v0,0,r31 |
339 |
lvsl v0,0,r31 |
Lines 343-356
Link Here
|
343 |
vspltisb v2,0 |
343 |
vspltisb v2,0 |
344 |
vspltisb v3,-1 |
344 |
vspltisb v3,-1 |
345 |
vmrglw v2,v2,v3 |
345 |
vmrglw v2,v2,v3 |
346 |
vsel v0,v1,v0,v2 ; v0: reversal permutation vector |
346 |
vsel v0,v1,v0,v2 /* v0: reversal permutation vector */ |
347 |
|
347 |
|
348 |
add r10,r5,r6 |
348 |
add r10,r5,r6 |
349 |
lvsl v5,0,r5 ; v5: coefficient alignment permutation vector |
349 |
lvsl v5,0,r5 /* v5: coefficient alignment permutation vector */ |
350 |
vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector |
350 |
vperm v5,v5,v5,v0 /* v5: reversal coefficient alignment permutation vector */ |
351 |
|
351 |
|
352 |
mr r11,r8 |
352 |
mr r11,r8 |
353 |
lvsl v4,0,r11 ; v4: history alignment permutation vector |
353 |
lvsl v4,0,r11 /* v4: history alignment permutation vector */ |
354 |
|
354 |
|
355 |
lvx v0,0,r5 |
355 |
lvx v0,0,r5 |
356 |
addi r5,r5,16 |
356 |
addi r5,r5,16 |
Lines 363-370
Link Here
|
363 |
cmplw cr0,r5,r10 |
363 |
cmplw cr0,r5,r10 |
364 |
bc 12,0,L2101 |
364 |
bc 12,0,L2101 |
365 |
vand v0,v0,v6 |
365 |
vand v0,v0,v6 |
366 |
addis r31,0,hi16(L2301) |
366 |
addis r31,0,L2301@ha |
367 |
ori r31,r31,lo16(L2301) |
367 |
ori r31,r31,L2301@l |
368 |
b L2199 |
368 |
b L2199 |
369 |
|
369 |
|
370 |
L2101: |
370 |
L2101: |
Lines 375-399
Link Here
|
375 |
lvx v7,0,r11 |
375 |
lvx v7,0,r11 |
376 |
vperm v3,v7,v3,v4 |
376 |
vperm v3,v7,v3,v4 |
377 |
vand v1,v1,v6 |
377 |
vand v1,v1,v6 |
378 |
addis r31,0,hi16(L2300) |
378 |
addis r31,0,L2300@ha |
379 |
ori r31,r31,lo16(L2300) |
379 |
ori r31,r31,L2300@l |
380 |
|
380 |
|
381 |
L2199: |
381 |
L2199: |
382 |
mtctr r31 |
382 |
mtctr r31 |
383 |
|
383 |
|
384 |
; set up invariant vectors |
384 |
/* set up invariant vectors */ |
385 |
vspltish v4,0 ; v4: zero vector |
385 |
vspltish v4,0 /* v4: zero vector */ |
386 |
|
386 |
|
387 |
li r10,-12 |
387 |
li r10,-12 |
388 |
lvsr v5,r10,r8 ; v5: result shift vector |
388 |
lvsr v5,r10,r8 /* v5: result shift vector */ |
389 |
lvsl v6,r10,r3 ; v6: residual shift back vector |
389 |
lvsl v6,r10,r3 /* v6: residual shift back vector */ |
390 |
|
390 |
|
391 |
li r10,-4 |
391 |
li r10,-4 |
392 |
stw r7,-4(r9) |
392 |
stw r7,-4(r9) |
393 |
lvewx v7,r10,r9 ; v7: lp_quantization vector |
393 |
lvewx v7,r10,r9 /* v7: lp_quantization vector */ |
394 |
|
394 |
|
395 |
L2200: |
395 |
L2200: |
396 |
vmulosh v8,v0,v2 ; v8: sum vector |
396 |
vmulosh v8,v0,v2 /* v8: sum vector */ |
397 |
bcctr 20,0 |
397 |
bcctr 20,0 |
398 |
|
398 |
|
399 |
L2300: |
399 |
L2300: |
Lines 402-428
Link Here
|
402 |
vaddsws v8,v8,v9 |
402 |
vaddsws v8,v8,v9 |
403 |
|
403 |
|
404 |
L2301: |
404 |
L2301: |
405 |
vsumsws v8,v8,v4 ; v8[3]: sum |
405 |
vsumsws v8,v8,v4 /* v8[3]: sum */ |
406 |
vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization |
406 |
vsraw v8,v8,v7 /* v8[3]: sum >> lp_quantization */ |
407 |
|
407 |
|
408 |
lvewx v9,0,r3 ; v9[n]: *residual |
408 |
lvewx v9,0,r3 /* v9[n]: *residual */ |
409 |
vperm v9,v9,v9,v6 ; v9[3]: *residual |
409 |
vperm v9,v9,v9,v6 /* v9[3]: *residual */ |
410 |
vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization) |
410 |
vaddsws v8,v9,v8 /* v8[3]: *residual + (sum >> lp_quantization) */ |
411 |
vsldoi v6,v6,v6,4 ; increment shift vector |
411 |
vsldoi v6,v6,v6,4 /* increment shift vector */ |
412 |
|
412 |
|
413 |
vperm v9,v8,v8,v5 ; v9[n]: shift for storage |
413 |
vperm v9,v8,v8,v5 /* v9[n]: shift for storage */ |
414 |
vsldoi v5,v5,v5,12 ; increment shift vector |
414 |
vsldoi v5,v5,v5,12 /* increment shift vector */ |
415 |
stvewx v9,0,r8 |
415 |
stvewx v9,0,r8 |
416 |
|
416 |
|
417 |
vsldoi v8,v8,v8,12 |
417 |
vsldoi v8,v8,v8,12 |
418 |
vsldoi v2,v2,v8,4 ; insert value onto history |
418 |
vsldoi v2,v2,v8,4 /* insert value onto history */ |
419 |
|
419 |
|
420 |
addi r3,r3,4 |
420 |
addi r3,r3,4 |
421 |
addi r8,r8,4 |
421 |
addi r8,r8,4 |
422 |
cmplw cr0,r8,r4 ; i<data_len |
422 |
cmplw cr0,r8,r4 /* i<data_len */ |
423 |
bc 12,0,L2200 |
423 |
bc 12,0,L2200 |
424 |
|
424 |
|
425 |
L2400: |
425 |
L2400: |
426 |
mtspr 256,r0 ; restore old vrsave |
426 |
mtspr 256,r0 /* restore old vrsave */ |
427 |
lmw r31,-4(r1) |
427 |
lmw r31,-4(r1) |
428 |
blr |
428 |
blr |