Lines 39-44
Link Here
|
39 |
#define rounder(bias) {round (bias), round (bias)} |
39 |
#define rounder(bias) {round (bias), round (bias)} |
40 |
#define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)} |
40 |
#define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)} |
41 |
|
41 |
|
|
|
42 |
struct x4 {int32_t x[4];}; |
43 |
struct x16_8 {int16_t x[8];}; |
44 |
struct x16_4 {int16_t x[4];}; |
42 |
|
45 |
|
43 |
#if 0 |
46 |
#if 0 |
44 |
/* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
47 |
/* C row IDCT - it is just here to document the MMXEXT and MMX versions */ |
Lines 81-94
Link Here
|
81 |
|
84 |
|
82 |
|
85 |
|
83 |
/* SSE2 row IDCT */ |
86 |
/* SSE2 row IDCT */ |
84 |
#define sse2_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
87 |
#define sse2_table(c1,c2,c3,c4,c5,c6,c7) {{{ c4, c2, c4, c6, \ |
85 |
c4, -c6, c4, -c2, \ |
88 |
c4, -c6, c4, -c2}}, \ |
86 |
c4, c6, -c4, -c2, \ |
89 |
{{ c4, c6, -c4, -c2, \ |
87 |
-c4, c2, c4, -c6, \ |
90 |
-c4, c2, c4, -c6}}, \ |
88 |
c1, c3, c3, -c7, \ |
91 |
{{ c1, c3, c3, -c7, \ |
89 |
c5, -c1, c7, -c5, \ |
92 |
c5, -c1, c7, -c5}}, \ |
90 |
c5, c7, -c1, -c5, \ |
93 |
{{ c5, c7, -c1, -c5, \ |
91 |
c7, c3, c3, -c1 } |
94 |
c7, c3, c3, -c1}} } |
92 |
|
95 |
|
93 |
#define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do { \ |
96 |
#define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do { \ |
94 |
/* no scheduling: trust in out of order execution */ \ |
97 |
/* no scheduling: trust in out of order execution */ \ |
Lines 99-125
Link Here
|
99 |
pshufd_r2r (row1, xmm1, 0); /* 1: xmm1= x2 x0 x2 x0 x2 x0 x2 x0 */ \ |
102 |
pshufd_r2r (row1, xmm1, 0); /* 1: xmm1= x2 x0 x2 x0 x2 x0 x2 x0 */ \ |
100 |
pmaddwd_m2r (table[0], xmm1); /* 1: xmm1= x2*C + x0*C ... */ \ |
103 |
pmaddwd_m2r (table[0], xmm1); /* 1: xmm1= x2*C + x0*C ... */ \ |
101 |
pshufd_r2r (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1 x3 x1 x3 x1 */ \ |
104 |
pshufd_r2r (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1 x3 x1 x3 x1 */ \ |
102 |
pmaddwd_m2r (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ... */ \ |
105 |
pmaddwd_m2r (table[2], xmm3); /* 1: xmm3= x3*C + x1*C ... */ \ |
103 |
pshufd_r2r (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4 x6 x4 x6 x4 */ \ |
106 |
pshufd_r2r (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4 x6 x4 x6 x4 */ \ |
104 |
pshufd_r2r (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5 x7 x5 x7 x5 */ \ |
107 |
pshufd_r2r (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5 x7 x5 x7 x5 */ \ |
105 |
pmaddwd_m2r (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ... */ \ |
108 |
pmaddwd_m2r (table[1], xmm2); /* 1: xmm2= x6*C + x4*C ... */ \ |
106 |
paddd_m2r (round1, xmm1); /* 1: xmm1= x2*C + x0*C + round ... */ \ |
109 |
paddd_m2r (round1, xmm1); /* 1: xmm1= x2*C + x0*C + round ... */ \ |
107 |
pmaddwd_m2r (table[3*8], row1); /* 1: row1= x7*C + x5*C ... */ \ |
110 |
pmaddwd_m2r (table[3], row1); /* 1: row1= x7*C + x5*C ... */ \ |
108 |
pshufd_r2r (row2, xmm5, 0); /* 2: */ \ |
111 |
pshufd_r2r (row2, xmm5, 0); /* 2: */ \ |
109 |
pshufd_r2r (row2, xmm6, 0x55); /* 2: */ \ |
112 |
pshufd_r2r (row2, xmm6, 0x55); /* 2: */ \ |
110 |
pmaddwd_m2r (table[0], xmm5); /* 2: */ \ |
113 |
pmaddwd_m2r (table[0], xmm5); /* 2: */ \ |
111 |
paddd_r2r (xmm2, xmm1); /* 1: xmm1= a[] */ \ |
114 |
paddd_r2r (xmm2, xmm1); /* 1: xmm1= a[] */ \ |
112 |
movdqa_r2r (xmm1, xmm2); /* 1: xmm2= a[] */ \ |
115 |
movdqa_r2r (xmm1, xmm2); /* 1: xmm2= a[] */ \ |
113 |
pshufd_r2r (row2, xmm7, 0xaa); /* 2: */ \ |
116 |
pshufd_r2r (row2, xmm7, 0xaa); /* 2: */ \ |
114 |
pmaddwd_m2r (table[1*8], xmm6); /* 2: */ \ |
117 |
pmaddwd_m2r (table[1], xmm6); /* 2: */ \ |
115 |
paddd_r2r (xmm3, row1); /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \ |
118 |
paddd_r2r (xmm3, row1); /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \ |
116 |
pshufd_r2r (row2, row2, 0xff); /* 2: */ \ |
119 |
pshufd_r2r (row2, row2, 0xff); /* 2: */ \ |
117 |
psubd_r2r (row1, xmm2); /* 1: xmm2= a[] - b[] */ \ |
120 |
psubd_r2r (row1, xmm2); /* 1: xmm2= a[] - b[] */ \ |
118 |
pmaddwd_m2r (table[2*8], xmm7); /* 2: */ \ |
121 |
pmaddwd_m2r (table[2], xmm7); /* 2: */ \ |
119 |
paddd_r2r (xmm1, row1); /* 1: row1= a[] + b[] */ \ |
122 |
paddd_r2r (xmm1, row1); /* 1: row1= a[] + b[] */ \ |
120 |
psrad_i2r (ROW_SHIFT, xmm2); /* 1: xmm2= result 4...7 */ \ |
123 |
psrad_i2r (ROW_SHIFT, xmm2); /* 1: xmm2= result 4...7 */ \ |
121 |
paddd_m2r (round2, xmm5); /* 2: */ \ |
124 |
paddd_m2r (round2, xmm5); /* 2: */ \ |
122 |
pmaddwd_m2r (table[3*8], row2); /* 2: */ \ |
125 |
pmaddwd_m2r (table[3], row2); /* 2: */ \ |
123 |
paddd_r2r (xmm6, xmm5); /* 2: */ \ |
126 |
paddd_r2r (xmm6, xmm5); /* 2: */ \ |
124 |
movdqa_r2r (xmm5, xmm6); /* 2: */ \ |
127 |
movdqa_r2r (xmm5, xmm6); /* 2: */ \ |
125 |
psrad_i2r (ROW_SHIFT, row1); /* 1: row1= result 0...4 */ \ |
128 |
psrad_i2r (ROW_SHIFT, row1); /* 1: row1= result 0...4 */ \ |
Lines 137-190
Link Here
|
137 |
|
140 |
|
138 |
/* MMXEXT row IDCT */ |
141 |
/* MMXEXT row IDCT */ |
139 |
|
142 |
|
140 |
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ |
143 |
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) {{{ c4, c2, -c4, -c2}}, \ |
141 |
c4, c6, c4, c6, \ |
144 |
{{ c4, c6, c4, c6}}, \ |
142 |
c1, c3, -c1, -c5, \ |
145 |
{{ c1, c3, -c1, -c5}}, \ |
143 |
c5, c7, c3, -c7, \ |
146 |
{{ c5, c7, c3, -c7}}, \ |
144 |
c4, -c6, c4, -c6, \ |
147 |
{{ c4, -c6, c4, -c6}}, \ |
145 |
-c4, c2, c4, -c2, \ |
148 |
{{-c4, c2, c4, -c2}}, \ |
146 |
c5, -c1, c3, -c1, \ |
149 |
{{ c5, -c1, c3, -c1}}, \ |
147 |
c7, c3, c7, -c5 } |
150 |
{{ c7, c3, c7, -c5}}} |
148 |
|
151 |
|
149 |
static inline void mmxext_row_head (int16_t * const row, const int offset, |
152 |
static inline void mmxext_row_head (int16_t * const row, const int offset, |
150 |
const int16_t * const table) |
153 |
const struct x16_4 * const table) |
151 |
{ |
154 |
{ |
152 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
155 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
153 |
|
156 |
|
154 |
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
157 |
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
155 |
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ |
158 |
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ |
156 |
|
159 |
|
157 |
movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
160 |
movq_m2r (table[0], mm3); /* mm3 = -C2 -C4 C2 C4 */ |
158 |
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ |
161 |
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ |
159 |
|
162 |
|
160 |
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
163 |
movq_m2r (table[1], mm4); /* mm4 = C6 C4 C6 C4 */ |
161 |
pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
164 |
pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
162 |
|
165 |
|
163 |
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
166 |
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
164 |
} |
167 |
} |
165 |
|
168 |
|
166 |
static inline void mmxext_row (const int16_t * const table, |
169 |
static inline void mmxext_row (const struct x16_4 * const table, |
167 |
const int32_t * const rounder) |
170 |
const struct x4 * const rounder) |
168 |
{ |
171 |
{ |
169 |
movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ |
172 |
movq_m2r (table[2], mm1); /* mm1 = -C5 -C1 C3 C1 */ |
170 |
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ |
173 |
pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ |
171 |
|
174 |
|
172 |
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
175 |
pmaddwd_m2r (table[4], mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ |
173 |
pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ |
176 |
pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ |
174 |
|
177 |
|
175 |
movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ |
178 |
movq_m2r (table[3], mm7); /* mm7 = -C7 C3 C7 C5 */ |
176 |
pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ |
179 |
pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ |
177 |
|
180 |
|
178 |
paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
181 |
paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
179 |
pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ |
182 |
pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ |
180 |
|
183 |
|
181 |
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
184 |
pmaddwd_m2r (table[5], mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ |
182 |
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ |
185 |
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ |
183 |
|
186 |
|
184 |
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
187 |
pmaddwd_m2r (table[6], mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ |
185 |
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ |
188 |
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ |
186 |
|
189 |
|
187 |
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
190 |
pmaddwd_m2r (table[7], mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ |
188 |
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ |
191 |
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ |
189 |
|
192 |
|
190 |
paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
193 |
paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
Lines 223-229
Link Here
|
223 |
|
226 |
|
224 |
static inline void mmxext_row_mid (int16_t * const row, const int store, |
227 |
static inline void mmxext_row_mid (int16_t * const row, const int store, |
225 |
const int offset, |
228 |
const int offset, |
226 |
const int16_t * const table) |
229 |
const struct x16_4 * const table) |
227 |
{ |
230 |
{ |
228 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
231 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
229 |
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
232 |
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
Lines 240-308
Link Here
|
240 |
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
243 |
movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ |
241 |
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ |
244 |
pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ |
242 |
|
245 |
|
243 |
movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ |
246 |
movq_m2r (table[0], mm3); /* mm3 = -C2 -C4 C2 C4 */ |
244 |
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
247 |
movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ |
245 |
|
248 |
|
246 |
pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
249 |
pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ |
247 |
|
250 |
|
248 |
movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ |
251 |
movq_m2r (table[1], mm4); /* mm4 = C6 C4 C6 C4 */ |
249 |
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
252 |
pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ |
250 |
} |
253 |
} |
251 |
|
254 |
|
252 |
|
255 |
|
253 |
/* MMX row IDCT */ |
256 |
/* MMX row IDCT */ |
254 |
|
257 |
|
255 |
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ |
258 |
#define mmx_table(c1,c2,c3,c4,c5,c6,c7) {{{ c4, c2, c4, c6}}, \ |
256 |
c4, c6, -c4, -c2, \ |
259 |
{{ c4, c6, -c4, -c2}}, \ |
257 |
c1, c3, c3, -c7, \ |
260 |
{{ c1, c3, c3, -c7}}, \ |
258 |
c5, c7, -c1, -c5, \ |
261 |
{{ c5, c7, -c1, -c5}}, \ |
259 |
c4, -c6, c4, -c2, \ |
262 |
{{ c4, -c6, c4, -c2}}, \ |
260 |
-c4, c2, c4, -c6, \ |
263 |
{{-c4, c2, c4, -c6}}, \ |
261 |
c5, -c1, c7, -c5, \ |
264 |
{{ c5, -c1, c7, -c5}}, \ |
262 |
c7, c3, c3, -c1 } |
265 |
{{ c7, c3, c3, -c1}}} |
263 |
|
266 |
|
264 |
static inline void mmx_row_head (int16_t * const row, const int offset, |
267 |
static inline void mmx_row_head (int16_t * const row, const int offset, |
265 |
const int16_t * const table) |
268 |
const struct x16_4 * const table) |
266 |
{ |
269 |
{ |
267 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
270 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
268 |
|
271 |
|
269 |
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
272 |
movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ |
270 |
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ |
273 |
movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ |
271 |
|
274 |
|
272 |
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
275 |
movq_m2r (table[0], mm3); /* mm3 = C6 C4 C2 C4 */ |
273 |
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ |
276 |
movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ |
274 |
|
277 |
|
275 |
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
278 |
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
276 |
|
279 |
|
277 |
movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
280 |
movq_m2r (table[1], mm4); /* mm4 = -C2 -C4 C6 C4 */ |
278 |
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ |
281 |
pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ |
279 |
|
282 |
|
280 |
movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
283 |
movq_m2r (table[2], mm1); /* mm1 = -C7 C3 C3 C1 */ |
281 |
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ |
284 |
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ |
282 |
} |
285 |
} |
283 |
|
286 |
|
284 |
static inline void mmx_row (const int16_t * const table, |
287 |
static inline void mmx_row (const struct x16_4 * const table, |
285 |
const int32_t * const rounder) |
288 |
const struct x4 * const rounder) |
286 |
{ |
289 |
{ |
287 |
pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
290 |
pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ |
288 |
punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ |
291 |
punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ |
289 |
|
292 |
|
290 |
pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
293 |
pmaddwd_m2r (table[4], mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ |
291 |
punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ |
294 |
punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ |
292 |
|
295 |
|
293 |
movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ |
296 |
movq_m2r (table[3], mm7); /* mm7 = -C5 -C1 C7 C5 */ |
294 |
pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ |
297 |
pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ |
295 |
|
298 |
|
296 |
paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
299 |
paddd_m2r (*rounder, mm3); /* mm3 += rounder */ |
297 |
pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ |
300 |
pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ |
298 |
|
301 |
|
299 |
pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
302 |
pmaddwd_m2r (table[5], mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ |
300 |
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ |
303 |
paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ |
301 |
|
304 |
|
302 |
pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
305 |
pmaddwd_m2r (table[6], mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ |
303 |
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ |
306 |
movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ |
304 |
|
307 |
|
305 |
pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
308 |
pmaddwd_m2r (table[7], mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ |
306 |
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ |
309 |
paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ |
307 |
|
310 |
|
308 |
paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
311 |
paddd_m2r (*rounder, mm0); /* mm0 += rounder */ |
Lines 346-352
Link Here
|
346 |
} |
349 |
} |
347 |
|
350 |
|
348 |
static inline void mmx_row_mid (int16_t * const row, const int store, |
351 |
static inline void mmx_row_mid (int16_t * const row, const int store, |
349 |
const int offset, const int16_t * const table) |
352 |
const int offset, const struct x16_4 * const table) |
350 |
{ |
353 |
{ |
351 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
354 |
movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ |
352 |
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
355 |
psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ |
Lines 366-378
Link Here
|
366 |
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
369 |
punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ |
367 |
psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ |
370 |
psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ |
368 |
|
371 |
|
369 |
movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ |
372 |
movq_m2r (table[0], mm3); /* mm3 = C6 C4 C2 C4 */ |
370 |
pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ |
373 |
pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ |
371 |
|
374 |
|
372 |
movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ |
375 |
movq_m2r (table[1], mm4); /* mm4 = -C2 -C4 C6 C4 */ |
373 |
por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ |
376 |
por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ |
374 |
|
377 |
|
375 |
movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ |
378 |
movq_m2r (table[2], mm1); /* mm1 = -C7 C3 C3 C1 */ |
376 |
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ |
379 |
punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ |
377 |
|
380 |
|
378 |
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
381 |
movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ |
Lines 457-462
Link Here
|
457 |
#define T3 43790 |
460 |
#define T3 43790 |
458 |
#define C4 23170 |
461 |
#define C4 23170 |
459 |
|
462 |
|
|
|
463 |
/* MMX-Ext. and SSE can share the constants */ |
464 |
static const struct x16_8 t1_vector ATTR_ALIGN(16) = {{T1,T1,T1,T1,T1,T1,T1,T1}}; |
465 |
static const struct x16_8 t2_vector ATTR_ALIGN(16) = {{T2,T2,T2,T2,T2,T2,T2,T2}}; |
466 |
static const struct x16_8 t3_vector ATTR_ALIGN(16) = {{T3,T3,T3,T3,T3,T3,T3,T3}}; |
467 |
static const struct x16_8 c4_vector ATTR_ALIGN(16) = {{C4,C4,C4,C4,C4,C4,C4,C4}}; |
460 |
|
468 |
|
461 |
/* SSE2 column IDCT */ |
469 |
/* SSE2 column IDCT */ |
462 |
static inline void sse2_idct_col (int16_t * const col) |
470 |
static inline void sse2_idct_col (int16_t * const col) |
Lines 464-491
Link Here
|
464 |
/* Almost identical to mmxext version: */ |
472 |
/* Almost identical to mmxext version: */ |
465 |
/* just do both 4x8 columns in paraller */ |
473 |
/* just do both 4x8 columns in paraller */ |
466 |
|
474 |
|
467 |
static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1}; |
|
|
468 |
static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2}; |
469 |
static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3}; |
470 |
static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4}; |
471 |
|
472 |
#if defined(__x86_64__) |
475 |
#if defined(__x86_64__) |
473 |
|
476 |
|
474 |
/* INPUT: block in xmm8 ... xmm15 */ |
477 |
/* INPUT: block in xmm8 ... xmm15 */ |
475 |
|
478 |
|
476 |
movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
479 |
movdqa_m2r (t1_vector, xmm0); /* xmm0 = T1 */ |
477 |
movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ |
480 |
movdqa_r2r (xmm9, xmm1); /* xmm1 = x1 */ |
478 |
|
481 |
|
479 |
movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
482 |
movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
480 |
pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
483 |
pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
481 |
|
484 |
|
482 |
movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
485 |
movdqa_m2r (t3_vector, xmm5); /* xmm5 = T3 */ |
483 |
pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ |
486 |
pmulhw_r2r (xmm15, xmm2); /* xmm2 = T1*x7 */ |
484 |
|
487 |
|
485 |
movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
488 |
movdqa_r2r (xmm5, xmm7); /* xmm7 = T3-1 */ |
486 |
psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ |
489 |
psubsw_r2r (xmm15, xmm0); /* xmm0 = v17 */ |
487 |
|
490 |
|
488 |
movdqa_m2r (*t2_vector, xmm9); /* xmm9 = T2 */ |
491 |
movdqa_m2r (t2_vector, xmm9); /* xmm9 = T2 */ |
489 |
pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ |
492 |
pmulhw_r2r (xmm11, xmm5); /* xmm5 = (T3-1)*x3 */ |
490 |
|
493 |
|
491 |
paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
494 |
paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
Lines 518-524
Link Here
|
518 |
movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
521 |
movdqa_r2r (xmm1, xmm7); /* xmm7 = u12 */ |
519 |
paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
522 |
paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
520 |
|
523 |
|
521 |
movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
524 |
movdqa_m2r (c4_vector, xmm0); /* xmm0 = C4/2 */ |
522 |
psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
525 |
psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
523 |
|
526 |
|
524 |
movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ |
527 |
movdqa_r2r (xmm6, xmm4); /* xmm4 = b0 */ |
Lines 577-583
Link Here
|
577 |
/* OUTPUT: block in xmm8 ... xmm15 */ |
580 |
/* OUTPUT: block in xmm8 ... xmm15 */ |
578 |
|
581 |
|
579 |
#else |
582 |
#else |
580 |
movdqa_m2r (*t1_vector, xmm0); /* xmm0 = T1 */ |
583 |
movdqa_m2r (t1_vector, xmm0); /* xmm0 = T1 */ |
581 |
|
584 |
|
582 |
movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ |
585 |
movdqa_m2r (*(col+1*8), xmm1); /* xmm1 = x1 */ |
583 |
movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
586 |
movdqa_r2r (xmm0, xmm2); /* xmm2 = T1 */ |
Lines 585-591
Link Here
|
585 |
movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ |
588 |
movdqa_m2r (*(col+7*8), xmm4); /* xmm4 = x7 */ |
586 |
pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
589 |
pmulhw_r2r (xmm1, xmm0); /* xmm0 = T1*x1 */ |
587 |
|
590 |
|
588 |
movdqa_m2r (*t3_vector, xmm5); /* xmm5 = T3 */ |
591 |
movdqa_m2r (t3_vector, xmm5); /* xmm5 = T3 */ |
589 |
pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ |
592 |
pmulhw_r2r (xmm4, xmm2); /* xmm2 = T1*x7 */ |
590 |
|
593 |
|
591 |
movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ |
594 |
movdqa_m2r (*(col+5*8), xmm6); /* xmm6 = x5 */ |
Lines 594-600
Link Here
|
594 |
movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ |
597 |
movdqa_m2r (*(col+3*8), xmm3); /* xmm3 = x3 */ |
595 |
psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ |
598 |
psubsw_r2r (xmm4, xmm0); /* xmm0 = v17 */ |
596 |
|
599 |
|
597 |
movdqa_m2r (*t2_vector, xmm4); /* xmm4 = T2 */ |
600 |
movdqa_m2r (t2_vector, xmm4); /* xmm4 = T2 */ |
598 |
pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ |
601 |
pmulhw_r2r (xmm3, xmm5); /* xmm5 = (T3-1)*x3 */ |
599 |
|
602 |
|
600 |
paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
603 |
paddsw_r2r (xmm2, xmm1); /* xmm1 = u17 */ |
Lines 632-638
Link Here
|
632 |
movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ |
635 |
movdqa_m2r (*(col+0*8), xmm3); /* xmm3 = x0 */ |
633 |
paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
636 |
paddsw_r2r (xmm5, xmm1); /* xmm1 = u12+v12 */ |
634 |
|
637 |
|
635 |
movdqa_m2r (*c4_vector, xmm0); /* xmm0 = C4/2 */ |
638 |
movdqa_m2r (c4_vector, xmm0); /* xmm0 = C4/2 */ |
636 |
psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
639 |
psubsw_r2r (xmm5, xmm7); /* xmm7 = u12-v12 */ |
637 |
|
640 |
|
638 |
movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ |
641 |
movdqa_r2m (xmm6, *(col+5*8)); /* save b0 in scratch1 */ |
Lines 710-724
Link Here
|
710 |
/* MMX column IDCT */ |
713 |
/* MMX column IDCT */ |
711 |
static inline void idct_col (int16_t * const col, const int offset) |
714 |
static inline void idct_col (int16_t * const col, const int offset) |
712 |
{ |
715 |
{ |
713 |
static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; |
|
|
714 |
static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; |
715 |
static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; |
716 |
static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; |
717 |
|
718 |
/* column code adapted from peter gubanov */ |
716 |
/* column code adapted from peter gubanov */ |
719 |
/* http://www.elecard.com/peter/idct.shtml */ |
717 |
/* http://www.elecard.com/peter/idct.shtml */ |
720 |
|
718 |
|
721 |
movq_m2r (*t1_vector, mm0); /* mm0 = T1 */ |
719 |
movq_m2r (t1_vector, mm0); /* mm0 = T1 */ |
722 |
|
720 |
|
723 |
movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
721 |
movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ |
724 |
movq_r2r (mm0, mm2); /* mm2 = T1 */ |
722 |
movq_r2r (mm0, mm2); /* mm2 = T1 */ |
Lines 726-732
Link Here
|
726 |
movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
724 |
movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ |
727 |
pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ |
725 |
pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ |
728 |
|
726 |
|
729 |
movq_m2r (*t3_vector, mm5); /* mm5 = T3 */ |
727 |
movq_m2r (t3_vector, mm5); /* mm5 = T3 */ |
730 |
pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ |
728 |
pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ |
731 |
|
729 |
|
732 |
movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
730 |
movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ |
Lines 735-741
Link Here
|
735 |
movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
733 |
movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ |
736 |
psubsw_r2r (mm4, mm0); /* mm0 = v17 */ |
734 |
psubsw_r2r (mm4, mm0); /* mm0 = v17 */ |
737 |
|
735 |
|
738 |
movq_m2r (*t2_vector, mm4); /* mm4 = T2 */ |
736 |
movq_m2r (t2_vector, mm4); /* mm4 = T2 */ |
739 |
pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ |
737 |
pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ |
740 |
|
738 |
|
741 |
paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
739 |
paddsw_r2r (mm2, mm1); /* mm1 = u17 */ |
Lines 773-779
Link Here
|
773 |
movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
771 |
movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ |
774 |
paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ |
772 |
paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ |
775 |
|
773 |
|
776 |
movq_m2r (*c4_vector, mm0); /* mm0 = C4/2 */ |
774 |
movq_m2r (c4_vector, mm0); /* mm0 = C4/2 */ |
777 |
psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ |
775 |
psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ |
778 |
|
776 |
|
779 |
movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
777 |
movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ |
Lines 846-897
Link Here
|
846 |
movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
844 |
movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ |
847 |
} |
845 |
} |
848 |
|
846 |
|
849 |
|
847 |
/* MMX and SSE can share these constants */ |
850 |
static const int32_t rounder0[] ATTR_ALIGN(8) = |
848 |
static const struct x4 rounder0 ATTR_ALIGN(16) = |
851 |
rounder ((1 << (COL_SHIFT - 1)) - 0.5); |
849 |
{rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5)}; |
852 |
static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); |
850 |
static const struct x4 rounder4 ATTR_ALIGN(16) = {rounder_sse2 (0)}; |
853 |
static const int32_t rounder1[] ATTR_ALIGN(8) = |
851 |
static const struct x4 rounder1 ATTR_ALIGN(16) = |
854 |
rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
852 |
{rounder_sse2 (1.25683487303)}; /* C1*(C1/C4+C1+C7)/2 */ |
855 |
static const int32_t rounder7[] ATTR_ALIGN(8) = |
853 |
static const struct x4 rounder7 ATTR_ALIGN(16) = |
856 |
rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
854 |
{rounder_sse2 (-0.25)}; /* C1*(C7/C4+C7-C1)/2 */ |
857 |
static const int32_t rounder2[] ATTR_ALIGN(8) = |
855 |
static const struct x4 rounder2 ATTR_ALIGN(16) = |
858 |
rounder (0.60355339059); /* C2 * (C6+C2)/2 */ |
856 |
{rounder_sse2 (0.60355339059)}; /* C2 * (C6+C2)/2 */ |
859 |
static const int32_t rounder6[] ATTR_ALIGN(8) = |
857 |
static const struct x4 rounder6 ATTR_ALIGN(16) = |
860 |
rounder (-0.25); /* C2 * (C6-C2)/2 */ |
858 |
{rounder_sse2 (-0.25)}; /* C2 * (C6-C2)/2 */ |
861 |
static const int32_t rounder3[] ATTR_ALIGN(8) = |
859 |
static const struct x4 rounder3 ATTR_ALIGN(16) = |
862 |
rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
860 |
{rounder_sse2 (0.087788325588)}; /* C3*(-C3/C4+C3+C5)/2 */ |
863 |
static const int32_t rounder5[] ATTR_ALIGN(8) = |
861 |
static const struct x4 rounder5 ATTR_ALIGN(16) = |
864 |
rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
862 |
{rounder_sse2 (-0.441341716183)}; /* C3*(-C5/C4+C5-C3)/2 */ |
865 |
|
|
|
866 |
|
863 |
|
867 |
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ |
864 |
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ |
868 |
static inline void idct (int16_t * const block) \ |
865 |
static inline void idct (int16_t * const block) \ |
869 |
{ \ |
866 |
{ \ |
870 |
static const int16_t table04[] ATTR_ALIGN(16) = \ |
867 |
static const struct x16_4 table04[8] ATTR_ALIGN(16) = \ |
871 |
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
868 |
table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ |
872 |
static const int16_t table17[] ATTR_ALIGN(16) = \ |
869 |
static const struct x16_4 table17[8] ATTR_ALIGN(16) = \ |
873 |
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
870 |
table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ |
874 |
static const int16_t table26[] ATTR_ALIGN(16) = \ |
871 |
static const struct x16_4 table26[8] ATTR_ALIGN(16) = \ |
875 |
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
872 |
table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ |
876 |
static const int16_t table35[] ATTR_ALIGN(16) = \ |
873 |
static const struct x16_4 table35[8] ATTR_ALIGN(16) = \ |
877 |
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
874 |
table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ |
878 |
\ |
875 |
\ |
879 |
idct_row_head (block, 0*8, table04); \ |
876 |
idct_row_head (block, 0*8, table04); \ |
880 |
idct_row (table04, rounder0); \ |
877 |
idct_row (table04, &rounder0); \ |
881 |
idct_row_mid (block, 0*8, 4*8, table04); \ |
878 |
idct_row_mid (block, 0*8, 4*8, table04); \ |
882 |
idct_row (table04, rounder4); \ |
879 |
idct_row (table04, &rounder4); \ |
883 |
idct_row_mid (block, 4*8, 1*8, table17); \ |
880 |
idct_row_mid (block, 4*8, 1*8, table17); \ |
884 |
idct_row (table17, rounder1); \ |
881 |
idct_row (table17, &rounder1); \ |
885 |
idct_row_mid (block, 1*8, 7*8, table17); \ |
882 |
idct_row_mid (block, 1*8, 7*8, table17); \ |
886 |
idct_row (table17, rounder7); \ |
883 |
idct_row (table17, &rounder7); \ |
887 |
idct_row_mid (block, 7*8, 2*8, table26); \ |
884 |
idct_row_mid (block, 7*8, 2*8, table26); \ |
888 |
idct_row (table26, rounder2); \ |
885 |
idct_row (table26, &rounder2); \ |
889 |
idct_row_mid (block, 2*8, 6*8, table26); \ |
886 |
idct_row_mid (block, 2*8, 6*8, table26); \ |
890 |
idct_row (table26, rounder6); \ |
887 |
idct_row (table26, &rounder6); \ |
891 |
idct_row_mid (block, 6*8, 3*8, table35); \ |
888 |
idct_row_mid (block, 6*8, 3*8, table35); \ |
892 |
idct_row (table35, rounder3); \ |
889 |
idct_row (table35, &rounder3); \ |
893 |
idct_row_mid (block, 3*8, 5*8, table35); \ |
890 |
idct_row_mid (block, 3*8, 5*8, table35); \ |
894 |
idct_row (table35, rounder5); \ |
891 |
idct_row (table35, &rounder5); \ |
895 |
idct_row_tail (block, 5*8); \ |
892 |
idct_row_tail (block, 5*8); \ |
896 |
\ |
893 |
\ |
897 |
idct_col (block, 0); \ |
894 |
idct_col (block, 0); \ |
Lines 900-971
Link Here
|
900 |
|
897 |
|
901 |
static inline void sse2_idct (int16_t * const block) |
898 |
static inline void sse2_idct (int16_t * const block) |
902 |
{ |
899 |
{ |
903 |
static const int16_t table04[] ATTR_ALIGN(16) = |
900 |
static const struct x16_8 table04[4] ATTR_ALIGN(16) = |
904 |
sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); |
901 |
sse2_table (22725, 21407, 19266, 16384, 12873, 8867, 4520); |
905 |
static const int16_t table17[] ATTR_ALIGN(16) = |
902 |
static const struct x16_8 table17[4] ATTR_ALIGN(16) = |
906 |
sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); |
903 |
sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270); |
907 |
static const int16_t table26[] ATTR_ALIGN(16) = |
904 |
static const struct x16_8 table26[4] ATTR_ALIGN(16) = |
908 |
sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); |
905 |
sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906); |
909 |
static const int16_t table35[] ATTR_ALIGN(16) = |
906 |
static const struct x16_8 table35[4] ATTR_ALIGN(16) = |
910 |
sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); |
907 |
sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315); |
911 |
|
908 |
|
912 |
static const int32_t rounder0_128[] ATTR_ALIGN(16) = |
|
|
913 |
rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5); |
914 |
static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0); |
915 |
static const int32_t rounder1_128[] ATTR_ALIGN(16) = |
916 |
rounder_sse2 (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ |
917 |
static const int32_t rounder7_128[] ATTR_ALIGN(16) = |
918 |
rounder_sse2 (-0.25); /* C1*(C7/C4+C7-C1)/2 */ |
919 |
static const int32_t rounder2_128[] ATTR_ALIGN(16) = |
920 |
rounder_sse2 (0.60355339059); /* C2 * (C6+C2)/2 */ |
921 |
static const int32_t rounder6_128[] ATTR_ALIGN(16) = |
922 |
rounder_sse2 (-0.25); /* C2 * (C6-C2)/2 */ |
923 |
static const int32_t rounder3_128[] ATTR_ALIGN(16) = |
924 |
rounder_sse2 (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ |
925 |
static const int32_t rounder5_128[] ATTR_ALIGN(16) = |
926 |
rounder_sse2 (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ |
927 |
|
928 |
#if defined(__x86_64__) |
909 |
#if defined(__x86_64__) |
929 |
movdqa_m2r (block[0*8], xmm8); |
910 |
movdqa_m2r (block[0*8], xmm8); |
930 |
movdqa_m2r (block[4*8], xmm12); |
911 |
movdqa_m2r (block[4*8], xmm12); |
931 |
SSE2_IDCT_2ROW (table04, xmm8, xmm12, *rounder0_128, *rounder4_128); |
912 |
SSE2_IDCT_2ROW (table04, xmm8, xmm12, rounder0, rounder4); |
932 |
|
913 |
|
933 |
movdqa_m2r (block[1*8], xmm9); |
914 |
movdqa_m2r (block[1*8], xmm9); |
934 |
movdqa_m2r (block[7*8], xmm15); |
915 |
movdqa_m2r (block[7*8], xmm15); |
935 |
SSE2_IDCT_2ROW (table17, xmm9, xmm15, *rounder1_128, *rounder7_128); |
916 |
SSE2_IDCT_2ROW (table17, xmm9, xmm15, rounder1, rounder7); |
936 |
|
917 |
|
937 |
movdqa_m2r (block[2*8], xmm10); |
918 |
movdqa_m2r (block[2*8], xmm10); |
938 |
movdqa_m2r (block[6*8], xmm14); |
919 |
movdqa_m2r (block[6*8], xmm14); |
939 |
SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128); |
920 |
SSE2_IDCT_2ROW (table26, xmm10, xmm14, rounder2, rounder6); |
940 |
|
921 |
|
941 |
movdqa_m2r (block[3*8], xmm11); |
922 |
movdqa_m2r (block[3*8], xmm11); |
942 |
movdqa_m2r (block[5*8], xmm13); |
923 |
movdqa_m2r (block[5*8], xmm13); |
943 |
SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128); |
924 |
SSE2_IDCT_2ROW (table35, xmm11, xmm13, rounder3, rounder5); |
944 |
|
925 |
|
945 |
/* OUTPUT: block in xmm8 ... xmm15 */ |
926 |
/* OUTPUT: block in xmm8 ... xmm15 */ |
946 |
|
927 |
|
947 |
#else |
928 |
#else |
948 |
movdqa_m2r (block[0*8], xmm0); |
929 |
movdqa_m2r (block[0*8], xmm0); |
949 |
movdqa_m2r (block[4*8], xmm4); |
930 |
movdqa_m2r (block[4*8], xmm4); |
950 |
SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128); |
931 |
SSE2_IDCT_2ROW (table04, xmm0, xmm4, rounder0, rounder4); |
951 |
movdqa_r2m (xmm0, block[0*8]); |
932 |
movdqa_r2m (xmm0, block[0*8]); |
952 |
movdqa_r2m (xmm4, block[4*8]); |
933 |
movdqa_r2m (xmm4, block[4*8]); |
953 |
|
934 |
|
954 |
movdqa_m2r (block[1*8], xmm0); |
935 |
movdqa_m2r (block[1*8], xmm0); |
955 |
movdqa_m2r (block[7*8], xmm4); |
936 |
movdqa_m2r (block[7*8], xmm4); |
956 |
SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128); |
937 |
SSE2_IDCT_2ROW (table17, xmm0, xmm4, rounder1, rounder7); |
957 |
movdqa_r2m (xmm0, block[1*8]); |
938 |
movdqa_r2m (xmm0, block[1*8]); |
958 |
movdqa_r2m (xmm4, block[7*8]); |
939 |
movdqa_r2m (xmm4, block[7*8]); |
959 |
|
940 |
|
960 |
movdqa_m2r (block[2*8], xmm0); |
941 |
movdqa_m2r (block[2*8], xmm0); |
961 |
movdqa_m2r (block[6*8], xmm4); |
942 |
movdqa_m2r (block[6*8], xmm4); |
962 |
SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128); |
943 |
SSE2_IDCT_2ROW (table26, xmm0, xmm4, rounder2, rounder6); |
963 |
movdqa_r2m (xmm0, block[2*8]); |
944 |
movdqa_r2m (xmm0, block[2*8]); |
964 |
movdqa_r2m (xmm4, block[6*8]); |
945 |
movdqa_r2m (xmm4, block[6*8]); |
965 |
|
946 |
|
966 |
movdqa_m2r (block[3*8], xmm0); |
947 |
movdqa_m2r (block[3*8], xmm0); |
967 |
movdqa_m2r (block[5*8], xmm4); |
948 |
movdqa_m2r (block[5*8], xmm4); |
968 |
SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128); |
949 |
SSE2_IDCT_2ROW (table35, xmm0, xmm4, rounder3, rounder5); |
969 |
movdqa_r2m (xmm0, block[3*8]); |
950 |
movdqa_r2m (xmm0, block[3*8]); |
970 |
movdqa_r2m (xmm4, block[5*8]); |
951 |
movdqa_r2m (xmm4, block[5*8]); |
971 |
#endif |
952 |
#endif |