Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 389323 | Differences between
and this patch

Collapse All | Expand All

(-)libmpeg2-0.5.1.orig/libmpeg2/idct_mmx.c (-138 / +119 lines)
Lines 39-44 Link Here
39
#define rounder(bias) {round (bias), round (bias)}
39
#define rounder(bias) {round (bias), round (bias)}
40
#define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)}
40
#define rounder_sse2(bias) {round (bias), round (bias), round (bias), round (bias)}
41
41
42
struct x4 {int32_t x[4];};
43
struct x16_8 {int16_t x[8];};
44
struct x16_4 {int16_t x[4];};
42
45
43
#if 0
46
#if 0
44
/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
47
/* C row IDCT - it is just here to document the MMXEXT and MMX versions */
Lines 81-94 Link Here
81
84
82
85
83
/* SSE2 row IDCT */
86
/* SSE2 row IDCT */
84
#define sse2_table(c1,c2,c3,c4,c5,c6,c7) {  c4,  c2,  c4,  c6,   \
87
#define sse2_table(c1,c2,c3,c4,c5,c6,c7) {{{ c4,  c2,  c4,  c6,   \
85
					    c4, -c6,  c4, -c2,   \
88
					     c4, -c6,  c4, -c2}}, \
86
					    c4,  c6, -c4, -c2,   \
89
					  {{ c4,  c6, -c4, -c2,   \
87
					   -c4,  c2,  c4, -c6,   \
90
					    -c4,  c2,  c4, -c6}}, \
88
					    c1,  c3,  c3, -c7,   \
91
					  {{ c1,  c3,  c3, -c7,   \
89
					    c5, -c1,  c7, -c5,   \
92
					     c5, -c1,  c7, -c5}}, \
90
					    c5,  c7, -c1, -c5,   \
93
					  {{ c5,  c7, -c1, -c5,   \
91
					    c7,  c3,  c3, -c1 }
94
					     c7,  c3,  c3, -c1}} }
92
95
93
#define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do {               \
96
#define SSE2_IDCT_2ROW(table, row1, row2, round1, round2) do {               \
94
    /* no scheduling: trust in out of order execution */                     \
97
    /* no scheduling: trust in out of order execution */                     \
Lines 99-125 Link Here
99
    pshufd_r2r   (row1, xmm1, 0);    /* 1: xmm1= x2 x0 x2 x0  x2 x0 x2 x0 */ \
102
    pshufd_r2r   (row1, xmm1, 0);    /* 1: xmm1= x2 x0 x2 x0  x2 x0 x2 x0 */ \
100
    pmaddwd_m2r  (table[0], xmm1);   /* 1: xmm1= x2*C + x0*C ...          */ \
103
    pmaddwd_m2r  (table[0], xmm1);   /* 1: xmm1= x2*C + x0*C ...          */ \
101
    pshufd_r2r   (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1  x3 x1 x3 x1 */ \
104
    pshufd_r2r   (row1, xmm3, 0xaa); /* 1: xmm3= x3 x1 x3 x1  x3 x1 x3 x1 */ \
102
    pmaddwd_m2r  (table[2*8], xmm3); /* 1: xmm3= x3*C + x1*C ...          */ \
105
    pmaddwd_m2r  (table[2], xmm3);   /* 1: xmm3= x3*C + x1*C ...          */ \
103
    pshufd_r2r   (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4  x6 x4 x6 x4 */ \
106
    pshufd_r2r   (row1, xmm2, 0x55); /* 1: xmm2= x6 x4 x6 x4  x6 x4 x6 x4 */ \
104
    pshufd_r2r   (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5  x7 x5 x7 x5 */ \
107
    pshufd_r2r   (row1, row1, 0xff); /* 1: row1= x7 x5 x7 x5  x7 x5 x7 x5 */ \
105
    pmaddwd_m2r  (table[1*8], xmm2); /* 1: xmm2= x6*C + x4*C ...          */ \
108
    pmaddwd_m2r  (table[1], xmm2);   /* 1: xmm2= x6*C + x4*C ...          */ \
106
    paddd_m2r    (round1, xmm1);     /* 1: xmm1= x2*C + x0*C + round ...  */ \
109
    paddd_m2r    (round1, xmm1);     /* 1: xmm1= x2*C + x0*C + round ...  */ \
107
    pmaddwd_m2r  (table[3*8], row1); /* 1: row1= x7*C + x5*C ...          */ \
110
    pmaddwd_m2r  (table[3], row1);   /* 1: row1= x7*C + x5*C ...          */ \
108
    pshufd_r2r   (row2, xmm5, 0);    /*    2:                             */ \
111
    pshufd_r2r   (row2, xmm5, 0);    /*    2:                             */ \
109
    pshufd_r2r   (row2, xmm6, 0x55); /*    2:                             */ \
112
    pshufd_r2r   (row2, xmm6, 0x55); /*    2:                             */ \
110
    pmaddwd_m2r  (table[0], xmm5);   /*    2:                             */ \
113
    pmaddwd_m2r  (table[0], xmm5);   /*    2:                             */ \
111
    paddd_r2r    (xmm2, xmm1);       /* 1: xmm1= a[]                      */ \
114
    paddd_r2r    (xmm2, xmm1);       /* 1: xmm1= a[]                      */ \
112
    movdqa_r2r   (xmm1, xmm2);       /* 1: xmm2= a[]                      */ \
115
    movdqa_r2r   (xmm1, xmm2);       /* 1: xmm2= a[]                      */ \
113
    pshufd_r2r   (row2, xmm7, 0xaa); /*    2:                             */ \
116
    pshufd_r2r   (row2, xmm7, 0xaa); /*    2:                             */ \
114
    pmaddwd_m2r  (table[1*8], xmm6); /*    2:                             */ \
117
    pmaddwd_m2r  (table[1], xmm6);   /*    2:                             */ \
115
    paddd_r2r    (xmm3, row1);       /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \
118
    paddd_r2r    (xmm3, row1);       /* 1: row1= b[]= 7*C+5*C+3*C+1*C ... */ \
116
    pshufd_r2r   (row2, row2, 0xff); /*    2:                             */ \
119
    pshufd_r2r   (row2, row2, 0xff); /*    2:                             */ \
117
    psubd_r2r    (row1, xmm2);       /* 1: xmm2= a[] - b[]                */ \
120
    psubd_r2r    (row1, xmm2);       /* 1: xmm2= a[] - b[]                */ \
118
    pmaddwd_m2r  (table[2*8], xmm7); /*    2:                             */ \
121
    pmaddwd_m2r  (table[2], xmm7);   /*    2:                             */ \
119
    paddd_r2r    (xmm1, row1);       /* 1: row1= a[] + b[]                */ \
122
    paddd_r2r    (xmm1, row1);       /* 1: row1= a[] + b[]                */ \
120
    psrad_i2r    (ROW_SHIFT, xmm2);  /* 1: xmm2= result 4...7             */ \
123
    psrad_i2r    (ROW_SHIFT, xmm2);  /* 1: xmm2= result 4...7             */ \
121
    paddd_m2r    (round2, xmm5);     /*    2:                             */ \
124
    paddd_m2r    (round2, xmm5);     /*    2:                             */ \
122
    pmaddwd_m2r  (table[3*8], row2); /*    2:                             */ \
125
    pmaddwd_m2r  (table[3], row2);   /*    2:                             */ \
123
    paddd_r2r    (xmm6, xmm5);       /*    2:                             */ \
126
    paddd_r2r    (xmm6, xmm5);       /*    2:                             */ \
124
    movdqa_r2r   (xmm5, xmm6);       /*    2:                             */ \
127
    movdqa_r2r   (xmm5, xmm6);       /*    2:                             */ \
125
    psrad_i2r    (ROW_SHIFT, row1);  /* 1: row1= result 0...4             */ \
128
    psrad_i2r    (ROW_SHIFT, row1);  /* 1: row1= result 0...4             */ \
Lines 137-190 Link Here
137
140
138
/* MMXEXT row IDCT */
141
/* MMXEXT row IDCT */
139
142
140
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2, -c4, -c2,	\
143
#define mmxext_table(c1,c2,c3,c4,c5,c6,c7)	{{{ c4,  c2, -c4, -c2}},	\
141
						   c4,  c6,  c4,  c6,	\
144
						 {{ c4,  c6,  c4,  c6}},	\
142
						   c1,  c3, -c1, -c5,	\
145
						 {{ c1,  c3, -c1, -c5}},	\
143
						   c5,  c7,  c3, -c7,	\
146
						 {{ c5,  c7,  c3, -c7}},	\
144
						   c4, -c6,  c4, -c6,	\
147
						 {{ c4, -c6,  c4, -c6}},	\
145
						  -c4,  c2,  c4, -c2,	\
148
						 {{-c4,  c2,  c4, -c2}},	\
146
						   c5, -c1,  c3, -c1,	\
149
						 {{ c5, -c1,  c3, -c1}},	\
147
						   c7,  c3,  c7, -c5 }
150
						 {{ c7,  c3,  c7, -c5}}}
148
151
149
static inline void mmxext_row_head (int16_t * const row, const int offset,
152
static inline void mmxext_row_head (int16_t * const row, const int offset,
150
				    const int16_t * const table)
153
				    const struct x16_4 * const table)
151
{
154
{
152
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
155
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
153
156
154
    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
157
    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
155
    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
158
    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
156
159
157
    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
160
    movq_m2r (table[0], mm3);		/* mm3 = -C2 -C4 C2 C4 */
158
    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
161
    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
159
162
160
    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
163
    movq_m2r (table[1], mm4);		/* mm4 = C6 C4 C6 C4 */
161
    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
164
    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
162
165
163
    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
166
    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
164
}
167
}
165
168
166
static inline void mmxext_row (const int16_t * const table,
169
static inline void mmxext_row (const struct x16_4 * const table,
167
			       const int32_t * const rounder)
170
			       const struct x4 * const rounder)
168
{
171
{
169
    movq_m2r (*(table+8), mm1);		/* mm1 = -C5 -C1 C3 C1 */
172
    movq_m2r (table[2], mm1);		/* mm1 = -C5 -C1 C3 C1 */
170
    pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
173
    pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
171
174
172
    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
175
    pmaddwd_m2r (table[4], mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
173
    pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
176
    pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
174
177
175
    movq_m2r (*(table+12), mm7);	/* mm7 = -C7 C3 C7 C5 */
178
    movq_m2r (table[3], mm7);		/* mm7 = -C7 C3 C7 C5 */
176
    pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
179
    pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
177
180
178
    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
181
    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
179
    pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
182
    pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
180
183
181
    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
184
    pmaddwd_m2r (table[5], mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
182
    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
185
    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
183
186
184
    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
187
    pmaddwd_m2r (table[6], mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
185
    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
188
    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
186
189
187
    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
190
    pmaddwd_m2r (table[7], mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
188
    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
191
    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
189
192
190
    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
193
    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
Lines 223-229 Link Here
223
226
224
static inline void mmxext_row_mid (int16_t * const row, const int store,
227
static inline void mmxext_row_mid (int16_t * const row, const int store,
225
				   const int offset,
228
				   const int offset,
226
				   const int16_t * const table)
229
				   const struct x16_4 * const table)
227
{
230
{
228
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
231
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
229
    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
232
    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
Lines 240-308 Link Here
240
    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
243
    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
241
    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
244
    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
242
245
243
    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
246
    movq_m2r (table[0], mm3);		/* mm3 = -C2 -C4 C2 C4 */
244
    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
247
    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
245
248
246
    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
249
    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
247
250
248
    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
251
    movq_m2r (table[1], mm4);		/* mm4 = C6 C4 C6 C4 */
249
    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
252
    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
250
}
253
}
251
254
252
255
253
/* MMX row IDCT */
256
/* MMX row IDCT */
254
257
255
#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{  c4,  c2,  c4,  c6,	\
258
#define mmx_table(c1,c2,c3,c4,c5,c6,c7)	{{{ c4,  c2,  c4,  c6}},	\
256
					   c4,  c6, -c4, -c2,	\
259
					 {{ c4,  c6, -c4, -c2}},	\
257
					   c1,  c3,  c3, -c7,	\
260
					 {{ c1,  c3,  c3, -c7}},	\
258
					   c5,  c7, -c1, -c5,	\
261
					 {{ c5,  c7, -c1, -c5}},	\
259
					   c4, -c6,  c4, -c2,	\
262
					 {{ c4, -c6,  c4, -c2}},	\
260
					  -c4,  c2,  c4, -c6,	\
263
					 {{-c4,  c2,  c4, -c6}},	\
261
					   c5, -c1,  c7, -c5,	\
264
					 {{ c5, -c1,  c7, -c5}},	\
262
					   c7,  c3,  c3, -c1 }
265
					 {{ c7,  c3,  c3, -c1}}}
263
266
264
static inline void mmx_row_head (int16_t * const row, const int offset,
267
static inline void mmx_row_head (int16_t * const row, const int offset,
265
				 const int16_t * const table)
268
				 const struct x16_4 * const table)
266
{
269
{
267
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
270
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
268
271
269
    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
272
    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
270
    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
273
    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
271
274
272
    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
275
    movq_m2r (table[0], mm3);		/* mm3 = C6 C4 C2 C4 */
273
    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
276
    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
274
277
275
    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
278
    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
276
279
277
    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
280
    movq_m2r (table[1], mm4);		/* mm4 = -C2 -C4 C6 C4 */
278
    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
281
    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
279
282
280
    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
283
    movq_m2r (table[2], mm1);		/* mm1 = -C7 C3 C3 C1 */
281
    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
284
    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
282
}
285
}
283
286
284
static inline void mmx_row (const int16_t * const table,
287
static inline void mmx_row (const struct x16_4 * const table,
285
			    const int32_t * const rounder)
288
			    const struct x4 * const rounder)
286
{
289
{
287
    pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
290
    pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
288
    punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
291
    punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
289
292
290
    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
293
    pmaddwd_m2r (table[4], mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
291
    punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
294
    punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
292
295
293
    movq_m2r (*(table+12), mm7);	/* mm7 = -C5 -C1 C7 C5 */
296
    movq_m2r (table[3], mm7);	/* mm7 = -C5 -C1 C7 C5 */
294
    pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
297
    pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
295
298
296
    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
299
    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
297
    pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
300
    pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
298
301
299
    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
302
    pmaddwd_m2r (table[5], mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
300
    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
303
    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
301
304
302
    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
305
    pmaddwd_m2r (table[6], mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
303
    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
306
    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
304
307
305
    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
308
    pmaddwd_m2r (table[7], mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
306
    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
309
    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
307
310
308
    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
311
    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
Lines 346-352 Link Here
346
}
349
}
347
350
348
static inline void mmx_row_mid (int16_t * const row, const int store,
351
static inline void mmx_row_mid (int16_t * const row, const int store,
349
				const int offset, const int16_t * const table)
352
				const int offset, const struct x16_4 * const table)
350
{
353
{
351
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
354
    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
352
    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
355
    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
Lines 366-378 Link Here
366
    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
369
    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
367
    psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
370
    psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
368
371
369
    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
372
    movq_m2r (table[0], mm3);		/* mm3 = C6 C4 C2 C4 */
370
    pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
373
    pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
371
374
372
    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
375
    movq_m2r (table[1], mm4);		/* mm4 = -C2 -C4 C6 C4 */
373
    por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
376
    por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
374
377
375
    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
378
    movq_m2r (table[2], mm1);		/* mm1 = -C7 C3 C3 C1 */
376
    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
379
    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
377
380
378
    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
381
    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
Lines 457-462 Link Here
457
#define T3 43790
460
#define T3 43790
458
#define C4 23170
461
#define C4 23170
459
462
463
/* MMX-Ext. and SSE can share the constants */
464
static const struct x16_8 t1_vector ATTR_ALIGN(16) = {{T1,T1,T1,T1,T1,T1,T1,T1}};
465
static const struct x16_8 t2_vector ATTR_ALIGN(16) = {{T2,T2,T2,T2,T2,T2,T2,T2}};
466
static const struct x16_8 t3_vector ATTR_ALIGN(16) = {{T3,T3,T3,T3,T3,T3,T3,T3}};
467
static const struct x16_8 c4_vector ATTR_ALIGN(16) = {{C4,C4,C4,C4,C4,C4,C4,C4}};
460
468
461
/* SSE2 column IDCT */
469
/* SSE2 column IDCT */
462
static inline void sse2_idct_col (int16_t * const col)
470
static inline void sse2_idct_col (int16_t * const col)
Lines 464-491 Link Here
464
    /* Almost identical to mmxext version:  */
472
    /* Almost identical to mmxext version:  */
465
    /* just do both 4x8 columns in paraller */
473
    /* just do both 4x8 columns in paraller */
466
474
467
    static const short t1_vector[] ATTR_ALIGN(16) = {T1,T1,T1,T1,T1,T1,T1,T1};
468
    static const short t2_vector[] ATTR_ALIGN(16) = {T2,T2,T2,T2,T2,T2,T2,T2};
469
    static const short t3_vector[] ATTR_ALIGN(16) = {T3,T3,T3,T3,T3,T3,T3,T3};
470
    static const short c4_vector[] ATTR_ALIGN(16) = {C4,C4,C4,C4,C4,C4,C4,C4};
471
472
#if defined(__x86_64__)
475
#if defined(__x86_64__)
473
476
474
    /* INPUT: block in xmm8 ... xmm15 */
477
    /* INPUT: block in xmm8 ... xmm15 */
475
478
476
    movdqa_m2r (*t1_vector, xmm0);	/* xmm0  = T1 */
479
    movdqa_m2r (t1_vector, xmm0);	/* xmm0  = T1 */
477
    movdqa_r2r (xmm9, xmm1);		/* xmm1  = x1 */
480
    movdqa_r2r (xmm9, xmm1);		/* xmm1  = x1 */
478
481
479
    movdqa_r2r (xmm0, xmm2);		/* xmm2  = T1 */
482
    movdqa_r2r (xmm0, xmm2);		/* xmm2  = T1 */
480
    pmulhw_r2r (xmm1, xmm0);		/* xmm0  = T1*x1 */
483
    pmulhw_r2r (xmm1, xmm0);		/* xmm0  = T1*x1 */
481
484
482
    movdqa_m2r (*t3_vector, xmm5);	/* xmm5  = T3 */
485
    movdqa_m2r (t3_vector, xmm5);	/* xmm5  = T3 */
483
    pmulhw_r2r (xmm15, xmm2);		/* xmm2  = T1*x7 */
486
    pmulhw_r2r (xmm15, xmm2);		/* xmm2  = T1*x7 */
484
487
485
    movdqa_r2r (xmm5, xmm7);		/* xmm7  = T3-1 */
488
    movdqa_r2r (xmm5, xmm7);		/* xmm7  = T3-1 */
486
    psubsw_r2r (xmm15, xmm0);		/* xmm0  = v17 */
489
    psubsw_r2r (xmm15, xmm0);		/* xmm0  = v17 */
487
490
488
    movdqa_m2r (*t2_vector, xmm9);	/* xmm9  = T2 */
491
    movdqa_m2r (t2_vector, xmm9);	/* xmm9  = T2 */
489
    pmulhw_r2r (xmm11, xmm5);		/* xmm5  = (T3-1)*x3 */
492
    pmulhw_r2r (xmm11, xmm5);		/* xmm5  = (T3-1)*x3 */
490
493
491
    paddsw_r2r (xmm2, xmm1);		/* xmm1  = u17 */
494
    paddsw_r2r (xmm2, xmm1);		/* xmm1  = u17 */
Lines 518-524 Link Here
518
    movdqa_r2r (xmm1, xmm7);		/* xmm7  = u12 */
521
    movdqa_r2r (xmm1, xmm7);		/* xmm7  = u12 */
519
    paddsw_r2r (xmm5, xmm1);		/* xmm1  = u12+v12 */
522
    paddsw_r2r (xmm5, xmm1);		/* xmm1  = u12+v12 */
520
523
521
    movdqa_m2r (*c4_vector, xmm0);	/* xmm0  = C4/2 */
524
    movdqa_m2r (c4_vector, xmm0);	/* xmm0  = C4/2 */
522
    psubsw_r2r (xmm5, xmm7);		/* xmm7  = u12-v12 */
525
    psubsw_r2r (xmm5, xmm7);		/* xmm7  = u12-v12 */
523
526
524
    movdqa_r2r (xmm6, xmm4);		/* xmm4  = b0 */
527
    movdqa_r2r (xmm6, xmm4);		/* xmm4  = b0 */
Lines 577-583 Link Here
577
    /* OUTPUT: block in xmm8 ... xmm15 */
580
    /* OUTPUT: block in xmm8 ... xmm15 */
578
581
579
#else
582
#else
580
    movdqa_m2r (*t1_vector, xmm0);	/* xmm0 = T1 */
583
    movdqa_m2r (t1_vector, xmm0);	/* xmm0 = T1 */
581
584
582
    movdqa_m2r (*(col+1*8), xmm1);	/* xmm1 = x1 */
585
    movdqa_m2r (*(col+1*8), xmm1);	/* xmm1 = x1 */
583
    movdqa_r2r (xmm0, xmm2);		/* xmm2 = T1 */
586
    movdqa_r2r (xmm0, xmm2);		/* xmm2 = T1 */
Lines 585-591 Link Here
585
    movdqa_m2r (*(col+7*8), xmm4);	/* xmm4 = x7 */
588
    movdqa_m2r (*(col+7*8), xmm4);	/* xmm4 = x7 */
586
    pmulhw_r2r (xmm1, xmm0);		/* xmm0 = T1*x1 */
589
    pmulhw_r2r (xmm1, xmm0);		/* xmm0 = T1*x1 */
587
590
588
    movdqa_m2r (*t3_vector, xmm5);	/* xmm5 = T3 */
591
    movdqa_m2r (t3_vector, xmm5);	/* xmm5 = T3 */
589
    pmulhw_r2r (xmm4, xmm2);		/* xmm2 = T1*x7 */
592
    pmulhw_r2r (xmm4, xmm2);		/* xmm2 = T1*x7 */
590
593
591
    movdqa_m2r (*(col+5*8), xmm6);	/* xmm6 = x5 */
594
    movdqa_m2r (*(col+5*8), xmm6);	/* xmm6 = x5 */
Lines 594-600 Link Here
594
    movdqa_m2r (*(col+3*8), xmm3);	/* xmm3 = x3 */
597
    movdqa_m2r (*(col+3*8), xmm3);	/* xmm3 = x3 */
595
    psubsw_r2r (xmm4, xmm0);		/* xmm0 = v17 */
598
    psubsw_r2r (xmm4, xmm0);		/* xmm0 = v17 */
596
599
597
    movdqa_m2r (*t2_vector, xmm4);	/* xmm4 = T2 */
600
    movdqa_m2r (t2_vector, xmm4);	/* xmm4 = T2 */
598
    pmulhw_r2r (xmm3, xmm5);		/* xmm5 = (T3-1)*x3 */
601
    pmulhw_r2r (xmm3, xmm5);		/* xmm5 = (T3-1)*x3 */
599
602
600
    paddsw_r2r (xmm2, xmm1);		/* xmm1 = u17 */
603
    paddsw_r2r (xmm2, xmm1);		/* xmm1 = u17 */
Lines 632-638 Link Here
632
    movdqa_m2r (*(col+0*8), xmm3);	/* xmm3 = x0 */
635
    movdqa_m2r (*(col+0*8), xmm3);	/* xmm3 = x0 */
633
    paddsw_r2r (xmm5, xmm1);		/* xmm1 = u12+v12 */
636
    paddsw_r2r (xmm5, xmm1);		/* xmm1 = u12+v12 */
634
637
635
    movdqa_m2r (*c4_vector, xmm0);	/* xmm0 = C4/2 */
638
    movdqa_m2r (c4_vector, xmm0);	/* xmm0 = C4/2 */
636
    psubsw_r2r (xmm5, xmm7);		/* xmm7 = u12-v12 */
639
    psubsw_r2r (xmm5, xmm7);		/* xmm7 = u12-v12 */
637
640
638
    movdqa_r2m (xmm6, *(col+5*8));	/* save b0 in scratch1 */
641
    movdqa_r2m (xmm6, *(col+5*8));	/* save b0 in scratch1 */
Lines 710-724 Link Here
710
/* MMX column IDCT */
713
/* MMX column IDCT */
711
static inline void idct_col (int16_t * const col, const int offset)
714
static inline void idct_col (int16_t * const col, const int offset)
712
{
715
{
713
    static const short t1_vector[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
714
    static const short t2_vector[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
715
    static const short t3_vector[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
716
    static const short c4_vector[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
717
718
    /* column code adapted from peter gubanov */
716
    /* column code adapted from peter gubanov */
719
    /* http://www.elecard.com/peter/idct.shtml */
717
    /* http://www.elecard.com/peter/idct.shtml */
720
718
721
    movq_m2r (*t1_vector, mm0);		/* mm0 = T1 */
719
    movq_m2r (t1_vector, mm0);		/* mm0 = T1 */
722
720
723
    movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
721
    movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
724
    movq_r2r (mm0, mm2);		/* mm2 = T1 */
722
    movq_r2r (mm0, mm2);		/* mm2 = T1 */
Lines 726-732 Link Here
726
    movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
724
    movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
727
    pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
725
    pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
728
726
729
    movq_m2r (*t3_vector, mm5);		/* mm5 = T3 */
727
    movq_m2r (t3_vector, mm5);		/* mm5 = T3 */
730
    pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
728
    pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
731
729
732
    movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
730
    movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
Lines 735-741 Link Here
735
    movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
733
    movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
736
    psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
734
    psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
737
735
738
    movq_m2r (*t2_vector, mm4);		/* mm4 = T2 */
736
    movq_m2r (t2_vector, mm4);		/* mm4 = T2 */
739
    pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
737
    pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
740
738
741
    paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
739
    paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
Lines 773-779 Link Here
773
    movq_m2r (*(col+offset+0*8), mm3);	/* mm3 = x0 */
771
    movq_m2r (*(col+offset+0*8), mm3);	/* mm3 = x0 */
774
    paddsw_r2r (mm5, mm1);		/* mm1 = u12+v12 */
772
    paddsw_r2r (mm5, mm1);		/* mm1 = u12+v12 */
775
773
776
    movq_m2r (*c4_vector, mm0);		/* mm0 = C4/2 */
774
    movq_m2r (c4_vector, mm0);		/* mm0 = C4/2 */
777
    psubsw_r2r (mm5, mm7);		/* mm7 = u12-v12 */
775
    psubsw_r2r (mm5, mm7);		/* mm7 = u12-v12 */
778
776
779
    movq_r2m (mm6, *(col+offset+5*8));	/* save b0 in scratch1 */
777
    movq_r2m (mm6, *(col+offset+5*8));	/* save b0 in scratch1 */
Lines 846-897 Link Here
846
    movq_r2m (mm4, *(col+offset+3*8));	/* save y3 */
844
    movq_r2m (mm4, *(col+offset+3*8));	/* save y3 */
847
}
845
}
848
846
849
847
/* MMX and SSE can share these constants */
850
static const int32_t rounder0[] ATTR_ALIGN(8) =
848
static const struct x4 rounder0 ATTR_ALIGN(16) =
851
    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
849
   {rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5)};
852
static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
850
static const struct x4 rounder4 ATTR_ALIGN(16) = {rounder_sse2 (0)};
853
static const int32_t rounder1[] ATTR_ALIGN(8) =
851
static const struct x4 rounder1 ATTR_ALIGN(16) =
854
    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
852
    {rounder_sse2 (1.25683487303)};	/* C1*(C1/C4+C1+C7)/2 */
855
static const int32_t rounder7[] ATTR_ALIGN(8) =
853
static const struct x4 rounder7 ATTR_ALIGN(16) =
856
    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
854
    {rounder_sse2 (-0.25)};		/* C1*(C7/C4+C7-C1)/2 */
857
static const int32_t rounder2[] ATTR_ALIGN(8) =
855
static const struct x4 rounder2 ATTR_ALIGN(16) =
858
    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
856
    {rounder_sse2 (0.60355339059)};	/* C2 * (C6+C2)/2 */
859
static const int32_t rounder6[] ATTR_ALIGN(8) =
857
static const struct x4 rounder6 ATTR_ALIGN(16) =
860
    rounder (-0.25);		/* C2 * (C6-C2)/2 */
858
    {rounder_sse2 (-0.25)};		/* C2 * (C6-C2)/2 */
861
static const int32_t rounder3[] ATTR_ALIGN(8) =
859
static const struct x4 rounder3 ATTR_ALIGN(16) =
862
    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
860
    {rounder_sse2 (0.087788325588)};	/* C3*(-C3/C4+C3+C5)/2 */
863
static const int32_t rounder5[] ATTR_ALIGN(8) =
861
static const struct x4 rounder5 ATTR_ALIGN(16) =
864
    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
862
    {rounder_sse2 (-0.441341716183)};	/* C3*(-C5/C4+C5-C3)/2 */
865
866
863
867
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
864
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
868
static inline void idct (int16_t * const block)				\
865
static inline void idct (int16_t * const block)				\
869
{									\
866
{									\
870
    static const int16_t table04[] ATTR_ALIGN(16) =			\
867
    static const struct x16_4 table04[8] ATTR_ALIGN(16) =			\
871
	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
868
	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
872
    static const int16_t table17[] ATTR_ALIGN(16) =			\
869
    static const struct x16_4 table17[8] ATTR_ALIGN(16) =			\
873
	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
870
	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
874
    static const int16_t table26[] ATTR_ALIGN(16) =			\
871
    static const struct x16_4 table26[8] ATTR_ALIGN(16) =			\
875
	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
872
	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
876
    static const int16_t table35[] ATTR_ALIGN(16) =			\
873
    static const struct x16_4 table35[8] ATTR_ALIGN(16) =			\
877
	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
874
	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
878
									\
875
									\
879
    idct_row_head (block, 0*8, table04);				\
876
    idct_row_head (block, 0*8, table04);				\
880
    idct_row (table04, rounder0);					\
877
    idct_row (table04, &rounder0);					\
881
    idct_row_mid (block, 0*8, 4*8, table04);				\
878
    idct_row_mid (block, 0*8, 4*8, table04);				\
882
    idct_row (table04, rounder4);					\
879
    idct_row (table04, &rounder4);					\
883
    idct_row_mid (block, 4*8, 1*8, table17);				\
880
    idct_row_mid (block, 4*8, 1*8, table17);				\
884
    idct_row (table17, rounder1);					\
881
    idct_row (table17, &rounder1);					\
885
    idct_row_mid (block, 1*8, 7*8, table17);				\
882
    idct_row_mid (block, 1*8, 7*8, table17);				\
886
    idct_row (table17, rounder7);					\
883
    idct_row (table17, &rounder7);					\
887
    idct_row_mid (block, 7*8, 2*8, table26);				\
884
    idct_row_mid (block, 7*8, 2*8, table26);				\
888
    idct_row (table26, rounder2);					\
885
    idct_row (table26, &rounder2);					\
889
    idct_row_mid (block, 2*8, 6*8, table26);				\
886
    idct_row_mid (block, 2*8, 6*8, table26);				\
890
    idct_row (table26, rounder6);					\
887
    idct_row (table26, &rounder6);					\
891
    idct_row_mid (block, 6*8, 3*8, table35);				\
888
    idct_row_mid (block, 6*8, 3*8, table35);				\
892
    idct_row (table35, rounder3);					\
889
    idct_row (table35, &rounder3);					\
893
    idct_row_mid (block, 3*8, 5*8, table35);				\
890
    idct_row_mid (block, 3*8, 5*8, table35);				\
894
    idct_row (table35, rounder5);					\
891
    idct_row (table35, &rounder5);					\
895
    idct_row_tail (block, 5*8);						\
892
    idct_row_tail (block, 5*8);						\
896
									\
893
									\
897
    idct_col (block, 0);						\
894
    idct_col (block, 0);						\
Lines 900-971 Link Here
900
897
901
static inline void sse2_idct (int16_t * const block)
898
static inline void sse2_idct (int16_t * const block)
902
{
899
{
903
    static const int16_t table04[] ATTR_ALIGN(16) =
900
    static const struct x16_8 table04[4] ATTR_ALIGN(16) =
904
	sse2_table (22725, 21407, 19266, 16384, 12873,  8867, 4520);
901
	sse2_table (22725, 21407, 19266, 16384, 12873,  8867, 4520);
905
    static const int16_t table17[] ATTR_ALIGN(16) =
902
    static const struct x16_8 table17[4] ATTR_ALIGN(16) =
906
	sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270);
903
	sse2_table (31521, 29692, 26722, 22725, 17855, 12299, 6270);
907
    static const int16_t table26[] ATTR_ALIGN(16) =
904
    static const struct x16_8 table26[4] ATTR_ALIGN(16) =
908
	sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906);
905
	sse2_table (29692, 27969, 25172, 21407, 16819, 11585, 5906);
909
    static const int16_t table35[] ATTR_ALIGN(16) =
906
    static const struct x16_8 table35[4] ATTR_ALIGN(16) =
910
	sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315);
907
	sse2_table (26722, 25172, 22654, 19266, 15137, 10426, 5315);
911
908
912
    static const int32_t rounder0_128[] ATTR_ALIGN(16) =
913
	rounder_sse2 ((1 << (COL_SHIFT - 1)) - 0.5);
914
    static const int32_t rounder4_128[] ATTR_ALIGN(16) = rounder_sse2 (0);
915
    static const int32_t rounder1_128[] ATTR_ALIGN(16) =
916
	rounder_sse2 (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
917
    static const int32_t rounder7_128[] ATTR_ALIGN(16) =
918
	rounder_sse2 (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
919
    static const int32_t rounder2_128[] ATTR_ALIGN(16) =
920
	rounder_sse2 (0.60355339059);	/* C2 * (C6+C2)/2 */
921
    static const int32_t rounder6_128[] ATTR_ALIGN(16) =
922
	rounder_sse2 (-0.25);		/* C2 * (C6-C2)/2 */
923
    static const int32_t rounder3_128[] ATTR_ALIGN(16) =
924
	rounder_sse2 (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
925
    static const int32_t rounder5_128[] ATTR_ALIGN(16) =
926
	rounder_sse2 (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
927
928
#if defined(__x86_64__)
909
#if defined(__x86_64__)
929
    movdqa_m2r (block[0*8], xmm8);
910
    movdqa_m2r (block[0*8], xmm8);
930
    movdqa_m2r (block[4*8], xmm12);
911
    movdqa_m2r (block[4*8], xmm12);
931
    SSE2_IDCT_2ROW (table04,  xmm8, xmm12, *rounder0_128, *rounder4_128);
912
    SSE2_IDCT_2ROW (table04,  xmm8, xmm12, rounder0, rounder4);
932
913
933
    movdqa_m2r (block[1*8], xmm9);
914
    movdqa_m2r (block[1*8], xmm9);
934
    movdqa_m2r (block[7*8], xmm15);
915
    movdqa_m2r (block[7*8], xmm15);
935
    SSE2_IDCT_2ROW (table17,  xmm9, xmm15, *rounder1_128, *rounder7_128);
916
    SSE2_IDCT_2ROW (table17,  xmm9, xmm15, rounder1, rounder7);
936
917
937
    movdqa_m2r (block[2*8], xmm10);
918
    movdqa_m2r (block[2*8], xmm10);
938
    movdqa_m2r (block[6*8], xmm14);
919
    movdqa_m2r (block[6*8], xmm14);
939
    SSE2_IDCT_2ROW (table26, xmm10, xmm14, *rounder2_128, *rounder6_128);
920
    SSE2_IDCT_2ROW (table26, xmm10, xmm14, rounder2, rounder6);
940
921
941
    movdqa_m2r (block[3*8], xmm11);
922
    movdqa_m2r (block[3*8], xmm11);
942
    movdqa_m2r (block[5*8], xmm13);
923
    movdqa_m2r (block[5*8], xmm13);
943
    SSE2_IDCT_2ROW (table35, xmm11, xmm13, *rounder3_128, *rounder5_128);
924
    SSE2_IDCT_2ROW (table35, xmm11, xmm13, rounder3, rounder5);
944
925
945
    /* OUTPUT: block in xmm8 ... xmm15 */
926
    /* OUTPUT: block in xmm8 ... xmm15 */
946
927
947
#else
928
#else
948
    movdqa_m2r (block[0*8], xmm0);
929
    movdqa_m2r (block[0*8], xmm0);
949
    movdqa_m2r (block[4*8], xmm4);
930
    movdqa_m2r (block[4*8], xmm4);
950
    SSE2_IDCT_2ROW (table04, xmm0, xmm4, *rounder0_128, *rounder4_128);
931
    SSE2_IDCT_2ROW (table04, xmm0, xmm4, rounder0, rounder4);
951
    movdqa_r2m (xmm0, block[0*8]);
932
    movdqa_r2m (xmm0, block[0*8]);
952
    movdqa_r2m (xmm4, block[4*8]);
933
    movdqa_r2m (xmm4, block[4*8]);
953
934
954
    movdqa_m2r (block[1*8], xmm0);
935
    movdqa_m2r (block[1*8], xmm0);
955
    movdqa_m2r (block[7*8], xmm4);
936
    movdqa_m2r (block[7*8], xmm4);
956
    SSE2_IDCT_2ROW (table17, xmm0, xmm4, *rounder1_128, *rounder7_128);
937
    SSE2_IDCT_2ROW (table17, xmm0, xmm4, rounder1, rounder7);
957
    movdqa_r2m (xmm0, block[1*8]);
938
    movdqa_r2m (xmm0, block[1*8]);
958
    movdqa_r2m (xmm4, block[7*8]);
939
    movdqa_r2m (xmm4, block[7*8]);
959
940
960
    movdqa_m2r (block[2*8], xmm0);
941
    movdqa_m2r (block[2*8], xmm0);
961
    movdqa_m2r (block[6*8], xmm4);
942
    movdqa_m2r (block[6*8], xmm4);
962
    SSE2_IDCT_2ROW (table26, xmm0, xmm4, *rounder2_128, *rounder6_128);
943
    SSE2_IDCT_2ROW (table26, xmm0, xmm4, rounder2, rounder6);
963
    movdqa_r2m (xmm0, block[2*8]);
944
    movdqa_r2m (xmm0, block[2*8]);
964
    movdqa_r2m (xmm4, block[6*8]);
945
    movdqa_r2m (xmm4, block[6*8]);
965
946
966
    movdqa_m2r (block[3*8], xmm0);
947
    movdqa_m2r (block[3*8], xmm0);
967
    movdqa_m2r (block[5*8], xmm4);
948
    movdqa_m2r (block[5*8], xmm4);
968
    SSE2_IDCT_2ROW (table35, xmm0, xmm4, *rounder3_128, *rounder5_128);
949
    SSE2_IDCT_2ROW (table35, xmm0, xmm4, rounder3, rounder5);
969
    movdqa_r2m (xmm0, block[3*8]);
950
    movdqa_r2m (xmm0, block[3*8]);
970
    movdqa_r2m (xmm4, block[5*8]);
951
    movdqa_r2m (xmm4, block[5*8]);
971
#endif
952
#endif

Return to bug 389323