Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 68549
Collapse All | Expand All

(-)libtheora-1.0alpha3/lib/blockmap.c (-1 / +1 lines)
Lines 21-27 Link Here
21
                            ogg_uint32_t FirstSB,
21
                            ogg_uint32_t FirstSB,
22
                            ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,
22
                            ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,
23
                            ogg_uint32_t VFrags ){
23
                            ogg_uint32_t VFrags ){
24
  ogg_uint32_t i, j;
24
  ogg_uint32_t i, j = 0;
25
  ogg_uint32_t xpos;
25
  ogg_uint32_t xpos;
26
  ogg_uint32_t ypos;
26
  ogg_uint32_t ypos;
27
  ogg_uint32_t SBrow, SBcol;
27
  ogg_uint32_t SBrow, SBcol;
(-)libtheora-1.0alpha3/lib/cpu.c (+107 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#include "cpu.h"
19
20
ogg_uint32_t cpu_flags = 0;
21
22
#if 1
23
static ogg_uint32_t cpu_get_flags (void)
24
{
25
  ogg_uint32_t eax, ebx, ecx, edx;
26
  ogg_uint32_t flags;
27
28
#define cpuid(op,eax,ebx,ecx,edx)      \
29
  asm volatile ("pushl %%ebx   \n\t"   \
30
                "cpuid         \n\t"   \
31
                "movl %%ebx,%1 \n\t"   \
32
                "popl %%ebx"           \
33
              : "=a" (eax),            \
34
                "=r" (ebx),            \
35
                "=c" (ecx),            \
36
                "=d" (edx)             \
37
              : "a" (op)               \
38
              : "cc")
39
40
  asm volatile ("pushfl              \n\t"
41
                "pushfl              \n\t"
42
                "popl %0             \n\t"
43
                "movl %0,%1          \n\t"
44
                "xorl $0x200000,%0   \n\t"
45
                "pushl %0            \n\t"
46
                "popfl               \n\t"
47
                "pushfl              \n\t"
48
                "popl %0             \n\t"
49
                "popfl"
50
              : "=r" (eax),
51
                "=r" (ebx)
52
              :
53
              : "cc");
54
         
55
  if (eax == ebx)             /* no cpuid */
56
    return 0;
57
58
  cpuid(0, eax, ebx, ecx, edx);
59
60
  if (ebx == 0x756e6547 &&
61
      edx == 0x49656e69 &&
62
      ecx == 0x6c65746e) {
63
    /* intel */
64
65
  inteltest:
66
    cpuid(1, eax, ebx, ecx, edx);
67
    if ((edx & 0x00800000) == 0)
68
      return 0;
69
    flags = CPU_X86_MMX;
70
    if (edx & 0x02000000)
71
      flags |= CPU_X86_MMXEXT | CPU_X86_SSE;
72
    if (edx & 0x04000000)
73
      flags |= CPU_X86_SSE2;
74
    return flags;
75
  } else if (ebx == 0x68747541 &&
76
             edx == 0x69746e65 &&
77
             ecx == 0x444d4163) {
78
    /* AMD */
79
    cpuid(0x80000000, eax, ebx, ecx, edx);
80
    if ((unsigned)eax < 0x80000001)
81
      goto inteltest;
82
    cpuid(0x80000001, eax, ebx, ecx, edx);
83
    if ((edx & 0x00800000) == 0)
84
      return 0;
85
    flags = CPU_X86_MMX;
86
    if (edx & 0x80000000)
87
      flags |= CPU_X86_3DNOW;
88
    if (edx & 0x00400000)
89
      flags |= CPU_X86_MMXEXT;
90
    return flags;
91
  }
92
  else {
93
    /* implement me */
94
  }
95
96
  return flags;
97
}
98
#else
99
static ogg_uint32_t cpu_get_flags (void) {
100
  return 0;
101
}
102
#endif
103
104
void cpu_init () 
105
{
106
  cpu_flags = cpu_get_flags();
107
}
(-)libtheora-1.0alpha3/lib/cpu.h (+28 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#include "encoder_internal.h"
19
20
extern ogg_uint32_t cpu_flags;
21
22
#define CPU_X86_MMX	(1<<0)
23
#define CPU_X86_3DNOW	(1<<1)
24
#define CPU_X86_MMXEXT	(1<<2)
25
#define CPU_X86_SSE	(1<<3)
26
#define CPU_X86_SSE2	(1<<4)
27
28
void cpu_init () ;
(-)libtheora-1.0alpha3/lib/dct.c (-1 / +11 lines)
Lines 16-21 Link Here
16
 ********************************************************************/
16
 ********************************************************************/
17
17
18
#include "encoder_internal.h"
18
#include "encoder_internal.h"
19
#include "cpu.h"
19
20
20
static ogg_int32_t xC1S7 = 64277;
21
static ogg_int32_t xC1S7 = 64277;
21
static ogg_int32_t xC2S6 = 60547;
22
static ogg_int32_t xC2S6 = 60547;
Lines 28-34 Link Here
28
#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
29
#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
29
#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
30
#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
30
31
31
void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
32
static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
32
  int loop;
33
  int loop;
33
34
34
  ogg_int32_t  is07, is12, is34, is56;
35
  ogg_int32_t  is07, is12, is34, is56;
Lines 251-253 Link Here
251
    op ++;
252
    op ++;
252
  }
253
  }
253
}
254
}
255
256
void dsp_dct_init (DspFunctions *funcs)
257
{
258
  funcs->fdct_short = fdct_short__c;
259
  if (cpu_flags & CPU_X86_MMX) {
260
    dsp_i386_mmx_fdct_init(&dsp_funcs);
261
  }
262
}
263
(-)libtheora-1.0alpha3/lib/dct_decode.c (-33 / +18 lines)
Lines 18-23 Link Here
18
#include <stdlib.h>
18
#include <stdlib.h>
19
#include <string.h>
19
#include <string.h>
20
#include "encoder_internal.h"
20
#include "encoder_internal.h"
21
#include "dsp.h"
21
22
22
23
23
#define GOLDEN_FRAME_THRESH_Q   50
24
#define GOLDEN_FRAME_THRESH_Q   50
Lines 112-133 Link Here
112
  SetupBoundingValueArray_Generic(pbi, FLimit);
113
  SetupBoundingValueArray_Generic(pbi, FLimit);
113
}
114
}
114
115
115
void CopyBlock(unsigned char *src,
116
               unsigned char *dest,
117
               unsigned int srcstride){
118
  unsigned char *s = src;
119
  unsigned char *d = dest;
120
  unsigned int stride = srcstride;
121
122
  int j;
123
  for ( j = 0; j < 8; j++ ){
124
    ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
125
    ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
126
    s+=stride;
127
    d+=stride;
128
  }
129
}
130
131
static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
116
static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
132
  ogg_uint32_t ReconPixelsPerLine;
117
  ogg_uint32_t ReconPixelsPerLine;
133
  ogg_int32_t     ReconPixelIndex;
118
  ogg_int32_t     ReconPixelIndex;
Lines 160-167 Link Here
160
  ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
145
  ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
161
146
162
  /* Get the pixel index for the first pixel in the fragment. */
147
  /* Get the pixel index for the first pixel in the fragment. */
163
  ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
148
  dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
164
              (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );
149
              (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);
165
150
166
}
151
}
167
152
Lines 237-246 Link Here
237
    /* Reconstruct the pixel data using the last frame reconstruction
222
    /* Reconstruct the pixel data using the last frame reconstruction
238
       and change data when the motion vector is (0,0), the recon is
223
       and change data when the motion vector is (0,0), the recon is
239
       based on the lastframe without loop filtering---- for testing */
224
       based on the lastframe without loop filtering---- for testing */
240
    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
225
    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
241
                &pbi->LastFrameRecon[ReconPixelIndex],
226
                &pbi->LastFrameRecon[ReconPixelIndex],
242
                pbi->ReconDataBuffer, ReconPixelsPerLine );
227
                  pbi->ReconDataBuffer, ReconPixelsPerLine);
243
244
  }else if ( ModeUsesMC[pbi->CodingMode] ) {
228
  }else if ( ModeUsesMC[pbi->CodingMode] ) {
245
    /* The mode uses a motion vector. */
229
    /* The mode uses a motion vector. */
246
    /* Get vector from list */
230
    /* Get vector from list */
Lines 287-315 Link Here
287
    if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
271
    if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
288
      /* Reconstruct the pixel dats from the reference frame and change data
272
      /* Reconstruct the pixel dats from the reference frame and change data
289
         (no half pixel in this case as the two references were the same. */
273
         (no half pixel in this case as the two references were the same. */
290
      ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
274
      dsp_static_recon_inter8x8 (
275
		  &pbi->ThisFrameRecon[ReconPixelIndex],
291
                  LastFrameRecPtr, pbi->ReconDataBuffer,
276
                  LastFrameRecPtr, pbi->ReconDataBuffer,
292
                  ReconPixelsPerLine );
277
                  ReconPixelsPerLine);
293
    }else{
278
    }else{
294
      /* Fractional pixel reconstruction. */
279
      /* Fractional pixel reconstruction. */
295
      /* Note that we only use two pixels per reconstruction even for
280
      /* Note that we only use two pixels per reconstruction even for
296
         the diagonal. */
281
         the diagonal. */
297
      ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],
282
      dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex],
298
                            LastFrameRecPtr, LastFrameRecPtr2,
283
                            LastFrameRecPtr, LastFrameRecPtr2,
299
                            pbi->ReconDataBuffer, ReconPixelsPerLine );
284
                            pbi->ReconDataBuffer, ReconPixelsPerLine);
300
    }
285
    }
301
  } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
286
  } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
302
    /* Golden frame with motion vector */
287
    /* Golden frame with motion vector */
303
    /* Reconstruct the pixel data using the golden frame
288
    /* Reconstruct the pixel data using the golden frame
304
       reconstruction and change data */
289
       reconstruction and change data */
305
    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
290
    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
306
                &pbi->GoldenFrame[ ReconPixelIndex ],
291
                &pbi->GoldenFrame[ ReconPixelIndex ],
307
                pbi->ReconDataBuffer, ReconPixelsPerLine );
292
                  pbi->ReconDataBuffer, ReconPixelsPerLine);
308
  } else {
293
  } else {
309
    /* Simple Intra coding */
294
    /* Simple Intra coding */
310
    /* Get the pixel index for the first pixel in the fragment. */
295
    /* Get the pixel index for the first pixel in the fragment. */
311
    ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
296
    dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
312
                pbi->ReconDataBuffer, ReconPixelsPerLine );
297
              pbi->ReconDataBuffer, ReconPixelsPerLine);
313
  }
298
  }
314
}
299
}
315
300
Lines 464-470 Link Here
464
      SrcPtr = &SrcReconPtr[ PixelIndex ];
449
      SrcPtr = &SrcReconPtr[ PixelIndex ];
465
      DestPtr = &DestReconPtr[ PixelIndex ];
450
      DestPtr = &DestReconPtr[ PixelIndex ];
466
451
467
      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
452
      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
468
    }
453
    }
469
  }
454
  }
470
455
Lines 476-482 Link Here
476
      SrcPtr = &SrcReconPtr[ PixelIndex ];
461
      SrcPtr = &SrcReconPtr[ PixelIndex ];
477
      DestPtr = &DestReconPtr[ PixelIndex ];
462
      DestPtr = &DestReconPtr[ PixelIndex ];
478
463
479
      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
464
      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
480
465
481
    }
466
    }
482
  }
467
  }
Lines 505-511 Link Here
505
      SrcPtr = &SrcReconPtr[ PixelIndex ];
490
      SrcPtr = &SrcReconPtr[ PixelIndex ];
506
      DestPtr = &DestReconPtr[ PixelIndex ];
491
      DestPtr = &DestReconPtr[ PixelIndex ];
507
492
508
      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
493
      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
509
    }
494
    }
510
  }
495
  }
511
496
Lines 517-523 Link Here
517
      SrcPtr = &SrcReconPtr[ PixelIndex ];
502
      SrcPtr = &SrcReconPtr[ PixelIndex ];
518
      DestPtr = &DestReconPtr[ PixelIndex ];
503
      DestPtr = &DestReconPtr[ PixelIndex ];
519
504
520
      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
505
      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
521
506
522
    }
507
    }
523
  }
508
  }
(-)libtheora-1.0alpha3/lib/dct_encode.c (-110 / +13 lines)
Lines 17-126 Link Here
17
17
18
#include <stdlib.h>
18
#include <stdlib.h>
19
#include "encoder_internal.h"
19
#include "encoder_internal.h"
20
#include "dsp.h"
20
21
21
static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
22
static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
22
23
23
static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
24
                  ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
25
                  unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
26
                  ogg_uint32_t ReconPixelsPerLine ) {
27
  int i;
28
29
  /* For each block row */
30
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
31
    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
32
    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
33
    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
34
    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
35
    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
36
    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
37
    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
38
    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
39
40
    /* Update the screen canvas in one step*/
41
    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
42
    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
43
44
    /* Start next row */
45
    new_ptr1 += PixelsPerLine;
46
    old_ptr1 += PixelsPerLine;
47
    FiltPtr += PixelsPerLine;
48
    ReconPtr += ReconPixelsPerLine;
49
    DctInputPtr += BLOCK_HEIGHT_WIDTH;
50
  }
51
}
52
53
static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
54
                      unsigned char *old_ptr1, unsigned char *new_ptr1,
55
                      ogg_uint32_t PixelsPerLine ) {
56
  int i;
57
  /* For each block row */
58
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
59
    /* INTRA mode so code raw image data */
60
    /* We convert the data to 8 bit signed (by subtracting 128) as
61
       this reduces the internal precision requirments in the DCT
62
       transform. */
63
    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
64
    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
65
    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
66
    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
67
    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
68
    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
69
    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
70
    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
71
72
    /* Update the screen canvas in one step */
73
    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
74
    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
75
76
    /* Start next row */
77
    new_ptr1 += PixelsPerLine;
78
    old_ptr1 += PixelsPerLine;
79
    FiltPtr += PixelsPerLine;
80
    DctInputPtr += BLOCK_HEIGHT_WIDTH;
81
  }
82
}
83
84
static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
85
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
86
                     unsigned char *old_ptr1, unsigned char *new_ptr1,
87
                     ogg_uint32_t PixelsPerLine,
88
                     ogg_uint32_t ReconPixelsPerLine ) {
89
  int i;
90
91
  /* For each block row */
92
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
93
    DctInputPtr[0] = (ogg_int16_t)
94
      ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
95
    DctInputPtr[1] = (ogg_int16_t)
96
      ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
97
    DctInputPtr[2] = (ogg_int16_t)
98
      ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
99
    DctInputPtr[3] = (ogg_int16_t)
100
      ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
101
    DctInputPtr[4] = (ogg_int16_t)
102
      ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
103
    DctInputPtr[5] = (ogg_int16_t)
104
      ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
105
    DctInputPtr[6] = (ogg_int16_t)
106
      ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
107
    DctInputPtr[7] = (ogg_int16_t)
108
      ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
109
110
    /* Update the screen canvas in one step */
111
    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
112
    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
113
114
    /* Start next row */
115
    new_ptr1 += PixelsPerLine;
116
    old_ptr1 += PixelsPerLine;
117
    FiltPtr += PixelsPerLine;
118
    ReconPtr1 += ReconPixelsPerLine;
119
    ReconPtr2 += ReconPixelsPerLine;
120
    DctInputPtr += BLOCK_HEIGHT_WIDTH;
121
  }
122
}
123
124
static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
24
static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
125
                                       ogg_uint32_t * TokenListPtr ){
25
                                       ogg_uint32_t * TokenListPtr ){
126
  unsigned char tokens_added = 0;
26
  unsigned char tokens_added = 0;
Lines 452-464 Link Here
452
352
453
  /* Is the MV offset exactly pixel alligned */
353
  /* Is the MV offset exactly pixel alligned */
454
  if ( AbsRefOffset == 0 ){
354
  if ( AbsRefOffset == 0 ){
455
    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
355
    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
456
               PixelsPerLine, ReconPixelsPerLine );
356
               PixelsPerLine, ReconPixelsPerLine);
357
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
457
  } else {
358
  } else {
458
    /* Fractional pixel MVs. */
359
    /* Fractional pixel MVs. */
459
    /* Note that we only use two pixel values even for the diagonal */
360
    /* Note that we only use two pixel values even for the diagonal */
460
    Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,
361
    dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
461
                 new_ptr1, PixelsPerLine, ReconPixelsPerLine );
362
                 PixelsPerLine, ReconPixelsPerLine);
363
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
462
  }
364
  }
463
}
365
}
464
366
Lines 534-550 Link Here
534
        pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
436
        pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
535
    }
437
    }
536
438
537
    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
439
    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
538
               PixelsPerLine, ReconPixelsPerLine );
440
               PixelsPerLine, ReconPixelsPerLine);
441
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
539
  } else if ( cpi->pb.CodingMode==CODE_INTRA ) {
442
  } else if ( cpi->pb.CodingMode==CODE_INTRA ) {
540
    Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);
443
    dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine);
541
444
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
542
  }
445
  }
543
446
544
  /* Proceed to encode the data into the encode buffer if the encoder
447
  /* Proceed to encode the data into the encode buffer if the encoder
545
     is enabled. */
448
     is enabled. */
546
  /* Perform a 2D DCT transform on the data. */
449
  /* Perform a 2D DCT transform on the data. */
547
  fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
450
  dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
548
451
549
  /* Quantize that transform data. */
452
  /* Quantize that transform data. */
550
  quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
453
  quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
(-)libtheora-1.0alpha3/lib/decode.c (+3 lines)
Lines 796-801 Link Here
796
  /* Make a not of the number of coded blocks this frame */
796
  /* Make a not of the number of coded blocks this frame */
797
  pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex;
797
  pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex;
798
798
799
  dsp_static_save_fpu();
800
799
  /* Decode the modes data */
801
  /* Decode the modes data */
800
  DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols);
802
  DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols);
801
803
Lines 808-813 Link Here
808
  /* Reconstruct and display the frame */
810
  /* Reconstruct and display the frame */
809
  ReconRefFrames(pbi);
811
  ReconRefFrames(pbi);
810
812
813
  dsp_static_restore_fpu();
811
}
814
}
812
815
813
816
(-)libtheora-1.0alpha3/lib/dsp.c (+416 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#include <stdlib.h>
19
#include "cpu.h"
20
#include "encoder_internal.h"
21
22
#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
23
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
24
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
25
26
DspFunctions dsp_funcs;
27
28
static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr,
29
                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
30
                  ogg_uint32_t ReconPixelsPerLine) {
31
  int i;
32
33
  /* For each block row */
34
  for (i=8; i; i--) {
35
    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
36
    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
37
    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
38
    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
39
    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
40
    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
41
    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
42
    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
43
44
    /* Start next row */
45
    FiltPtr += PixelsPerLine;
46
    ReconPtr += ReconPixelsPerLine;
47
    DctInputPtr += 8;
48
  }
49
}
50
51
static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
52
                      ogg_uint32_t PixelsPerLine) {
53
  int i;
54
  /* For each block row */
55
  for (i=8; i; i--) {
56
    /* INTRA mode so code raw image data */
57
    /* We convert the data to 8 bit signed (by subtracting 128) as
58
       this reduces the internal precision requirments in the DCT
59
       transform. */
60
    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
61
    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
62
    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
63
    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
64
    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
65
    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
66
    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
67
    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
68
69
    /* Start next row */
70
    FiltPtr += PixelsPerLine;
71
    DctInputPtr += 8;
72
  }
73
}
74
75
static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
76
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
77
                     ogg_uint32_t PixelsPerLine,
78
                     ogg_uint32_t ReconPixelsPerLine) 
79
{
80
  int i;
81
82
  /* For each block row */
83
  for (i=8; i; i--) {
84
    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
85
    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
86
    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
87
    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
88
    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
89
    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
90
    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
91
    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
92
93
    /* Start next row */
94
    FiltPtr += PixelsPerLine;
95
    ReconPtr1 += ReconPixelsPerLine;
96
    ReconPtr2 += ReconPixelsPerLine;
97
    DctInputPtr += 8;
98
  }
99
}
100
101
static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2)
102
{
103
  ogg_uint32_t SadValue;
104
  ogg_uint32_t SadValue1;
105
106
  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
107
	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
108
	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
109
	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
110
111
  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
112
	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
113
	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
114
	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
115
116
  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
117
118
  return SadValue;
119
}
120
121
static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2,
122
		                    ogg_uint32_t stride)
123
{
124
  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
125
  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
126
  ogg_uint32_t MaxSad = 0;
127
  ogg_uint32_t i;
128
129
  for ( i = 0; i < 4; i++ ){
130
    SadValue[0] += abs(Src1[0] - Src2[0]);
131
    SadValue[1] += abs(Src1[1] - Src2[1]);
132
    SadValue[2] += abs(Src1[2] - Src2[2]);
133
    SadValue[3] += abs(Src1[3] - Src2[3]);
134
    SadValue[4] += abs(Src1[4] - Src2[4]);
135
    SadValue[5] += abs(Src1[5] - Src2[5]);
136
    SadValue[6] += abs(Src1[6] - Src2[6]);
137
    SadValue[7] += abs(Src1[7] - Src2[7]);
138
    
139
    Src1 += stride;
140
    Src2 += stride;
141
  }
142
143
  for ( i = 0; i < 4; i++ ){
144
    SadValue2[0] += abs(Src1[0] - Src2[0]);
145
    SadValue2[1] += abs(Src1[1] - Src2[1]);
146
    SadValue2[2] += abs(Src1[2] - Src2[2]);
147
    SadValue2[3] += abs(Src1[3] - Src2[3]);
148
    SadValue2[4] += abs(Src1[4] - Src2[4]);
149
    SadValue2[5] += abs(Src1[5] - Src2[5]);
150
    SadValue2[6] += abs(Src1[6] - Src2[6]);
151
    SadValue2[7] += abs(Src1[7] - Src2[7]);
152
    
153
    Src1 += stride;
154
    Src2 += stride;
155
  }
156
    
157
  for ( i = 0; i < 8; i++ ){
158
    if ( SadValue[i] > MaxSad )
159
      MaxSad = SadValue[i];
160
    if ( SadValue2[i] > MaxSad )
161
      MaxSad = SadValue2[i];
162
  }
163
    
164
  return MaxSad;
165
}
166
167
static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1,
168
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
169
{
170
  ogg_uint32_t  i;
171
  ogg_uint32_t  sad = 0;
172
173
  for (i=8; i; i--) {
174
    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
175
    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
176
    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
177
    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
178
    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
179
    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
180
    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
181
    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
182
183
    /* Step to next row of block. */
184
    ptr1 += stride1;
185
    ptr2 += stride2;
186
  }
187
188
  return sad;
189
}
190
191
static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
192
		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
193
			   	  ogg_uint32_t thres)
194
{
195
  ogg_uint32_t  i;
196
  ogg_uint32_t  sad = 0;
197
198
  for (i=8; i; i--) {
199
    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
200
    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
201
    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
202
    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
203
    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
204
    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
205
    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
206
    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
207
208
    if (sad > thres )
209
      break;
210
211
    /* Step to next row of block. */
212
    ptr1 += stride1;
213
    ptr2 += stride2;
214
  }
215
216
  return sad;
217
}
218
219
static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
220
		                      unsigned char *RefDataPtr1,
221
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
222
			              ogg_uint32_t thres)
223
{
224
  ogg_uint32_t  i;
225
  ogg_uint32_t  sad = 0;
226
227
  for (i=8; i; i--) {
228
    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
229
    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
230
    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
231
    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
232
    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
233
    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
234
    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
235
    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
236
237
    if ( sad > thres )
238
      break;
239
240
    /* Step to next row of block. */
241
    SrcData += SrcStride;
242
    RefDataPtr1 += RefStride;
243
    RefDataPtr2 += RefStride;
244
  }
245
246
  return sad;
247
}
248
249
static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride)
250
{
251
  ogg_uint32_t  i;
252
  ogg_uint32_t  XSum=0;
253
  ogg_uint32_t  XXSum=0;
254
255
  for (i=8; i; i--) {
256
     /* Examine alternate pixel locations. */
257
     XSum += DataPtr[0];
258
     XXSum += DataPtr[0]*DataPtr[0];
259
     XSum += DataPtr[1];
260
     XXSum += DataPtr[1]*DataPtr[1];
261
     XSum += DataPtr[2];
262
     XXSum += DataPtr[2]*DataPtr[2];
263
     XSum += DataPtr[3];
264
     XXSum += DataPtr[3]*DataPtr[3];
265
     XSum += DataPtr[4];
266
     XXSum += DataPtr[4]*DataPtr[4];
267
     XSum += DataPtr[5];
268
     XXSum += DataPtr[5]*DataPtr[5];
269
     XSum += DataPtr[6];
270
     XXSum += DataPtr[6]*DataPtr[6];
271
     XSum += DataPtr[7];
272
     XXSum += DataPtr[7]*DataPtr[7];
273
274
     /* Step to next row of block. */
275
     DataPtr += Stride;
276
   }
277
278
   /* Compute population variance as mis-match metric. */
279
   return (( (XXSum<<6) - XSum*XSum ) );
280
}
281
282
static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
283
		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
284
{
285
  ogg_uint32_t  i;
286
  ogg_uint32_t  XSum=0;
287
  ogg_uint32_t  XXSum=0;
288
  ogg_int32_t   DiffVal;
289
290
  for (i=8; i; i--) {
291
    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
292
    XSum += DiffVal;
293
    XXSum += DiffVal*DiffVal;
294
295
    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
296
    XSum += DiffVal;
297
    XXSum += DiffVal*DiffVal;
298
299
    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
300
    XSum += DiffVal;
301
    XXSum += DiffVal*DiffVal;
302
303
    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
304
    XSum += DiffVal;
305
    XXSum += DiffVal*DiffVal;
306
        
307
    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
308
    XSum += DiffVal;
309
    XXSum += DiffVal*DiffVal;
310
        
311
    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
312
    XSum += DiffVal;
313
    XXSum += DiffVal*DiffVal;
314
        
315
    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
316
    XSum += DiffVal;
317
    XXSum += DiffVal*DiffVal;
318
        
319
    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
320
    XSum += DiffVal;
321
    XXSum += DiffVal*DiffVal;
322
        
323
    /* Step to next row of block. */
324
    SrcData += SrcStride;
325
    RefDataPtr += RefStride;
326
  }
327
328
  /* Compute and return population variance as mis-match metric. */
329
  return (( (XXSum<<6) - XSum*XSum ));
330
}
331
332
static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
333
		                     unsigned char *RefDataPtr1,
334
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
335
{
336
  ogg_uint32_t  i;
337
  ogg_uint32_t  XSum=0;
338
  ogg_uint32_t  XXSum=0;
339
  ogg_int32_t   DiffVal;
340
341
  for (i=8; i; i--) {
342
    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
343
    XSum += DiffVal;
344
    XXSum += DiffVal*DiffVal;
345
346
    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
347
    XSum += DiffVal;
348
    XXSum += DiffVal*DiffVal;
349
350
    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
351
    XSum += DiffVal;
352
    XXSum += DiffVal*DiffVal;
353
354
    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
355
    XSum += DiffVal;
356
    XXSum += DiffVal*DiffVal;
357
358
    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
359
    XSum += DiffVal;
360
    XXSum += DiffVal*DiffVal;
361
362
    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
363
    XSum += DiffVal;
364
    XXSum += DiffVal*DiffVal;
365
366
    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
367
    XSum += DiffVal;
368
    XXSum += DiffVal*DiffVal;
369
370
    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
371
    XSum += DiffVal;
372
    XXSum += DiffVal*DiffVal;
373
374
    /* Step to next row of block. */
375
    SrcData += SrcStride;
376
    RefDataPtr1 += RefStride;
377
    RefDataPtr2 += RefStride;
378
  }
379
380
  /* Compute and return population variance as mis-match metric. */
381
  return (( (XXSum<<6) - XSum*XSum ));
382
}
383
384
static void nop (void) { /* NOP */ }
385
386
void dsp_init(DspFunctions *funcs)
387
{
388
  funcs->save_fpu = nop;
389
  funcs->restore_fpu = nop;
390
  funcs->sub8x8 = sub8x8__c;
391
  funcs->sub8x8_128 = sub8x8_128__c;
392
  funcs->sub8x8avg2 = sub8x8avg2__c;
393
  funcs->row_sad8 = row_sad8__c;
394
  funcs->col_sad8x8 = col_sad8x8__c;
395
  funcs->sad8x8 = sad8x8__c;
396
  funcs->sad8x8_thres = sad8x8_thres__c;
397
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
398
  funcs->intra8x8_err = intra8x8_err__c;
399
  funcs->inter8x8_err = inter8x8_err__c;
400
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
401
}
402
403
void dsp_static_init(void)
404
{
405
  cpu_init ();
406
  dsp_init (&dsp_funcs);
407
  dsp_recon_init (&dsp_funcs);
408
  dsp_dct_init (&dsp_funcs);
409
  if (cpu_flags & CPU_X86_MMX) {
410
    dsp_i386_mmx_init(&dsp_funcs);
411
  }
412
  if (cpu_flags & CPU_X86_MMXEXT) {
413
    dsp_i386_mmxext_init(&dsp_funcs);
414
  }
415
}
416
(-)libtheora-1.0alpha3/lib/dsp.h (+154 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#ifndef DSP_H
19
#define DSP_H
20
21
#include <theora/theora.h>
22
23
typedef struct
24
{
25
  void   (*save_fpu)            (void);
26
  void   (*restore_fpu)         (void);
27
28
  void   (*sub8x8)  		(unsigned char *FiltPtr, unsigned char *ReconPtr,
29
	                   	 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
30
				 ogg_uint32_t ReconPixelsPerLine);
31
32
  void   (*sub8x8_128) 		(unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
33
			         ogg_uint32_t PixelsPerLine);
34
35
  void   (*sub8x8avg2) 		(unsigned char *FiltPtr, unsigned char *ReconPtr1,
36
		                 unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
37
			         ogg_uint32_t PixelsPerLine,
38
			         ogg_uint32_t ReconPixelsPerLine); 
39
40
  void   (*copy8x8)  		(unsigned char *src, unsigned char *dest, 
41
		                 ogg_uint32_t stride);
42
43
  void   (*recon_intra8x8)  	(unsigned char *ReconPtr, ogg_int16_t *ChangePtr, 
44
		                 ogg_uint32_t LineStep);
45
46
  void   (*recon_inter8x8)  	(unsigned char *ReconPtr, unsigned char *RefPtr, 
47
		                 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
48
49
  void   (*recon_inter8x8_half)	(unsigned char *ReconPtr, unsigned char *RefPtr1, 
50
		  		 unsigned char *RefPtr2, ogg_int16_t *ChangePtr, 
51
				 ogg_uint32_t LineStep);
52
53
  void   (*fdct_short)          (ogg_int16_t *InputData, ogg_int16_t *OutputData);
54
55
  ogg_uint32_t (*row_sad8)	(unsigned char *Src1, unsigned char *Src2);
56
57
  ogg_uint32_t (*col_sad8x8)	(unsigned char *Src1, unsigned char *Src2,
58
		  		 ogg_uint32_t stride);
59
60
  ogg_uint32_t (*sad8x8)	(unsigned char *ptr1, ogg_uint32_t stride1,
61
		        	 unsigned char *ptr2, ogg_uint32_t stride2);
62
63
  ogg_uint32_t (*sad8x8_thres)	(unsigned char *ptr1, ogg_uint32_t stride1,
64
		       		 unsigned char *ptr2, ogg_uint32_t stride2, 
65
				 ogg_uint32_t thres);
66
67
  ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride,
68
		                 unsigned char *RefDataPtr1,
69
			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
70
				 ogg_uint32_t thres);
71
72
  ogg_uint32_t (*intra8x8_err)	(unsigned char *DataPtr, ogg_uint32_t Stride);
73
74
  ogg_uint32_t (*inter8x8_err)	(unsigned char *SrcData, ogg_uint32_t SrcStride,
75
		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride);
76
77
  ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
78
		                 unsigned char *RefDataPtr1,
79
			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
80
} DspFunctions;
81
82
extern DspFunctions dsp_funcs;
83
84
extern void dsp_recon_init (DspFunctions *funcs);
85
86
void dsp_init(DspFunctions *funcs);
87
void dsp_static_init(void);
88
89
#define dsp_save_fpu(funcs) (funcs.save_fpu ())
90
#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)
91
92
#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
93
#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs)
94
95
#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5))
96
#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5)
97
98
#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
99
#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3)
100
101
#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6))
102
#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6)
103
104
#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
105
#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1)
106
107
#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1))
108
#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1)
109
110
#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \
111
	(funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1))
112
#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \
113
	dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1)
114
115
#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \
116
	(funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1))
117
#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \
118
	dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1)
119
120
#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
121
#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out)
122
123
#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
124
#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2)
125
126
#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1))
127
#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1)
128
129
#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2))
130
#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2)
131
132
#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t))
133
#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t)
134
135
#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \
136
	(funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t))
137
#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \
138
	dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t)
139
140
#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
141
#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1)
142
143
#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \
144
	(funcs.inter8x8_err (ptr1,str1,ptr2,str2))
145
#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \
146
	dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2)
147
148
#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
149
	(funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
150
#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \
151
	dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2)
152
153
154
#endif /* DSP_H */
(-)libtheora-1.0alpha3/lib/encode.c (-16 / +6 lines)
Lines 531-538 Link Here
531
531
532
static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
532
static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
533
                                     ogg_int32_t BlockIndex ) {
533
                                     ogg_int32_t BlockIndex ) {
534
  ogg_uint32_t  i;
534
  ogg_uint32_t  ErrorVal;
535
  ogg_uint32_t  ErrorVal = 0;
536
535
537
  unsigned char * SrcDataPtr =
536
  unsigned char * SrcDataPtr =
538
    &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
537
    &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
Lines 550-570 Link Here
550
    RecStride = cpi->pb.UVStride;
549
    RecStride = cpi->pb.UVStride;
551
  }
550
  }
552
551
552
  ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride);
553
553
554
  /* Decide on standard or MMX implementation */
555
  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
556
    ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
557
    ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
558
    ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
559
    ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
560
    ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
561
    ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
562
    ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
563
    ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
564
    /* Step to next row of block. */
565
    SrcDataPtr += SrcStride;
566
    RecDataPtr += RecStride;
567
  }
568
  return ErrorVal;
554
  return ErrorVal;
569
}
555
}
570
556
Lines 933-941 Link Here
933
    /* Zero Decoder EOB run count */
919
    /* Zero Decoder EOB run count */
934
    cpi->pb.EOB_Run = 0;
920
    cpi->pb.EOB_Run = 0;
935
921
922
    dsp_static_save_fpu ();
923
936
    /* Encode any fragments coded using DCT. */
924
    /* Encode any fragments coded using DCT. */
937
    coded_pixels += QuadCodeDisplayFragments (cpi);
925
    coded_pixels += QuadCodeDisplayFragments (cpi);
938
926
927
    dsp_static_restore_fpu ();
928
939
    return coded_pixels;
929
    return coded_pixels;
940
930
941
}
931
}
(-)libtheora-1.0alpha3/lib/encoder_internal.h (-15 / +2 lines)
Lines 24-29 Link Here
24
24
25
#include <theora/theora.h>
25
#include <theora/theora.h>
26
#include "huffman.h"
26
#include "huffman.h"
27
#include "dsp.h"
27
28
28
#ifndef LIBOGG2
29
#ifndef LIBOGG2
29
#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
30
#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
Lines 689-711 Link Here
689
                   ogg_int16_t *QuantMatrix,
690
                   ogg_int16_t *QuantMatrix,
690
                   ogg_int16_t * OutputData );
691
                   ogg_int16_t * OutputData );
691
692
692
extern void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
693
extern void dsp_recon_init (DspFunctions *funcs);
693
                        ogg_int16_t * ChangePtr, ogg_uint32_t LineStep );
694
695
extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
696
                        unsigned char * RefPtr, ogg_int16_t * ChangePtr,
697
                        ogg_uint32_t LineStep ) ;
698
699
extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
700
                                  unsigned char * RefPtr1,
701
                                  unsigned char * RefPtr2,
702
                                  ogg_int16_t * ChangePtr,
703
                                  ogg_uint32_t LineStep ) ;
704
694
705
extern void SetupLoopFilter(PB_INSTANCE *pbi);
695
extern void SetupLoopFilter(PB_INSTANCE *pbi);
706
extern void CopyBlock(unsigned char *src,
707
                      unsigned char *dest,
708
                      unsigned int srcstride);
709
extern void LoopFilter(PB_INSTANCE *pbi);
696
extern void LoopFilter(PB_INSTANCE *pbi);
710
extern void ReconRefFrames (PB_INSTANCE *pbi);
697
extern void ReconRefFrames (PB_INSTANCE *pbi);
711
extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,
698
extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,
(-)libtheora-1.0alpha3/lib/i386/dsp_mmx.c (+642 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#include <stdlib.h>
19
#include "dsp.h"
20
21
static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL;
22
23
#if defined(__MINGW32__) || defined(__CYGWIN__) || \
24
    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
25
# define M(a) "_" #a
26
#else
27
# define M(a) #a
28
#endif
29
30
#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
31
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
32
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
33
34
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
35
                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
36
                  ogg_uint32_t ReconPixelsPerLine) 
37
{
38
  __asm__ __volatile__ (
39
    "  .balign 16                   \n\t"
40
41
    "  pxor        %%mm7, %%mm7     \n\t" 
42
43
    ".rept 8                        \n\t"
44
    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
45
    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
46
    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
47
    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
48
    /* convert from UINT8 to INT16 */
49
    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
50
    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
51
    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
52
    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
53
    /* start calculation */
54
    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
55
    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
56
    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
57
    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
58
    /* Increment pointers */
59
    "  add         $16, %2           \n\t"
60
    "  add         %3, %0           \n\t"
61
    "  add         %4, %1           \n\t"
62
    ".endr                          \n\t"
63
64
     : "+r" (FiltPtr),
65
       "+r" (ReconPtr),
66
       "+r" (DctInputPtr)
67
     : "m" (PixelsPerLine),
68
       "m" (ReconPixelsPerLine) 
69
     : "memory"
70
  );
71
}
72
73
static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
74
                      ogg_uint32_t PixelsPerLine) 
75
{
76
  __asm__ __volatile__ (
77
    "  .balign 16                   \n\t"
78
79
    "  pxor        %%mm7, %%mm7     \n\t" 
80
    "  movq      "M(V128w)", %%mm1  \n\t"
81
82
    ".rept 8                        \n\t"
83
    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
84
    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
85
    /* convert from UINT8 to INT16 */
86
    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
87
    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
88
    /* start calculation */
89
    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
90
    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
91
    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
92
    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
93
    /* Increment pointers */
94
    "  add         $16, %1           \n\t"
95
    "  add         %2, %0           \n\t"
96
    ".endr                          \n\t"
97
98
     : "+r" (FiltPtr),
99
       "+r" (DctInputPtr)
100
     : "r" (PixelsPerLine)
101
     : "memory"
102
  );
103
}
104
105
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
106
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
107
                     ogg_uint32_t PixelsPerLine,
108
                     ogg_uint32_t ReconPixelsPerLine) 
109
{
110
  __asm__ __volatile__ (
111
    "  .balign 16                   \n\t"
112
113
    "  pxor        %%mm7, %%mm7     \n\t" 
114
115
    ".rept 8                        \n\t"
116
    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
117
    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
118
    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
119
    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
120
    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
121
    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
122
    /* convert from UINT8 to INT16 */
123
    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
124
    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
125
    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
126
    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
127
    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
128
    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
129
    /* average ReconPtr1 and ReconPtr2 */
130
    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
131
    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
132
    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
133
    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
134
    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
135
    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
136
    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
137
    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
138
    /* Increment pointers */
139
    "  add         $16, %3           \n\t"
140
    "  add         %4, %0           \n\t"
141
    "  add         %5, %1           \n\t"
142
    "  add         %5, %2           \n\t"
143
    ".endr                          \n\t"
144
145
     : "+r" (FiltPtr),
146
       "+r" (ReconPtr1),
147
       "+r" (ReconPtr2),
148
       "+r" (DctInputPtr)
149
     : "m" (PixelsPerLine),
150
       "m" (ReconPixelsPerLine) 
151
     : "memory"
152
  );
153
}
154
155
static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
156
{
157
  ogg_uint32_t MaxSad;
158
159
  __asm__ __volatile__ (
160
    "  .balign 16                   \n\t"
161
162
    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
163
    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
164
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
165
    "  movq        (%2), %%mm1      \n\t"
166
167
    "  movq        %%mm0, %%mm2     \n\t"
168
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
169
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
170
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
171
172
    "  movq        %%mm0, %%mm1     \n\t"
173
174
    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
175
    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
176
177
    "  movq        %%mm0, %%mm2     \n\t"
178
    "  movq        %%mm1, %%mm3     \n\t"
179
    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
180
    "  psrlq       $32, %%mm3       \n\t"
181
    "  paddw       %%mm2, %%mm0     \n\t"
182
    "  paddw       %%mm3, %%mm1     \n\t"
183
    "  movq        %%mm0, %%mm2     \n\t"
184
    "  movq        %%mm1, %%mm3     \n\t"
185
    "  psrlq       $16, %%mm2       \n\t"
186
    "  psrlq       $16, %%mm3       \n\t"
187
    "  paddw       %%mm2, %%mm0     \n\t"
188
    "  paddw       %%mm3, %%mm1     \n\t"
189
190
    "  psubusw     %%mm0, %%mm1     \n\t"
191
    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
192
    "  movd        %%mm1, %0        \n\t"
193
    "  andl        $0xffff, %0      \n\t"
194
195
     : "=m" (MaxSad),
196
       "+r" (Src1), 
197
       "+r" (Src2) 
198
     :
199
     : "memory"
200
  );
201
  return MaxSad;
202
}
203
204
static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
205
		                    ogg_uint32_t stride)
206
{
207
  ogg_uint32_t MaxSad;
208
209
  __asm__ __volatile__ (
210
    "  .balign 16                   \n\t"
211
212
    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
213
    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
214
    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
215
    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
216
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
217
    "  mov         $4, %%edi        \n\t"	/* 4 rows */
218
    "1:                             \n\t"
219
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
220
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
221
222
    "  movq        %%mm0, %%mm2     \n\t"
223
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
224
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
225
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
226
    "  movq        %%mm0, %%mm1     \n\t"
227
228
    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
229
    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
230
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
231
    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
232
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
233
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
234
235
    "  dec         %%edi            \n\t"
236
    "  jnz 1b                       \n\t"
237
238
    "  mov         $4, %%edi        \n\t"	/* 4 rows */
239
    "2:                             \n\t"
240
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
241
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
242
243
    "  movq        %%mm0, %%mm2     \n\t"
244
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
245
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
246
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
247
    "  movq        %%mm0, %%mm1     \n\t"
248
249
    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
250
    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
251
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
252
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
253
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
254
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
255
256
    "  dec         %%edi            \n\t"
257
    "  jnz 2b                       \n\t"
258
259
    "  psubusw     %%mm6, %%mm7     \n\t"
260
    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
261
    "  psubusw     %%mm4, %%mm5     \n\t" 	
262
    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
263
    "  psubusw     %%mm5, %%mm7     \n\t" 	
264
    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
265
    "  movq        %%mm7, %%mm6     \n\t"
266
    "  psrlq       $32, %%mm6       \n\t"
267
    "  psubusw     %%mm6, %%mm7     \n\t" 	
268
    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
269
    "  movq        %%mm7, %%mm6     \n\t"
270
    "  psrlq       $16, %%mm6       \n\t"
271
    "  psubusw     %%mm6, %%mm7     \n\t" 	
272
    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
273
    "  movd        %%mm7, %0        \n\t"
274
    "  andl        $0xffff, %0      \n\t"
275
276
     : "=r" (MaxSad),
277
       "+r" (Src1), 
278
       "+r" (Src2) 
279
     : "r" (stride)
280
     : "memory", "edi"
281
  );
282
283
  return MaxSad;
284
}
285
286
static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
287
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
288
{
289
  ogg_uint32_t  DiffVal;
290
291
  __asm__ __volatile__ (
292
    "  .balign 16                   \n\t"
293
    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
294
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
295
    ".rept 8                         \n\t"
296
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
297
    "  movq        (%2), %%mm1      \n\t"
298
    "  movq        %%mm0, %%mm2     \n\t"
299
300
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
301
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
302
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
303
    "  movq        %%mm0, %%mm1     \n\t"
304
305
    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
306
    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
307
    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
308
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
309
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
310
    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
311
    ".endr                          \n\t"
312
313
    "  movq        %%mm7, %%mm0     \n\t"
314
    "  psrlq       $32, %%mm7       \n\t"
315
    "  paddw       %%mm0, %%mm7     \n\t"
316
    "  movq        %%mm7, %%mm0     \n\t"
317
    "  psrlq       $16, %%mm7       \n\t"
318
    "  paddw       %%mm0, %%mm7     \n\t"
319
    "  movd        %%mm7, %0        \n\t"
320
    "  andl        $0xffff, %0      \n\t"
321
322
     : "=m" (DiffVal),
323
       "+r" (ptr1), 
324
       "+r" (ptr2) 
325
     : "r" (stride1),
326
       "r" (stride2)
327
     : "memory"
328
  );
329
330
  return DiffVal;
331
}
332
333
static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
334
		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
335
			   	  ogg_uint32_t thres)
336
{
337
  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
338
}
339
340
static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
341
		                      unsigned char *RefDataPtr1,
342
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
343
			              ogg_uint32_t thres)
344
{
345
  ogg_uint32_t  DiffVal;
346
347
  __asm__ __volatile__ (
348
    "  .balign 16                   \n\t"
349
350
    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
351
    "  paddb       %%mm5, %%mm5     \n\t"
352
   
353
    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
354
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
355
    "  mov         $8, %%edi        \n\t"	/* 8 rows */
356
    "1:                             \n\t"
357
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
358
359
    "  movq        (%2), %%mm2      \n\t"
360
    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
361
    "  movq        %%mm2, %%mm1     \n\t"
362
    "  pand        %%mm3, %%mm1     \n\t"
363
    "  pxor        %%mm2, %%mm3     \n\t"
364
    "  pand        %%mm5, %%mm3     \n\t"
365
    "  psrlq       $1, %%mm3        \n\t"
366
    "  paddb       %%mm3, %%mm1     \n\t"
367
368
    "  movq        %%mm0, %%mm2     \n\t"
369
370
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
371
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
372
    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
373
    "  movq        %%mm0, %%mm1     \n\t"
374
375
    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
376
    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
377
    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
378
    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
379
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
380
    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
381
    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
382
383
    "  dec         %%edi            \n\t"
384
    "  jnz 1b                       \n\t"
385
386
    "  movq        %%mm7, %%mm0     \n\t"
387
    "  psrlq       $32, %%mm7       \n\t"
388
    "  paddw       %%mm0, %%mm7     \n\t"
389
    "  movq        %%mm7, %%mm0     \n\t"
390
    "  psrlq       $16, %%mm7       \n\t"
391
    "  paddw       %%mm0, %%mm7     \n\t"
392
    "  movd        %%mm7, %0        \n\t"
393
    "  andl        $0xffff, %0      \n\t"
394
395
     : "=m" (DiffVal),
396
       "+r" (SrcData), 
397
       "+r" (RefDataPtr1), 
398
       "+r" (RefDataPtr2) 
399
     : "m" (SrcStride),
400
       "m" (RefStride)
401
     : "edi", "memory"
402
  );
403
404
  return DiffVal;
405
}
406
407
static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
408
{
409
  ogg_uint32_t  XSum;
410
  ogg_uint32_t  XXSum;
411
412
  __asm__ __volatile__ (
413
    "  .balign 16                   \n\t"
414
415
    "  pxor        %%mm5, %%mm5     \n\t"
416
    "  pxor        %%mm6, %%mm6     \n\t"
417
    "  pxor        %%mm7, %%mm7     \n\t"
418
    "  mov         $8, %%edi        \n\t"
419
    "1:                             \n\t"
420
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
421
    "  movq        %%mm0, %%mm2     \n\t"
422
423
    "  punpcklbw   %%mm6, %%mm0     \n\t"
424
    "  punpckhbw   %%mm6, %%mm2     \n\t"
425
426
    "  paddw       %%mm0, %%mm5     \n\t"
427
    "  paddw       %%mm2, %%mm5     \n\t"
428
429
    "  pmaddwd     %%mm0, %%mm0     \n\t"
430
    "  pmaddwd     %%mm2, %%mm2     \n\t"
431
    
432
    "  paddd       %%mm0, %%mm7     \n\t"
433
    "  paddd       %%mm2, %%mm7     \n\t"
434
435
    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
436
437
    "  dec         %%edi            \n\t"
438
    "  jnz 1b                       \n\t"
439
440
    "  movq        %%mm5, %%mm0     \n\t"
441
    "  psrlq       $32, %%mm5       \n\t"
442
    "  paddw       %%mm0, %%mm5     \n\t"
443
    "  movq        %%mm5, %%mm0     \n\t"
444
    "  psrlq       $16, %%mm5       \n\t"
445
    "  paddw       %%mm0, %%mm5     \n\t"
446
    "  movd        %%mm5, %%edi     \n\t"
447
    "  movsx       %%di, %%edi      \n\t"
448
    "  movl        %%edi, %0        \n\t"
449
450
    "  movq        %%mm7, %%mm0     \n\t"
451
    "  psrlq       $32, %%mm7       \n\t"
452
    "  paddd       %%mm0, %%mm7     \n\t"
453
    "  movd        %%mm7, %1        \n\t"
454
455
     : "=r" (XSum),
456
       "=r" (XXSum),
457
       "+r" (DataPtr) 
458
     : "r" (Stride)
459
     : "edi", "memory"
460
  );
461
462
  /* Compute population variance as mis-match metric. */
463
  return (( (XXSum<<6) - XSum*XSum ) );
464
}
465
466
static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
467
		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
468
{
469
  ogg_uint32_t  XSum;
470
  ogg_uint32_t  XXSum;
471
472
  __asm__ __volatile__ (
473
    "  .balign 16                   \n\t"
474
475
    "  pxor        %%mm5, %%mm5     \n\t"
476
    "  pxor        %%mm6, %%mm6     \n\t"
477
    "  pxor        %%mm7, %%mm7     \n\t"
478
    "  mov         $8, %%edi        \n\t"
479
    "1:                             \n\t"
480
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
481
    "  movq        (%3), %%mm1      \n\t"
482
    "  movq        %%mm0, %%mm2     \n\t"
483
    "  movq        %%mm1, %%mm3     \n\t"
484
485
    "  punpcklbw   %%mm6, %%mm0     \n\t"
486
    "  punpcklbw   %%mm6, %%mm1     \n\t"
487
    "  punpckhbw   %%mm6, %%mm2     \n\t"
488
    "  punpckhbw   %%mm6, %%mm3     \n\t"
489
490
    "  psubsw      %%mm1, %%mm0     \n\t"
491
    "  psubsw      %%mm3, %%mm2     \n\t"
492
493
    "  paddw       %%mm0, %%mm5     \n\t"
494
    "  paddw       %%mm2, %%mm5     \n\t"
495
496
    "  pmaddwd     %%mm0, %%mm0     \n\t"
497
    "  pmaddwd     %%mm2, %%mm2     \n\t"
498
    
499
    "  paddd       %%mm0, %%mm7     \n\t"
500
    "  paddd       %%mm2, %%mm7     \n\t"
501
502
    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
503
    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
504
505
    "  dec         %%edi            \n\t"
506
    "  jnz 1b                       \n\t"
507
508
    "  movq        %%mm5, %%mm0     \n\t"
509
    "  psrlq       $32, %%mm5       \n\t"
510
    "  paddw       %%mm0, %%mm5     \n\t"
511
    "  movq        %%mm5, %%mm0     \n\t"
512
    "  psrlq       $16, %%mm5       \n\t"
513
    "  paddw       %%mm0, %%mm5     \n\t"
514
    "  movd        %%mm5, %%edi     \n\t"
515
    "  movsx       %%di, %%edi      \n\t"
516
    "  movl        %%edi, %0        \n\t"
517
518
    "  movq        %%mm7, %%mm0     \n\t"
519
    "  psrlq       $32, %%mm7       \n\t"
520
    "  paddd       %%mm0, %%mm7     \n\t"
521
    "  movd        %%mm7, %1        \n\t"
522
523
     : "=m" (XSum),
524
       "=m" (XXSum),
525
       "+r" (SrcData), 
526
       "+r" (RefDataPtr) 
527
     : "m" (SrcStride),
528
       "m" (RefStride)
529
     : "edi", "memory"
530
  );
531
532
  /* Compute and return population variance as mis-match metric. */
533
  return (( (XXSum<<6) - XSum*XSum ));
534
}
535
536
static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
537
		                     unsigned char *RefDataPtr1,
538
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
539
{
540
  ogg_uint32_t XSum;
541
  ogg_uint32_t XXSum;
542
543
  __asm__ __volatile__ (
544
    "  .balign 16                   \n\t"
545
546
    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
547
    "  paddb       %%mm4, %%mm4     \n\t"
548
    "  pxor        %%mm5, %%mm5     \n\t"
549
    "  pxor        %%mm6, %%mm6     \n\t"
550
    "  pxor        %%mm7, %%mm7     \n\t"
551
    "  mov         $8, %%edi        \n\t"
552
    "1:                             \n\t"
553
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
554
555
    "  movq        (%3), %%mm2      \n\t"
556
    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
557
    "  movq        %%mm2, %%mm1     \n\t"
558
    "  pand        %%mm3, %%mm1     \n\t"
559
    "  pxor        %%mm2, %%mm3     \n\t"
560
    "  pand        %%mm4, %%mm3     \n\t"
561
    "  psrlq       $1, %%mm3        \n\t"
562
    "  paddb       %%mm3, %%mm1     \n\t"
563
564
    "  movq        %%mm0, %%mm2     \n\t"
565
    "  movq        %%mm1, %%mm3     \n\t"
566
567
    "  punpcklbw   %%mm6, %%mm0     \n\t"
568
    "  punpcklbw   %%mm6, %%mm1     \n\t"
569
    "  punpckhbw   %%mm6, %%mm2     \n\t"
570
    "  punpckhbw   %%mm6, %%mm3     \n\t"
571
572
    "  psubsw      %%mm1, %%mm0     \n\t"
573
    "  psubsw      %%mm3, %%mm2     \n\t"
574
575
    "  paddw       %%mm0, %%mm5     \n\t"
576
    "  paddw       %%mm2, %%mm5     \n\t"
577
578
    "  pmaddwd     %%mm0, %%mm0     \n\t"
579
    "  pmaddwd     %%mm2, %%mm2     \n\t"
580
    
581
    "  paddd       %%mm0, %%mm7     \n\t"
582
    "  paddd       %%mm2, %%mm7     \n\t"
583
584
    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
585
    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
586
    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
587
588
    "  dec         %%edi            \n\t"
589
    "  jnz 1b                       \n\t"
590
591
    "  movq        %%mm5, %%mm0     \n\t"
592
    "  psrlq       $32, %%mm5       \n\t"
593
    "  paddw       %%mm0, %%mm5     \n\t"
594
    "  movq        %%mm5, %%mm0     \n\t"
595
    "  psrlq       $16, %%mm5       \n\t"
596
    "  paddw       %%mm0, %%mm5     \n\t"
597
    "  movd        %%mm5, %%edi     \n\t"
598
    "  movsx       %%di, %%edi      \n\t"
599
    "  movl        %%edi, %0        \n\t"
600
601
    "  movq        %%mm7, %%mm0     \n\t"
602
    "  psrlq       $32, %%mm7       \n\t"
603
    "  paddd       %%mm0, %%mm7     \n\t"
604
    "  movd        %%mm7, %1        \n\t"
605
606
     : "=m" (XSum),
607
       "=m" (XXSum),
608
       "+r" (SrcData), 
609
       "+r" (RefDataPtr1),
610
       "+r" (RefDataPtr2) 
611
     : "m" (SrcStride),
612
       "m" (RefStride)
613
     : "edi", "memory"
614
  );
615
616
  /* Compute and return population variance as mis-match metric. */
617
  return (( (XXSum<<6) - XSum*XSum ));
618
}
619
620
static void restore_fpu (void)
621
{
622
  __asm__ __volatile__ (
623
    "  emms                         \n\t"
624
  );
625
}
626
627
void dsp_i386_mmx_init(DspFunctions *funcs)
628
{
629
  funcs->restore_fpu = restore_fpu;
630
  funcs->sub8x8 = sub8x8__mmx;
631
  funcs->sub8x8_128 = sub8x8_128__mmx;
632
  funcs->sub8x8avg2 = sub8x8avg2__mmx;
633
  funcs->row_sad8 = row_sad8__mmx;
634
  funcs->col_sad8x8 = col_sad8x8__mmx;
635
  funcs->sad8x8 = sad8x8__mmx;
636
  funcs->sad8x8_thres = sad8x8_thres__mmx;
637
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
638
  funcs->intra8x8_err = intra8x8_err__mmx;
639
  funcs->inter8x8_err = inter8x8_err__mmx;
640
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
641
}
642
(-)libtheora-1.0alpha3/lib/i386/dsp_mmxext.c (+316 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#include <stdlib.h>
19
#include "dsp.h"
20
21
static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
22
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
23
{
24
  ogg_uint32_t  DiffVal;
25
26
  __asm__ __volatile__ (
27
    "  .balign 16                   \n\t"
28
    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
29
30
    ".rept 7                        \n\t"
31
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
32
    "  movq (%2), %%mm1             \n\t"
33
    "  psadbw %%mm1, %%mm0          \n\t"
34
    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
35
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
36
    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
37
    ".endr                          \n\t"
38
39
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
40
    "  movq (%2), %%mm1             \n\t"
41
    "  psadbw %%mm1, %%mm0          \n\t"
42
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
43
    "  movd %%mm7, %0               \n\t"
44
45
     : "=r" (DiffVal),
46
       "+r" (ptr1), 
47
       "+r" (ptr2) 
48
     : "r" (stride1),
49
       "r" (stride2)
50
     : "memory"
51
  );
52
53
  return DiffVal;
54
}
55
56
static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
57
		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
58
			   	  ogg_uint32_t thres)
59
{
60
  ogg_uint32_t  DiffVal;
61
62
  __asm__ __volatile__ (
63
    "  .balign 16                   \n\t"
64
    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
65
66
    ".rept 8                        \n\t"
67
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
68
    "  movq (%2), %%mm1             \n\t"
69
    "  psadbw %%mm1, %%mm0          \n\t"
70
    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
71
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
72
    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
73
    ".endr                          \n\t"
74
75
    "  movd %%mm7, %0               \n\t"
76
77
     : "=r" (DiffVal),
78
       "+r" (ptr1), 
79
       "+r" (ptr2) 
80
     : "r" (stride1),
81
       "r" (stride2)
82
     : "memory"
83
  );
84
85
  return DiffVal;
86
}
87
88
static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
89
		                      unsigned char *RefDataPtr1,
90
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
91
			              ogg_uint32_t thres)
92
{
93
  ogg_uint32_t  DiffVal;
94
95
  __asm__ __volatile__ (
96
    "  .balign 16                   \n\t"
97
    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
98
    ".rept 8                        \n\t"
99
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
100
    "  movq (%2), %%mm1             \n\t"
101
    "  movq (%3), %%mm2             \n\t"
102
    "  pavgb %%mm2, %%mm1           \n\t"
103
    "  psadbw %%mm1, %%mm0          \n\t"
104
105
    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
106
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
107
    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
108
    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
109
    ".endr                          \n\t"
110
111
    "  movd %%mm7, %0               \n\t"
112
     : "=m" (DiffVal),
113
       "+r" (SrcData), 
114
       "+r" (RefDataPtr1), 
115
       "+r" (RefDataPtr2) 
116
     : "m" (SrcStride),
117
       "m" (RefStride)
118
     : "memory"
119
  );
120
121
  return DiffVal;
122
}
123
		
124
static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
125
{
126
  ogg_uint32_t MaxSad;
127
128
  __asm__ __volatile__ (
129
    "  .balign 16                   \n\t"
130
131
    "  movd        (%1), %%mm0      \n\t"
132
    "  movd        (%2), %%mm1      \n\t"
133
    "  psadbw      %%mm0, %%mm1     \n\t"
134
    "  movd        4(%1), %%mm2     \n\t"
135
    "  movd        4(%2), %%mm3     \n\t"
136
    "  psadbw      %%mm2, %%mm3     \n\t"
137
138
    "  pmaxsw      %%mm1, %%mm3     \n\t"
139
    "  movd        %%mm3, %0        \n\t"
140
    "  andl        $0xffff, %0      \n\t"
141
142
     : "=m" (MaxSad),
143
       "+r" (Src1), 
144
       "+r" (Src2) 
145
     :
146
     : "memory"
147
  );
148
149
  return MaxSad;
150
}
151
152
static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
153
		                    ogg_uint32_t stride)
154
{
155
  ogg_uint32_t MaxSad;
156
157
  __asm__ __volatile__ (
158
    "  .balign 16                   \n\t"
159
160
    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
161
    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
162
    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
163
    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
164
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
165
    "  mov         $4, %%edi        \n\t"	/* 4 rows */
166
    "1:                             \n\t"
167
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
168
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
169
170
    "  movq        %%mm0, %%mm2     \n\t"
171
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
172
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
173
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
174
    "  movq        %%mm0, %%mm1     \n\t"
175
176
    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
177
    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
178
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
179
    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
180
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
181
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
182
183
    "  dec         %%edi            \n\t"
184
    "  jnz 1b                       \n\t"
185
186
    "  mov         $4, %%edi        \n\t"	/* 4 rows */
187
    "2:                             \n\t"
188
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
189
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
190
191
    "  movq        %%mm0, %%mm2     \n\t"
192
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
193
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
194
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
195
    "  movq        %%mm0, %%mm1     \n\t"
196
197
    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
198
    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
199
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
200
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
201
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
202
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
203
204
    "  dec         %%edi            \n\t"
205
    "  jnz 2b                       \n\t"
206
207
    "  pmaxsw      %%mm6, %%mm7     \n\t"
208
    "  pmaxsw      %%mm4, %%mm5     \n\t"
209
    "  pmaxsw      %%mm5, %%mm7     \n\t"
210
    "  movq        %%mm7, %%mm6     \n\t"
211
    "  psrlq       $32, %%mm6       \n\t"
212
    "  pmaxsw      %%mm6, %%mm7     \n\t"
213
    "  movq        %%mm7, %%mm6     \n\t"
214
    "  psrlq       $16, %%mm6       \n\t"
215
    "  pmaxsw      %%mm6, %%mm7     \n\t"
216
    "  movd        %%mm7, %0        \n\t"
217
    "  andl        $0xffff, %0      \n\t"
218
219
     : "=r" (MaxSad),
220
       "+r" (Src1), 
221
       "+r" (Src2) 
222
     : "r" (stride)
223
     : "memory", "edi"
224
  );
225
226
  return MaxSad;
227
}
228
229
static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
230
		                     unsigned char *RefDataPtr1,
231
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
232
{
233
  ogg_uint32_t XSum;
234
  ogg_uint32_t XXSum;
235
236
  __asm__ __volatile__ (
237
    "  .balign 16                   \n\t"
238
239
    "  pxor        %%mm4, %%mm4     \n\t"
240
    "  pxor        %%mm5, %%mm5     \n\t"
241
    "  pxor        %%mm6, %%mm6     \n\t"
242
    "  pxor        %%mm7, %%mm7     \n\t"
243
    "  mov         $8, %%edi        \n\t"
244
    "1:                             \n\t"
245
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
246
247
    "  movq        (%3), %%mm2      \n\t"
248
    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
249
    "  pavgb       %%mm2, %%mm1     \n\t"
250
251
    "  movq        %%mm0, %%mm2     \n\t"
252
    "  movq        %%mm1, %%mm3     \n\t"
253
254
    "  punpcklbw   %%mm6, %%mm0     \n\t"
255
    "  punpcklbw   %%mm4, %%mm1     \n\t"
256
    "  punpckhbw   %%mm6, %%mm2     \n\t"
257
    "  punpckhbw   %%mm4, %%mm3     \n\t"
258
259
    "  psubsw      %%mm1, %%mm0     \n\t"
260
    "  psubsw      %%mm3, %%mm2     \n\t"
261
262
    "  paddw       %%mm0, %%mm5     \n\t"
263
    "  paddw       %%mm2, %%mm5     \n\t"
264
265
    "  pmaddwd     %%mm0, %%mm0     \n\t"
266
    "  pmaddwd     %%mm2, %%mm2     \n\t"
267
    
268
    "  paddd       %%mm0, %%mm7     \n\t"
269
    "  paddd       %%mm2, %%mm7     \n\t"
270
271
    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
272
    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
273
    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
274
275
    "  dec         %%edi            \n\t"
276
    "  jnz 1b                       \n\t"
277
278
    "  movq        %%mm5, %%mm0     \n\t"
279
    "  psrlq       $32, %%mm5       \n\t"
280
    "  paddw       %%mm0, %%mm5     \n\t"
281
    "  movq        %%mm5, %%mm0     \n\t"
282
    "  psrlq       $16, %%mm5       \n\t"
283
    "  paddw       %%mm0, %%mm5     \n\t"
284
    "  movd        %%mm5, %%edi     \n\t"
285
    "  movsx       %%di, %%edi      \n\t"
286
    "  movl        %%edi, %0        \n\t"
287
288
    "  movq        %%mm7, %%mm0     \n\t"
289
    "  psrlq       $32, %%mm7       \n\t"
290
    "  paddd       %%mm0, %%mm7     \n\t"
291
    "  movd        %%mm7, %1        \n\t"
292
293
     : "=m" (XSum),
294
       "=m" (XXSum),
295
       "+r" (SrcData), 
296
       "+r" (RefDataPtr1),
297
       "+r" (RefDataPtr2) 
298
     : "m" (SrcStride),
299
       "m" (RefStride)
300
     : "edi", "memory"
301
  );
302
303
  /* Compute and return population variance as mis-match metric. */
304
  return (( (XXSum<<6) - XSum*XSum ));
305
}
306
307
void dsp_i386_mmxext_init(DspFunctions *funcs)
308
{
309
  funcs->row_sad8 = row_sad8__mmxext;
310
  funcs->col_sad8x8 = col_sad8x8__mmxext;
311
  funcs->sad8x8 = sad8x8__mmxext;
312
  funcs->sad8x8_thres = sad8x8_thres__mmxext;
313
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
314
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
315
}
316
(-)libtheora-1.0alpha3/lib/i386/fdct_mmx.c (+340 lines)
Line 0 Link Here
1
;//==========================================================================
2
;//
3
;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
4
;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
5
;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
6
;//  PURPOSE.
7
;//
8
;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
9
;//
10
;//--------------------------------------------------------------------------
11
12
#include <theora/theora.h>
13
#include "dsp.h"
14
15
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
16
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
17
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
18
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
19
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
20
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
21
static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
22
23
#if defined(__MINGW32__) || defined(__CYGWIN__) || \
24
    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
25
# define M(a) "_" #a
26
#else
27
# define M(a) #a
28
#endif
29
30
/***********************************************************************
31
 *	File:			fdct_m.asm
32
 *
33
 *	Description:
34
 *					This function perform 2-D Forward DCT on a 8x8 block
35
 *					
36
 *
37
 *	Input:			Pointers to input source data buffer and destination 
38
 *					buffer.
39
 *
40
 *	Note:			none
41
 *
42
 *	Special Notes:	We try to do the truncation right to match the result 
43
 *					of the c version. 
44
 *
45
 ************************************************************************/
46
47
/* execute stage 1 of forward DCT */
48
#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
49
  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
50
  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
51
  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
52
  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
53
  "  movq        %%mm0, %%mm4       \n\t"                                     \
54
  "  movq        %%mm1, %%mm5       \n\t"                                     \
55
  "  movq        %%mm2, %%mm6       \n\t"                                     \
56
  "  movq        %%mm3, %%mm7       \n\t"                                     \
57
                                                                              \
58
  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
59
  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
60
  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
61
  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
62
  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
63
  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
64
                                                                              \
65
  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
66
                                                                              \
67
  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
68
                                                                              \
69
  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
70
                                                                              \
71
  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
72
  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
73
  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
74
  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
75
  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
76
                                                                              \
77
  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
78
  /* ------------------------------------------------------------------- */   \
79
  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
80
  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
81
  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
82
  /* ------------------------------------------------------------------- */   \
83
  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
84
  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
85
                                                                              \
86
  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
87
  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
88
                                                                              \
89
  "  pmulhw   "M(xC4S4)", %%mm0     \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
90
  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
91
  "  psrlw       $15, %%mm2         \n\t"                                     \
92
  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
93
                                                                              \
94
  "  movq        %%mm3, %%mm2       \n\t"                                     \
95
  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
96
                                                                              \
97
  "  movq        %%mm3, %%mm0       \n\t"                                     \
98
  "  pmulhw   "M(xC4S4)", %%mm3     \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
99
                                                                              \
100
  "  psrlw       $15, %%mm2         \n\t"                                     \
101
  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
102
  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
103
                                                                              \
104
  "  movq        %%mm3," #ip0 "     \n\t"                                     \
105
  /* ------------------------------------------------------------------- */   \
106
  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
107
  "  pmulhw   "M(xC2S6)", %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
108
                                                                              \
109
  "  movq      " #temp ", %%mm2     \n\t"                                     \
110
  "  movq        %%mm2, %%mm0       \n\t"                                     \
111
                                                                              \
112
  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
113
  "  paddw       %%mm0, %%mm3       \n\t"                                     \
114
                                                                              \
115
  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
116
  "  movq        %%mm5, %%mm0       \n\t"                                     \
117
                                                                              \
118
  "  movq        %%mm5, %%mm2       \n\t"                                     \
119
  "  pmulhw   "M(xC6S2)", %%mm0     \n\t" /* mm0 = xC6S2 * irot_input_x */    \
120
                                                                              \
121
  "  psrlw       $15, %%mm2         \n\t"                                     \
122
  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
123
                                                                              \
124
  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
125
  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
126
                                                                              \
127
  "  movq        %%mm5, %%mm0       \n\t"                                     \
128
  "  movq        %%mm5, %%mm2       \n\t"                                     \
129
                                                                              \
130
  "  pmulhw   "M(xC2S6)", %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
131
  "  psrlw       $15, %%mm2         \n\t"                                     \
132
                                                                              \
133
  "  movq      " #temp ", %%mm3     \n\t"                                     \
134
  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
135
                                                                              \
136
  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
137
  "  movq        %%mm3, %%mm2       \n\t"                                     \
138
                                                                              \
139
  "  pmulhw   "M(xC6S2)", %%mm3     \n\t" /* mm3 = xC6S2 * irot_input_y */    \
140
  "  psrlw       $15, %%mm2         \n\t"                                     \
141
                                                                              \
142
  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
143
  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
144
                                                                              \
145
  "  movq        %%mm3," #ip6 "     \n\t"                                     \
146
  /* ------------------------------------------------------------------- */   \
147
  "  movq     "M(xC4S4)", %%mm0     \n\t"                                     \
148
  "  movq        %%mm1, %%mm2       \n\t"                                     \
149
  "  movq        %%mm1, %%mm3       \n\t"                                     \
150
                                                                              \
151
  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
152
  "  psrlw       $15, %%mm2         \n\t"				      \
153
                                                                              \
154
  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
155
  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
156
                                                                              \
157
  "  movq        %%mm7, %%mm2       \n\t"                                     \
158
  "  movq        %%mm7, %%mm3       \n\t"			              \
159
                                                                              \
160
  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
161
  "  psrlw       $15, %%mm2         \n\t"			              \
162
                                                                              \
163
  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
164
  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
165
  /* ------------------------------------------------------------------- */   \
166
  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
167
  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
168
                                                                              \
169
  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
170
  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
171
  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
172
                                                                              \
173
  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
174
  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
175
  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
176
  /* ------------------------------------------------------------------- */   \
177
  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
178
  "  movq        %%mm1, %%mm2       \n\t"                                     \
179
                                                                              \
180
  "  movq        %%mm1, %%mm3       \n\t"                                     \
181
  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
182
                                                                              \
183
  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
184
  "  psrlw       $15, %%mm2         \n\t"                                     \
185
                                                                              \
186
  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
187
  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
188
                                                                              \
189
  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
190
  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
191
                                                                              \
192
  "  movq        %%mm0, %%mm5       \n\t"                                     \
193
  "  movq        %%mm0, %%mm2       \n\t"                                     \
194
                                                                              \
195
  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
196
  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
197
                                                                              \
198
  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
199
  "  psrlw       $15, %%mm2         \n\t"                                     \
200
                                                                              \
201
  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
202
  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
203
                                                                              \
204
  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
205
  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
206
                                                                              \
207
  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
208
  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
209
                                                                              \
210
  "  movq        %%mm1," #ip1 "     \n\t"                                     \
211
  "  movq        %%mm3," #ip7 "     \n\t"                                     \
212
  /* ------------------------------------------------------------------- */   \
213
  "  movq     "M(xC3S5)", %%mm0     \n\t"                                     \
214
  "  movq     "M(xC5S3)", %%mm1     \n\t"                                     \
215
                                                                              \
216
  "  movq        %%mm6, %%mm5       \n\t"                                     \
217
  "  movq        %%mm6, %%mm7       \n\t"                                     \
218
                                                                              \
219
  "  movq        %%mm4, %%mm2       \n\t"                                     \
220
  "  movq        %%mm4, %%mm3       \n\t"                                     \
221
                                                                              \
222
  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
223
  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
224
                                                                              \
225
  "  psrlw       $15, %%mm2         \n\t"                                     \
226
  "  psrlw       $15, %%mm5         \n\t"                                     \
227
                                                                              \
228
  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
229
  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
230
                                                                              \
231
  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
232
  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
233
                                                                              \
234
  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
235
  "  movq        %%mm4," #ip3 "     \n\t"                                     \
236
                                                                              \
237
  "  movq        %%mm3, %%mm4       \n\t"                                     \
238
  "  movq        %%mm7, %%mm6       \n\t"                                     \
239
                                                                              \
240
  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
241
  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
242
                                                                              \
243
  "  paddw       %%mm2, %%mm4       \n\t"                                     \
244
  "  paddw       %%mm5, %%mm6       \n\t"                                     \
245
                                                                              \
246
  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
247
  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
248
                                                                              \
249
  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
250
  "  movq        %%mm3," #ip5 "     \n\t" 
251
252
#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
253
		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
254
  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
255
  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
256
  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
257
  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
258
  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
259
  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
260
  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
261
  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
262
  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
263
   /* Transpose 2x8 block */                                            \
264
  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
265
  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
266
  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
267
  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
268
  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
269
  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
270
  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
271
  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
272
  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
273
  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
274
  "  movq        %%mm4," #op4 "     \n\t"                               \
275
  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
276
  "  movq        %%mm5," #op5 "     \n\t"                               \
277
  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
278
  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
279
  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
280
  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
281
  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
282
  "  movq        %%mm6," #op7 "     \n\t"                               \
283
  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
284
  "  movq        %%mm1," #op6 "     \n\t"                               \
285
  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
286
  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
287
  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
288
  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
289
  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
290
  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
291
  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
292
  "  movq        %%mm0," #op0 "     \n\t"                               \
293
  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
294
  "  movq        %%mm1," #op1 "     \n\t"                               \
295
  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
296
  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
297
  "  movq        %%mm4," #op3 "     \n\t"                               \
298
  "  movq        %%mm2," #op2 "     \n\t"
299
300
301
static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
302
{
303
  ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
304
  ogg_int16_t *const temp= (int16_t*)align_tmp;
305
306
  __asm__ __volatile__ (
307
    "  .balign 16                   \n\t"
308
    /*
309
     * Input data is an 8x8 block.  To make processing of the data more efficent
310
     * we will transpose the block of data to two 4x8 blocks???
311
     */
312
    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
313
		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
314
    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
315
316
    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
317
		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
318
    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
319
320
    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
321
		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
322
    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
323
324
    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
325
		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
326
    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
327
328
    "  emms                         \n\t"
329
    
330
    : "+r" (InputData),
331
      "+r" (OutputData)
332
    : "r" (temp)
333
    : "memory"
334
  );
335
}
336
337
void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
338
{
339
  funcs->fdct_short = fdct_short__mmx;
340
}
(-)libtheora-1.0alpha3/lib/i386/recon_mmx.c (+185 lines)
Line 0 Link Here
1
/********************************************************************
2
 *                                                                  *
3
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7
 *                                                                  *
8
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
9
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
10
 *                                                                  *
11
 ********************************************************************
12
13
  function:
14
  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
15
16
 ********************************************************************/
17
18
#include "encoder_internal.h"
19
20
static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
21
22
#if defined(__MINGW32__) || defined(__CYGWIN__) || \
23
	    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
24
# define M(a) "_" #a
25
#else
26
# define M(a) #a
27
#endif
28
29
static void copy8x8__mmx (unsigned char *src,
30
	                unsigned char *dest,
31
	                unsigned int stride)
32
{
33
  __asm__ __volatile__ (
34
    "  .balign 16                      \n\t"
35
36
    "  lea         (%2, %2, 2), %%edi  \n\t"
37
38
    "  movq        (%1), %%mm0         \n\t"
39
    "  movq        (%1, %2), %%mm1     \n\t"
40
    "  movq        (%1, %2, 2), %%mm2  \n\t"
41
    "  movq        (%1, %%edi), %%mm3  \n\t"
42
43
    "  lea         (%1, %2, 4), %1     \n\t" 
44
45
    "  movq        %%mm0, (%0)         \n\t"
46
    "  movq        %%mm1, (%0, %2)     \n\t"
47
    "  movq        %%mm2, (%0, %2, 2)  \n\t"
48
    "  movq        %%mm3, (%0, %%edi)  \n\t"
49
50
    "  lea         (%0, %2, 4), %0     \n\t" 
51
52
    "  movq        (%1), %%mm0         \n\t"
53
    "  movq        (%1, %2), %%mm1     \n\t"
54
    "  movq        (%1, %2, 2), %%mm2  \n\t"
55
    "  movq        (%1, %%edi), %%mm3  \n\t"
56
57
    "  movq        %%mm0, (%0)         \n\t"
58
    "  movq        %%mm1, (%0, %2)     \n\t"
59
    "  movq        %%mm2, (%0, %2, 2)  \n\t"
60
    "  movq        %%mm3, (%0, %%edi)  \n\t"
61
      : "+a" (dest)
62
      : "c" (src),
63
        "d" (stride)
64
      : "memory", "edi"
65
  );
66
}
67
68
static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
69
		      ogg_uint32_t LineStep)
70
{
71
  __asm__ __volatile__ (
72
    "  .balign 16                      \n\t"
73
74
    "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */
75
76
    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
77
    "1:                                \n\t" 
78
    "  movq         (%1), %%mm2        \n\t" /* First four input values */
79
80
    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
81
    "  por         %%mm0, %%mm0        \n\t" 
82
    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
83
    "  lea         16(%1), %1          \n\t" /* Step source buffer */
84
    "  cmp         %%edi, %1           \n\t" /* are we done */
85
86
    "  movq        %%mm2, (%0)         \n\t" /* store results */
87
88
    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
89
    "  jc          1b                  \n\t" /* Loop back if we are not done */
90
      : "+r" (ReconPtr)
91
      : "r" (ChangePtr),
92
        "r" (LineStep)
93
      : "memory", "edi"
94
  );
95
}
96
97
static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
98
		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
99
{
100
  __asm__ __volatile__ (
101
    "  .balign 16                      \n\t"
102
103
    "  pxor        %%mm0, %%mm0        \n\t"
104
    "  lea         128(%1), %%edi      \n\t"
105
106
    "1:                                \n\t"
107
    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
108
109
    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
110
    "  movq        %%mm2, %%mm3        \n\t"
111
    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
112
    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
113
    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
114
    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
115
    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
116
    "  add         %3, %2              \n\t" /* next row of reference pixels */
117
    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
118
    "  lea         16(%1), %1          \n\t" /* next row of changes */
119
    "  cmp         %%edi, %1            \n\t" /* are we done? */
120
121
    "  movq        %%mm2, (%0)         \n\t" /* store result */
122
123
    "  lea         (%0, %3), %0        \n\t" /* next row of output */
124
    "  jc          1b                  \n\t"
125
      : "+r" (ReconPtr)
126
      : "r" (ChangePtr),
127
        "r" (RefPtr),
128
        "r" (LineStep)
129
      : "memory", "edi"
130
  );
131
}
132
133
static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
134
		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
135
			   ogg_uint32_t LineStep)
136
{
137
  __asm__ __volatile__ (
138
    "  .balign 16                      \n\t"
139
140
    "  pxor        %%mm0, %%mm0        \n\t"
141
    "  lea         128(%1), %%edi      \n\t"
142
143
    "1:                                \n\t"
144
    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
145
    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
146
147
    "  movq        %%mm2, %%mm3        \n\t"
148
    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
149
    "  movq        %%mm4, %%mm5        \n\t"
150
    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
151
    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
152
    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
153
    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
154
    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
155
    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
156
    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
157
    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
158
    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
159
    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
160
    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
161
    "  lea         16(%1), %1          \n\t" /* next row of changes */
162
    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
163
    "  add         %4, %2              \n\t" /* next row of reference pixels */
164
    "  add         %4, %3              \n\t" /* next row of reference pixels */
165
    "  movq        %%mm2, (%0)         \n\t" /* store result */
166
    "  add         %4, %0              \n\t" /* next row of output */
167
    "  cmp         %%edi, %1           \n\t" /* are we done? */
168
    "  jc          1b                  \n\t"
169
      : "+r" (ReconPtr)
170
      : "r" (ChangePtr),
171
        "r" (RefPtr1),
172
        "r" (RefPtr2),
173
        "m" (LineStep)
174
      : "memory", "edi"
175
  );
176
}
177
178
void dsp_i386_mmx_recon_init(DspFunctions *funcs)
179
{
180
  funcs->copy8x8 = copy8x8__mmx;
181
  funcs->recon_intra8x8 = recon_intra8x8__mmx;
182
  funcs->recon_inter8x8 = recon_inter8x8__mmx;
183
  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
184
}
185
(-)libtheora-1.0alpha3/lib/Makefile.am (-1 / +2 lines)
Lines 6-12 Link Here
6
6
7
libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \
7
libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \
8
	encoder_internal.h idct.c reconstruct.c block_inline.h \
8
	encoder_internal.h idct.c reconstruct.c block_inline.h \
9
	encoder_lookup.h mcomp.c scan.c blockmap.c misc_common.c \
9
	encoder_lookup.h cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c \
10
	i386/recon_mmx.c i386/fdct_mmx.c mcomp.c scan.c blockmap.c misc_common.c \
10
	dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \
11
	dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \
11
	huffman.c pp.h toplevel.c decode.c huffman.h quant.c \
12
	huffman.c pp.h toplevel.c decode.c huffman.h quant.c \
12
	comment.c toplevel_lookup.h mcomp.h
13
	comment.c toplevel_lookup.h mcomp.h
(-)libtheora-1.0alpha3/lib/mcomp.c (-249 / +69 lines)
Lines 17-22 Link Here
17
17
18
#include <stdlib.h>
18
#include <stdlib.h>
19
#include <stdio.h>
19
#include <stdio.h>
20
#include "dsp.h"
20
#include "encoder_internal.h"
21
#include "encoder_internal.h"
21
22
22
/* Initialises motion compentsation. */
23
/* Initialises motion compentsation. */
Lines 100-260 Link Here
100
                          unsigned char * RefDataPtr1,
101
                          unsigned char * RefDataPtr1,
101
                          unsigned char * RefDataPtr2,
102
                          unsigned char * RefDataPtr2,
102
                          ogg_uint32_t PixelsPerLine ) {
103
                          ogg_uint32_t PixelsPerLine ) {
103
  ogg_uint32_t  i;
104
  ogg_int32_t   XSum=0;
105
  ogg_int32_t   XXSum=0;
106
  ogg_int32_t   DiffVal;
104
  ogg_int32_t   DiffVal;
107
  ogg_int32_t   AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));
105
  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
106
  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
108
107
109
  /* Mode of interpolation chosen based upon on the offset of the
108
  /* Mode of interpolation chosen based upon on the offset of the
110
     second reference pointer */
109
     second reference pointer */
111
  if ( AbsRefOffset == 0 ) {
110
  if ( RefOffset == 0 ) {
112
    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
111
    DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine,
113
      DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];
112
		          RefDataPtr1, RefPixelsPerLine);
114
      XSum += DiffVal;
115
116
      /* negative array indexes are strictly forbidden by ANSI C and C99 */
117
      XXSum += DiffVal*DiffVal;
118
119
      DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
120
      XSum += DiffVal;
121
      XXSum += DiffVal*DiffVal;
122
123
      DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
124
      XSum += DiffVal;
125
      XXSum += DiffVal*DiffVal;
126
127
      DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
128
      XSum += DiffVal;
129
      XXSum += DiffVal*DiffVal;
130
131
      DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
132
      XSum += DiffVal;
133
      XXSum += DiffVal*DiffVal;
134
135
      DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
136
      XSum += DiffVal;
137
      XXSum += DiffVal*DiffVal;
138
139
      DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
140
      XSum += DiffVal;
141
      XXSum += DiffVal*DiffVal;
142
143
      DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
144
      XSum += DiffVal;
145
      XXSum += DiffVal*DiffVal;
146
147
      /* Step to next row of block. */
148
      NewDataPtr += PixelsPerLine;
149
      RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
150
    }
151
152
  }else{
113
  }else{
153
114
    DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine,
154
    /* Simple two reference interpolation */
115
		          RefDataPtr1, 
155
    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
116
		          RefDataPtr2, RefPixelsPerLine);
156
      DiffVal = ((int)NewDataPtr[0]) -
157
        (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
158
      XSum += DiffVal;
159
      XXSum += DiffVal*DiffVal;
160
161
      DiffVal = ((int)NewDataPtr[1]) -
162
        (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
163
      XSum += DiffVal;
164
      XXSum += DiffVal*DiffVal;
165
166
      DiffVal = ((int)NewDataPtr[2]) -
167
        (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
168
      XSum += DiffVal;
169
      XXSum += DiffVal*DiffVal;
170
171
      DiffVal = ((int)NewDataPtr[3]) -
172
        (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
173
      XSum += DiffVal;
174
      XXSum += DiffVal*DiffVal;
175
176
      DiffVal = ((int)NewDataPtr[4]) -
177
        (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
178
      XSum += DiffVal;
179
      XXSum += DiffVal*DiffVal;
180
181
      DiffVal = ((int)NewDataPtr[5]) -
182
        (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
183
      XSum += DiffVal;
184
      XXSum += DiffVal*DiffVal;
185
186
      DiffVal = ((int)NewDataPtr[6]) -
187
        (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
188
      XSum += DiffVal;
189
      XXSum += DiffVal*DiffVal;
190
191
      DiffVal = ((int)NewDataPtr[7]) -
192
        (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
193
      XSum += DiffVal;
194
      XXSum += DiffVal*DiffVal;
195
196
      /* Step to next row of block. */
197
      NewDataPtr += PixelsPerLine;
198
      RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
199
      RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
200
    }
201
  }
117
  }
202
118
203
  /* Compute and return population variance as mis-match metric. */
119
  /* Compute and return population variance as mis-match metric. */
204
  return (( (XXSum<<6) - XSum*XSum ));
205
}
206
207
static ogg_uint32_t GetSumAbsDiffs  (unsigned char * NewDataPtr,
208
                              unsigned char  * RefDataPtr,
209
                              ogg_uint32_t PixelsPerLine,
210
                              ogg_uint32_t ErrorSoFar) {
211
  ogg_uint32_t  i;
212
  ogg_uint32_t  DiffVal = ErrorSoFar;
213
214
  /* Decide on standard or MMX implementation */
215
  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
216
    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
217
    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
218
    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
219
    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
220
    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
221
    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
222
    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
223
    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
224
225
    /* Step to next row of block. */
226
    NewDataPtr += PixelsPerLine;
227
    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
228
  }
229
230
  return DiffVal;
231
}
232
233
static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
234
                                 unsigned char * RefDataPtr,
235
                                 ogg_uint32_t PixelsPerLine,
236
                                 ogg_uint32_t ErrorSoFar,
237
                                 ogg_uint32_t BestSoFar ) {
238
  ogg_uint32_t  i;
239
  ogg_uint32_t  DiffVal = ErrorSoFar;
240
241
  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
242
    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
243
    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
244
    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
245
    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
246
    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
247
    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
248
    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
249
    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
250
251
    if ( DiffVal > BestSoFar )break;
252
253
    /* Step to next row of block. */
254
    NewDataPtr += PixelsPerLine;
255
    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
256
  }
257
258
  return DiffVal;
120
  return DiffVal;
259
}
121
}
260
122
Lines 265-382 Link Here
265
                                      ogg_uint32_t ErrorSoFar,
127
                                      ogg_uint32_t ErrorSoFar,
266
                                      ogg_uint32_t BestSoFar ) {
128
                                      ogg_uint32_t BestSoFar ) {
267
129
268
  ogg_uint32_t  i;
269
  ogg_uint32_t  DiffVal = ErrorSoFar;
130
  ogg_uint32_t  DiffVal = ErrorSoFar;
270
  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
131
  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
271
  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
132
  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
272
133
273
  if ( RefOffset == 0 ) {
134
  if ( RefOffset == 0 ) {
274
    /* Simple case as for non 0.5 pixel */
135
    /* Simple case as for non 0.5 pixel */
275
    DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,
136
    DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine, 
276
                               ErrorSoFar);
137
		               RefDataPtr1, RefPixelsPerLine);
277
  } else  {
138
  } else  {
278
    for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
139
    DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine, 
279
      DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +
140
		               RefDataPtr1, 
280
                                            (int)RefDataPtr2[0]) / 2) );
141
		               RefDataPtr2, RefPixelsPerLine, BestSoFar);
281
      DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
282
                                            (int)RefDataPtr2[1]) / 2) );
283
      DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
284
                                            (int)RefDataPtr2[2]) / 2) );
285
      DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
286
                                            (int)RefDataPtr2[3]) / 2) );
287
      DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
288
                                            (int)RefDataPtr2[4]) / 2) );
289
      DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
290
                                            (int)RefDataPtr2[5]) / 2) );
291
      DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
292
                                            (int)RefDataPtr2[6]) / 2) );
293
      DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
294
                                            (int)RefDataPtr2[7]) / 2) );
295
296
      if ( DiffVal > BestSoFar ) break;
297
298
      /* Step to next row of block. */
299
      SrcData += PixelsPerLine;
300
      RefDataPtr1 += RefPixelsPerLine;
301
      RefDataPtr2 += RefPixelsPerLine;
302
    }
303
  }
142
  }
304
143
305
  return DiffVal;
144
  return DiffVal;
306
}
145
}
307
146
308
static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
309
                            ogg_uint32_t PixelsPerLine ) {
310
  ogg_uint32_t  i;
311
  ogg_uint32_t  XSum=0;
312
  ogg_uint32_t  XXSum=0;
313
  unsigned char *DiffPtr;
314
315
  /* Loop expanded out for speed. */
316
  DiffPtr = DataPtr;
317
318
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
319
320
    /* Examine alternate pixel locations. */
321
    XSum += DiffPtr[0];
322
    XXSum += DiffPtr[0]*DiffPtr[0];
323
    XSum += DiffPtr[1];
324
    XXSum += DiffPtr[1]*DiffPtr[1];
325
    XSum += DiffPtr[2];
326
    XXSum += DiffPtr[2]*DiffPtr[2];
327
    XSum += DiffPtr[3];
328
    XXSum += DiffPtr[3]*DiffPtr[3];
329
    XSum += DiffPtr[4];
330
    XXSum += DiffPtr[4]*DiffPtr[4];
331
    XSum += DiffPtr[5];
332
    XXSum += DiffPtr[5]*DiffPtr[5];
333
    XSum += DiffPtr[6];
334
    XXSum += DiffPtr[6]*DiffPtr[6];
335
    XSum += DiffPtr[7];
336
    XXSum += DiffPtr[7]*DiffPtr[7];
337
338
    /* Step to next row of block. */
339
    DiffPtr += PixelsPerLine;
340
  }
341
342
  /* Compute population variance as mis-match metric. */
343
  return (( (XXSum<<6) - XSum*XSum ) );
344
}
345
346
ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
147
ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
347
                              ogg_uint32_t PixelsPerLine ) {
148
                              ogg_uint32_t PixelsPerLine ) {
348
  ogg_uint32_t  LocalFragIndex = FragIndex;
149
  ogg_uint32_t  LocalFragIndex = FragIndex;
349
  ogg_uint32_t  IntraError = 0;
150
  ogg_uint32_t  IntraError = 0;
350
151
152
  dsp_static_save_fpu ();
153
351
  /* Add together the intra errors for those blocks in the macro block
154
  /* Add together the intra errors for those blocks in the macro block
352
     that are coded (Y only) */
155
     that are coded (Y only) */
353
  if ( cpi->pb.display_fragments[LocalFragIndex] )
156
  if ( cpi->pb.display_fragments[LocalFragIndex] )
354
    IntraError +=
157
    IntraError +=
355
      GetIntraError(&cpi->
158
      dsp_static_intra8x8_err (&cpi->
356
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
159
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
357
                    PixelsPerLine );
160
                    PixelsPerLine);
358
359
161
360
  LocalFragIndex++;
162
  LocalFragIndex++;
361
  if ( cpi->pb.display_fragments[LocalFragIndex] )
163
  if ( cpi->pb.display_fragments[LocalFragIndex] )
362
    IntraError +=
164
    IntraError +=
363
      GetIntraError(&cpi->
165
      dsp_static_intra8x8_err (&cpi->
364
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
166
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
365
                    PixelsPerLine );
167
                    PixelsPerLine);
366
168
367
  LocalFragIndex = FragIndex + cpi->pb.HFragments;
169
  LocalFragIndex = FragIndex + cpi->pb.HFragments;
368
  if ( cpi->pb.display_fragments[LocalFragIndex] )
170
  if ( cpi->pb.display_fragments[LocalFragIndex] )
369
    IntraError +=
171
    IntraError +=
370
      GetIntraError(&cpi->
172
      dsp_static_intra8x8_err (&cpi->
371
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
173
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
372
                     PixelsPerLine );
174
                    PixelsPerLine);
373
175
374
  LocalFragIndex++;
176
  LocalFragIndex++;
375
  if ( cpi->pb.display_fragments[LocalFragIndex] )
177
  if ( cpi->pb.display_fragments[LocalFragIndex] )
376
    IntraError +=
178
    IntraError +=
377
      GetIntraError(&cpi->
179
      dsp_static_intra8x8_err (&cpi->
378
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
180
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
379
                    PixelsPerLine );
181
                    PixelsPerLine);
182
183
  dsp_static_restore_fpu ();
380
184
381
  return IntraError;
185
  return IntraError;
382
}
186
}
Lines 400-405 Link Here
400
  unsigned char * SrcPtr1;
204
  unsigned char * SrcPtr1;
401
  unsigned char * RefPtr1;
205
  unsigned char * RefPtr1;
402
206
207
  dsp_static_save_fpu ();
208
403
  /* Work out pixel offset into source buffer. */
209
  /* Work out pixel offset into source buffer. */
404
  PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
210
  PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
405
211
Lines 462-467 Link Here
462
    InterError += GetInterErr( SrcPtr1, RefPtr1,
268
    InterError += GetInterErr( SrcPtr1, RefPtr1,
463
                                 &RefPtr1[RefPtr2Offset], PixelsPerLine );
269
                                 &RefPtr1[RefPtr2Offset], PixelsPerLine );
464
  }
270
  }
271
272
  dsp_static_restore_fpu ();
273
465
  return InterError;
274
  return InterError;
466
}
275
}
467
276
Lines 496-501 Link Here
496
  unsigned char * RefDataPtr1;
305
  unsigned char * RefDataPtr1;
497
  unsigned char * RefDataPtr2;
306
  unsigned char * RefDataPtr2;
498
307
308
  dsp_static_save_fpu ();
309
499
  /* Note which of the four blocks in the macro block are to be
310
  /* Note which of the four blocks in the macro block are to be
500
     included in the search. */
311
     included in the search. */
501
  MBlockDispFrags[0] =
312
  MBlockDispFrags[0] =
Lines 518-537 Link Here
518
329
519
  /* Check the 0,0 candidate. */
330
  /* Check the 0,0 candidate. */
520
  if ( MBlockDispFrags[0] ) {
331
  if ( MBlockDispFrags[0] ) {
521
    Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,
332
    Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr,
522
                         PixelsPerLine, Error);
333
                         PixelsPerLine + STRIDE_EXTRA);
523
  }
334
  }
524
  if ( MBlockDispFrags[1] ) {
335
  if ( MBlockDispFrags[1] ) {
525
    Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,
336
    Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8,
526
                         PixelsPerLine, Error);
337
                         PixelsPerLine + STRIDE_EXTRA);
527
  }
338
  }
528
  if ( MBlockDispFrags[2] ) {
339
  if ( MBlockDispFrags[2] ) {
529
    Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,
340
    Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
530
                         PixelsPerLine, Error);
341
                         PixelsPerLine + STRIDE_EXTRA);
531
  }
342
  }
532
  if ( MBlockDispFrags[3] ) {
343
  if ( MBlockDispFrags[3] ) {
533
    Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,
344
    Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
534
                         PixelsPerLine, Error);
345
                         PixelsPerLine + STRIDE_EXTRA);
535
  }
346
  }
536
347
537
  /* Set starting values to results of 0, 0 vector. */
348
  /* Set starting values to results of 0, 0 vector. */
Lines 554-577 Link Here
554
365
555
      /* Get the score for the current offset */
366
      /* Get the score for the current offset */
556
      if ( MBlockDispFrags[0] ) {
367
      if ( MBlockDispFrags[0] ) {
557
        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
368
        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
558
                             PixelsPerLine, Error);
369
                             PixelsPerLine + STRIDE_EXTRA);
559
      }
370
      }
560
371
561
      if ( MBlockDispFrags[1] && (Error < MinError) ) {
372
      if ( MBlockDispFrags[1] && (Error < MinError) ) {
562
        Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
373
        Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
563
                                 PixelsPerLine, Error, MinError );
374
                             PixelsPerLine + STRIDE_EXTRA, MinError);
564
      }
375
      }
565
376
566
      if ( MBlockDispFrags[2] && (Error < MinError) ) {
377
      if ( MBlockDispFrags[2] && (Error < MinError) ) {
567
        Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
378
        Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
568
                                 PixelsPerLine, Error, MinError );
379
                             PixelsPerLine + STRIDE_EXTRA, MinError);
569
      }
380
      }
570
381
571
      if ( MBlockDispFrags[3] && (Error < MinError) ) {
382
      if ( MBlockDispFrags[3] && (Error < MinError) ) {
572
        Error = GetNextSumAbsDiffs( SrcPtr[3],
383
        Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
573
                                 CandidateBlockPtr + RefRow2Offset + 8,
384
                             PixelsPerLine + STRIDE_EXTRA, MinError);
574
                                 PixelsPerLine, Error, MinError );
575
      }
385
      }
576
386
577
      if ( Error < MinError ) {
387
      if ( Error < MinError ) {
Lines 652-657 Link Here
652
  InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
462
  InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
653
                                  FragIndex, MV->x, MV->y, PixelsPerLine );
463
                                  FragIndex, MV->x, MV->y, PixelsPerLine );
654
464
465
  dsp_static_restore_fpu ();
466
655
  /* Return score of best matching block. */
467
  /* Return score of best matching block. */
656
  return InterMVError;
468
  return InterMVError;
657
}
469
}
Lines 684-689 Link Here
684
  unsigned char * RefDataPtr1;
496
  unsigned char * RefDataPtr1;
685
  unsigned char * RefDataPtr2;
497
  unsigned char * RefDataPtr2;
686
498
499
  dsp_static_save_fpu ();
500
687
  /* Note which of the four blocks in the macro block are to be
501
  /* Note which of the four blocks in the macro block are to be
688
     included in the search. */
502
     included in the search. */
689
  MBlockDispFrags[0] = cpi->
503
  MBlockDispFrags[0] = cpi->
Lines 717-736 Link Here
717
531
718
      /* Summ errors for each block. */
532
      /* Summ errors for each block. */
719
      if ( MBlockDispFrags[0] ) {
533
      if ( MBlockDispFrags[0] ) {
720
        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
534
        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
721
                             PixelsPerLine, Error);
535
                             PixelsPerLine + STRIDE_EXTRA);
722
      }
536
      }
723
      if ( MBlockDispFrags[1] ){
537
      if ( MBlockDispFrags[1] ){
724
        Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
538
        Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
725
                             PixelsPerLine, Error);
539
                             PixelsPerLine + STRIDE_EXTRA);
726
      }
540
      }
727
      if ( MBlockDispFrags[2] ){
541
      if ( MBlockDispFrags[2] ){
728
        Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
542
        Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
729
                             PixelsPerLine, Error);
543
                             PixelsPerLine + STRIDE_EXTRA);
730
      }
544
      }
731
      if ( MBlockDispFrags[3] ){
545
      if ( MBlockDispFrags[3] ){
732
        Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8,
546
        Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
733
                             PixelsPerLine, Error);
547
                             PixelsPerLine + STRIDE_EXTRA);
734
      }
548
      }
735
549
736
      /* Was this the best so far */
550
      /* Was this the best so far */
Lines 808-813 Link Here
808
  InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
622
  InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
809
                                  FragIndex, MV->x, MV->y, PixelsPerLine );
623
                                  FragIndex, MV->x, MV->y, PixelsPerLine );
810
624
625
  dsp_static_restore_fpu ();
626
811
  /* Return score of best matching block. */
627
  /* Return score of best matching block. */
812
  return InterMVError;
628
  return InterMVError;
813
}
629
}
Lines 850-857 Link Here
850
666
851
    for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
667
    for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
852
      /* Get the block error score. */
668
      /* Get the block error score. */
853
      Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,
669
      Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr,
854
                           PixelsPerLine, 0);
670
                             PixelsPerLine + STRIDE_EXTRA);
855
671
856
      /* Was this the best so far */
672
      /* Was this the best so far */
857
      if ( Error < MinError ) {
673
      if ( Error < MinError ) {
Lines 911-916 Link Here
911
                                        MOTION_VECTOR *MV ) {
727
                                        MOTION_VECTOR *MV ) {
912
  ogg_uint32_t  InterMVError;
728
  ogg_uint32_t  InterMVError;
913
729
730
  dsp_static_save_fpu ();
731
914
  /* For the moment the 4MV mode is only deemd to be valid if all four
732
  /* For the moment the 4MV mode is only deemd to be valid if all four
915
     Y blocks are to be updated */
733
     Y blocks are to be updated */
916
  /* This May be adapted later. */
734
  /* This May be adapted later. */
Lines 941-946 Link Here
941
    InterMVError = HUGE_ERROR;
759
    InterMVError = HUGE_ERROR;
942
  }
760
  }
943
761
762
  dsp_static_restore_fpu ();
763
944
  /* Return score of best matching block. */
764
  /* Return score of best matching block. */
945
  return InterMVError;
765
  return InterMVError;
946
}
766
}
(-)libtheora-1.0alpha3/lib/pp.c (-4 / +6 lines)
Lines 19-24 Link Here
19
#include <string.h>
19
#include <string.h>
20
#include "encoder_internal.h"
20
#include "encoder_internal.h"
21
#include "pp.h"
21
#include "pp.h"
22
#include "dsp.h"
22
23
23
#define MAX(a, b) ((a>b)?a:b)
24
#define MAX(a, b) ((a>b)?a:b)
24
#define MIN(a, b) ((a<b)?a:b)
25
#define MIN(a, b) ((a<b)?a:b)
Lines 490-496 Link Here
490
491
491
      } else {
492
      } else {
492
493
493
        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
494
        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
494
495
495
      }
496
      }
496
497
Lines 529-535 Link Here
529
        DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
530
        DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
530
                        LineLength,Quality,QuantScale);
531
                        LineLength,Quality,QuantScale);
531
      }else{
532
      }else{
532
        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
533
        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
533
      }
534
      }
534
535
535
      ++Block;
536
      ++Block;
Lines 565-571 Link Here
565
        DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
566
        DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
566
                        LineLength,Quality,QuantScale);
567
                        LineLength,Quality,QuantScale);
567
      }else{
568
      }else{
568
        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
569
        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
569
      }
570
      }
570
571
571
      ++Block;
572
      ++Block;
Lines 913-919 Link Here
913
}
914
}
914
915
915
void PostProcess(PB_INSTANCE *pbi){
916
void PostProcess(PB_INSTANCE *pbi){
916
917
  dsp_static_save_fpu ();
917
  switch (pbi->PostProcessingLevel){
918
  switch (pbi->PostProcessingLevel){
918
  case 8:
919
  case 8:
919
    /* on a slow machine, use a simpler and faster deblocking filter */
920
    /* on a slow machine, use a simpler and faster deblocking filter */
Lines 947-951 Link Here
947
    DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer);
948
    DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer);
948
    break;
949
    break;
949
  }
950
  }
951
  dsp_static_restore_fpu ();
950
}
952
}
951
953
(-)libtheora-1.0alpha3/lib/reconstruct.c (-16 / +41 lines)
Lines 16-27 Link Here
16
 ********************************************************************/
16
 ********************************************************************/
17
17
18
#include "encoder_internal.h"
18
#include "encoder_internal.h"
19
#include "dsp.h"
20
#include "cpu.h"
19
21
20
void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
22
static void copy8x8__c (unsigned char *src,
21
                 ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
23
	                unsigned char *dest,
24
	                unsigned int stride)
25
{
26
  int j;
27
  for ( j = 0; j < 8; j++ ){
28
    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
29
    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
30
    src+=stride;
31
    dest+=stride;
32
  }
33
}
34
35
static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
36
		      ogg_uint32_t LineStep)
37
{
22
  ogg_uint32_t i;
38
  ogg_uint32_t i;
23
39
24
  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
40
  for (i = 8; i; i--){
25
    /* Convert the data back to 8 bit unsigned */
41
    /* Convert the data back to 8 bit unsigned */
26
    /* Saturate the output to unsigend 8 bit values */
42
    /* Saturate the output to unsigend 8 bit values */
27
    ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
43
    ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
Lines 34-50 Link Here
34
    ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
50
    ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
35
51
36
    ReconPtr += LineStep;
52
    ReconPtr += LineStep;
37
    ChangePtr += BLOCK_HEIGHT_WIDTH;
53
    ChangePtr += 8;
38
  }
54
  }
39
40
}
55
}
41
56
42
void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
57
static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
43
                 unsigned char * RefPtr, ogg_int16_t * ChangePtr,
58
		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
44
                 ogg_uint32_t LineStep ) {
59
{
45
  ogg_uint32_t i;
60
  ogg_uint32_t i;
46
61
47
  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {
62
  for (i = 8; i; i--){
48
    ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
63
    ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
49
    ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
64
    ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
50
    ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
65
    ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
Lines 54-72 Link Here
54
    ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
69
    ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
55
    ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
70
    ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
56
71
57
    ChangePtr += BLOCK_HEIGHT_WIDTH;
72
    ChangePtr += 8;
58
    ReconPtr += LineStep;
73
    ReconPtr += LineStep;
59
    RefPtr += LineStep;
74
    RefPtr += LineStep;
60
  }
75
  }
61
62
}
76
}
63
77
64
void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
78
static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1,
65
                           unsigned char * RefPtr1, unsigned char * RefPtr2,
79
		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
66
                           ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
80
			   ogg_uint32_t LineStep)
81
{
67
  ogg_uint32_t  i;
82
  ogg_uint32_t  i;
68
83
69
  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
84
  for (i = 8; i; i--){
70
    ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
85
    ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
71
    ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
86
    ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
72
    ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
87
    ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
Lines 76-85 Link Here
76
    ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
91
    ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
77
    ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
92
    ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
78
93
79
    ChangePtr += BLOCK_HEIGHT_WIDTH;
94
    ChangePtr += 8;
80
    ReconPtr += LineStep;
95
    ReconPtr += LineStep;
81
    RefPtr1 += LineStep;
96
    RefPtr1 += LineStep;
82
    RefPtr2 += LineStep;
97
    RefPtr2 += LineStep;
83
  }
98
  }
99
}
84
100
101
void dsp_recon_init (DspFunctions *funcs)
102
{
103
  funcs->copy8x8 = copy8x8__c;
104
  funcs->recon_intra8x8 = recon_intra8x8__c;
105
  funcs->recon_inter8x8 = recon_inter8x8__c;
106
  funcs->recon_inter8x8_half = recon_inter8x8_half__c;
107
  if (cpu_flags & CPU_X86_MMX) {
108
    dsp_i386_mmx_recon_init(&dsp_funcs);
109
  }
85
}
110
}
(-)libtheora-1.0alpha3/lib/scan.c (-85 / +35 lines)
Lines 19-27 Link Here
19
#include <math.h>
19
#include <math.h>
20
#include <string.h>
20
#include <string.h>
21
#include "encoder_internal.h"
21
#include "encoder_internal.h"
22
#include "dsp.h"
22
23
23
#define MAX_SEARCH_LINE_LEN                   7
24
#define MAX_SEARCH_LINE_LEN                   7
24
25
26
#define SET8_0(ptr) \
27
  ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
28
  ((ogg_uint32_t *)ptr)[1] = 0x00000000;
29
#define SET8_1(ptr) \
30
  ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
31
  ((ogg_uint32_t *)ptr)[1] = 0x01010101;
32
#define SET8_8(ptr) \
33
  ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
34
  ((ogg_uint32_t *)ptr)[1] = 0x08080808;
35
25
static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
36
static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
26
  0, 0, 0, 0, 2, 4, 12, 24
37
  0, 0, 0, 0, 2, 4, 12, 24
27
};
38
};
Lines 384-452 Link Here
384
  ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
395
  ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
385
}
396
}
386
397
387
static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
388
                                  unsigned char * Src2 ){
389
  ogg_uint32_t SadValue;
390
  ogg_uint32_t SadValue1;
391
392
  SadValue    = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
393
    abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
394
395
  SadValue1   = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
396
    abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
397
398
  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
399
400
  return SadValue;
401
}
402
403
static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
404
                           unsigned char * Src1,
405
                           unsigned char * Src2 ){
406
  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
407
  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
408
  ogg_uint32_t MaxSad = 0;
409
  ogg_uint32_t i;
410
411
  for ( i = 0; i < 4; i++ ){
412
    SadValue[0] += abs(Src1[0] - Src2[0]);
413
    SadValue[1] += abs(Src1[1] - Src2[1]);
414
    SadValue[2] += abs(Src1[2] - Src2[2]);
415
    SadValue[3] += abs(Src1[3] - Src2[3]);
416
    SadValue[4] += abs(Src1[4] - Src2[4]);
417
    SadValue[5] += abs(Src1[5] - Src2[5]);
418
    SadValue[6] += abs(Src1[6] - Src2[6]);
419
    SadValue[7] += abs(Src1[7] - Src2[7]);
420
421
    Src1 += ppi->PlaneStride;
422
    Src2 += ppi->PlaneStride;
423
  }
424
425
  for ( i = 0; i < 4; i++ ){
426
    SadValue2[0] += abs(Src1[0] - Src2[0]);
427
    SadValue2[1] += abs(Src1[1] - Src2[1]);
428
    SadValue2[2] += abs(Src1[2] - Src2[2]);
429
    SadValue2[3] += abs(Src1[3] - Src2[3]);
430
    SadValue2[4] += abs(Src1[4] - Src2[4]);
431
    SadValue2[5] += abs(Src1[5] - Src2[5]);
432
    SadValue2[6] += abs(Src1[6] - Src2[6]);
433
    SadValue2[7] += abs(Src1[7] - Src2[7]);
434
435
    Src1 += ppi->PlaneStride;
436
    Src2 += ppi->PlaneStride;
437
  }
438
439
  for ( i = 0; i < 8; i++ ){
440
    if ( SadValue[i] > MaxSad )
441
      MaxSad = SadValue[i];
442
    if ( SadValue2[i] > MaxSad )
443
      MaxSad = SadValue2[i];
444
  }
445
446
  return MaxSad;
447
}
448
449
450
static int RowSadScan( PP_INSTANCE *ppi,
398
static int RowSadScan( PP_INSTANCE *ppi,
451
                       unsigned char * YuvPtr1,
399
                       unsigned char * YuvPtr1,
452
                       unsigned char * YuvPtr2,
400
                       unsigned char * YuvPtr2,
Lines 475-481 Link Here
475
    for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
423
    for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
476
      if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
424
      if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
477
        /* Calculate the SAD score for the block row */
425
        /* Calculate the SAD score for the block row */
478
        GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);
426
        GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2);
479
427
480
        /* Now test the group SAD score */
428
        /* Now test the group SAD score */
481
        if ( GrpSad > LocalGrpLowSadThresh ){
429
        if ( GrpSad > LocalGrpLowSadThresh ){
Lines 532-538 Link Here
532
    /* Skip if block already marked to be coded. */
480
    /* Skip if block already marked to be coded. */
533
    if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
481
    if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
534
      /* Calculate the SAD score for the block column */
482
      /* Calculate the SAD score for the block column */
535
      MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );
483
      MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );
536
484
537
      /* Now test the group SAD score */
485
      /* Now test the group SAD score */
538
      if ( MaxSad > LocalGrpLowSadThresh ){
486
      if ( MaxSad > LocalGrpLowSadThresh ){
Lines 758-764 Link Here
758
      if (*DispFragPtr == CANDIDATE_BLOCK){
706
      if (*DispFragPtr == CANDIDATE_BLOCK){
759
707
760
        /* Clear down entries in changed locals array */
708
        /* Clear down entries in changed locals array */
761
        memset(ChLocalsPtr,0,8);
709
        SET8_0(ChLocalsPtr);
762
710
763
        for ( j = 0; j < HFRAGPIXELS; j++ ){
711
        for ( j = 0; j < HFRAGPIXELS; j++ ){
764
          /* Take a local copy of the measured difference. */
712
          /* Take a local copy of the measured difference. */
Lines 777-786 Link Here
777
      }else{
725
      }else{
778
        /* If we are breaking out here mark all pixels as changed. */
726
        /* If we are breaking out here mark all pixels as changed. */
779
        if ( *DispFragPtr > BLOCK_NOT_CODED ){
727
        if ( *DispFragPtr > BLOCK_NOT_CODED ){
780
          memset(bits_map_ptr,1,8);
728
          SET8_1(bits_map_ptr);
781
          memset(ChLocalsPtr,8,8);
729
          SET8_8(ChLocalsPtr);
782
        }else{
730
        }else{
783
          memset(ChLocalsPtr,0,8);
731
          SET8_0(ChLocalsPtr);
784
        }
732
        }
785
      }
733
      }
786
734
Lines 816-822 Link Here
816
    /* Test for break out conditions to save time. */
764
    /* Test for break out conditions to save time. */
817
    if (*DispFragPtr == CANDIDATE_BLOCK){
765
    if (*DispFragPtr == CANDIDATE_BLOCK){
818
      /* Clear down entries in changed locals array */
766
      /* Clear down entries in changed locals array */
819
      memset(ChLocalsPtr,0,8);
767
      SET8_0(ChLocalsPtr);
820
768
821
      for ( j = 0; j < HFRAGPIXELS; j++ ){
769
      for ( j = 0; j < HFRAGPIXELS; j++ ){
822
        /* Take a local copy of the measured difference. */
770
        /* Take a local copy of the measured difference. */
Lines 839-848 Link Here
839
    }else{
787
    }else{
840
      /* If we are breaking out here mark all pixels as changed. */
788
      /* If we are breaking out here mark all pixels as changed. */
841
      if ( *DispFragPtr > BLOCK_NOT_CODED ){
789
      if ( *DispFragPtr > BLOCK_NOT_CODED ){
842
        memset(bits_map_ptr,1,8);
790
        SET8_1(bits_map_ptr);
843
        memset(ChLocalsPtr,8,8);
791
        SET8_8(ChLocalsPtr);
844
      }else{
792
      }else{
845
        memset(ChLocalsPtr,0,8);
793
        SET8_0(ChLocalsPtr);
846
      }
794
      }
847
    }
795
    }
848
796
Lines 876-882 Link Here
876
      /* Test for break out conditions to save time. */
824
      /* Test for break out conditions to save time. */
877
      if (*DispFragPtr == CANDIDATE_BLOCK){
825
      if (*DispFragPtr == CANDIDATE_BLOCK){
878
        /* Clear down entries in changed locals array */
826
        /* Clear down entries in changed locals array */
879
        memset(ChLocalsPtr,0,8);
827
        SET8_0(ChLocalsPtr);
880
        for ( j = 0; j < HFRAGPIXELS; j++ ){
828
        for ( j = 0; j < HFRAGPIXELS; j++ ){
881
          /* Take a local copy of the measured difference. */
829
          /* Take a local copy of the measured difference. */
882
          Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
830
          Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
Lines 899-908 Link Here
899
      }else{
847
      }else{
900
        /* If we are breaking out here mark all pixels as changed. */
848
        /* If we are breaking out here mark all pixels as changed. */
901
        if ( *DispFragPtr > BLOCK_NOT_CODED ){
849
        if ( *DispFragPtr > BLOCK_NOT_CODED ){
902
          memset(bits_map_ptr,1,8);
850
          SET8_1(bits_map_ptr);
903
          memset(ChLocalsPtr,8,8);
851
          SET8_8(ChLocalsPtr);
904
        }else{
852
        }else{
905
          memset(ChLocalsPtr,0,8);
853
          SET8_0(ChLocalsPtr);
906
        }
854
        }
907
      }
855
      }
908
856
Lines 935-941 Link Here
935
    /* Test for break out conditions to save time. */
883
    /* Test for break out conditions to save time. */
936
    if (*DispFragPtr == CANDIDATE_BLOCK){
884
    if (*DispFragPtr == CANDIDATE_BLOCK){
937
      /* Clear down entries in changed locals array */
885
      /* Clear down entries in changed locals array */
938
      memset(ChLocalsPtr,0,8);
886
      SET8_0(ChLocalsPtr);
939
887
940
      for ( j = 0; j < HFRAGPIXELS; j++ ){
888
      for ( j = 0; j < HFRAGPIXELS; j++ ){
941
        /* Take a local copy of the measured difference. */
889
        /* Take a local copy of the measured difference. */
Lines 959-968 Link Here
959
    }else{
907
    }else{
960
      /* If we are breaking out here mark all pixels as changed.*/
908
      /* If we are breaking out here mark all pixels as changed.*/
961
      if ( *DispFragPtr > BLOCK_NOT_CODED ) {
909
      if ( *DispFragPtr > BLOCK_NOT_CODED ) {
962
          memset(bits_map_ptr,1,8);
910
          SET8_1(bits_map_ptr);
963
          memset(ChLocalsPtr,8,8);
911
          SET8_8(ChLocalsPtr);
964
        }else{
912
        }else{
965
          memset(ChLocalsPtr,0,8);
913
          SET8_0(ChLocalsPtr);
966
        }
914
        }
967
    }
915
    }
968
    /* If we have a lot of changed pixels for this fragment on this
916
    /* If we have a lot of changed pixels for this fragment on this
Lines 1071-1077 Link Here
1071
        }
1019
        }
1072
      }else{
1020
      }else{
1073
        if ( *DispFragPtr > BLOCK_NOT_CODED )
1021
        if ( *DispFragPtr > BLOCK_NOT_CODED )
1074
          memset(ChLocalsPtr,0,8);
1022
          SET8_0(ChLocalsPtr);
1075
1023
1076
        /* Step pointers */
1024
        /* Step pointers */
1077
        ChLocalsPtr += HFRAGPIXELS;
1025
        ChLocalsPtr += HFRAGPIXELS;
Lines 1133-1139 Link Here
1133
        }
1081
        }
1134
      }else{
1082
      }else{
1135
        if ( *DispFragPtr > BLOCK_NOT_CODED )
1083
        if ( *DispFragPtr > BLOCK_NOT_CODED )
1136
          memset(ChLocalsPtr,0,8);
1084
          SET8_0(ChLocalsPtr);
1137
1085
1138
        /* Step pointers */
1086
        /* Step pointers */
1139
        ChLocalsPtr += HFRAGPIXELS;
1087
        ChLocalsPtr += HFRAGPIXELS;
Lines 2126-2135 Link Here
2126
    /* Fast break out test for obvious yes and no cases in this row of
2074
    /* Fast break out test for obvious yes and no cases in this row of
2127
       blocks */
2075
       blocks */
2128
    if ( i < ppi->PlaneVFragments ){
2076
    if ( i < ppi->PlaneVFragments ){
2077
      dsp_static_save_fpu ();
2129
      UpdatedOrCandidateBlocks =
2078
      UpdatedOrCandidateBlocks =
2130
        RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
2079
        RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
2131
      if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )
2080
      UpdatedOrCandidateBlocks |=
2132
        UpdatedOrCandidateBlocks = 1;
2081
        ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
2082
      dsp_static_restore_fpu ();
2133
    }else{
2083
    }else{
2134
      /* Make sure we still call other functions if RowSadScan() disabled */
2084
      /* Make sure we still call other functions if RowSadScan() disabled */
2135
      UpdatedOrCandidateBlocks = 1;
2085
      UpdatedOrCandidateBlocks = 1;
(-)libtheora-1.0alpha3/lib/toplevel.c (+4 lines)
Lines 787-792 Link Here
787
787
788
  CP_INSTANCE *cpi;
788
  CP_INSTANCE *cpi;
789
789
790
  dsp_static_init ();
791
790
  memset(th, 0, sizeof(*th));
792
  memset(th, 0, sizeof(*th));
791
  th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
793
  th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
792
794
Lines 1446-1451 Link Here
1446
  PB_INSTANCE *pbi;
1448
  PB_INSTANCE *pbi;
1447
  codec_setup_info *ci;
1449
  codec_setup_info *ci;
1448
1450
1451
  dsp_static_init ();
1452
1449
  ci=(codec_setup_info *)c->codec_setup;
1453
  ci=(codec_setup_info *)c->codec_setup;
1450
  th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));
1454
  th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));
1451
1455

Return to bug 68549