Attachment 42417 Details for Bug 68549 – libtheora mmx patch

[patch] libtheora mmx patch

libtheora-1.0alpha3-mmx.patch (text/plain), 141.58 KB, created by Zaheer Abbas Merali (RETIRED) on 2004-10-22 07:33:11 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Zaheer Abbas Merali (RETIRED)

Created: 2004-10-22 07:33:11 UTC

Size: 141.58 KB

patch

obsolete

>diff -Naur libtheora-1.0alpha3/lib/blockmap.c libtheora-1.0alpha3.mmx/lib/blockmap.c
>--- libtheora-1.0alpha3/lib/blockmap.c	2003-12-03 09:59:39.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/blockmap.c	2004-10-06 17:48:22.202433112 +0200
>@@ -21,7 +21,7 @@
>                             ogg_uint32_t FirstSB,
>                             ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,
>                             ogg_uint32_t VFrags ){
>-  ogg_uint32_t i, j;
>+  ogg_uint32_t i, j = 0;
>   ogg_uint32_t xpos;
>   ogg_uint32_t ypos;
>   ogg_uint32_t SBrow, SBcol;
>diff -Naur libtheora-1.0alpha3/lib/cpu.c libtheora-1.0alpha3.mmx/lib/cpu.c
>--- libtheora-1.0alpha3/lib/cpu.c	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/cpu.c	2004-10-06 17:48:22.203432960 +0200
>@@ -0,0 +1,107 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#include "cpu.h"
>+
>+ogg_uint32_t cpu_flags = 0;
>+
>+#if 1
>+static ogg_uint32_t cpu_get_flags (void)
>+{
>+  ogg_uint32_t eax, ebx, ecx, edx;
>+  ogg_uint32_t flags;
>+
>+#define cpuid(op,eax,ebx,ecx,edx)      \
>+  asm volatile ("pushl %%ebx   \n\t"   \
>+                "cpuid         \n\t"   \
>+                "movl %%ebx,%1 \n\t"   \
>+                "popl %%ebx"           \
>+              : "=a" (eax),            \
>+                "=r" (ebx),            \
>+                "=c" (ecx),            \
>+                "=d" (edx)             \
>+              : "a" (op)               \
>+              : "cc")
>+
>+  asm volatile ("pushfl              \n\t"
>+                "pushfl              \n\t"
>+                "popl %0             \n\t"
>+                "movl %0,%1          \n\t"
>+                "xorl $0x200000,%0   \n\t"
>+                "pushl %0            \n\t"
>+                "popfl               \n\t"
>+                "pushfl              \n\t"
>+                "popl %0             \n\t"
>+                "popfl"
>+              : "=r" (eax),
>+                "=r" (ebx)
>+              :
>+              : "cc");
>+         
>+  if (eax == ebx)             /* no cpuid */
>+    return 0;
>+
>+  cpuid(0, eax, ebx, ecx, edx);
>+
>+  if (ebx == 0x756e6547 &&
>+      edx == 0x49656e69 &&
>+      ecx == 0x6c65746e) {
>+    /* intel */
>+
>+  inteltest:
>+    cpuid(1, eax, ebx, ecx, edx);
>+    if ((edx & 0x00800000) == 0)
>+      return 0;
>+    flags = CPU_X86_MMX;
>+    if (edx & 0x02000000)
>+      flags |= CPU_X86_MMXEXT | CPU_X86_SSE;
>+    if (edx & 0x04000000)
>+      flags |= CPU_X86_SSE2;
>+    return flags;
>+  } else if (ebx == 0x68747541 &&
>+             edx == 0x69746e65 &&
>+             ecx == 0x444d4163) {
>+    /* AMD */
>+    cpuid(0x80000000, eax, ebx, ecx, edx);
>+    if ((unsigned)eax < 0x80000001)
>+      goto inteltest;
>+    cpuid(0x80000001, eax, ebx, ecx, edx);
>+    if ((edx & 0x00800000) == 0)
>+      return 0;
>+    flags = CPU_X86_MMX;
>+    if (edx & 0x80000000)
>+      flags |= CPU_X86_3DNOW;
>+    if (edx & 0x00400000)
>+      flags |= CPU_X86_MMXEXT;
>+    return flags;
>+  }
>+  else {
>+    /* implement me */
>+  }
>+
>+  return flags;
>+}
>+#else
>+static ogg_uint32_t cpu_get_flags (void) {
>+  return 0;
>+}
>+#endif
>+
>+void cpu_init () 
>+{
>+  cpu_flags = cpu_get_flags();
>+}
>diff -Naur libtheora-1.0alpha3/lib/cpu.h libtheora-1.0alpha3.mmx/lib/cpu.h
>--- libtheora-1.0alpha3/lib/cpu.h	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/cpu.h	2004-10-06 17:48:22.243426880 +0200
>@@ -0,0 +1,28 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#include "encoder_internal.h"
>+
>+extern ogg_uint32_t cpu_flags;
>+
>+#define CPU_X86_MMX	(1<<0)
>+#define CPU_X86_3DNOW	(1<<1)
>+#define CPU_X86_MMXEXT	(1<<2)
>+#define CPU_X86_SSE	(1<<3)
>+#define CPU_X86_SSE2	(1<<4)
>+
>+void cpu_init () ;
>diff -Naur libtheora-1.0alpha3/lib/dct.c libtheora-1.0alpha3.mmx/lib/dct.c
>--- libtheora-1.0alpha3/lib/dct.c	2003-12-03 09:59:39.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/dct.c	2004-10-06 17:48:22.244426728 +0200
>@@ -16,6 +16,7 @@
>  ********************************************************************/
> 
> #include "encoder_internal.h"
>+#include "cpu.h"
> 
> static ogg_int32_t xC1S7 = 64277;
> static ogg_int32_t xC2S6 = 60547;
>@@ -28,7 +29,7 @@
> #define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
> #define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
> 
>-void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
>+static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
>   int loop;
> 
>   ogg_int32_t  is07, is12, is34, is56;
>@@ -251,3 +252,12 @@
>     op ++;
>   }
> }
>+
>+void dsp_dct_init (DspFunctions *funcs)
>+{
>+  funcs->fdct_short = fdct_short__c;
>+  if (cpu_flags & CPU_X86_MMX) {
>+    dsp_i386_mmx_fdct_init(&dsp_funcs);
>+  }
>+}
>+
>diff -Naur libtheora-1.0alpha3/lib/dct_decode.c libtheora-1.0alpha3.mmx/lib/dct_decode.c
>--- libtheora-1.0alpha3/lib/dct_decode.c	2004-03-18 18:10:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/dct_decode.c	2004-10-06 17:48:22.284420648 +0200
>@@ -18,6 +18,7 @@
> #include <stdlib.h>
> #include <string.h>
> #include "encoder_internal.h"
>+#include "dsp.h"
> 
> 
> #define GOLDEN_FRAME_THRESH_Q   50
>@@ -112,22 +113,6 @@
>   SetupBoundingValueArray_Generic(pbi, FLimit);
> }
> 
>-void CopyBlock(unsigned char *src,
>-               unsigned char *dest,
>-               unsigned int srcstride){
>-  unsigned char *s = src;
>-  unsigned char *d = dest;
>-  unsigned int stride = srcstride;
>-
>-  int j;
>-  for ( j = 0; j < 8; j++ ){
>-    ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
>-    ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
>-    s+=stride;
>-    d+=stride;
>-  }
>-}
>-
> static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
>   ogg_uint32_t ReconPixelsPerLine;
>   ogg_int32_t     ReconPixelIndex;
>@@ -160,8 +145,8 @@
>   ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
> 
>   /* Get the pixel index for the first pixel in the fragment. */
>-  ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
>-              (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );
>+  dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
>+              (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);
> 
> }
> 
>@@ -237,10 +222,9 @@
>     /* Reconstruct the pixel data using the last frame reconstruction
>        and change data when the motion vector is (0,0), the recon is
>        based on the lastframe without loop filtering---- for testing */
>-    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
>+    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
>                 &pbi->LastFrameRecon[ReconPixelIndex],
>-                pbi->ReconDataBuffer, ReconPixelsPerLine );
>-
>+                  pbi->ReconDataBuffer, ReconPixelsPerLine);
>   }else if ( ModeUsesMC[pbi->CodingMode] ) {
>     /* The mode uses a motion vector. */
>     /* Get vector from list */
>@@ -287,29 +271,30 @@
>     if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
>       /* Reconstruct the pixel dats from the reference frame and change data
>          (no half pixel in this case as the two references were the same. */
>-      ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
>+      dsp_static_recon_inter8x8 (
>+		  &pbi->ThisFrameRecon[ReconPixelIndex],
>                   LastFrameRecPtr, pbi->ReconDataBuffer,
>-                  ReconPixelsPerLine );
>+                  ReconPixelsPerLine);
>     }else{
>       /* Fractional pixel reconstruction. */
>       /* Note that we only use two pixels per reconstruction even for
>          the diagonal. */
>-      ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],
>+      dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex],
>                             LastFrameRecPtr, LastFrameRecPtr2,
>-                            pbi->ReconDataBuffer, ReconPixelsPerLine );
>+                            pbi->ReconDataBuffer, ReconPixelsPerLine);
>     }
>   } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
>     /* Golden frame with motion vector */
>     /* Reconstruct the pixel data using the golden frame
>        reconstruction and change data */
>-    ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
>+    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
>                 &pbi->GoldenFrame[ ReconPixelIndex ],
>-                pbi->ReconDataBuffer, ReconPixelsPerLine );
>+                  pbi->ReconDataBuffer, ReconPixelsPerLine);
>   } else {
>     /* Simple Intra coding */
>     /* Get the pixel index for the first pixel in the fragment. */
>-    ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],
>-                pbi->ReconDataBuffer, ReconPixelsPerLine );
>+    dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
>+              pbi->ReconDataBuffer, ReconPixelsPerLine);
>   }
> }
> 
>@@ -464,7 +449,7 @@
>       SrcPtr = &SrcReconPtr[ PixelIndex ];
>       DestPtr = &DestReconPtr[ PixelIndex ];
> 
>-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
>+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
>     }
>   }
> 
>@@ -476,7 +461,7 @@
>       SrcPtr = &SrcReconPtr[ PixelIndex ];
>       DestPtr = &DestReconPtr[ PixelIndex ];
> 
>-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
>+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
> 
>     }
>   }
>@@ -505,7 +490,7 @@
>       SrcPtr = &SrcReconPtr[ PixelIndex ];
>       DestPtr = &DestReconPtr[ PixelIndex ];
> 
>-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
>+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
>     }
>   }
> 
>@@ -517,7 +502,7 @@
>       SrcPtr = &SrcReconPtr[ PixelIndex ];
>       DestPtr = &DestReconPtr[ PixelIndex ];
> 
>-      CopyBlock(SrcPtr, DestPtr, PlaneLineStep);
>+      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
> 
>     }
>   }
>diff -Naur libtheora-1.0alpha3/lib/dct_encode.c libtheora-1.0alpha3.mmx/lib/dct_encode.c
>--- libtheora-1.0alpha3/lib/dct_encode.c	2003-06-10 03:31:33.000000000 +0200
>+++ libtheora-1.0alpha3.mmx/lib/dct_encode.c	2004-10-06 17:48:22.285420496 +0200
>@@ -17,110 +17,10 @@
> 
> #include <stdlib.h>
> #include "encoder_internal.h"
>+#include "dsp.h"
> 
> static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
> 
>-static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
>-                  ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
>-                  unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
>-                  ogg_uint32_t ReconPixelsPerLine ) {
>-  int i;
>-
>-  /* For each block row */
>-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
>-    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
>-    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
>-    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
>-    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
>-    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
>-    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
>-    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
>-    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
>-
>-    /* Update the screen canvas in one step*/
>-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
>-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
>-
>-    /* Start next row */
>-    new_ptr1 += PixelsPerLine;
>-    old_ptr1 += PixelsPerLine;
>-    FiltPtr += PixelsPerLine;
>-    ReconPtr += ReconPixelsPerLine;
>-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
>-  }
>-}
>-
>-static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
>-                      unsigned char *old_ptr1, unsigned char *new_ptr1,
>-                      ogg_uint32_t PixelsPerLine ) {
>-  int i;
>-  /* For each block row */
>-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
>-    /* INTRA mode so code raw image data */
>-    /* We convert the data to 8 bit signed (by subtracting 128) as
>-       this reduces the internal precision requirments in the DCT
>-       transform. */
>-    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
>-    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
>-    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
>-    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
>-    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
>-    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
>-    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
>-    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
>-
>-    /* Update the screen canvas in one step */
>-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
>-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
>-
>-    /* Start next row */
>-    new_ptr1 += PixelsPerLine;
>-    old_ptr1 += PixelsPerLine;
>-    FiltPtr += PixelsPerLine;
>-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
>-  }
>-}
>-
>-static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
>-                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
>-                     unsigned char *old_ptr1, unsigned char *new_ptr1,
>-                     ogg_uint32_t PixelsPerLine,
>-                     ogg_uint32_t ReconPixelsPerLine ) {
>-  int i;
>-
>-  /* For each block row */
>-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
>-    DctInputPtr[0] = (ogg_int16_t)
>-      ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
>-    DctInputPtr[1] = (ogg_int16_t)
>-      ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
>-    DctInputPtr[2] = (ogg_int16_t)
>-      ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
>-    DctInputPtr[3] = (ogg_int16_t)
>-      ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
>-    DctInputPtr[4] = (ogg_int16_t)
>-      ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
>-    DctInputPtr[5] = (ogg_int16_t)
>-      ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
>-    DctInputPtr[6] = (ogg_int16_t)
>-      ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
>-    DctInputPtr[7] = (ogg_int16_t)
>-      ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
>-
>-    /* Update the screen canvas in one step */
>-    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
>-    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];
>-
>-    /* Start next row */
>-    new_ptr1 += PixelsPerLine;
>-    old_ptr1 += PixelsPerLine;
>-    FiltPtr += PixelsPerLine;
>-    ReconPtr1 += ReconPixelsPerLine;
>-    ReconPtr2 += ReconPixelsPerLine;
>-    DctInputPtr += BLOCK_HEIGHT_WIDTH;
>-  }
>-}
>-
> static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
>                                        ogg_uint32_t * TokenListPtr ){
>   unsigned char tokens_added = 0;
>@@ -452,13 +352,15 @@
> 
>   /* Is the MV offset exactly pixel alligned */
>   if ( AbsRefOffset == 0 ){
>-    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
>-               PixelsPerLine, ReconPixelsPerLine );
>+    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
>+               PixelsPerLine, ReconPixelsPerLine);
>+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
>   } else {
>     /* Fractional pixel MVs. */
>     /* Note that we only use two pixel values even for the diagonal */
>-    Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,
>-                 new_ptr1, PixelsPerLine, ReconPixelsPerLine );
>+    dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
>+                 PixelsPerLine, ReconPixelsPerLine);
>+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
>   }
> }
> 
>@@ -534,17 +436,18 @@
>         pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
>     }
> 
>-    Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,
>-               PixelsPerLine, ReconPixelsPerLine );
>+    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
>+               PixelsPerLine, ReconPixelsPerLine);
>+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
>   } else if ( cpi->pb.CodingMode==CODE_INTRA ) {
>-    Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);
>-
>+    dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine);
>+    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
>   }
> 
>   /* Proceed to encode the data into the encode buffer if the encoder
>      is enabled. */
>   /* Perform a 2D DCT transform on the data. */
>-  fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
>+  dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
> 
>   /* Quantize that transform data. */
>   quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );
>diff -Naur libtheora-1.0alpha3/lib/decode.c libtheora-1.0alpha3.mmx/lib/decode.c
>--- libtheora-1.0alpha3/lib/decode.c	2003-12-06 19:06:20.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/decode.c	2004-10-06 17:48:22.324414568 +0200
>@@ -796,6 +796,8 @@
>   /* Make a not of the number of coded blocks this frame */
>   pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex;
> 
>+  dsp_static_save_fpu();
>+
>   /* Decode the modes data */
>   DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols);
> 
>@@ -808,6 +810,7 @@
>   /* Reconstruct and display the frame */
>   ReconRefFrames(pbi);
> 
>+  dsp_static_restore_fpu();
> }
> 
> 
>diff -Naur libtheora-1.0alpha3/lib/dsp.c libtheora-1.0alpha3.mmx/lib/dsp.c
>--- libtheora-1.0alpha3/lib/dsp.c	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/dsp.c	2004-10-06 17:48:22.363408640 +0200
>@@ -0,0 +1,416 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#include <stdlib.h>
>+#include "cpu.h"
>+#include "encoder_internal.h"
>+
>+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
>+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
>+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
>+
>+DspFunctions dsp_funcs;
>+
>+static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr,
>+                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
>+                  ogg_uint32_t ReconPixelsPerLine) {
>+  int i;
>+
>+  /* For each block row */
>+  for (i=8; i; i--) {
>+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
>+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
>+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
>+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
>+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
>+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
>+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
>+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
>+
>+    /* Start next row */
>+    FiltPtr += PixelsPerLine;
>+    ReconPtr += ReconPixelsPerLine;
>+    DctInputPtr += 8;
>+  }
>+}
>+
>+static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
>+                      ogg_uint32_t PixelsPerLine) {
>+  int i;
>+  /* For each block row */
>+  for (i=8; i; i--) {
>+    /* INTRA mode so code raw image data */
>+    /* We convert the data to 8 bit signed (by subtracting 128) as
>+       this reduces the internal precision requirments in the DCT
>+       transform. */
>+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
>+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
>+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
>+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
>+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
>+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
>+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
>+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
>+
>+    /* Start next row */
>+    FiltPtr += PixelsPerLine;
>+    DctInputPtr += 8;
>+  }
>+}
>+
>+static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
>+                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
>+                     ogg_uint32_t PixelsPerLine,
>+                     ogg_uint32_t ReconPixelsPerLine) 
>+{
>+  int i;
>+
>+  /* For each block row */
>+  for (i=8; i; i--) {
>+    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
>+    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
>+    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
>+    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
>+    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
>+    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
>+    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
>+    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
>+
>+    /* Start next row */
>+    FiltPtr += PixelsPerLine;
>+    ReconPtr1 += ReconPixelsPerLine;
>+    ReconPtr2 += ReconPixelsPerLine;
>+    DctInputPtr += 8;
>+  }
>+}
>+
>+static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2)
>+{
>+  ogg_uint32_t SadValue;
>+  ogg_uint32_t SadValue1;
>+
>+  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
>+	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
>+	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
>+	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
>+
>+  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
>+	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
>+	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
>+	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
>+
>+  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
>+
>+  return SadValue;
>+}
>+
>+static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2,
>+		                    ogg_uint32_t stride)
>+{
>+  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
>+  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
>+  ogg_uint32_t MaxSad = 0;
>+  ogg_uint32_t i;
>+
>+  for ( i = 0; i < 4; i++ ){
>+    SadValue[0] += abs(Src1[0] - Src2[0]);
>+    SadValue[1] += abs(Src1[1] - Src2[1]);
>+    SadValue[2] += abs(Src1[2] - Src2[2]);
>+    SadValue[3] += abs(Src1[3] - Src2[3]);
>+    SadValue[4] += abs(Src1[4] - Src2[4]);
>+    SadValue[5] += abs(Src1[5] - Src2[5]);
>+    SadValue[6] += abs(Src1[6] - Src2[6]);
>+    SadValue[7] += abs(Src1[7] - Src2[7]);
>+    
>+    Src1 += stride;
>+    Src2 += stride;
>+  }
>+
>+  for ( i = 0; i < 4; i++ ){
>+    SadValue2[0] += abs(Src1[0] - Src2[0]);
>+    SadValue2[1] += abs(Src1[1] - Src2[1]);
>+    SadValue2[2] += abs(Src1[2] - Src2[2]);
>+    SadValue2[3] += abs(Src1[3] - Src2[3]);
>+    SadValue2[4] += abs(Src1[4] - Src2[4]);
>+    SadValue2[5] += abs(Src1[5] - Src2[5]);
>+    SadValue2[6] += abs(Src1[6] - Src2[6]);
>+    SadValue2[7] += abs(Src1[7] - Src2[7]);
>+    
>+    Src1 += stride;
>+    Src2 += stride;
>+  }
>+    
>+  for ( i = 0; i < 8; i++ ){
>+    if ( SadValue[i] > MaxSad )
>+      MaxSad = SadValue[i];
>+    if ( SadValue2[i] > MaxSad )
>+      MaxSad = SadValue2[i];
>+  }
>+    
>+  return MaxSad;
>+}
>+
>+static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1,
>+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
>+{
>+  ogg_uint32_t  i;
>+  ogg_uint32_t  sad = 0;
>+
>+  for (i=8; i; i--) {
>+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
>+
>+    /* Step to next row of block. */
>+    ptr1 += stride1;
>+    ptr2 += stride2;
>+  }
>+
>+  return sad;
>+}
>+
>+static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
>+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
>+			   	  ogg_uint32_t thres)
>+{
>+  ogg_uint32_t  i;
>+  ogg_uint32_t  sad = 0;
>+
>+  for (i=8; i; i--) {
>+    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
>+    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
>+
>+    if (sad > thres )
>+      break;
>+
>+    /* Step to next row of block. */
>+    ptr1 += stride1;
>+    ptr2 += stride2;
>+  }
>+
>+  return sad;
>+}
>+
>+static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                      unsigned char *RefDataPtr1,
>+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
>+			              ogg_uint32_t thres)
>+{
>+  ogg_uint32_t  i;
>+  ogg_uint32_t  sad = 0;
>+
>+  for (i=8; i; i--) {
>+    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
>+    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
>+
>+    if ( sad > thres )
>+      break;
>+
>+    /* Step to next row of block. */
>+    SrcData += SrcStride;
>+    RefDataPtr1 += RefStride;
>+    RefDataPtr2 += RefStride;
>+  }
>+
>+  return sad;
>+}
>+
>+static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride)
>+{
>+  ogg_uint32_t  i;
>+  ogg_uint32_t  XSum=0;
>+  ogg_uint32_t  XXSum=0;
>+
>+  for (i=8; i; i--) {
>+     /* Examine alternate pixel locations. */
>+     XSum += DataPtr[0];
>+     XXSum += DataPtr[0]*DataPtr[0];
>+     XSum += DataPtr[1];
>+     XXSum += DataPtr[1]*DataPtr[1];
>+     XSum += DataPtr[2];
>+     XXSum += DataPtr[2]*DataPtr[2];
>+     XSum += DataPtr[3];
>+     XXSum += DataPtr[3]*DataPtr[3];
>+     XSum += DataPtr[4];
>+     XXSum += DataPtr[4]*DataPtr[4];
>+     XSum += DataPtr[5];
>+     XXSum += DataPtr[5]*DataPtr[5];
>+     XSum += DataPtr[6];
>+     XXSum += DataPtr[6]*DataPtr[6];
>+     XSum += DataPtr[7];
>+     XXSum += DataPtr[7]*DataPtr[7];
>+
>+     /* Step to next row of block. */
>+     DataPtr += Stride;
>+   }
>+
>+   /* Compute population variance as mis-match metric. */
>+   return (( (XXSum<<6) - XSum*XSum ) );
>+}
>+
>+static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
>+{
>+  ogg_uint32_t  i;
>+  ogg_uint32_t  XSum=0;
>+  ogg_uint32_t  XXSum=0;
>+  ogg_int32_t   DiffVal;
>+
>+  for (i=8; i; i--) {
>+    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+        
>+    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+        
>+    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+        
>+    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+        
>+    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+        
>+    /* Step to next row of block. */
>+    SrcData += SrcStride;
>+    RefDataPtr += RefStride;
>+  }
>+
>+  /* Compute and return population variance as mis-match metric. */
>+  return (( (XXSum<<6) - XSum*XSum ));
>+}
>+
>+static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                     unsigned char *RefDataPtr1,
>+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
>+{
>+  ogg_uint32_t  i;
>+  ogg_uint32_t  XSum=0;
>+  ogg_uint32_t  XXSum=0;
>+  ogg_int32_t   DiffVal;
>+
>+  for (i=8; i; i--) {
>+    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
>+    XSum += DiffVal;
>+    XXSum += DiffVal*DiffVal;
>+
>+    /* Step to next row of block. */
>+    SrcData += SrcStride;
>+    RefDataPtr1 += RefStride;
>+    RefDataPtr2 += RefStride;
>+  }
>+
>+  /* Compute and return population variance as mis-match metric. */
>+  return (( (XXSum<<6) - XSum*XSum ));
>+}
>+
>+static void nop (void) { /* NOP */ }
>+
>+void dsp_init(DspFunctions *funcs)
>+{
>+  funcs->save_fpu = nop;
>+  funcs->restore_fpu = nop;
>+  funcs->sub8x8 = sub8x8__c;
>+  funcs->sub8x8_128 = sub8x8_128__c;
>+  funcs->sub8x8avg2 = sub8x8avg2__c;
>+  funcs->row_sad8 = row_sad8__c;
>+  funcs->col_sad8x8 = col_sad8x8__c;
>+  funcs->sad8x8 = sad8x8__c;
>+  funcs->sad8x8_thres = sad8x8_thres__c;
>+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
>+  funcs->intra8x8_err = intra8x8_err__c;
>+  funcs->inter8x8_err = inter8x8_err__c;
>+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
>+}
>+
>+void dsp_static_init(void)
>+{
>+  cpu_init ();
>+  dsp_init (&dsp_funcs);
>+  dsp_recon_init (&dsp_funcs);
>+  dsp_dct_init (&dsp_funcs);
>+  if (cpu_flags & CPU_X86_MMX) {
>+    dsp_i386_mmx_init(&dsp_funcs);
>+  }
>+  if (cpu_flags & CPU_X86_MMXEXT) {
>+    dsp_i386_mmxext_init(&dsp_funcs);
>+  }
>+}
>+
>diff -Naur libtheora-1.0alpha3/lib/dsp.h libtheora-1.0alpha3.mmx/lib/dsp.h
>--- libtheora-1.0alpha3/lib/dsp.h	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/dsp.h	2004-10-06 17:48:22.364408488 +0200
>@@ -0,0 +1,154 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#ifndef DSP_H
>+#define DSP_H
>+
>+#include <theora/theora.h>
>+
>+typedef struct
>+{
>+  void   (*save_fpu)            (void);
>+  void   (*restore_fpu)         (void);
>+
>+  void   (*sub8x8)  		(unsigned char *FiltPtr, unsigned char *ReconPtr,
>+	                   	 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
>+				 ogg_uint32_t ReconPixelsPerLine);
>+
>+  void   (*sub8x8_128) 		(unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
>+			         ogg_uint32_t PixelsPerLine);
>+
>+  void   (*sub8x8avg2) 		(unsigned char *FiltPtr, unsigned char *ReconPtr1,
>+		                 unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
>+			         ogg_uint32_t PixelsPerLine,
>+			         ogg_uint32_t ReconPixelsPerLine); 
>+
>+  void   (*copy8x8)  		(unsigned char *src, unsigned char *dest, 
>+		                 ogg_uint32_t stride);
>+
>+  void   (*recon_intra8x8)  	(unsigned char *ReconPtr, ogg_int16_t *ChangePtr, 
>+		                 ogg_uint32_t LineStep);
>+
>+  void   (*recon_inter8x8)  	(unsigned char *ReconPtr, unsigned char *RefPtr, 
>+		                 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
>+
>+  void   (*recon_inter8x8_half)	(unsigned char *ReconPtr, unsigned char *RefPtr1, 
>+		  		 unsigned char *RefPtr2, ogg_int16_t *ChangePtr, 
>+				 ogg_uint32_t LineStep);
>+
>+  void   (*fdct_short)          (ogg_int16_t *InputData, ogg_int16_t *OutputData);
>+
>+  ogg_uint32_t (*row_sad8)	(unsigned char *Src1, unsigned char *Src2);
>+
>+  ogg_uint32_t (*col_sad8x8)	(unsigned char *Src1, unsigned char *Src2,
>+		  		 ogg_uint32_t stride);
>+
>+  ogg_uint32_t (*sad8x8)	(unsigned char *ptr1, ogg_uint32_t stride1,
>+		        	 unsigned char *ptr2, ogg_uint32_t stride2);
>+
>+  ogg_uint32_t (*sad8x8_thres)	(unsigned char *ptr1, ogg_uint32_t stride1,
>+		       		 unsigned char *ptr2, ogg_uint32_t stride2, 
>+				 ogg_uint32_t thres);
>+
>+  ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                 unsigned char *RefDataPtr1,
>+			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
>+				 ogg_uint32_t thres);
>+
>+  ogg_uint32_t (*intra8x8_err)	(unsigned char *DataPtr, ogg_uint32_t Stride);
>+
>+  ogg_uint32_t (*inter8x8_err)	(unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride);
>+
>+  ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                 unsigned char *RefDataPtr1,
>+			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
>+} DspFunctions;
>+
>+extern DspFunctions dsp_funcs;
>+
>+extern void dsp_recon_init (DspFunctions *funcs);
>+
>+void dsp_init(DspFunctions *funcs);
>+void dsp_static_init(void);
>+
>+#define dsp_save_fpu(funcs) (funcs.save_fpu ())
>+#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)
>+
>+#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
>+#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs)
>+
>+#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5))
>+#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5)
>+
>+#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
>+#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3)
>+
>+#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6))
>+#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6)
>+
>+#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
>+#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1)
>+
>+#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1))
>+#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1)
>+
>+#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \
>+	(funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1))
>+#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \
>+	dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1)
>+
>+#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \
>+	(funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1))
>+#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \
>+	dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1)
>+
>+#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
>+#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out)
>+
>+#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
>+#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2)
>+
>+#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1))
>+#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1)
>+
>+#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2))
>+#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2)
>+
>+#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t))
>+#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t)
>+
>+#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \
>+	(funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t))
>+#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \
>+	dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t)
>+
>+#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
>+#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1)
>+
>+#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \
>+	(funcs.inter8x8_err (ptr1,str1,ptr2,str2))
>+#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \
>+	dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2)
>+
>+#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
>+	(funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
>+#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \
>+	dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2)
>+
>+
>+#endif /* DSP_H */
>diff -Naur libtheora-1.0alpha3/lib/encode.c libtheora-1.0alpha3.mmx/lib/encode.c
>--- libtheora-1.0alpha3/lib/encode.c	2004-03-18 15:25:25.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/encode.c	2004-10-06 17:48:22.401402864 +0200
>@@ -531,8 +531,7 @@
> 
> static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
>                                      ogg_int32_t BlockIndex ) {
>-  ogg_uint32_t  i;
>-  ogg_uint32_t  ErrorVal = 0;
>+  ogg_uint32_t  ErrorVal;
> 
>   unsigned char * SrcDataPtr =
>     &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
>@@ -550,21 +549,8 @@
>     RecStride = cpi->pb.UVStride;
>   }
> 
>+  ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride);
> 
>-  /* Decide on standard or MMX implementation */
>-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
>-    ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
>-    ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
>-    /* Step to next row of block. */
>-    SrcDataPtr += SrcStride;
>-    RecDataPtr += RecStride;
>-  }
>   return ErrorVal;
> }
> 
>@@ -933,9 +919,13 @@
>     /* Zero Decoder EOB run count */
>     cpi->pb.EOB_Run = 0;
> 
>+    dsp_static_save_fpu ();
>+
>     /* Encode any fragments coded using DCT. */
>     coded_pixels += QuadCodeDisplayFragments (cpi);
> 
>+    dsp_static_restore_fpu ();
>+
>     return coded_pixels;
> 
> }
>diff -Naur libtheora-1.0alpha3/lib/encoder_internal.h libtheora-1.0alpha3.mmx/lib/encoder_internal.h
>--- libtheora-1.0alpha3/lib/encoder_internal.h	2004-03-09 03:02:56.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/encoder_internal.h	2004-10-06 17:48:22.436397544 +0200
>@@ -24,6 +24,7 @@
> 
> #include <theora/theora.h>
> #include "huffman.h"
>+#include "dsp.h"
> 
> #ifndef LIBOGG2
> #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
>@@ -689,23 +690,9 @@
>                    ogg_int16_t *QuantMatrix,
>                    ogg_int16_t * OutputData );
> 
>-extern void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
>-                        ogg_int16_t * ChangePtr, ogg_uint32_t LineStep );
>-
>-extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
>-                        unsigned char * RefPtr, ogg_int16_t * ChangePtr,
>-                        ogg_uint32_t LineStep ) ;
>-
>-extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
>-                                  unsigned char * RefPtr1,
>-                                  unsigned char * RefPtr2,
>-                                  ogg_int16_t * ChangePtr,
>-                                  ogg_uint32_t LineStep ) ;
>+extern void dsp_recon_init (DspFunctions *funcs);
> 
> extern void SetupLoopFilter(PB_INSTANCE *pbi);
>-extern void CopyBlock(unsigned char *src,
>-                      unsigned char *dest,
>-                      unsigned int srcstride);
> extern void LoopFilter(PB_INSTANCE *pbi);
> extern void ReconRefFrames (PB_INSTANCE *pbi);
> extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,
>diff -Naur libtheora-1.0alpha3/lib/i386/dsp_mmx.c libtheora-1.0alpha3.mmx/lib/i386/dsp_mmx.c
>--- libtheora-1.0alpha3/lib/i386/dsp_mmx.c	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/i386/dsp_mmx.c	2004-10-06 17:48:22.472392072 +0200
>@@ -0,0 +1,642 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#include <stdlib.h>
>+#include "dsp.h"
>+
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL;
>+
>+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
>+    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
>+# define M(a) "_" #a
>+#else
>+# define M(a) #a
>+#endif
>+
>+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
>+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
>+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
>+
>+static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
>+                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
>+                  ogg_uint32_t ReconPixelsPerLine) 
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm7, %%mm7     \n\t" 
>+
>+    ".rept 8                        \n\t"
>+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
>+    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
>+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
>+    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
>+    /* convert from UINT8 to INT16 */
>+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
>+    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
>+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
>+    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
>+    /* start calculation */
>+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
>+    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
>+    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
>+    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
>+    /* Increment pointers */
>+    "  add         $16, %2           \n\t"
>+    "  add         %3, %0           \n\t"
>+    "  add         %4, %1           \n\t"
>+    ".endr                          \n\t"
>+
>+     : "+r" (FiltPtr),
>+       "+r" (ReconPtr),
>+       "+r" (DctInputPtr)
>+     : "m" (PixelsPerLine),
>+       "m" (ReconPixelsPerLine) 
>+     : "memory"
>+  );
>+}
>+
>+static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
>+                      ogg_uint32_t PixelsPerLine) 
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm7, %%mm7     \n\t" 
>+    "  movq      "M(V128w)", %%mm1  \n\t"
>+
>+    ".rept 8                        \n\t"
>+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
>+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
>+    /* convert from UINT8 to INT16 */
>+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
>+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
>+    /* start calculation */
>+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
>+    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
>+    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
>+    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
>+    /* Increment pointers */
>+    "  add         $16, %1           \n\t"
>+    "  add         %2, %0           \n\t"
>+    ".endr                          \n\t"
>+
>+     : "+r" (FiltPtr),
>+       "+r" (DctInputPtr)
>+     : "r" (PixelsPerLine)
>+     : "memory"
>+  );
>+}
>+
>+static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
>+                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
>+                     ogg_uint32_t PixelsPerLine,
>+                     ogg_uint32_t ReconPixelsPerLine) 
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm7, %%mm7     \n\t" 
>+
>+    ".rept 8                        \n\t"
>+    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
>+    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
>+    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
>+    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
>+    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
>+    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
>+    /* convert from UINT8 to INT16 */
>+    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
>+    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
>+    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
>+    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
>+    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
>+    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
>+    /* average ReconPtr1 and ReconPtr2 */
>+    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
>+    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
>+    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
>+    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
>+    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
>+    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
>+    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
>+    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
>+    /* Increment pointers */
>+    "  add         $16, %3           \n\t"
>+    "  add         %4, %0           \n\t"
>+    "  add         %5, %1           \n\t"
>+    "  add         %5, %2           \n\t"
>+    ".endr                          \n\t"
>+
>+     : "+r" (FiltPtr),
>+       "+r" (ReconPtr1),
>+       "+r" (ReconPtr2),
>+       "+r" (DctInputPtr)
>+     : "m" (PixelsPerLine),
>+       "m" (ReconPixelsPerLine) 
>+     : "memory"
>+  );
>+}
>+
>+static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
>+{
>+  ogg_uint32_t MaxSad;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
>+    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%2), %%mm1      \n\t"
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
>+
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
>+    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  movq        %%mm1, %%mm3     \n\t"
>+    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
>+    "  psrlq       $32, %%mm3       \n\t"
>+    "  paddw       %%mm2, %%mm0     \n\t"
>+    "  paddw       %%mm3, %%mm1     \n\t"
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  movq        %%mm1, %%mm3     \n\t"
>+    "  psrlq       $16, %%mm2       \n\t"
>+    "  psrlq       $16, %%mm3       \n\t"
>+    "  paddw       %%mm2, %%mm0     \n\t"
>+    "  paddw       %%mm3, %%mm1     \n\t"
>+
>+    "  psubusw     %%mm0, %%mm1     \n\t"
>+    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
>+    "  movd        %%mm1, %0        \n\t"
>+    "  andl        $0xffff, %0      \n\t"
>+
>+     : "=m" (MaxSad),
>+       "+r" (Src1), 
>+       "+r" (Src2) 
>+     :
>+     : "memory"
>+  );
>+  return MaxSad;
>+}
>+
>+static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
>+		                    ogg_uint32_t stride)
>+{
>+  ogg_uint32_t MaxSad;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
>+    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
>+    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
>+    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
>+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
>+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
>+    "1:                             \n\t"
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
>+    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
>+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
>+    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
>+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
>+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
>+    "2:                             \n\t"
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
>+    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
>+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
>+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
>+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
>+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 2b                       \n\t"
>+
>+    "  psubusw     %%mm6, %%mm7     \n\t"
>+    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
>+    "  psubusw     %%mm4, %%mm5     \n\t" 	
>+    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
>+    "  psubusw     %%mm5, %%mm7     \n\t" 	
>+    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
>+    "  movq        %%mm7, %%mm6     \n\t"
>+    "  psrlq       $32, %%mm6       \n\t"
>+    "  psubusw     %%mm6, %%mm7     \n\t" 	
>+    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
>+    "  movq        %%mm7, %%mm6     \n\t"
>+    "  psrlq       $16, %%mm6       \n\t"
>+    "  psubusw     %%mm6, %%mm7     \n\t" 	
>+    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
>+    "  movd        %%mm7, %0        \n\t"
>+    "  andl        $0xffff, %0      \n\t"
>+
>+     : "=r" (MaxSad),
>+       "+r" (Src1), 
>+       "+r" (Src2) 
>+     : "r" (stride)
>+     : "memory", "edi"
>+  );
>+
>+  return MaxSad;
>+}
>+
>+static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
>+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
>+{
>+  ogg_uint32_t  DiffVal;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
>+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
>+    ".rept 8                         \n\t"
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%2), %%mm1      \n\t"
>+    "  movq        %%mm0, %%mm2     \n\t"
>+
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
>+    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
>+    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
>+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
>+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
>+    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
>+    ".endr                          \n\t"
>+
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm7       \n\t"
>+    "  paddw       %%mm0, %%mm7     \n\t"
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $16, %%mm7       \n\t"
>+    "  paddw       %%mm0, %%mm7     \n\t"
>+    "  movd        %%mm7, %0        \n\t"
>+    "  andl        $0xffff, %0      \n\t"
>+
>+     : "=m" (DiffVal),
>+       "+r" (ptr1), 
>+       "+r" (ptr2) 
>+     : "r" (stride1),
>+       "r" (stride2)
>+     : "memory"
>+  );
>+
>+  return DiffVal;
>+}
>+
>+static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
>+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
>+			   	  ogg_uint32_t thres)
>+{
>+  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
>+}
>+
>+static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                      unsigned char *RefDataPtr1,
>+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
>+			              ogg_uint32_t thres)
>+{
>+  ogg_uint32_t  DiffVal;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
>+    "  paddb       %%mm5, %%mm5     \n\t"
>+   
>+    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
>+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
>+    "  mov         $8, %%edi        \n\t"	/* 8 rows */
>+    "1:                             \n\t"
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+
>+    "  movq        (%2), %%mm2      \n\t"
>+    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
>+    "  movq        %%mm2, %%mm1     \n\t"
>+    "  pand        %%mm3, %%mm1     \n\t"
>+    "  pxor        %%mm2, %%mm3     \n\t"
>+    "  pand        %%mm5, %%mm3     \n\t"
>+    "  psrlq       $1, %%mm3        \n\t"
>+    "  paddb       %%mm3, %%mm1     \n\t"
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
>+    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
>+    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
>+    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
>+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
>+    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
>+    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm7       \n\t"
>+    "  paddw       %%mm0, %%mm7     \n\t"
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $16, %%mm7       \n\t"
>+    "  paddw       %%mm0, %%mm7     \n\t"
>+    "  movd        %%mm7, %0        \n\t"
>+    "  andl        $0xffff, %0      \n\t"
>+
>+     : "=m" (DiffVal),
>+       "+r" (SrcData), 
>+       "+r" (RefDataPtr1), 
>+       "+r" (RefDataPtr2) 
>+     : "m" (SrcStride),
>+       "m" (RefStride)
>+     : "edi", "memory"
>+  );
>+
>+  return DiffVal;
>+}
>+
>+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
>+{
>+  ogg_uint32_t  XSum;
>+  ogg_uint32_t  XXSum;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm5, %%mm5     \n\t"
>+    "  pxor        %%mm6, %%mm6     \n\t"
>+    "  pxor        %%mm7, %%mm7     \n\t"
>+    "  mov         $8, %%edi        \n\t"
>+    "1:                             \n\t"
>+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        %%mm0, %%mm2     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"
>+    "  punpckhbw   %%mm6, %%mm2     \n\t"
>+
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  paddw       %%mm2, %%mm5     \n\t"
>+
>+    "  pmaddwd     %%mm0, %%mm0     \n\t"
>+    "  pmaddwd     %%mm2, %%mm2     \n\t"
>+    
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  paddd       %%mm2, %%mm7     \n\t"
>+
>+    "  add         %3, %2           \n\t"	/* Inc pointer into src data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $16, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movd        %%mm5, %%edi     \n\t"
>+    "  movsx       %%di, %%edi      \n\t"
>+    "  movl        %%edi, %0        \n\t"
>+
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm7       \n\t"
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  movd        %%mm7, %1        \n\t"
>+
>+     : "=r" (XSum),
>+       "=r" (XXSum),
>+       "+r" (DataPtr) 
>+     : "r" (Stride)
>+     : "edi", "memory"
>+  );
>+
>+  /* Compute population variance as mis-match metric. */
>+  return (( (XXSum<<6) - XSum*XSum ) );
>+}
>+
>+static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
>+{
>+  ogg_uint32_t  XSum;
>+  ogg_uint32_t  XXSum;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm5, %%mm5     \n\t"
>+    "  pxor        %%mm6, %%mm6     \n\t"
>+    "  pxor        %%mm7, %%mm7     \n\t"
>+    "  mov         $8, %%edi        \n\t"
>+    "1:                             \n\t"
>+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%3), %%mm1      \n\t"
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  movq        %%mm1, %%mm3     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"
>+    "  punpcklbw   %%mm6, %%mm1     \n\t"
>+    "  punpckhbw   %%mm6, %%mm2     \n\t"
>+    "  punpckhbw   %%mm6, %%mm3     \n\t"
>+
>+    "  psubsw      %%mm1, %%mm0     \n\t"
>+    "  psubsw      %%mm3, %%mm2     \n\t"
>+
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  paddw       %%mm2, %%mm5     \n\t"
>+
>+    "  pmaddwd     %%mm0, %%mm0     \n\t"
>+    "  pmaddwd     %%mm2, %%mm2     \n\t"
>+    
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  paddd       %%mm2, %%mm7     \n\t"
>+
>+    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
>+    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $16, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movd        %%mm5, %%edi     \n\t"
>+    "  movsx       %%di, %%edi      \n\t"
>+    "  movl        %%edi, %0        \n\t"
>+
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm7       \n\t"
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  movd        %%mm7, %1        \n\t"
>+
>+     : "=m" (XSum),
>+       "=m" (XXSum),
>+       "+r" (SrcData), 
>+       "+r" (RefDataPtr) 
>+     : "m" (SrcStride),
>+       "m" (RefStride)
>+     : "edi", "memory"
>+  );
>+
>+  /* Compute and return population variance as mis-match metric. */
>+  return (( (XXSum<<6) - XSum*XSum ));
>+}
>+
>+static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                     unsigned char *RefDataPtr1,
>+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
>+{
>+  ogg_uint32_t XSum;
>+  ogg_uint32_t XXSum;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
>+    "  paddb       %%mm4, %%mm4     \n\t"
>+    "  pxor        %%mm5, %%mm5     \n\t"
>+    "  pxor        %%mm6, %%mm6     \n\t"
>+    "  pxor        %%mm7, %%mm7     \n\t"
>+    "  mov         $8, %%edi        \n\t"
>+    "1:                             \n\t"
>+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
>+
>+    "  movq        (%3), %%mm2      \n\t"
>+    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
>+    "  movq        %%mm2, %%mm1     \n\t"
>+    "  pand        %%mm3, %%mm1     \n\t"
>+    "  pxor        %%mm2, %%mm3     \n\t"
>+    "  pand        %%mm4, %%mm3     \n\t"
>+    "  psrlq       $1, %%mm3        \n\t"
>+    "  paddb       %%mm3, %%mm1     \n\t"
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  movq        %%mm1, %%mm3     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"
>+    "  punpcklbw   %%mm6, %%mm1     \n\t"
>+    "  punpckhbw   %%mm6, %%mm2     \n\t"
>+    "  punpckhbw   %%mm6, %%mm3     \n\t"
>+
>+    "  psubsw      %%mm1, %%mm0     \n\t"
>+    "  psubsw      %%mm3, %%mm2     \n\t"
>+
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  paddw       %%mm2, %%mm5     \n\t"
>+
>+    "  pmaddwd     %%mm0, %%mm0     \n\t"
>+    "  pmaddwd     %%mm2, %%mm2     \n\t"
>+    
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  paddd       %%mm2, %%mm7     \n\t"
>+
>+    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
>+    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
>+    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $16, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movd        %%mm5, %%edi     \n\t"
>+    "  movsx       %%di, %%edi      \n\t"
>+    "  movl        %%edi, %0        \n\t"
>+
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm7       \n\t"
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  movd        %%mm7, %1        \n\t"
>+
>+     : "=m" (XSum),
>+       "=m" (XXSum),
>+       "+r" (SrcData), 
>+       "+r" (RefDataPtr1),
>+       "+r" (RefDataPtr2) 
>+     : "m" (SrcStride),
>+       "m" (RefStride)
>+     : "edi", "memory"
>+  );
>+
>+  /* Compute and return population variance as mis-match metric. */
>+  return (( (XXSum<<6) - XSum*XSum ));
>+}
>+
>+static void restore_fpu (void)
>+{
>+  __asm__ __volatile__ (
>+    "  emms                         \n\t"
>+  );
>+}
>+
>+void dsp_i386_mmx_init(DspFunctions *funcs)
>+{
>+  funcs->restore_fpu = restore_fpu;
>+  funcs->sub8x8 = sub8x8__mmx;
>+  funcs->sub8x8_128 = sub8x8_128__mmx;
>+  funcs->sub8x8avg2 = sub8x8avg2__mmx;
>+  funcs->row_sad8 = row_sad8__mmx;
>+  funcs->col_sad8x8 = col_sad8x8__mmx;
>+  funcs->sad8x8 = sad8x8__mmx;
>+  funcs->sad8x8_thres = sad8x8_thres__mmx;
>+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
>+  funcs->intra8x8_err = intra8x8_err__mmx;
>+  funcs->inter8x8_err = inter8x8_err__mmx;
>+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
>+}
>+
>diff -Naur libtheora-1.0alpha3/lib/i386/dsp_mmxext.c libtheora-1.0alpha3.mmx/lib/i386/dsp_mmxext.c
>--- libtheora-1.0alpha3/lib/i386/dsp_mmxext.c	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/i386/dsp_mmxext.c	2004-10-06 17:48:22.474391768 +0200
>@@ -0,0 +1,316 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#include <stdlib.h>
>+#include "dsp.h"
>+
>+static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
>+		       	    unsigned char *ptr2, ogg_uint32_t stride2)
>+{
>+  ogg_uint32_t  DiffVal;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
>+
>+    ".rept 7                        \n\t"
>+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
>+    "  movq (%2), %%mm1             \n\t"
>+    "  psadbw %%mm1, %%mm0          \n\t"
>+    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
>+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
>+    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
>+    ".endr                          \n\t"
>+
>+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
>+    "  movq (%2), %%mm1             \n\t"
>+    "  psadbw %%mm1, %%mm0          \n\t"
>+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
>+    "  movd %%mm7, %0               \n\t"
>+
>+     : "=r" (DiffVal),
>+       "+r" (ptr1), 
>+       "+r" (ptr2) 
>+     : "r" (stride1),
>+       "r" (stride2)
>+     : "memory"
>+  );
>+
>+  return DiffVal;
>+}
>+
>+static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
>+		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
>+			   	  ogg_uint32_t thres)
>+{
>+  ogg_uint32_t  DiffVal;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
>+
>+    ".rept 8                        \n\t"
>+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
>+    "  movq (%2), %%mm1             \n\t"
>+    "  psadbw %%mm1, %%mm0          \n\t"
>+    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
>+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
>+    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
>+    ".endr                          \n\t"
>+
>+    "  movd %%mm7, %0               \n\t"
>+
>+     : "=r" (DiffVal),
>+       "+r" (ptr1), 
>+       "+r" (ptr2) 
>+     : "r" (stride1),
>+       "r" (stride2)
>+     : "memory"
>+  );
>+
>+  return DiffVal;
>+}
>+
>+static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                      unsigned char *RefDataPtr1,
>+			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
>+			              ogg_uint32_t thres)
>+{
>+  ogg_uint32_t  DiffVal;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
>+    ".rept 8                        \n\t"
>+    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
>+    "  movq (%2), %%mm1             \n\t"
>+    "  movq (%3), %%mm2             \n\t"
>+    "  pavgb %%mm2, %%mm1           \n\t"
>+    "  psadbw %%mm1, %%mm0          \n\t"
>+
>+    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
>+    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
>+    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
>+    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
>+    ".endr                          \n\t"
>+
>+    "  movd %%mm7, %0               \n\t"
>+     : "=m" (DiffVal),
>+       "+r" (SrcData), 
>+       "+r" (RefDataPtr1), 
>+       "+r" (RefDataPtr2) 
>+     : "m" (SrcStride),
>+       "m" (RefStride)
>+     : "memory"
>+  );
>+
>+  return DiffVal;
>+}
>+		
>+static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
>+{
>+  ogg_uint32_t MaxSad;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  movd        (%1), %%mm0      \n\t"
>+    "  movd        (%2), %%mm1      \n\t"
>+    "  psadbw      %%mm0, %%mm1     \n\t"
>+    "  movd        4(%1), %%mm2     \n\t"
>+    "  movd        4(%2), %%mm3     \n\t"
>+    "  psadbw      %%mm2, %%mm3     \n\t"
>+
>+    "  pmaxsw      %%mm1, %%mm3     \n\t"
>+    "  movd        %%mm3, %0        \n\t"
>+    "  andl        $0xffff, %0      \n\t"
>+
>+     : "=m" (MaxSad),
>+       "+r" (Src1), 
>+       "+r" (Src2) 
>+     :
>+     : "memory"
>+  );
>+
>+  return MaxSad;
>+}
>+
>+static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
>+		                    ogg_uint32_t stride)
>+{
>+  ogg_uint32_t MaxSad;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
>+    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
>+    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
>+    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
>+    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
>+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
>+    "1:                             \n\t"
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
>+    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
>+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
>+    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
>+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
>+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  mov         $4, %%edi        \n\t"	/* 4 rows */
>+    "2:                             \n\t"
>+    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
>+    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
>+    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
>+    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
>+    "  movq        %%mm0, %%mm1     \n\t"
>+
>+    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
>+    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
>+    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
>+    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
>+    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
>+    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 2b                       \n\t"
>+
>+    "  pmaxsw      %%mm6, %%mm7     \n\t"
>+    "  pmaxsw      %%mm4, %%mm5     \n\t"
>+    "  pmaxsw      %%mm5, %%mm7     \n\t"
>+    "  movq        %%mm7, %%mm6     \n\t"
>+    "  psrlq       $32, %%mm6       \n\t"
>+    "  pmaxsw      %%mm6, %%mm7     \n\t"
>+    "  movq        %%mm7, %%mm6     \n\t"
>+    "  psrlq       $16, %%mm6       \n\t"
>+    "  pmaxsw      %%mm6, %%mm7     \n\t"
>+    "  movd        %%mm7, %0        \n\t"
>+    "  andl        $0xffff, %0      \n\t"
>+
>+     : "=r" (MaxSad),
>+       "+r" (Src1), 
>+       "+r" (Src2) 
>+     : "r" (stride)
>+     : "memory", "edi"
>+  );
>+
>+  return MaxSad;
>+}
>+
>+static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
>+		                     unsigned char *RefDataPtr1,
>+				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
>+{
>+  ogg_uint32_t XSum;
>+  ogg_uint32_t XXSum;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+
>+    "  pxor        %%mm4, %%mm4     \n\t"
>+    "  pxor        %%mm5, %%mm5     \n\t"
>+    "  pxor        %%mm6, %%mm6     \n\t"
>+    "  pxor        %%mm7, %%mm7     \n\t"
>+    "  mov         $8, %%edi        \n\t"
>+    "1:                             \n\t"
>+    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
>+
>+    "  movq        (%3), %%mm2      \n\t"
>+    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
>+    "  pavgb       %%mm2, %%mm1     \n\t"
>+
>+    "  movq        %%mm0, %%mm2     \n\t"
>+    "  movq        %%mm1, %%mm3     \n\t"
>+
>+    "  punpcklbw   %%mm6, %%mm0     \n\t"
>+    "  punpcklbw   %%mm4, %%mm1     \n\t"
>+    "  punpckhbw   %%mm6, %%mm2     \n\t"
>+    "  punpckhbw   %%mm4, %%mm3     \n\t"
>+
>+    "  psubsw      %%mm1, %%mm0     \n\t"
>+    "  psubsw      %%mm3, %%mm2     \n\t"
>+
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  paddw       %%mm2, %%mm5     \n\t"
>+
>+    "  pmaddwd     %%mm0, %%mm0     \n\t"
>+    "  pmaddwd     %%mm2, %%mm2     \n\t"
>+    
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  paddd       %%mm2, %%mm7     \n\t"
>+
>+    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
>+    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
>+    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */
>+
>+    "  dec         %%edi            \n\t"
>+    "  jnz 1b                       \n\t"
>+
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movq        %%mm5, %%mm0     \n\t"
>+    "  psrlq       $16, %%mm5       \n\t"
>+    "  paddw       %%mm0, %%mm5     \n\t"
>+    "  movd        %%mm5, %%edi     \n\t"
>+    "  movsx       %%di, %%edi      \n\t"
>+    "  movl        %%edi, %0        \n\t"
>+
>+    "  movq        %%mm7, %%mm0     \n\t"
>+    "  psrlq       $32, %%mm7       \n\t"
>+    "  paddd       %%mm0, %%mm7     \n\t"
>+    "  movd        %%mm7, %1        \n\t"
>+
>+     : "=m" (XSum),
>+       "=m" (XXSum),
>+       "+r" (SrcData), 
>+       "+r" (RefDataPtr1),
>+       "+r" (RefDataPtr2) 
>+     : "m" (SrcStride),
>+       "m" (RefStride)
>+     : "edi", "memory"
>+  );
>+
>+  /* Compute and return population variance as mis-match metric. */
>+  return (( (XXSum<<6) - XSum*XSum ));
>+}
>+
>+void dsp_i386_mmxext_init(DspFunctions *funcs)
>+{
>+  funcs->row_sad8 = row_sad8__mmxext;
>+  funcs->col_sad8x8 = col_sad8x8__mmxext;
>+  funcs->sad8x8 = sad8x8__mmxext;
>+  funcs->sad8x8_thres = sad8x8_thres__mmxext;
>+  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
>+  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
>+}
>+
>diff -Naur libtheora-1.0alpha3/lib/i386/fdct_mmx.c libtheora-1.0alpha3.mmx/lib/i386/fdct_mmx.c
>--- libtheora-1.0alpha3/lib/i386/fdct_mmx.c	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/i386/fdct_mmx.c	2004-10-06 17:48:22.509386448 +0200
>@@ -0,0 +1,340 @@
>+;//==========================================================================
>+;//
>+;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
>+;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
>+;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
>+;//  PURPOSE.
>+;//
>+;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
>+;//
>+;//--------------------------------------------------------------------------
>+
>+#include <theora/theora.h>
>+#include "dsp.h"
>+
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
>+
>+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
>+    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
>+# define M(a) "_" #a
>+#else
>+# define M(a) #a
>+#endif
>+
>+/***********************************************************************
>+ *	File:			fdct_m.asm
>+ *
>+ *	Description:
>+ *					This function perform 2-D Forward DCT on a 8x8 block
>+ *					
>+ *
>+ *	Input:			Pointers to input source data buffer and destination 
>+ *					buffer.
>+ *
>+ *	Note:			none
>+ *
>+ *	Special Notes:	We try to do the truncation right to match the result 
>+ *					of the c version. 
>+ *
>+ ************************************************************************/
>+
>+/* execute stage 1 of forward DCT */
>+#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
>+  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
>+  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
>+  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
>+  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
>+  "  movq        %%mm0, %%mm4       \n\t"                                     \
>+  "  movq        %%mm1, %%mm5       \n\t"                                     \
>+  "  movq        %%mm2, %%mm6       \n\t"                                     \
>+  "  movq        %%mm3, %%mm7       \n\t"                                     \
>+                                                                              \
>+  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
>+  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
>+  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
>+  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
>+  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
>+  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
>+                                                                              \
>+  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
>+                                                                              \
>+  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
>+                                                                              \
>+  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
>+                                                                              \
>+  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
>+  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
>+  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
>+  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
>+  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
>+                                                                              \
>+  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
>+  /* ------------------------------------------------------------------- */   \
>+  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
>+  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
>+  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
>+  /* ------------------------------------------------------------------- */   \
>+  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
>+  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
>+  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
>+                                                                              \
>+  "  pmulhw   "M(xC4S4)", %%mm0     \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
>+  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
>+                                                                              \
>+  "  movq        %%mm3, %%mm2       \n\t"                                     \
>+  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
>+                                                                              \
>+  "  movq        %%mm3, %%mm0       \n\t"                                     \
>+  "  pmulhw   "M(xC4S4)", %%mm3     \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
>+                                                                              \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
>+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
>+                                                                              \
>+  "  movq        %%mm3," #ip0 "     \n\t"                                     \
>+  /* ------------------------------------------------------------------- */   \
>+  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
>+  "  pmulhw   "M(xC2S6)", %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
>+                                                                              \
>+  "  movq      " #temp ", %%mm2     \n\t"                                     \
>+  "  movq        %%mm2, %%mm0       \n\t"                                     \
>+                                                                              \
>+  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
>+  "  paddw       %%mm0, %%mm3       \n\t"                                     \
>+                                                                              \
>+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
>+  "  movq        %%mm5, %%mm0       \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm5, %%mm2       \n\t"                                     \
>+  "  pmulhw   "M(xC6S2)", %%mm0     \n\t" /* mm0 = xC6S2 * irot_input_x */    \
>+                                                                              \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
>+                                                                              \
>+  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
>+  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
>+                                                                              \
>+  "  movq        %%mm5, %%mm0       \n\t"                                     \
>+  "  movq        %%mm5, %%mm2       \n\t"                                     \
>+                                                                              \
>+  "  pmulhw   "M(xC2S6)", %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+                                                                              \
>+  "  movq      " #temp ", %%mm3     \n\t"                                     \
>+  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
>+                                                                              \
>+  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
>+  "  movq        %%mm3, %%mm2       \n\t"                                     \
>+                                                                              \
>+  "  pmulhw   "M(xC6S2)", %%mm3     \n\t" /* mm3 = xC6S2 * irot_input_y */    \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+                                                                              \
>+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
>+  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm3," #ip6 "     \n\t"                                     \
>+  /* ------------------------------------------------------------------- */   \
>+  "  movq     "M(xC4S4)", %%mm0     \n\t"                                     \
>+  "  movq        %%mm1, %%mm2       \n\t"                                     \
>+  "  movq        %%mm1, %%mm3       \n\t"                                     \
>+                                                                              \
>+  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
>+  "  psrlw       $15, %%mm2         \n\t"				      \
>+                                                                              \
>+  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
>+  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
>+                                                                              \
>+  "  movq        %%mm7, %%mm2       \n\t"                                     \
>+  "  movq        %%mm7, %%mm3       \n\t"			              \
>+                                                                              \
>+  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
>+  "  psrlw       $15, %%mm2         \n\t"			              \
>+                                                                              \
>+  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
>+  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
>+  /* ------------------------------------------------------------------- */   \
>+  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
>+  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
>+                                                                              \
>+  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
>+  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
>+  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
>+                                                                              \
>+  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
>+  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
>+  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
>+  /* ------------------------------------------------------------------- */   \
>+  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
>+  "  movq        %%mm1, %%mm2       \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm1, %%mm3       \n\t"                                     \
>+  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
>+                                                                              \
>+  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+                                                                              \
>+  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
>+  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
>+                                                                              \
>+  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
>+  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
>+                                                                              \
>+  "  movq        %%mm0, %%mm5       \n\t"                                     \
>+  "  movq        %%mm0, %%mm2       \n\t"                                     \
>+                                                                              \
>+  "  movq     "M(xC1S7)", %%mm7     \n\t"                                     \
>+  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
>+                                                                              \
>+  "  movq     "M(xC7S1)", %%mm7     \n\t"                                     \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+                                                                              \
>+  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
>+  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
>+                                                                              \
>+  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
>+  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
>+                                                                              \
>+  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
>+  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
>+                                                                              \
>+  "  movq        %%mm1," #ip1 "     \n\t"                                     \
>+  "  movq        %%mm3," #ip7 "     \n\t"                                     \
>+  /* ------------------------------------------------------------------- */   \
>+  "  movq     "M(xC3S5)", %%mm0     \n\t"                                     \
>+  "  movq     "M(xC5S3)", %%mm1     \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm6, %%mm5       \n\t"                                     \
>+  "  movq        %%mm6, %%mm7       \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm4, %%mm2       \n\t"                                     \
>+  "  movq        %%mm4, %%mm3       \n\t"                                     \
>+                                                                              \
>+  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
>+  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
>+                                                                              \
>+  "  psrlw       $15, %%mm2         \n\t"                                     \
>+  "  psrlw       $15, %%mm5         \n\t"                                     \
>+                                                                              \
>+  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
>+  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
>+                                                                              \
>+  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
>+  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
>+                                                                              \
>+  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
>+  "  movq        %%mm4," #ip3 "     \n\t"                                     \
>+                                                                              \
>+  "  movq        %%mm3, %%mm4       \n\t"                                     \
>+  "  movq        %%mm7, %%mm6       \n\t"                                     \
>+                                                                              \
>+  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
>+  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
>+                                                                              \
>+  "  paddw       %%mm2, %%mm4       \n\t"                                     \
>+  "  paddw       %%mm5, %%mm6       \n\t"                                     \
>+                                                                              \
>+  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
>+  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
>+                                                                              \
>+  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
>+  "  movq        %%mm3," #ip5 "     \n\t" 
>+
>+#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
>+		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
>+  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
>+  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
>+  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
>+  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
>+  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
>+  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
>+  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
>+  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
>+  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
>+   /* Transpose 2x8 block */                                            \
>+  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
>+  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
>+  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
>+  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
>+  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
>+  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
>+  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
>+  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
>+  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
>+  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
>+  "  movq        %%mm4," #op4 "     \n\t"                               \
>+  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
>+  "  movq        %%mm5," #op5 "     \n\t"                               \
>+  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
>+  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
>+  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
>+  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
>+  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
>+  "  movq        %%mm6," #op7 "     \n\t"                               \
>+  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
>+  "  movq        %%mm1," #op6 "     \n\t"                               \
>+  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
>+  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
>+  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
>+  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
>+  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
>+  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
>+  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
>+  "  movq        %%mm0," #op0 "     \n\t"                               \
>+  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
>+  "  movq        %%mm1," #op1 "     \n\t"                               \
>+  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
>+  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
>+  "  movq        %%mm4," #op3 "     \n\t"                               \
>+  "  movq        %%mm2," #op2 "     \n\t"
>+
>+
>+static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
>+{
>+  ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
>+  ogg_int16_t *const temp= (int16_t*)align_tmp;
>+
>+  __asm__ __volatile__ (
>+    "  .balign 16                   \n\t"
>+    /*
>+     * Input data is an 8x8 block.  To make processing of the data more efficent
>+     * we will transpose the block of data to two 4x8 blocks???
>+     */
>+    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
>+		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
>+    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
>+
>+    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
>+		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
>+    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
>+
>+    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
>+		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
>+    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
>+
>+    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
>+		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
>+    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
>+
>+    "  emms                         \n\t"
>+    
>+    : "+r" (InputData),
>+      "+r" (OutputData)
>+    : "r" (temp)
>+    : "memory"
>+  );
>+}
>+
>+void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
>+{
>+  funcs->fdct_short = fdct_short__mmx;
>+}
>diff -Naur libtheora-1.0alpha3/lib/i386/recon_mmx.c libtheora-1.0alpha3.mmx/lib/i386/recon_mmx.c
>--- libtheora-1.0alpha3/lib/i386/recon_mmx.c	1970-01-01 01:00:00.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/i386/recon_mmx.c	2004-10-06 17:48:22.510386296 +0200
>@@ -0,0 +1,185 @@
>+/********************************************************************
>+ *                                                                  *
>+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
>+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
>+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
>+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
>+ *                                                                  *
>+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
>+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
>+ *                                                                  *
>+ ********************************************************************
>+
>+  function:
>+  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
>+
>+ ********************************************************************/
>+
>+#include "encoder_internal.h"
>+
>+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
>+
>+#if defined(__MINGW32__) || defined(__CYGWIN__) || \
>+	    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
>+# define M(a) "_" #a
>+#else
>+# define M(a) #a
>+#endif
>+
>+static void copy8x8__mmx (unsigned char *src,
>+	                unsigned char *dest,
>+	                unsigned int stride)
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                      \n\t"
>+
>+    "  lea         (%2, %2, 2), %%edi  \n\t"
>+
>+    "  movq        (%1), %%mm0         \n\t"
>+    "  movq        (%1, %2), %%mm1     \n\t"
>+    "  movq        (%1, %2, 2), %%mm2  \n\t"
>+    "  movq        (%1, %%edi), %%mm3  \n\t"
>+
>+    "  lea         (%1, %2, 4), %1     \n\t" 
>+
>+    "  movq        %%mm0, (%0)         \n\t"
>+    "  movq        %%mm1, (%0, %2)     \n\t"
>+    "  movq        %%mm2, (%0, %2, 2)  \n\t"
>+    "  movq        %%mm3, (%0, %%edi)  \n\t"
>+
>+    "  lea         (%0, %2, 4), %0     \n\t" 
>+
>+    "  movq        (%1), %%mm0         \n\t"
>+    "  movq        (%1, %2), %%mm1     \n\t"
>+    "  movq        (%1, %2, 2), %%mm2  \n\t"
>+    "  movq        (%1, %%edi), %%mm3  \n\t"
>+
>+    "  movq        %%mm0, (%0)         \n\t"
>+    "  movq        %%mm1, (%0, %2)     \n\t"
>+    "  movq        %%mm2, (%0, %2, 2)  \n\t"
>+    "  movq        %%mm3, (%0, %%edi)  \n\t"
>+      : "+a" (dest)
>+      : "c" (src),
>+        "d" (stride)
>+      : "memory", "edi"
>+  );
>+}
>+
>+static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
>+		      ogg_uint32_t LineStep)
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                      \n\t"
>+
>+    "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */
>+
>+    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
>+    "1:                                \n\t" 
>+    "  movq         (%1), %%mm2        \n\t" /* First four input values */
>+
>+    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
>+    "  por         %%mm0, %%mm0        \n\t" 
>+    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
>+    "  lea         16(%1), %1          \n\t" /* Step source buffer */
>+    "  cmp         %%edi, %1           \n\t" /* are we done */
>+
>+    "  movq        %%mm2, (%0)         \n\t" /* store results */
>+
>+    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
>+    "  jc          1b                  \n\t" /* Loop back if we are not done */
>+      : "+r" (ReconPtr)
>+      : "r" (ChangePtr),
>+        "r" (LineStep)
>+      : "memory", "edi"
>+  );
>+}
>+
>+static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
>+		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                      \n\t"
>+
>+    "  pxor        %%mm0, %%mm0        \n\t"
>+    "  lea         128(%1), %%edi      \n\t"
>+
>+    "1:                                \n\t"
>+    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
>+
>+    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
>+    "  movq        %%mm2, %%mm3        \n\t"
>+    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
>+    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
>+    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
>+    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
>+    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
>+    "  add         %3, %2              \n\t" /* next row of reference pixels */
>+    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
>+    "  lea         16(%1), %1          \n\t" /* next row of changes */
>+    "  cmp         %%edi, %1            \n\t" /* are we done? */
>+
>+    "  movq        %%mm2, (%0)         \n\t" /* store result */
>+
>+    "  lea         (%0, %3), %0        \n\t" /* next row of output */
>+    "  jc          1b                  \n\t"
>+      : "+r" (ReconPtr)
>+      : "r" (ChangePtr),
>+        "r" (RefPtr),
>+        "r" (LineStep)
>+      : "memory", "edi"
>+  );
>+}
>+
>+static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
>+		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
>+			   ogg_uint32_t LineStep)
>+{
>+  __asm__ __volatile__ (
>+    "  .balign 16                      \n\t"
>+
>+    "  pxor        %%mm0, %%mm0        \n\t"
>+    "  lea         128(%1), %%edi      \n\t"
>+
>+    "1:                                \n\t"
>+    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
>+    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */
>+
>+    "  movq        %%mm2, %%mm3        \n\t"
>+    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
>+    "  movq        %%mm4, %%mm5        \n\t"
>+    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
>+    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
>+    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
>+    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
>+    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
>+    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
>+    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
>+    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
>+    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
>+    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
>+    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
>+    "  lea         16(%1), %1          \n\t" /* next row of changes */
>+    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
>+    "  add         %4, %2              \n\t" /* next row of reference pixels */
>+    "  add         %4, %3              \n\t" /* next row of reference pixels */
>+    "  movq        %%mm2, (%0)         \n\t" /* store result */
>+    "  add         %4, %0              \n\t" /* next row of output */
>+    "  cmp         %%edi, %1           \n\t" /* are we done? */
>+    "  jc          1b                  \n\t"
>+      : "+r" (ReconPtr)
>+      : "r" (ChangePtr),
>+        "r" (RefPtr1),
>+        "r" (RefPtr2),
>+        "m" (LineStep)
>+      : "memory", "edi"
>+  );
>+}
>+
>+void dsp_i386_mmx_recon_init(DspFunctions *funcs)
>+{
>+  funcs->copy8x8 = copy8x8__mmx;
>+  funcs->recon_intra8x8 = recon_intra8x8__mmx;
>+  funcs->recon_inter8x8 = recon_inter8x8__mmx;
>+  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
>+}
>+
>diff -Naur libtheora-1.0alpha3/lib/Makefile.am libtheora-1.0alpha3.mmx/lib/Makefile.am
>--- libtheora-1.0alpha3/lib/Makefile.am	2003-06-15 02:56:42.000000000 +0200
>+++ libtheora-1.0alpha3.mmx/lib/Makefile.am	2004-10-06 17:48:22.510386296 +0200
>@@ -6,7 +6,8 @@
> 
> libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \
> 	encoder_internal.h idct.c reconstruct.c block_inline.h \
>-	encoder_lookup.h mcomp.c scan.c blockmap.c misc_common.c \
>+	encoder_lookup.h cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c \
>+	i386/recon_mmx.c i386/fdct_mmx.c mcomp.c scan.c blockmap.c misc_common.c \
> 	dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \
> 	huffman.c pp.h toplevel.c decode.c huffman.h quant.c \
> 	comment.c toplevel_lookup.h mcomp.h
>diff -Naur libtheora-1.0alpha3/lib/mcomp.c libtheora-1.0alpha3.mmx/lib/mcomp.c
>--- libtheora-1.0alpha3/lib/mcomp.c	2003-12-03 09:59:41.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/mcomp.c	2004-10-06 17:48:22.543381280 +0200
>@@ -17,6 +17,7 @@
> 
> #include <stdlib.h>
> #include <stdio.h>
>+#include "dsp.h"
> #include "encoder_internal.h"
> 
> /* Initialises motion compentsation. */
>@@ -100,161 +101,22 @@
>                           unsigned char * RefDataPtr1,
>                           unsigned char * RefDataPtr2,
>                           ogg_uint32_t PixelsPerLine ) {
>-  ogg_uint32_t  i;
>-  ogg_int32_t   XSum=0;
>-  ogg_int32_t   XXSum=0;
>   ogg_int32_t   DiffVal;
>-  ogg_int32_t   AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));
>+  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
>+  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
> 
>   /* Mode of interpolation chosen based upon on the offset of the
>      second reference pointer */
>-  if ( AbsRefOffset == 0 ) {
>-    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
>-      DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];
>-      XSum += DiffVal;
>-
>-      /* negative array indexes are strictly forbidden by ANSI C and C99 */
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      /* Step to next row of block. */
>-      NewDataPtr += PixelsPerLine;
>-      RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
>-    }
>-
>+  if ( RefOffset == 0 ) {
>+    DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine,
>+		          RefDataPtr1, RefPixelsPerLine);
>   }else{
>-
>-    /* Simple two reference interpolation */
>-    for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
>-      DiffVal = ((int)NewDataPtr[0]) -
>-        (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[1]) -
>-        (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[2]) -
>-        (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[3]) -
>-        (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[4]) -
>-        (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[5]) -
>-        (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[6]) -
>-        (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      DiffVal = ((int)NewDataPtr[7]) -
>-        (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
>-      XSum += DiffVal;
>-      XXSum += DiffVal*DiffVal;
>-
>-      /* Step to next row of block. */
>-      NewDataPtr += PixelsPerLine;
>-      RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
>-      RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
>-    }
>+    DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine,
>+		          RefDataPtr1, 
>+		          RefDataPtr2, RefPixelsPerLine);
>   }
> 
>   /* Compute and return population variance as mis-match metric. */
>-  return (( (XXSum<<6) - XSum*XSum ));
>-}
>-
>-static ogg_uint32_t GetSumAbsDiffs  (unsigned char * NewDataPtr,
>-                              unsigned char  * RefDataPtr,
>-                              ogg_uint32_t PixelsPerLine,
>-                              ogg_uint32_t ErrorSoFar) {
>-  ogg_uint32_t  i;
>-  ogg_uint32_t  DiffVal = ErrorSoFar;
>-
>-  /* Decide on standard or MMX implementation */
>-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
>-    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
>-    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
>-    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
>-    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
>-    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
>-    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
>-    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
>-    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
>-
>-    /* Step to next row of block. */
>-    NewDataPtr += PixelsPerLine;
>-    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
>-  }
>-
>-  return DiffVal;
>-}
>-
>-static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
>-                                 unsigned char * RefDataPtr,
>-                                 ogg_uint32_t PixelsPerLine,
>-                                 ogg_uint32_t ErrorSoFar,
>-                                 ogg_uint32_t BestSoFar ) {
>-  ogg_uint32_t  i;
>-  ogg_uint32_t  DiffVal = ErrorSoFar;
>-
>-  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
>-    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
>-    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
>-    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
>-    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
>-    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
>-    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
>-    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
>-    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
>-
>-    if ( DiffVal > BestSoFar )break;
>-
>-    /* Step to next row of block. */
>-    NewDataPtr += PixelsPerLine;
>-    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
>-  }
>-
>   return DiffVal;
> }
> 
>@@ -265,118 +127,60 @@
>                                       ogg_uint32_t ErrorSoFar,
>                                       ogg_uint32_t BestSoFar ) {
> 
>-  ogg_uint32_t  i;
>   ogg_uint32_t  DiffVal = ErrorSoFar;
>   ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
>   ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
> 
>   if ( RefOffset == 0 ) {
>     /* Simple case as for non 0.5 pixel */
>-    DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,
>-                               ErrorSoFar);
>+    DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine, 
>+		               RefDataPtr1, RefPixelsPerLine);
>   } else  {
>-    for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
>-      DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +
>-                                            (int)RefDataPtr2[0]) / 2) );
>-      DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
>-                                            (int)RefDataPtr2[1]) / 2) );
>-      DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
>-                                            (int)RefDataPtr2[2]) / 2) );
>-      DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
>-                                            (int)RefDataPtr2[3]) / 2) );
>-      DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
>-                                            (int)RefDataPtr2[4]) / 2) );
>-      DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
>-                                            (int)RefDataPtr2[5]) / 2) );
>-      DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
>-                                            (int)RefDataPtr2[6]) / 2) );
>-      DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
>-                                            (int)RefDataPtr2[7]) / 2) );
>-
>-      if ( DiffVal > BestSoFar ) break;
>-
>-      /* Step to next row of block. */
>-      SrcData += PixelsPerLine;
>-      RefDataPtr1 += RefPixelsPerLine;
>-      RefDataPtr2 += RefPixelsPerLine;
>-    }
>+    DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine, 
>+		               RefDataPtr1, 
>+		               RefDataPtr2, RefPixelsPerLine, BestSoFar);
>   }
> 
>   return DiffVal;
> }
> 
>-static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
>-                            ogg_uint32_t PixelsPerLine ) {
>-  ogg_uint32_t  i;
>-  ogg_uint32_t  XSum=0;
>-  ogg_uint32_t  XXSum=0;
>-  unsigned char *DiffPtr;
>-
>-  /* Loop expanded out for speed. */
>-  DiffPtr = DataPtr;
>-
>-  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
>-
>-    /* Examine alternate pixel locations. */
>-    XSum += DiffPtr[0];
>-    XXSum += DiffPtr[0]*DiffPtr[0];
>-    XSum += DiffPtr[1];
>-    XXSum += DiffPtr[1]*DiffPtr[1];
>-    XSum += DiffPtr[2];
>-    XXSum += DiffPtr[2]*DiffPtr[2];
>-    XSum += DiffPtr[3];
>-    XXSum += DiffPtr[3]*DiffPtr[3];
>-    XSum += DiffPtr[4];
>-    XXSum += DiffPtr[4]*DiffPtr[4];
>-    XSum += DiffPtr[5];
>-    XXSum += DiffPtr[5]*DiffPtr[5];
>-    XSum += DiffPtr[6];
>-    XXSum += DiffPtr[6]*DiffPtr[6];
>-    XSum += DiffPtr[7];
>-    XXSum += DiffPtr[7]*DiffPtr[7];
>-
>-    /* Step to next row of block. */
>-    DiffPtr += PixelsPerLine;
>-  }
>-
>-  /* Compute population variance as mis-match metric. */
>-  return (( (XXSum<<6) - XSum*XSum ) );
>-}
>-
> ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
>                               ogg_uint32_t PixelsPerLine ) {
>   ogg_uint32_t  LocalFragIndex = FragIndex;
>   ogg_uint32_t  IntraError = 0;
> 
>+  dsp_static_save_fpu ();
>+
>   /* Add together the intra errors for those blocks in the macro block
>      that are coded (Y only) */
>   if ( cpi->pb.display_fragments[LocalFragIndex] )
>     IntraError +=
>-      GetIntraError(&cpi->
>+      dsp_static_intra8x8_err (&cpi->
>                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
>-                    PixelsPerLine );
>-
>+                    PixelsPerLine);
> 
>   LocalFragIndex++;
>   if ( cpi->pb.display_fragments[LocalFragIndex] )
>     IntraError +=
>-      GetIntraError(&cpi->
>+      dsp_static_intra8x8_err (&cpi->
>                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
>-                    PixelsPerLine );
>+                    PixelsPerLine);
> 
>   LocalFragIndex = FragIndex + cpi->pb.HFragments;
>   if ( cpi->pb.display_fragments[LocalFragIndex] )
>     IntraError +=
>-      GetIntraError(&cpi->
>+      dsp_static_intra8x8_err (&cpi->
>                      ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
>-                     PixelsPerLine );
>+                    PixelsPerLine);
> 
>   LocalFragIndex++;
>   if ( cpi->pb.display_fragments[LocalFragIndex] )
>     IntraError +=
>-      GetIntraError(&cpi->
>+      dsp_static_intra8x8_err (&cpi->
>                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
>-                    PixelsPerLine );
>+                    PixelsPerLine);
>+
>+  dsp_static_restore_fpu ();
> 
>   return IntraError;
> }
>@@ -400,6 +204,8 @@
>   unsigned char * SrcPtr1;
>   unsigned char * RefPtr1;
> 
>+  dsp_static_save_fpu ();
>+
>   /* Work out pixel offset into source buffer. */
>   PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
> 
>@@ -462,6 +268,9 @@
>     InterError += GetInterErr( SrcPtr1, RefPtr1,
>                                  &RefPtr1[RefPtr2Offset], PixelsPerLine );
>   }
>+
>+  dsp_static_restore_fpu ();
>+
>   return InterError;
> }
> 
>@@ -496,6 +305,8 @@
>   unsigned char * RefDataPtr1;
>   unsigned char * RefDataPtr2;
> 
>+  dsp_static_save_fpu ();
>+
>   /* Note which of the four blocks in the macro block are to be
>      included in the search. */
>   MBlockDispFrags[0] =
>@@ -518,20 +329,20 @@
> 
>   /* Check the 0,0 candidate. */
>   if ( MBlockDispFrags[0] ) {
>-    Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,
>-                         PixelsPerLine, Error);
>+    Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr,
>+                         PixelsPerLine + STRIDE_EXTRA);
>   }
>   if ( MBlockDispFrags[1] ) {
>-    Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,
>-                         PixelsPerLine, Error);
>+    Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8,
>+                         PixelsPerLine + STRIDE_EXTRA);
>   }
>   if ( MBlockDispFrags[2] ) {
>-    Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,
>-                         PixelsPerLine, Error);
>+    Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
>+                         PixelsPerLine + STRIDE_EXTRA);
>   }
>   if ( MBlockDispFrags[3] ) {
>-    Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,
>-                         PixelsPerLine, Error);
>+    Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
>+                         PixelsPerLine + STRIDE_EXTRA);
>   }
> 
>   /* Set starting values to results of 0, 0 vector. */
>@@ -554,24 +365,23 @@
> 
>       /* Get the score for the current offset */
>       if ( MBlockDispFrags[0] ) {
>-        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
>-                             PixelsPerLine, Error);
>+        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
>+                             PixelsPerLine + STRIDE_EXTRA);
>       }
> 
>       if ( MBlockDispFrags[1] && (Error < MinError) ) {
>-        Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
>-                                 PixelsPerLine, Error, MinError );
>+        Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
>+                             PixelsPerLine + STRIDE_EXTRA, MinError);
>       }
> 
>       if ( MBlockDispFrags[2] && (Error < MinError) ) {
>-        Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
>-                                 PixelsPerLine, Error, MinError );
>+        Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
>+                             PixelsPerLine + STRIDE_EXTRA, MinError);
>       }
> 
>       if ( MBlockDispFrags[3] && (Error < MinError) ) {
>-        Error = GetNextSumAbsDiffs( SrcPtr[3],
>-                                 CandidateBlockPtr + RefRow2Offset + 8,
>-                                 PixelsPerLine, Error, MinError );
>+        Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
>+                             PixelsPerLine + STRIDE_EXTRA, MinError);
>       }
> 
>       if ( Error < MinError ) {
>@@ -652,6 +462,8 @@
>   InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
>                                   FragIndex, MV->x, MV->y, PixelsPerLine );
> 
>+  dsp_static_restore_fpu ();
>+
>   /* Return score of best matching block. */
>   return InterMVError;
> }
>@@ -684,6 +496,8 @@
>   unsigned char * RefDataPtr1;
>   unsigned char * RefDataPtr2;
> 
>+  dsp_static_save_fpu ();
>+
>   /* Note which of the four blocks in the macro block are to be
>      included in the search. */
>   MBlockDispFrags[0] = cpi->
>@@ -717,20 +531,20 @@
> 
>       /* Summ errors for each block. */
>       if ( MBlockDispFrags[0] ) {
>-        Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,
>-                             PixelsPerLine, Error);
>+        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
>+                             PixelsPerLine + STRIDE_EXTRA);
>       }
>       if ( MBlockDispFrags[1] ){
>-        Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,
>-                             PixelsPerLine, Error);
>+        Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
>+                             PixelsPerLine + STRIDE_EXTRA);
>       }
>       if ( MBlockDispFrags[2] ){
>-        Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,
>-                             PixelsPerLine, Error);
>+        Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
>+                             PixelsPerLine + STRIDE_EXTRA);
>       }
>       if ( MBlockDispFrags[3] ){
>-        Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8,
>-                             PixelsPerLine, Error);
>+        Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
>+                             PixelsPerLine + STRIDE_EXTRA);
>       }
> 
>       /* Was this the best so far */
>@@ -808,6 +622,8 @@
>   InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
>                                   FragIndex, MV->x, MV->y, PixelsPerLine );
> 
>+  dsp_static_restore_fpu ();
>+
>   /* Return score of best matching block. */
>   return InterMVError;
> }
>@@ -850,8 +666,8 @@
> 
>     for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
>       /* Get the block error score. */
>-      Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,
>-                           PixelsPerLine, 0);
>+      Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr,
>+                             PixelsPerLine + STRIDE_EXTRA);
> 
>       /* Was this the best so far */
>       if ( Error < MinError ) {
>@@ -911,6 +727,8 @@
>                                         MOTION_VECTOR *MV ) {
>   ogg_uint32_t  InterMVError;
> 
>+  dsp_static_save_fpu ();
>+
>   /* For the moment the 4MV mode is only deemd to be valid if all four
>      Y blocks are to be updated */
>   /* This May be adapted later. */
>@@ -941,6 +759,8 @@
>     InterMVError = HUGE_ERROR;
>   }
> 
>+  dsp_static_restore_fpu ();
>+
>   /* Return score of best matching block. */
>   return InterMVError;
> }
>diff -Naur libtheora-1.0alpha3/lib/pp.c libtheora-1.0alpha3.mmx/lib/pp.c
>--- libtheora-1.0alpha3/lib/pp.c	2003-12-03 09:59:41.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/pp.c	2004-10-06 17:48:22.545380976 +0200
>@@ -19,6 +19,7 @@
> #include <string.h>
> #include "encoder_internal.h"
> #include "pp.h"
>+#include "dsp.h"
> 
> #define MAX(a, b) ((a>b)?a:b)
> #define MIN(a, b) ((a<b)?a:b)
>@@ -490,7 +491,7 @@
> 
>       } else {
> 
>-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
>+        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
> 
>       }
> 
>@@ -529,7 +530,7 @@
>         DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
>                         LineLength,Quality,QuantScale);
>       }else{
>-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
>+        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
>       }
> 
>       ++Block;
>@@ -565,7 +566,7 @@
>         DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
>                         LineLength,Quality,QuantScale);
>       }else{
>-        CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
>+        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
>       }
> 
>       ++Block;
>@@ -913,7 +914,7 @@
> }
> 
> void PostProcess(PB_INSTANCE *pbi){
>-
>+  dsp_static_save_fpu ();
>   switch (pbi->PostProcessingLevel){
>   case 8:
>     /* on a slow machine, use a simpler and faster deblocking filter */
>@@ -947,5 +948,6 @@
>     DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer);
>     break;
>   }
>+  dsp_static_restore_fpu ();
> }
> 
>diff -Naur libtheora-1.0alpha3/lib/reconstruct.c libtheora-1.0alpha3.mmx/lib/reconstruct.c
>--- libtheora-1.0alpha3/lib/reconstruct.c	2003-12-03 09:59:41.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/reconstruct.c	2004-10-06 17:48:22.574376568 +0200
>@@ -16,12 +16,28 @@
>  ********************************************************************/
> 
> #include "encoder_internal.h"
>+#include "dsp.h"
>+#include "cpu.h"
> 
>-void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr,
>-                 ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
>+static void copy8x8__c (unsigned char *src,
>+	                unsigned char *dest,
>+	                unsigned int stride)
>+{
>+  int j;
>+  for ( j = 0; j < 8; j++ ){
>+    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
>+    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
>+    src+=stride;
>+    dest+=stride;
>+  }
>+}
>+
>+static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
>+		      ogg_uint32_t LineStep)
>+{
>   ogg_uint32_t i;
> 
>-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
>+  for (i = 8; i; i--){
>     /* Convert the data back to 8 bit unsigned */
>     /* Saturate the output to unsigend 8 bit values */
>     ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
>@@ -34,17 +50,16 @@
>     ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
> 
>     ReconPtr += LineStep;
>-    ChangePtr += BLOCK_HEIGHT_WIDTH;
>+    ChangePtr += 8;
>   }
>-
> }
> 
>-void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
>-                 unsigned char * RefPtr, ogg_int16_t * ChangePtr,
>-                 ogg_uint32_t LineStep ) {
>+static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
>+		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
>+{
>   ogg_uint32_t i;
> 
>-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {
>+  for (i = 8; i; i--){
>     ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
>     ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
>     ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
>@@ -54,19 +69,19 @@
>     ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
>     ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
> 
>-    ChangePtr += BLOCK_HEIGHT_WIDTH;
>+    ChangePtr += 8;
>     ReconPtr += LineStep;
>     RefPtr += LineStep;
>   }
>-
> }
> 
>-void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
>-                           unsigned char * RefPtr1, unsigned char * RefPtr2,
>-                           ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {
>+static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1,
>+		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
>+			   ogg_uint32_t LineStep)
>+{
>   ogg_uint32_t  i;
> 
>-  for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){
>+  for (i = 8; i; i--){
>     ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
>     ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
>     ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
>@@ -76,10 +91,20 @@
>     ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
>     ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
> 
>-    ChangePtr += BLOCK_HEIGHT_WIDTH;
>+    ChangePtr += 8;
>     ReconPtr += LineStep;
>     RefPtr1 += LineStep;
>     RefPtr2 += LineStep;
>   }
>+}
> 
>+void dsp_recon_init (DspFunctions *funcs)
>+{
>+  funcs->copy8x8 = copy8x8__c;
>+  funcs->recon_intra8x8 = recon_intra8x8__c;
>+  funcs->recon_inter8x8 = recon_inter8x8__c;
>+  funcs->recon_inter8x8_half = recon_inter8x8_half__c;
>+  if (cpu_flags & CPU_X86_MMX) {
>+    dsp_i386_mmx_recon_init(&dsp_funcs);
>+  }
> }
>diff -Naur libtheora-1.0alpha3/lib/scan.c libtheora-1.0alpha3.mmx/lib/scan.c
>--- libtheora-1.0alpha3/lib/scan.c	2003-12-03 09:59:41.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/scan.c	2004-10-06 17:48:22.609371248 +0200
>@@ -19,9 +19,20 @@
> #include <math.h>
> #include <string.h>
> #include "encoder_internal.h"
>+#include "dsp.h"
> 
> #define MAX_SEARCH_LINE_LEN                   7
> 
>+#define SET8_0(ptr) \
>+  ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
>+  ((ogg_uint32_t *)ptr)[1] = 0x00000000;
>+#define SET8_1(ptr) \
>+  ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
>+  ((ogg_uint32_t *)ptr)[1] = 0x01010101;
>+#define SET8_8(ptr) \
>+  ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
>+  ((ogg_uint32_t *)ptr)[1] = 0x08080808;
>+
> static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
>   0, 0, 0, 0, 2, 4, 12, 24
> };
>@@ -384,69 +395,6 @@
>   ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
> }
> 
>-static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
>-                                  unsigned char * Src2 ){
>-  ogg_uint32_t SadValue;
>-  ogg_uint32_t SadValue1;
>-
>-  SadValue    = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
>-    abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
>-
>-  SadValue1   = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
>-    abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
>-
>-  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
>-
>-  return SadValue;
>-}
>-
>-static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
>-                           unsigned char * Src1,
>-                           unsigned char * Src2 ){
>-  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
>-  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
>-  ogg_uint32_t MaxSad = 0;
>-  ogg_uint32_t i;
>-
>-  for ( i = 0; i < 4; i++ ){
>-    SadValue[0] += abs(Src1[0] - Src2[0]);
>-    SadValue[1] += abs(Src1[1] - Src2[1]);
>-    SadValue[2] += abs(Src1[2] - Src2[2]);
>-    SadValue[3] += abs(Src1[3] - Src2[3]);
>-    SadValue[4] += abs(Src1[4] - Src2[4]);
>-    SadValue[5] += abs(Src1[5] - Src2[5]);
>-    SadValue[6] += abs(Src1[6] - Src2[6]);
>-    SadValue[7] += abs(Src1[7] - Src2[7]);
>-
>-    Src1 += ppi->PlaneStride;
>-    Src2 += ppi->PlaneStride;
>-  }
>-
>-  for ( i = 0; i < 4; i++ ){
>-    SadValue2[0] += abs(Src1[0] - Src2[0]);
>-    SadValue2[1] += abs(Src1[1] - Src2[1]);
>-    SadValue2[2] += abs(Src1[2] - Src2[2]);
>-    SadValue2[3] += abs(Src1[3] - Src2[3]);
>-    SadValue2[4] += abs(Src1[4] - Src2[4]);
>-    SadValue2[5] += abs(Src1[5] - Src2[5]);
>-    SadValue2[6] += abs(Src1[6] - Src2[6]);
>-    SadValue2[7] += abs(Src1[7] - Src2[7]);
>-
>-    Src1 += ppi->PlaneStride;
>-    Src2 += ppi->PlaneStride;
>-  }
>-
>-  for ( i = 0; i < 8; i++ ){
>-    if ( SadValue[i] > MaxSad )
>-      MaxSad = SadValue[i];
>-    if ( SadValue2[i] > MaxSad )
>-      MaxSad = SadValue2[i];
>-  }
>-
>-  return MaxSad;
>-}
>-
>-
> static int RowSadScan( PP_INSTANCE *ppi,
>                        unsigned char * YuvPtr1,
>                        unsigned char * YuvPtr2,
>@@ -475,7 +423,7 @@
>     for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
>       if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
>         /* Calculate the SAD score for the block row */
>-        GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);
>+        GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2);
> 
>         /* Now test the group SAD score */
>         if ( GrpSad > LocalGrpLowSadThresh ){
>@@ -532,7 +480,7 @@
>     /* Skip if block already marked to be coded. */
>     if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
>       /* Calculate the SAD score for the block column */
>-      MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );
>+      MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );
> 
>       /* Now test the group SAD score */
>       if ( MaxSad > LocalGrpLowSadThresh ){
>@@ -758,7 +706,7 @@
>       if (*DispFragPtr == CANDIDATE_BLOCK){
> 
>         /* Clear down entries in changed locals array */
>-        memset(ChLocalsPtr,0,8);
>+        SET8_0(ChLocalsPtr);
> 
>         for ( j = 0; j < HFRAGPIXELS; j++ ){
>           /* Take a local copy of the measured difference. */
>@@ -777,10 +725,10 @@
>       }else{
>         /* If we are breaking out here mark all pixels as changed. */
>         if ( *DispFragPtr > BLOCK_NOT_CODED ){
>-          memset(bits_map_ptr,1,8);
>-          memset(ChLocalsPtr,8,8);
>+          SET8_1(bits_map_ptr);
>+          SET8_8(ChLocalsPtr);
>         }else{
>-          memset(ChLocalsPtr,0,8);
>+          SET8_0(ChLocalsPtr);
>         }
>       }
> 
>@@ -816,7 +764,7 @@
>     /* Test for break out conditions to save time. */
>     if (*DispFragPtr == CANDIDATE_BLOCK){
>       /* Clear down entries in changed locals array */
>-      memset(ChLocalsPtr,0,8);
>+      SET8_0(ChLocalsPtr);
> 
>       for ( j = 0; j < HFRAGPIXELS; j++ ){
>         /* Take a local copy of the measured difference. */
>@@ -839,10 +787,10 @@
>     }else{
>       /* If we are breaking out here mark all pixels as changed. */
>       if ( *DispFragPtr > BLOCK_NOT_CODED ){
>-        memset(bits_map_ptr,1,8);
>-        memset(ChLocalsPtr,8,8);
>+        SET8_1(bits_map_ptr);
>+        SET8_8(ChLocalsPtr);
>       }else{
>-        memset(ChLocalsPtr,0,8);
>+        SET8_0(ChLocalsPtr);
>       }
>     }
> 
>@@ -876,7 +824,7 @@
>       /* Test for break out conditions to save time. */
>       if (*DispFragPtr == CANDIDATE_BLOCK){
>         /* Clear down entries in changed locals array */
>-        memset(ChLocalsPtr,0,8);
>+        SET8_0(ChLocalsPtr);
>         for ( j = 0; j < HFRAGPIXELS; j++ ){
>           /* Take a local copy of the measured difference. */
>           Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
>@@ -899,10 +847,10 @@
>       }else{
>         /* If we are breaking out here mark all pixels as changed. */
>         if ( *DispFragPtr > BLOCK_NOT_CODED ){
>-          memset(bits_map_ptr,1,8);
>-          memset(ChLocalsPtr,8,8);
>+          SET8_1(bits_map_ptr);
>+          SET8_8(ChLocalsPtr);
>         }else{
>-          memset(ChLocalsPtr,0,8);
>+          SET8_0(ChLocalsPtr);
>         }
>       }
> 
>@@ -935,7 +883,7 @@
>     /* Test for break out conditions to save time. */
>     if (*DispFragPtr == CANDIDATE_BLOCK){
>       /* Clear down entries in changed locals array */
>-      memset(ChLocalsPtr,0,8);
>+      SET8_0(ChLocalsPtr);
> 
>       for ( j = 0; j < HFRAGPIXELS; j++ ){
>         /* Take a local copy of the measured difference. */
>@@ -959,10 +907,10 @@
>     }else{
>       /* If we are breaking out here mark all pixels as changed.*/
>       if ( *DispFragPtr > BLOCK_NOT_CODED ) {
>-          memset(bits_map_ptr,1,8);
>-          memset(ChLocalsPtr,8,8);
>+          SET8_1(bits_map_ptr);
>+          SET8_8(ChLocalsPtr);
>         }else{
>-          memset(ChLocalsPtr,0,8);
>+          SET8_0(ChLocalsPtr);
>         }
>     }
>     /* If we have a lot of changed pixels for this fragment on this
>@@ -1071,7 +1019,7 @@
>         }
>       }else{
>         if ( *DispFragPtr > BLOCK_NOT_CODED )
>-          memset(ChLocalsPtr,0,8);
>+          SET8_0(ChLocalsPtr);
> 
>         /* Step pointers */
>         ChLocalsPtr += HFRAGPIXELS;
>@@ -1133,7 +1081,7 @@
>         }
>       }else{
>         if ( *DispFragPtr > BLOCK_NOT_CODED )
>-          memset(ChLocalsPtr,0,8);
>+          SET8_0(ChLocalsPtr);
> 
>         /* Step pointers */
>         ChLocalsPtr += HFRAGPIXELS;
>@@ -2126,10 +2074,12 @@
>     /* Fast break out test for obvious yes and no cases in this row of
>        blocks */
>     if ( i < ppi->PlaneVFragments ){
>+      dsp_static_save_fpu ();
>       UpdatedOrCandidateBlocks =
>         RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
>-      if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )
>-        UpdatedOrCandidateBlocks = 1;
>+      UpdatedOrCandidateBlocks |=
>+        ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
>+      dsp_static_restore_fpu ();
>     }else{
>       /* Make sure we still call other functions if RowSadScan() disabled */
>       UpdatedOrCandidateBlocks = 1;
>diff -Naur libtheora-1.0alpha3/lib/toplevel.c libtheora-1.0alpha3.mmx/lib/toplevel.c
>--- libtheora-1.0alpha3/lib/toplevel.c	2004-03-18 03:00:30.000000000 +0100
>+++ libtheora-1.0alpha3.mmx/lib/toplevel.c	2004-10-06 17:48:22.611370944 +0200
>@@ -787,6 +787,8 @@
> 
>   CP_INSTANCE *cpi;
> 
>+  dsp_static_init ();
>+
>   memset(th, 0, sizeof(*th));
>   th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
> 
>@@ -1446,6 +1448,8 @@
>   PB_INSTANCE *pbi;
>   codec_setup_info *ci;
> 
>+  dsp_static_init ();
>+
>   ci=(codec_setup_info *)c->codec_setup;
>   th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));
>

Actions: View | Diff

Attachments on bug 68549: 42417