Attachment #42417 for bug #68549

View | Details | Raw Unified | Return to bug 68549
Collapse All | Expand All

Lines 21-27 Link Here

(-)libtheora-1.0alpha3/lib/blockmap.c (-1 / +1 lines)
21	ogg_uint32_t FirstSB,	21	ogg_uint32_t FirstSB,
22	ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,	22	ogg_uint32_t FirstFrag, ogg_uint32_t HFrags,
23	ogg_uint32_t VFrags ){	23	ogg_uint32_t VFrags ){
24	ogg_uint32_t i, j;	24	ogg_uint32_t i, j = 0;
25	ogg_uint32_t xpos;	25	ogg_uint32_t xpos;
26	ogg_uint32_t ypos;	26	ogg_uint32_t ypos;
27	ogg_uint32_t SBrow, SBcol;	27	ogg_uint32_t SBrow, SBcol;




/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include "cpu.h"

ogg_uint32_t cpu_flags = 0;

#if 1
static ogg_uint32_t cpu_get_flags (void)
{
  ogg_uint32_t eax, ebx, ecx, edx;
  ogg_uint32_t flags;

#define cpuid(op,eax,ebx,ecx,edx)      \
  asm volatile ("pushl %%ebx   \n\t"   \
                "cpuid         \n\t"   \
                "movl %%ebx,%1 \n\t"   \
                "popl %%ebx"           \
              : "=a" (eax),            \
                "=r" (ebx),            \
                "=c" (ecx),            \
                "=d" (edx)             \
              : "a" (op)               \
              : "cc")

  asm volatile ("pushfl              \n\t"
                "pushfl              \n\t"
                "popl %0             \n\t"
                "movl %0,%1          \n\t"
                "xorl $0x200000,%0   \n\t"
                "pushl %0            \n\t"
                "popfl               \n\t"
                "pushfl              \n\t"
                "popl %0             \n\t"
                "popfl"
              : "=r" (eax),
                "=r" (ebx)
              :
              : "cc");
         
  if (eax == ebx)             /* no cpuid */
    return 0;

  cpuid(0, eax, ebx, ecx, edx);

  if (ebx == 0x756e6547 &&
      edx == 0x49656e69 &&
      ecx == 0x6c65746e) {
    /* intel */

  inteltest:
    cpuid(1, eax, ebx, ecx, edx);
    if ((edx & 0x00800000) == 0)
      return 0;
    flags = CPU_X86_MMX;
    if (edx & 0x02000000)
      flags |= CPU_X86_MMXEXT | CPU_X86_SSE;
    if (edx & 0x04000000)
      flags |= CPU_X86_SSE2;
    return flags;
  } else if (ebx == 0x68747541 &&
             edx == 0x69746e65 &&
             ecx == 0x444d4163) {
    /* AMD */
    cpuid(0x80000000, eax, ebx, ecx, edx);
    if ((unsigned)eax < 0x80000001)
      goto inteltest;
    cpuid(0x80000001, eax, ebx, ecx, edx);
    if ((edx & 0x00800000) == 0)
      return 0;
    flags = CPU_X86_MMX;
    if (edx & 0x80000000)
      flags |= CPU_X86_3DNOW;
    if (edx & 0x00400000)
      flags |= CPU_X86_MMXEXT;
    return flags;
  }
  else {
    /* implement me */
  }

  return flags;
}
#else
static ogg_uint32_t cpu_get_flags (void) {
  return 0;
}
#endif

void cpu_init () 
{
  cpu_flags = cpu_get_flags();
}




/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include "encoder_internal.h"

extern ogg_uint32_t cpu_flags;

#define CPU_X86_MMX	(1<<0)
#define CPU_X86_3DNOW	(1<<1)
#define CPU_X86_MMXEXT	(1<<2)
#define CPU_X86_SSE	(1<<3)
#define CPU_X86_SSE2	(1<<4)

void cpu_init () ;




 ********************************************************************/

#include "encoder_internal.h"
#include "cpu.h"

static ogg_int32_t xC1S7 = 64277;
static ogg_int32_t xC2S6 = 60547;

#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )

static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
  int loop;

  ogg_int32_t  is07, is12, is34, is56;

    op ++;
  }
}

void dsp_dct_init (DspFunctions *funcs)
{
  funcs->fdct_short = fdct_short__c;
  if (cpu_flags & CPU_X86_MMX) {
    dsp_i386_mmx_fdct_init(&dsp_funcs);
  }
}





#include <stdlib.h>
#include <string.h>
#include "encoder_internal.h"
#include "dsp.h"


#define GOLDEN_FRAME_THRESH_Q   50

  SetupBoundingValueArray_Generic(pbi, FLimit);
}

void CopyBlock(unsigned char *src,
               unsigned char *dest,
               unsigned int srcstride){
  unsigned char *s = src;
  unsigned char *d = dest;
  unsigned int stride = srcstride;

  int j;
  for ( j = 0; j < 8; j++ ){
    ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0];
    ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1];
    s+=stride;
    d+=stride;
  }
}

static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
  ogg_uint32_t ReconPixelsPerLine;
  ogg_int32_t     ReconPixelIndex;

  ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];

  /* Get the pixel index for the first pixel in the fragment. */
  dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
              (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);

}


    /* Reconstruct the pixel data using the last frame reconstruction
       and change data when the motion vector is (0,0), the recon is
       based on the lastframe without loop filtering---- for testing */
    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
                &pbi->LastFrameRecon[ReconPixelIndex],
                  pbi->ReconDataBuffer, ReconPixelsPerLine);

  }else if ( ModeUsesMC[pbi->CodingMode] ) {
    /* The mode uses a motion vector. */
    /* Get vector from list */

    if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
      /* Reconstruct the pixel dats from the reference frame and change data
         (no half pixel in this case as the two references were the same. */
      dsp_static_recon_inter8x8 (
		  &pbi->ThisFrameRecon[ReconPixelIndex],
                  LastFrameRecPtr, pbi->ReconDataBuffer,
                  ReconPixelsPerLine);
    }else{
      /* Fractional pixel reconstruction. */
      /* Note that we only use two pixels per reconstruction even for
         the diagonal. */
      dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex],
                            LastFrameRecPtr, LastFrameRecPtr2,
                            pbi->ReconDataBuffer, ReconPixelsPerLine);
    }
  } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
    /* Golden frame with motion vector */
    /* Reconstruct the pixel data using the golden frame
       reconstruction and change data */
    dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
                &pbi->GoldenFrame[ ReconPixelIndex ],
                  pbi->ReconDataBuffer, ReconPixelsPerLine);
  } else {
    /* Simple Intra coding */
    /* Get the pixel index for the first pixel in the fragment. */
    dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
              pbi->ReconDataBuffer, ReconPixelsPerLine);
  }
}


      SrcPtr = &SrcReconPtr[ PixelIndex ];
      DestPtr = &DestReconPtr[ PixelIndex ];

      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
    }
  }


      SrcPtr = &SrcReconPtr[ PixelIndex ];
      DestPtr = &DestReconPtr[ PixelIndex ];

      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);

    }
  }

      SrcPtr = &SrcReconPtr[ PixelIndex ];
      DestPtr = &DestReconPtr[ PixelIndex ];

      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
    }
  }


      SrcPtr = &SrcReconPtr[ PixelIndex ];
      DestPtr = &DestReconPtr[ PixelIndex ];

      dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);

    }
  }





#include <stdlib.h>
#include "encoder_internal.h"
#include "dsp.h"

static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };

static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr,
                  ogg_int16_t *DctInputPtr, unsigned char *old_ptr1,
                  unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
                  ogg_uint32_t ReconPixelsPerLine ) {
  int i;

  /* For each block row */
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );

    /* Update the screen canvas in one step*/
    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];

    /* Start next row */
    new_ptr1 += PixelsPerLine;
    old_ptr1 += PixelsPerLine;
    FiltPtr += PixelsPerLine;
    ReconPtr += ReconPixelsPerLine;
    DctInputPtr += BLOCK_HEIGHT_WIDTH;
  }
}

static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
                      unsigned char *old_ptr1, unsigned char *new_ptr1,
                      ogg_uint32_t PixelsPerLine ) {
  int i;
  /* For each block row */
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
    /* INTRA mode so code raw image data */
    /* We convert the data to 8 bit signed (by subtracting 128) as
       this reduces the internal precision requirments in the DCT
       transform. */
    DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
    DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
    DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
    DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
    DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
    DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
    DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
    DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);

    /* Update the screen canvas in one step */
    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];

    /* Start next row */
    new_ptr1 += PixelsPerLine;
    old_ptr1 += PixelsPerLine;
    FiltPtr += PixelsPerLine;
    DctInputPtr += BLOCK_HEIGHT_WIDTH;
  }
}

static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                     unsigned char *old_ptr1, unsigned char *new_ptr1,
                     ogg_uint32_t PixelsPerLine,
                     ogg_uint32_t ReconPixelsPerLine ) {
  int i;

  /* For each block row */
  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
    DctInputPtr[0] = (ogg_int16_t)
      ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
    DctInputPtr[1] = (ogg_int16_t)
      ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
    DctInputPtr[2] = (ogg_int16_t)
      ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
    DctInputPtr[3] = (ogg_int16_t)
      ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
    DctInputPtr[4] = (ogg_int16_t)
      ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
    DctInputPtr[5] = (ogg_int16_t)
      ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
    DctInputPtr[6] = (ogg_int16_t)
      ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
    DctInputPtr[7] = (ogg_int16_t)
      ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );

    /* Update the screen canvas in one step */
    ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0];
    ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1];

    /* Start next row */
    new_ptr1 += PixelsPerLine;
    old_ptr1 += PixelsPerLine;
    FiltPtr += PixelsPerLine;
    ReconPtr1 += ReconPixelsPerLine;
    ReconPtr2 += ReconPixelsPerLine;
    DctInputPtr += BLOCK_HEIGHT_WIDTH;
  }
}

static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
                                       ogg_uint32_t * TokenListPtr ){
  unsigned char tokens_added = 0;


  /* Is the MV offset exactly pixel alligned */
  if ( AbsRefOffset == 0 ){
    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
               PixelsPerLine, ReconPixelsPerLine);
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
  } else {
    /* Fractional pixel MVs. */
    /* Note that we only use two pixel values even for the diagonal */
    dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
                 PixelsPerLine, ReconPixelsPerLine);
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
  }
}


        pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
    }

    dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
               PixelsPerLine, ReconPixelsPerLine);
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
  } else if ( cpi->pb.CodingMode==CODE_INTRA ) {
    dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine);
    dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
  }

  /* Proceed to encode the data into the encode buffer if the encoder
     is enabled. */
  /* Perform a 2D DCT transform on the data. */
  dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );

  /* Quantize that transform data. */
  quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );

Lines 796-801 Link Here

(-)libtheora-1.0alpha3/lib/decode.c (+3 lines)
796	/* Make a not of the number of coded blocks this frame */	796	/* Make a not of the number of coded blocks this frame */
797	pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex;	797	pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex;
798		798
		799	dsp_static_save_fpu();
		800
799	/* Decode the modes data */	801	/* Decode the modes data */
800	DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols);	802	DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols);
801		803
Lines 808-813 Link Here
808	/* Reconstruct and display the frame */	810	/* Reconstruct and display the frame */
809	ReconRefFrames(pbi);	811	ReconRefFrames(pbi);
810		812
		813	dsp_static_restore_fpu();
811	}	814	}
812		815
813		816




/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include <stdlib.h>
#include "cpu.h"
#include "encoder_internal.h"

#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))

DspFunctions dsp_funcs;

static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr,
                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
                  ogg_uint32_t ReconPixelsPerLine) {
  int i;

  /* For each block row */
  for (i=8; i; i--) {
    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);

    /* Start next row */
    FiltPtr += PixelsPerLine;
    ReconPtr += ReconPixelsPerLine;
    DctInputPtr += 8;
  }
}

static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine) {
  int i;
  /* For each block row */
  for (i=8; i; i--) {
    /* INTRA mode so code raw image data */
    /* We convert the data to 8 bit signed (by subtracting 128) as
       this reduces the internal precision requirments in the DCT
       transform. */
    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);

    /* Start next row */
    FiltPtr += PixelsPerLine;
    DctInputPtr += 8;
  }
}

static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                     ogg_uint32_t PixelsPerLine,
                     ogg_uint32_t ReconPixelsPerLine) 
{
  int i;

  /* For each block row */
  for (i=8; i; i--) {
    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));

    /* Start next row */
    FiltPtr += PixelsPerLine;
    ReconPtr1 += ReconPixelsPerLine;
    ReconPtr2 += ReconPixelsPerLine;
    DctInputPtr += 8;
  }
}

static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2)
{
  ogg_uint32_t SadValue;
  ogg_uint32_t SadValue1;

  SadValue    = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + 
	        DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
	        DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
	        DSP_OP_ABS_DIFF (Src1[3], Src2[3]);

  SadValue1   = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + 
	        DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
	        DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
	        DSP_OP_ABS_DIFF (Src1[7], Src2[7]);

  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;

  return SadValue;
}

static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2,
		                    ogg_uint32_t stride)
{
  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
  ogg_uint32_t MaxSad = 0;
  ogg_uint32_t i;

  for ( i = 0; i < 4; i++ ){
    SadValue[0] += abs(Src1[0] - Src2[0]);
    SadValue[1] += abs(Src1[1] - Src2[1]);
    SadValue[2] += abs(Src1[2] - Src2[2]);
    SadValue[3] += abs(Src1[3] - Src2[3]);
    SadValue[4] += abs(Src1[4] - Src2[4]);
    SadValue[5] += abs(Src1[5] - Src2[5]);
    SadValue[6] += abs(Src1[6] - Src2[6]);
    SadValue[7] += abs(Src1[7] - Src2[7]);
    
    Src1 += stride;
    Src2 += stride;
  }

  for ( i = 0; i < 4; i++ ){
    SadValue2[0] += abs(Src1[0] - Src2[0]);
    SadValue2[1] += abs(Src1[1] - Src2[1]);
    SadValue2[2] += abs(Src1[2] - Src2[2]);
    SadValue2[3] += abs(Src1[3] - Src2[3]);
    SadValue2[4] += abs(Src1[4] - Src2[4]);
    SadValue2[5] += abs(Src1[5] - Src2[5]);
    SadValue2[6] += abs(Src1[6] - Src2[6]);
    SadValue2[7] += abs(Src1[7] - Src2[7]);
    
    Src1 += stride;
    Src2 += stride;
  }
    
  for ( i = 0; i < 8; i++ ){
    if ( SadValue[i] > MaxSad )
      MaxSad = SadValue[i];
    if ( SadValue2[i] > MaxSad )
      MaxSad = SadValue2[i];
  }
    
  return MaxSad;
}

static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1,
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
{
  ogg_uint32_t  i;
  ogg_uint32_t  sad = 0;

  for (i=8; i; i--) {
    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);

    /* Step to next row of block. */
    ptr1 += stride1;
    ptr2 += stride2;
  }

  return sad;
}

static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
			   	  ogg_uint32_t thres)
{
  ogg_uint32_t  i;
  ogg_uint32_t  sad = 0;

  for (i=8; i; i--) {
    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);

    if (sad > thres )
      break;

    /* Step to next row of block. */
    ptr1 += stride1;
    ptr2 += stride2;
  }

  return sad;
}

static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                      unsigned char *RefDataPtr1,
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
			              ogg_uint32_t thres)
{
  ogg_uint32_t  i;
  ogg_uint32_t  sad = 0;

  for (i=8; i; i--) {
    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));

    if ( sad > thres )
      break;

    /* Step to next row of block. */
    SrcData += SrcStride;
    RefDataPtr1 += RefStride;
    RefDataPtr2 += RefStride;
  }

  return sad;
}

static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride)
{
  ogg_uint32_t  i;
  ogg_uint32_t  XSum=0;
  ogg_uint32_t  XXSum=0;

  for (i=8; i; i--) {
     /* Examine alternate pixel locations. */
     XSum += DataPtr[0];
     XXSum += DataPtr[0]*DataPtr[0];
     XSum += DataPtr[1];
     XXSum += DataPtr[1]*DataPtr[1];
     XSum += DataPtr[2];
     XXSum += DataPtr[2]*DataPtr[2];
     XSum += DataPtr[3];
     XXSum += DataPtr[3]*DataPtr[3];
     XSum += DataPtr[4];
     XXSum += DataPtr[4]*DataPtr[4];
     XSum += DataPtr[5];
     XXSum += DataPtr[5]*DataPtr[5];
     XSum += DataPtr[6];
     XXSum += DataPtr[6]*DataPtr[6];
     XSum += DataPtr[7];
     XXSum += DataPtr[7]*DataPtr[7];

     /* Step to next row of block. */
     DataPtr += Stride;
   }

   /* Compute population variance as mis-match metric. */
   return (( (XXSum<<6) - XSum*XSum ) );
}

static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
{
  ogg_uint32_t  i;
  ogg_uint32_t  XSum=0;
  ogg_uint32_t  XXSum=0;
  ogg_int32_t   DiffVal;

  for (i=8; i; i--) {
    DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;
        
    DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;
        
    DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;
        
    DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;
        
    DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;
        
    /* Step to next row of block. */
    SrcData += SrcStride;
    RefDataPtr += RefStride;
  }

  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                     unsigned char *RefDataPtr1,
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
  ogg_uint32_t  i;
  ogg_uint32_t  XSum=0;
  ogg_uint32_t  XXSum=0;
  ogg_int32_t   DiffVal;

  for (i=8; i; i--) {
    DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
    XSum += DiffVal;
    XXSum += DiffVal*DiffVal;

    /* Step to next row of block. */
    SrcData += SrcStride;
    RefDataPtr1 += RefStride;
    RefDataPtr2 += RefStride;
  }

  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

static void nop (void) { /* NOP */ }

void dsp_init(DspFunctions *funcs)
{
  funcs->save_fpu = nop;
  funcs->restore_fpu = nop;
  funcs->sub8x8 = sub8x8__c;
  funcs->sub8x8_128 = sub8x8_128__c;
  funcs->sub8x8avg2 = sub8x8avg2__c;
  funcs->row_sad8 = row_sad8__c;
  funcs->col_sad8x8 = col_sad8x8__c;
  funcs->sad8x8 = sad8x8__c;
  funcs->sad8x8_thres = sad8x8_thres__c;
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
  funcs->intra8x8_err = intra8x8_err__c;
  funcs->inter8x8_err = inter8x8_err__c;
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
}

void dsp_static_init(void)
{
  cpu_init ();
  dsp_init (&dsp_funcs);
  dsp_recon_init (&dsp_funcs);
  dsp_dct_init (&dsp_funcs);
  if (cpu_flags & CPU_X86_MMX) {
    dsp_i386_mmx_init(&dsp_funcs);
  }
  if (cpu_flags & CPU_X86_MMXEXT) {
    dsp_i386_mmxext_init(&dsp_funcs);
  }
}





/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#ifndef DSP_H
#define DSP_H

#include <theora/theora.h>

typedef struct
{
  void   (*save_fpu)            (void);
  void   (*restore_fpu)         (void);

  void   (*sub8x8)  		(unsigned char *FiltPtr, unsigned char *ReconPtr,
	                   	 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
				 ogg_uint32_t ReconPixelsPerLine);

  void   (*sub8x8_128) 		(unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
			         ogg_uint32_t PixelsPerLine);

  void   (*sub8x8avg2) 		(unsigned char *FiltPtr, unsigned char *ReconPtr1,
		                 unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
			         ogg_uint32_t PixelsPerLine,
			         ogg_uint32_t ReconPixelsPerLine); 

  void   (*copy8x8)  		(unsigned char *src, unsigned char *dest, 
		                 ogg_uint32_t stride);

  void   (*recon_intra8x8)  	(unsigned char *ReconPtr, ogg_int16_t *ChangePtr, 
		                 ogg_uint32_t LineStep);

  void   (*recon_inter8x8)  	(unsigned char *ReconPtr, unsigned char *RefPtr, 
		                 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);

  void   (*recon_inter8x8_half)	(unsigned char *ReconPtr, unsigned char *RefPtr1, 
		  		 unsigned char *RefPtr2, ogg_int16_t *ChangePtr, 
				 ogg_uint32_t LineStep);

  void   (*fdct_short)          (ogg_int16_t *InputData, ogg_int16_t *OutputData);

  ogg_uint32_t (*row_sad8)	(unsigned char *Src1, unsigned char *Src2);

  ogg_uint32_t (*col_sad8x8)	(unsigned char *Src1, unsigned char *Src2,
		  		 ogg_uint32_t stride);

  ogg_uint32_t (*sad8x8)	(unsigned char *ptr1, ogg_uint32_t stride1,
		        	 unsigned char *ptr2, ogg_uint32_t stride2);

  ogg_uint32_t (*sad8x8_thres)	(unsigned char *ptr1, ogg_uint32_t stride1,
		       		 unsigned char *ptr2, ogg_uint32_t stride2, 
				 ogg_uint32_t thres);

  ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride,
		                 unsigned char *RefDataPtr1,
			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
				 ogg_uint32_t thres);

  ogg_uint32_t (*intra8x8_err)	(unsigned char *DataPtr, ogg_uint32_t Stride);

  ogg_uint32_t (*inter8x8_err)	(unsigned char *SrcData, ogg_uint32_t SrcStride,
		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride);

  ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride,
		                 unsigned char *RefDataPtr1,
			         unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
} DspFunctions;

extern DspFunctions dsp_funcs;

extern void dsp_recon_init (DspFunctions *funcs);

void dsp_init(DspFunctions *funcs);
void dsp_static_init(void);

#define dsp_save_fpu(funcs) (funcs.save_fpu ())
#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)

#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs)

#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5))
#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5)

#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3)

#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6))
#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6)

#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1)

#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1))
#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1)

#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \
	(funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1))
#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \
	dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1)

#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \
	(funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1))
#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \
	dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1)

#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out)

#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2)

#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1))
#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1)

#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2))
#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2)

#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t))
#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t)

#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \
	(funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t))
#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \
	dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t)

#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1)

#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \
	(funcs.inter8x8_err (ptr1,str1,ptr2,str2))
#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \
	dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2)

#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
	(funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \
	dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2)


#endif /* DSP_H */





static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
                                     ogg_int32_t BlockIndex ) {
  ogg_uint32_t  ErrorVal;
  ogg_uint32_t  ErrorVal = 0;

  unsigned char * SrcDataPtr =
    &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];

    RecStride = cpi->pb.UVStride;
  }

  ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride);

  /* Decide on standard or MMX implementation */
  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
    ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
    ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
    ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
    ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
    ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
    ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
    ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
    ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
    /* Step to next row of block. */
    SrcDataPtr += SrcStride;
    RecDataPtr += RecStride;
  }
  return ErrorVal;
}


    /* Zero Decoder EOB run count */
    cpi->pb.EOB_Run = 0;

    dsp_static_save_fpu ();

    /* Encode any fragments coded using DCT. */
    coded_pixels += QuadCodeDisplayFragments (cpi);

    dsp_static_restore_fpu ();

    return coded_pixels;

}





#include <theora/theora.h>
#include "huffman.h"
#include "dsp.h"

#ifndef LIBOGG2
#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )

                   ogg_int16_t *QuantMatrix,
                   ogg_int16_t * OutputData );

extern void dsp_recon_init (DspFunctions *funcs);
                        ogg_int16_t * ChangePtr, ogg_uint32_t LineStep );

extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr,
                        unsigned char * RefPtr, ogg_int16_t * ChangePtr,
                        ogg_uint32_t LineStep ) ;

extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr,
                                  unsigned char * RefPtr1,
                                  unsigned char * RefPtr2,
                                  ogg_int16_t * ChangePtr,
                                  ogg_uint32_t LineStep ) ;

extern void SetupLoopFilter(PB_INSTANCE *pbi);
extern void CopyBlock(unsigned char *src,
                      unsigned char *dest,
                      unsigned int srcstride);
extern void LoopFilter(PB_INSTANCE *pbi);
extern void ReconRefFrames (PB_INSTANCE *pbi);
extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,




/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include <stdlib.h>
#include "dsp.h"

static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL;

#if defined(__MINGW32__) || defined(__CYGWIN__) || \
    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
# define M(a) "_" #a
#else
# define M(a) #a
#endif

#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))

static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr,
                  ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
                  ogg_uint32_t ReconPixelsPerLine) 
{
  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm7, %%mm7     \n\t" 

    ".rept 8                        \n\t"
    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */
    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
    /* convert from UINT8 to INT16 */
    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */
    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */
    /* start calculation */
    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */
    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */
    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */
    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */
    /* Increment pointers */
    "  add         $16, %2           \n\t"
    "  add         %3, %0           \n\t"
    "  add         %4, %1           \n\t"
    ".endr                          \n\t"

     : "+r" (FiltPtr),
       "+r" (ReconPtr),
       "+r" (DctInputPtr)
     : "m" (PixelsPerLine),
       "m" (ReconPixelsPerLine) 
     : "memory"
  );
}

static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
                      ogg_uint32_t PixelsPerLine) 
{
  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm7, %%mm7     \n\t" 
    "  movq      "M(V128w)", %%mm1  \n\t"

    ".rept 8                        \n\t"
    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
    /* convert from UINT8 to INT16 */
    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
    /* start calculation */
    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */
    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */
    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */
    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */
    /* Increment pointers */
    "  add         $16, %1           \n\t"
    "  add         %2, %0           \n\t"
    ".endr                          \n\t"

     : "+r" (FiltPtr),
       "+r" (DctInputPtr)
     : "r" (PixelsPerLine)
     : "memory"
  );
}

static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1,
                     unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr,
                     ogg_uint32_t PixelsPerLine,
                     ogg_uint32_t ReconPixelsPerLine) 
{
  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm7, %%mm7     \n\t" 

    ".rept 8                        \n\t"
    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr1 */
    "  movq        (%2), %%mm4      \n\t" /* mm1 = ReconPtr2 */
    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */
    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */
    "  movq        %%mm4, %%mm5     \n\t" /* dup to prepare for up conversion */
    /* convert from UINT8 to INT16 */
    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */
    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr1) */
    "  punpcklbw   %%mm7, %%mm4     \n\t" /* mm1 = INT16(ReconPtr2) */
    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */
    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr1) */
    "  punpckhbw   %%mm7, %%mm5     \n\t" /* mm3 = INT16(ReconPtr2) */
    /* average ReconPtr1 and ReconPtr2 */
    "  paddw       %%mm4, %%mm1     \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
    "  paddw       %%mm5, %%mm3     \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
    "  psrlw       $1, %%mm1        \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
    "  psrlw       $1, %%mm3        \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
    "  movq        %%mm0,  (%3)     \n\t" /* write answer out */
    "  movq        %%mm2, 8(%3)     \n\t" /* write answer out */
    /* Increment pointers */
    "  add         $16, %3           \n\t"
    "  add         %4, %0           \n\t"
    "  add         %5, %1           \n\t"
    "  add         %5, %2           \n\t"
    ".endr                          \n\t"

     : "+r" (FiltPtr),
       "+r" (ReconPtr1),
       "+r" (ReconPtr2),
       "+r" (DctInputPtr)
     : "m" (PixelsPerLine),
       "m" (ReconPixelsPerLine) 
     : "memory"
  );
}

static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2)
{
  ogg_uint32_t MaxSad;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
    "  pxor        %%mm7, %%mm7     \n\t" 	/* zero out mm7 for unpack */
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%2), %%mm1      \n\t"

    "  movq        %%mm0, %%mm2     \n\t"
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */

    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"       /* ; unpack low four bytes to higher precision */
    "  punpckhbw   %%mm7, %%mm1     \n\t"       /* ; unpack high four bytes to higher precision */

    "  movq        %%mm0, %%mm2     \n\t"
    "  movq        %%mm1, %%mm3     \n\t"
    "  psrlq       $32, %%mm2       \n\t"	/* fold and add */
    "  psrlq       $32, %%mm3       \n\t"
    "  paddw       %%mm2, %%mm0     \n\t"
    "  paddw       %%mm3, %%mm1     \n\t"
    "  movq        %%mm0, %%mm2     \n\t"
    "  movq        %%mm1, %%mm3     \n\t"
    "  psrlq       $16, %%mm2       \n\t"
    "  psrlq       $16, %%mm3       \n\t"
    "  paddw       %%mm2, %%mm0     \n\t"
    "  paddw       %%mm3, %%mm1     \n\t"

    "  psubusw     %%mm0, %%mm1     \n\t"
    "  paddw       %%mm0, %%mm1     \n\t" 	/* mm1 = max(mm1, mm0) */
    "  movd        %%mm1, %0        \n\t"
    "  andl        $0xffff, %0      \n\t"

     : "=m" (MaxSad),
       "+r" (Src1), 
       "+r" (Src2) 
     :
     : "memory"
  );
  return MaxSad;
}

static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2,
		                    ogg_uint32_t stride)
{
  ogg_uint32_t MaxSad;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
    "  mov         $4, %%edi        \n\t"	/* 4 rows */
    "1:                             \n\t"
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */

    "  movq        %%mm0, %%mm2     \n\t"
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  mov         $4, %%edi        \n\t"	/* 4 rows */
    "2:                             \n\t"
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */

    "  movq        %%mm0, %%mm2     \n\t"
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */

    "  dec         %%edi            \n\t"
    "  jnz 2b                       \n\t"

    "  psubusw     %%mm6, %%mm7     \n\t"
    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm7, mm6) */
    "  psubusw     %%mm4, %%mm5     \n\t" 	
    "  paddw       %%mm4, %%mm5     \n\t" 	/* mm5 = max(mm5, mm4) */
    "  psubusw     %%mm5, %%mm7     \n\t" 	
    "  paddw       %%mm5, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
    "  movq        %%mm7, %%mm6     \n\t"
    "  psrlq       $32, %%mm6       \n\t"
    "  psubusw     %%mm6, %%mm7     \n\t" 	
    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
    "  movq        %%mm7, %%mm6     \n\t"
    "  psrlq       $16, %%mm6       \n\t"
    "  psubusw     %%mm6, %%mm7     \n\t" 	
    "  paddw       %%mm6, %%mm7     \n\t" 	/* mm7 = max(mm5, mm7) */
    "  movd        %%mm7, %0        \n\t"
    "  andl        $0xffff, %0      \n\t"

     : "=r" (MaxSad),
       "+r" (Src1), 
       "+r" (Src2) 
     : "r" (stride)
     : "memory", "edi"
  );

  return MaxSad;
}

static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
{
  ogg_uint32_t  DiffVal;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"
    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
    ".rept 8                         \n\t"
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%2), %%mm1      \n\t"
    "  movq        %%mm0, %%mm2     \n\t"

    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
    "  add         %4, %2           \n\t"	/* Inc pointer into ref data */
    ".endr                          \n\t"

    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $32, %%mm7       \n\t"
    "  paddw       %%mm0, %%mm7     \n\t"
    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $16, %%mm7       \n\t"
    "  paddw       %%mm0, %%mm7     \n\t"
    "  movd        %%mm7, %0        \n\t"
    "  andl        $0xffff, %0      \n\t"

     : "=m" (DiffVal),
       "+r" (ptr1), 
       "+r" (ptr2) 
     : "r" (stride1),
       "r" (stride2)
     : "memory"
  );

  return DiffVal;
}

static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
			   	  ogg_uint32_t thres)
{
  return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
}

static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                      unsigned char *RefDataPtr1,
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
			              ogg_uint32_t thres)
{
  ogg_uint32_t  DiffVal;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pcmpeqd     %%mm5, %%mm5     \n\t"	/* fefefefefefefefe in mm5 */
    "  paddb       %%mm5, %%mm5     \n\t"
   
    "  pxor        %%mm6, %%mm6     \n\t"	/* zero out mm6 for unpack */
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 contains the result */
    "  mov         $8, %%edi        \n\t"	/* 8 rows */
    "1:                             \n\t"
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */

    "  movq        (%2), %%mm2      \n\t"
    "  movq        (%3), %%mm3      \n\t"	/* take average of mm2 and mm3 */
    "  movq        %%mm2, %%mm1     \n\t"
    "  pand        %%mm3, %%mm1     \n\t"
    "  pxor        %%mm2, %%mm3     \n\t"
    "  pand        %%mm5, %%mm3     \n\t"
    "  psrlq       $1, %%mm3        \n\t"
    "  paddb       %%mm3, %%mm1     \n\t"

    "  movq        %%mm0, %%mm2     \n\t"

    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"    	/* and or gives abs difference */
    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
    "  paddw       %%mm0, %%mm7     \n\t"	/* accumulate difference... */
    "  punpckhbw   %%mm6, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
    "  add         %4, %1           \n\t"	/* Inc pointer into the new data */
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
    "  add         %5, %2           \n\t"	/* Inc pointer into ref data */
    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $32, %%mm7       \n\t"
    "  paddw       %%mm0, %%mm7     \n\t"
    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $16, %%mm7       \n\t"
    "  paddw       %%mm0, %%mm7     \n\t"
    "  movd        %%mm7, %0        \n\t"
    "  andl        $0xffff, %0      \n\t"

     : "=m" (DiffVal),
       "+r" (SrcData), 
       "+r" (RefDataPtr1), 
       "+r" (RefDataPtr2) 
     : "m" (SrcStride),
       "m" (RefStride)
     : "edi", "memory"
  );

  return DiffVal;
}

static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
{
  ogg_uint32_t  XSum;
  ogg_uint32_t  XXSum;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm5, %%mm5     \n\t"
    "  pxor        %%mm6, %%mm6     \n\t"
    "  pxor        %%mm7, %%mm7     \n\t"
    "  mov         $8, %%edi        \n\t"
    "1:                             \n\t"
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        %%mm0, %%mm2     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"
    "  punpckhbw   %%mm6, %%mm2     \n\t"

    "  paddw       %%mm0, %%mm5     \n\t"
    "  paddw       %%mm2, %%mm5     \n\t"

    "  pmaddwd     %%mm0, %%mm0     \n\t"
    "  pmaddwd     %%mm2, %%mm2     \n\t"
    
    "  paddd       %%mm0, %%mm7     \n\t"
    "  paddd       %%mm2, %%mm7     \n\t"

    "  add         %3, %2           \n\t"	/* Inc pointer into src data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $32, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $16, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movd        %%mm5, %%edi     \n\t"
    "  movsx       %%di, %%edi      \n\t"
    "  movl        %%edi, %0        \n\t"

    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $32, %%mm7       \n\t"
    "  paddd       %%mm0, %%mm7     \n\t"
    "  movd        %%mm7, %1        \n\t"

     : "=r" (XSum),
       "=r" (XXSum),
       "+r" (DataPtr) 
     : "r" (Stride)
     : "edi", "memory"
  );

  /* Compute population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ) );
}

static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                 unsigned char *RefDataPtr, ogg_uint32_t RefStride)
{
  ogg_uint32_t  XSum;
  ogg_uint32_t  XXSum;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm5, %%mm5     \n\t"
    "  pxor        %%mm6, %%mm6     \n\t"
    "  pxor        %%mm7, %%mm7     \n\t"
    "  mov         $8, %%edi        \n\t"
    "1:                             \n\t"
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%3), %%mm1      \n\t"
    "  movq        %%mm0, %%mm2     \n\t"
    "  movq        %%mm1, %%mm3     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"
    "  punpcklbw   %%mm6, %%mm1     \n\t"
    "  punpckhbw   %%mm6, %%mm2     \n\t"
    "  punpckhbw   %%mm6, %%mm3     \n\t"

    "  psubsw      %%mm1, %%mm0     \n\t"
    "  psubsw      %%mm3, %%mm2     \n\t"

    "  paddw       %%mm0, %%mm5     \n\t"
    "  paddw       %%mm2, %%mm5     \n\t"

    "  pmaddwd     %%mm0, %%mm0     \n\t"
    "  pmaddwd     %%mm2, %%mm2     \n\t"
    
    "  paddd       %%mm0, %%mm7     \n\t"
    "  paddd       %%mm2, %%mm7     \n\t"

    "  add         %4, %2           \n\t"	/* Inc pointer into src data */
    "  add         %5, %3           \n\t"	/* Inc pointer into ref data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $32, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $16, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movd        %%mm5, %%edi     \n\t"
    "  movsx       %%di, %%edi      \n\t"
    "  movl        %%edi, %0        \n\t"

    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $32, %%mm7       \n\t"
    "  paddd       %%mm0, %%mm7     \n\t"
    "  movd        %%mm7, %1        \n\t"

     : "=m" (XSum),
       "=m" (XXSum),
       "+r" (SrcData), 
       "+r" (RefDataPtr) 
     : "m" (SrcStride),
       "m" (RefStride)
     : "edi", "memory"
  );

  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                     unsigned char *RefDataPtr1,
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
  ogg_uint32_t XSum;
  ogg_uint32_t XXSum;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pcmpeqd     %%mm4, %%mm4     \n\t"	/* fefefefefefefefe in mm4 */
    "  paddb       %%mm4, %%mm4     \n\t"
    "  pxor        %%mm5, %%mm5     \n\t"
    "  pxor        %%mm6, %%mm6     \n\t"
    "  pxor        %%mm7, %%mm7     \n\t"
    "  mov         $8, %%edi        \n\t"
    "1:                             \n\t"
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */

    "  movq        (%3), %%mm2      \n\t"
    "  movq        (%4), %%mm3      \n\t"	/* take average of mm2 and mm3 */
    "  movq        %%mm2, %%mm1     \n\t"
    "  pand        %%mm3, %%mm1     \n\t"
    "  pxor        %%mm2, %%mm3     \n\t"
    "  pand        %%mm4, %%mm3     \n\t"
    "  psrlq       $1, %%mm3        \n\t"
    "  paddb       %%mm3, %%mm1     \n\t"

    "  movq        %%mm0, %%mm2     \n\t"
    "  movq        %%mm1, %%mm3     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"
    "  punpcklbw   %%mm6, %%mm1     \n\t"
    "  punpckhbw   %%mm6, %%mm2     \n\t"
    "  punpckhbw   %%mm6, %%mm3     \n\t"

    "  psubsw      %%mm1, %%mm0     \n\t"
    "  psubsw      %%mm3, %%mm2     \n\t"

    "  paddw       %%mm0, %%mm5     \n\t"
    "  paddw       %%mm2, %%mm5     \n\t"

    "  pmaddwd     %%mm0, %%mm0     \n\t"
    "  pmaddwd     %%mm2, %%mm2     \n\t"
    
    "  paddd       %%mm0, %%mm7     \n\t"
    "  paddd       %%mm2, %%mm7     \n\t"

    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $32, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $16, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movd        %%mm5, %%edi     \n\t"
    "  movsx       %%di, %%edi      \n\t"
    "  movl        %%edi, %0        \n\t"

    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $32, %%mm7       \n\t"
    "  paddd       %%mm0, %%mm7     \n\t"
    "  movd        %%mm7, %1        \n\t"

     : "=m" (XSum),
       "=m" (XXSum),
       "+r" (SrcData), 
       "+r" (RefDataPtr1),
       "+r" (RefDataPtr2) 
     : "m" (SrcStride),
       "m" (RefStride)
     : "edi", "memory"
  );

  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

static void restore_fpu (void)
{
  __asm__ __volatile__ (
    "  emms                         \n\t"
  );
}

void dsp_i386_mmx_init(DspFunctions *funcs)
{
  funcs->restore_fpu = restore_fpu;
  funcs->sub8x8 = sub8x8__mmx;
  funcs->sub8x8_128 = sub8x8_128__mmx;
  funcs->sub8x8avg2 = sub8x8avg2__mmx;
  funcs->row_sad8 = row_sad8__mmx;
  funcs->col_sad8x8 = col_sad8x8__mmx;
  funcs->sad8x8 = sad8x8__mmx;
  funcs->sad8x8_thres = sad8x8_thres__mmx;
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
  funcs->intra8x8_err = intra8x8_err__mmx;
  funcs->inter8x8_err = inter8x8_err__mmx;
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
}





/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include <stdlib.h>
#include "dsp.h"

static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
		       	    unsigned char *ptr2, ogg_uint32_t stride2)
{
  ogg_uint32_t  DiffVal;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"
    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */

    ".rept 7                        \n\t"
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
    "  movq (%2), %%mm1             \n\t"
    "  psadbw %%mm1, %%mm0          \n\t"
    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
    ".endr                          \n\t"

    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
    "  movq (%2), %%mm1             \n\t"
    "  psadbw %%mm1, %%mm0          \n\t"
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
    "  movd %%mm7, %0               \n\t"

     : "=r" (DiffVal),
       "+r" (ptr1), 
       "+r" (ptr2) 
     : "r" (stride1),
       "r" (stride2)
     : "memory"
  );

  return DiffVal;
}

static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
		       		  unsigned char *ptr2, ogg_uint32_t stride2, 
			   	  ogg_uint32_t thres)
{
  ogg_uint32_t  DiffVal;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"
    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */

    ".rept 8                        \n\t"
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
    "  movq (%2), %%mm1             \n\t"
    "  psadbw %%mm1, %%mm0          \n\t"
    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
    ".endr                          \n\t"

    "  movd %%mm7, %0               \n\t"

     : "=r" (DiffVal),
       "+r" (ptr1), 
       "+r" (ptr2) 
     : "r" (stride1),
       "r" (stride2)
     : "memory"
  );

  return DiffVal;
}

static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                      unsigned char *RefDataPtr1,
			              unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
			              ogg_uint32_t thres)
{
  ogg_uint32_t  DiffVal;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"
    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
    ".rept 8                        \n\t"
    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
    "  movq (%2), %%mm1             \n\t"
    "  movq (%3), %%mm2             \n\t"
    "  pavgb %%mm2, %%mm1           \n\t"
    "  psadbw %%mm1, %%mm0          \n\t"

    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
    "  add %5, %2                   \n\t"	/* Inc pointer into ref data */
    "  add %5, %3                   \n\t"	/* Inc pointer into ref data */
    ".endr                          \n\t"

    "  movd %%mm7, %0               \n\t"
     : "=m" (DiffVal),
       "+r" (SrcData), 
       "+r" (RefDataPtr1), 
       "+r" (RefDataPtr2) 
     : "m" (SrcStride),
       "m" (RefStride)
     : "memory"
  );

  return DiffVal;
}
		
static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2)
{
  ogg_uint32_t MaxSad;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  movd        (%1), %%mm0      \n\t"
    "  movd        (%2), %%mm1      \n\t"
    "  psadbw      %%mm0, %%mm1     \n\t"
    "  movd        4(%1), %%mm2     \n\t"
    "  movd        4(%2), %%mm3     \n\t"
    "  psadbw      %%mm2, %%mm3     \n\t"

    "  pmaxsw      %%mm1, %%mm3     \n\t"
    "  movd        %%mm3, %0        \n\t"
    "  andl        $0xffff, %0      \n\t"

     : "=m" (MaxSad),
       "+r" (Src1), 
       "+r" (Src2) 
     :
     : "memory"
  );

  return MaxSad;
}

static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2,
		                    ogg_uint32_t stride)
{
  ogg_uint32_t MaxSad;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm3, %%mm3     \n\t"	/* zero out mm3 for unpack */
    "  pxor        %%mm4, %%mm4     \n\t"	/* mm4 low sum */
    "  pxor        %%mm5, %%mm5     \n\t" 	/* mm5 high sum */
    "  pxor        %%mm6, %%mm6     \n\t"	/* mm6 low sum */
    "  pxor        %%mm7, %%mm7     \n\t" 	/* mm7 high sum */
    "  mov         $4, %%edi        \n\t"	/* 4 rows */
    "1:                             \n\t"
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */

    "  movq        %%mm0, %%mm2     \n\t"
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
    "  paddw       %%mm0, %%mm4     \n\t"	/* accumulate difference... */
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
    "  paddw       %%mm1, %%mm5     \n\t"	/* accumulate difference... */
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  mov         $4, %%edi        \n\t"	/* 4 rows */
    "2:                             \n\t"
    "  movq        (%1), %%mm0      \n\t"	/* take 8 bytes */
    "  movq        (%2), %%mm1      \n\t"	/* take 8 bytes */

    "  movq        %%mm0, %%mm2     \n\t"
    "  psubusb     %%mm1, %%mm0     \n\t" 	/* A - B */
    "  psubusb     %%mm2, %%mm1     \n\t"	/* B - A */
    "  por         %%mm1, %%mm0     \n\t"      	/* and or gives abs difference */
    "  movq        %%mm0, %%mm1     \n\t"

    "  punpcklbw   %%mm3, %%mm0     \n\t"	/* unpack to higher precision for accumulation */
    "  paddw       %%mm0, %%mm6     \n\t"	/* accumulate difference... */
    "  punpckhbw   %%mm3, %%mm1     \n\t"	/* unpack high four bytes to higher precision */
    "  paddw       %%mm1, %%mm7     \n\t"	/* accumulate difference... */
    "  add         %3, %1           \n\t"	/* Inc pointer into the new data */
    "  add         %3, %2           \n\t"	/* Inc pointer into the new data */

    "  dec         %%edi            \n\t"
    "  jnz 2b                       \n\t"

    "  pmaxsw      %%mm6, %%mm7     \n\t"
    "  pmaxsw      %%mm4, %%mm5     \n\t"
    "  pmaxsw      %%mm5, %%mm7     \n\t"
    "  movq        %%mm7, %%mm6     \n\t"
    "  psrlq       $32, %%mm6       \n\t"
    "  pmaxsw      %%mm6, %%mm7     \n\t"
    "  movq        %%mm7, %%mm6     \n\t"
    "  psrlq       $16, %%mm6       \n\t"
    "  pmaxsw      %%mm6, %%mm7     \n\t"
    "  movd        %%mm7, %0        \n\t"
    "  andl        $0xffff, %0      \n\t"

     : "=r" (MaxSad),
       "+r" (Src1), 
       "+r" (Src2) 
     : "r" (stride)
     : "memory", "edi"
  );

  return MaxSad;
}

static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
		                     unsigned char *RefDataPtr1,
				     unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
{
  ogg_uint32_t XSum;
  ogg_uint32_t XXSum;

  __asm__ __volatile__ (
    "  .balign 16                   \n\t"

    "  pxor        %%mm4, %%mm4     \n\t"
    "  pxor        %%mm5, %%mm5     \n\t"
    "  pxor        %%mm6, %%mm6     \n\t"
    "  pxor        %%mm7, %%mm7     \n\t"
    "  mov         $8, %%edi        \n\t"
    "1:                             \n\t"
    "  movq        (%2), %%mm0      \n\t"	/* take 8 bytes */

    "  movq        (%3), %%mm2      \n\t"
    "  movq        (%4), %%mm1      \n\t"	/* take average of mm2 and mm1 */
    "  pavgb       %%mm2, %%mm1     \n\t"

    "  movq        %%mm0, %%mm2     \n\t"
    "  movq        %%mm1, %%mm3     \n\t"

    "  punpcklbw   %%mm6, %%mm0     \n\t"
    "  punpcklbw   %%mm4, %%mm1     \n\t"
    "  punpckhbw   %%mm6, %%mm2     \n\t"
    "  punpckhbw   %%mm4, %%mm3     \n\t"

    "  psubsw      %%mm1, %%mm0     \n\t"
    "  psubsw      %%mm3, %%mm2     \n\t"

    "  paddw       %%mm0, %%mm5     \n\t"
    "  paddw       %%mm2, %%mm5     \n\t"

    "  pmaddwd     %%mm0, %%mm0     \n\t"
    "  pmaddwd     %%mm2, %%mm2     \n\t"
    
    "  paddd       %%mm0, %%mm7     \n\t"
    "  paddd       %%mm2, %%mm7     \n\t"

    "  add         %5, %2           \n\t"	/* Inc pointer into src data */
    "  add         %6, %3           \n\t"	/* Inc pointer into ref data */
    "  add         %6, %4           \n\t"	/* Inc pointer into ref data */

    "  dec         %%edi            \n\t"
    "  jnz 1b                       \n\t"

    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $32, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movq        %%mm5, %%mm0     \n\t"
    "  psrlq       $16, %%mm5       \n\t"
    "  paddw       %%mm0, %%mm5     \n\t"
    "  movd        %%mm5, %%edi     \n\t"
    "  movsx       %%di, %%edi      \n\t"
    "  movl        %%edi, %0        \n\t"

    "  movq        %%mm7, %%mm0     \n\t"
    "  psrlq       $32, %%mm7       \n\t"
    "  paddd       %%mm0, %%mm7     \n\t"
    "  movd        %%mm7, %1        \n\t"

     : "=m" (XSum),
       "=m" (XXSum),
       "+r" (SrcData), 
       "+r" (RefDataPtr1),
       "+r" (RefDataPtr2) 
     : "m" (SrcStride),
       "m" (RefStride)
     : "edi", "memory"
  );

  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

void dsp_i386_mmxext_init(DspFunctions *funcs)
{
  funcs->row_sad8 = row_sad8__mmxext;
  funcs->col_sad8x8 = col_sad8x8__mmxext;
  funcs->sad8x8 = sad8x8__mmxext;
  funcs->sad8x8_thres = sad8x8_thres__mmxext;
  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
  funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
}


Line 0 Link Here

(-)libtheora-1.0alpha3/lib/i386/fdct_mmx.c (+340 lines)
		1	;//==========================================================================
		2	;//
		3	;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
		4	;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
		5	;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
		6	;// PURPOSE.
		7	;//
		8	;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
		9	;//
		10	;//--------------------------------------------------------------------------
		11
		12	#include <theora/theora.h>
		13	#include "dsp.h"
		14
		15	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
		16	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
		17	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
		18	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
		19	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
		20	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
		21	static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
		22
		23	#if defined(__MINGW32__) \|\| defined(__CYGWIN__) \|\| \
		24	defined(__OS2__) \|\| (defined (__OpenBSD__) && !defined(__ELF__))
		25	# define M(a) "_" #a
		26	#else
		27	# define M(a) #a
		28	#endif
		29
		30	/***********************************************************************
		31	* File: fdct_m.asm
		32	*
		33	* Description:
		34	* This function perform 2-D Forward DCT on a 8x8 block
		35	*
		36	*
		37	* Input: Pointers to input source data buffer and destination
		38	* buffer.
		39	*
		40	* Note: none
		41	*
		42	* Special Notes: We try to do the truncation right to match the result
		43	* of the c version.
		44	*
		45	************************************************************************/
		46
		47	/* execute stage 1 of forward DCT */
		48	#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \
		49	" movq " #ip0 ", %%mm0 \n\t" \
		50	" movq " #ip1 ", %%mm1 \n\t" \
		51	" movq " #ip3 ", %%mm2 \n\t" \
		52	" movq " #ip5 ", %%mm3 \n\t" \
		53	" movq %%mm0, %%mm4 \n\t" \
		54	" movq %%mm1, %%mm5 \n\t" \
		55	" movq %%mm2, %%mm6 \n\t" \
		56	" movq %%mm3, %%mm7 \n\t" \
		57	\
		58	" paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \
		59	" paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \
		60	" paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \
		61	" paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \
		62	" psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \
		63	" psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \
		64	\
65	" psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \
66	\
67	" paddsw %%mm2, %%mm2 \n\t" \
68	\
69	" psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \
70	\
71	" paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \
72	" psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \
73	" movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \
74	" paddsw %%mm3, %%mm3 \n\t" \
75	" paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \
76	\
77	" psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \
78	/* ------------------------------------------------------------------- */ \
79	" psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \
80	" paddsw %%mm7, %%mm7 \n\t" \
81	" paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \
82	/* ------------------------------------------------------------------- */ \
83	" psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \
84	" paddsw %%mm3, %%mm3 \n\t" \
85	\
86	" movq %%mm2, %%mm0 \n\t" /* make a copy */ \
87	" paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \
88	\
89	" pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
90	" paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
91	" psrlw $15, %%mm2 \n\t" \
92	" paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \
93	\
94	" movq %%mm3, %%mm2 \n\t" \
95	" movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \
96	\
97	" movq %%mm3, %%mm0 \n\t" \
98	" pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
99	\
100	" psrlw $15, %%mm2 \n\t" \
101	" paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \
102	" paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \
103	\
104	" movq %%mm3," #ip0 " \n\t" \
105	/* ------------------------------------------------------------------- */ \
106	" movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \
107	" pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
108	\
109	" movq " #temp ", %%mm2 \n\t" \
110	" movq %%mm2, %%mm0 \n\t" \
111	\
112	" psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \
113	" paddw %%mm0, %%mm3 \n\t" \
114	\
115	" paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
116	" movq %%mm5, %%mm0 \n\t" \
117	\
118	" movq %%mm5, %%mm2 \n\t" \
119	" pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \
120	\
121	" psrlw $15, %%mm2 \n\t" \
122	" paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
123	\
124	" paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \
125	" movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \
126	\
127	" movq %%mm5, %%mm0 \n\t" \
128	" movq %%mm5, %%mm2 \n\t" \
129	\
130	" pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
131	" psrlw $15, %%mm2 \n\t" \
132	\
133	" movq " #temp ", %%mm3 \n\t" \
134	" paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \
135	\
136	" paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
137	" movq %%mm3, %%mm2 \n\t" \
138	\
139	" pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \
140	" psrlw $15, %%mm2 \n\t" \
141	\
142	" paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
143	" psubsw %%mm5, %%mm3 \n\t" \
144	\
145	" movq %%mm3," #ip6 " \n\t" \
146	/* ------------------------------------------------------------------- */ \
147	" movq "M(xC4S4)", %%mm0 \n\t" \
148	" movq %%mm1, %%mm2 \n\t" \
149	" movq %%mm1, %%mm3 \n\t" \
150	\
151	" pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
152	" psrlw $15, %%mm2 \n\t" \
153	\
154	" paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
155	" paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \
156	\
157	" movq %%mm7, %%mm2 \n\t" \
158	" movq %%mm7, %%mm3 \n\t" \
159	\
160	" pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
161	" psrlw $15, %%mm2 \n\t" \
162	\
163	" paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
164	" paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \
165	/* ------------------------------------------------------------------- */ \
166	" pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \
167	" psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \
168	\
169	" psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
170	" paddsw %%mm6, %%mm6 \n\t" \
171	" paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \
172	\
173	" psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \
174	" paddsw %%mm1, %%mm1 \n\t" \
175	" paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \
176	/* ------------------------------------------------------------------- */ \
177	" movq "M(xC1S7)", %%mm7 \n\t" \
178	" movq %%mm1, %%mm2 \n\t" \
179	\
180	" movq %%mm1, %%mm3 \n\t" \
181	" pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
182	\
183	" movq "M(xC7S1)", %%mm7 \n\t" \
184	" psrlw $15, %%mm2 \n\t" \
185	\
186	" paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \
187	" paddw %%mm2, %%mm1 \n\t" /* Trucated */ \
188	\
189	" pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \
190	" paddw %%mm2, %%mm3 \n\t" /* Truncated */ \
191	\
192	" movq %%mm0, %%mm5 \n\t" \
193	" movq %%mm0, %%mm2 \n\t" \
194	\
195	" movq "M(xC1S7)", %%mm7 \n\t" \
196	" pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
197	\
198	" movq "M(xC7S1)", %%mm7 \n\t" \
199	" psrlw $15, %%mm2 \n\t" \
200	\
201	" paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \
202	" paddw %%mm2, %%mm0 \n\t" /* Truncated */ \
203	\
204	" pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \
205	" paddw %%mm2, %%mm5 \n\t" /* Truncated */ \
206	\
207	" psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
208	" paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
209	\
210	" movq %%mm1," #ip1 " \n\t" \
211	" movq %%mm3," #ip7 " \n\t" \
212	/* ------------------------------------------------------------------- */ \
213	" movq "M(xC3S5)", %%mm0 \n\t" \
214	" movq "M(xC5S3)", %%mm1 \n\t" \
215	\
216	" movq %%mm6, %%mm5 \n\t" \
217	" movq %%mm6, %%mm7 \n\t" \
218	\
219	" movq %%mm4, %%mm2 \n\t" \
220	" movq %%mm4, %%mm3 \n\t" \
221	\
222	" pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
223	" pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
224	\
225	" psrlw $15, %%mm2 \n\t" \
226	" psrlw $15, %%mm5 \n\t" \
227	\
228	" paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \
229	" paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \
230	\
231	" paddw %%mm2, %%mm4 \n\t" /* Truncated */ \
232	" paddw %%mm5, %%mm6 \n\t" /* Truncated */ \
233	\
234	" psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \
235	" movq %%mm4," #ip3 " \n\t" \
236	\
237	" movq %%mm3, %%mm4 \n\t" \
238	" movq %%mm7, %%mm6 \n\t" \
239	\
240	" pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
241	" pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
242	\
243	" paddw %%mm2, %%mm4 \n\t" \
244	" paddw %%mm5, %%mm6 \n\t" \
245	\
246	" paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \
247	" paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \
248	\
249	" paddw %%mm7, %%mm3 \n\t" /* ip5 */ \
250	" movq %%mm3," #ip5 " \n\t"
251
252	#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \
253	op0,op1,op2,op3,op4,op5,op6,op7) \
254	" movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \
255	" movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \
256	" movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \
257	" movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \
258	" movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \
259	" movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \
260	" movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \
261	" movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \
262	" movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \
263	/* Transpose 2x8 block */ \
264	" movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \
265	" punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \
266	" movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \
267	" punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \
268	" movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \
269	" punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \
270	" movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \
271	" punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
272	" punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
273	" movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \
274	" movq %%mm4," #op4 " \n\t" \
275	" punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \
276	" movq %%mm5," #op5 " \n\t" \
277	" punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
278	" movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \
279	" punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
280	" movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \
281	" movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \
282	" movq %%mm6," #op7 " \n\t" \
283	" punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \
284	" movq %%mm1," #op6 " \n\t" \
285	" punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \
286	" movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \
287	" punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \
288	" movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \
289	" punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
290	" punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
291	" movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \
292	" movq %%mm0," #op0 " \n\t" \
293	" punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \
294	" movq %%mm1," #op1 " \n\t" \
295	" punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
296	" punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
297	" movq %%mm4," #op3 " \n\t" \
298	" movq %%mm2," #op2 " \n\t"
299
300
301	static void fdct_short__mmx ( ogg_int16_t InputData, ogg_int16_t OutputData)
302	{
303	ogg_int64_t __attribute__((aligned(8))) align_tmp[16];
304	ogg_int16_t const temp= (int16_t)align_tmp;
305
306	__asm__ __volatile__ (
307	" .balign 16 \n\t"
308	/*
309	* Input data is an 8x8 block. To make processing of the data more efficent
310	* we will transpose the block of data to two 4x8 blocks???
311	*/
312	Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0),
313	(%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1))
314	Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2))
315
316	Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
317	64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
318	Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
319
320	Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
321	0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
322	Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
323
324	Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
325	8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
326	Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
327
328	" emms \n\t"
329
330	: "+r" (InputData),
331	"+r" (OutputData)
332	: "r" (temp)
333	: "memory"
334	);
335	}
336
337	void dsp_i386_mmx_fdct_init(DspFunctions *funcs)
338	{
339	funcs->fdct_short = fdct_short__mmx;
340	}




/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include "encoder_internal.h"

static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;

#if defined(__MINGW32__) || defined(__CYGWIN__) || \
	    defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__))
# define M(a) "_" #a
#else
# define M(a) #a
#endif

static void copy8x8__mmx (unsigned char *src,
	                unsigned char *dest,
	                unsigned int stride)
{
  __asm__ __volatile__ (
    "  .balign 16                      \n\t"

    "  lea         (%2, %2, 2), %%edi  \n\t"

    "  movq        (%1), %%mm0         \n\t"
    "  movq        (%1, %2), %%mm1     \n\t"
    "  movq        (%1, %2, 2), %%mm2  \n\t"
    "  movq        (%1, %%edi), %%mm3  \n\t"

    "  lea         (%1, %2, 4), %1     \n\t" 

    "  movq        %%mm0, (%0)         \n\t"
    "  movq        %%mm1, (%0, %2)     \n\t"
    "  movq        %%mm2, (%0, %2, 2)  \n\t"
    "  movq        %%mm3, (%0, %%edi)  \n\t"

    "  lea         (%0, %2, 4), %0     \n\t" 

    "  movq        (%1), %%mm0         \n\t"
    "  movq        (%1, %2), %%mm1     \n\t"
    "  movq        (%1, %2, 2), %%mm2  \n\t"
    "  movq        (%1, %%edi), %%mm3  \n\t"

    "  movq        %%mm0, (%0)         \n\t"
    "  movq        %%mm1, (%0, %2)     \n\t"
    "  movq        %%mm2, (%0, %2, 2)  \n\t"
    "  movq        %%mm3, (%0, %%edi)  \n\t"
      : "+a" (dest)
      : "c" (src),
        "d" (stride)
      : "memory", "edi"
  );
}

static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
		      ogg_uint32_t LineStep)
{
  __asm__ __volatile__ (
    "  .balign 16                      \n\t"

    "  movq     "M(V128)", %%mm0       \n\t" /* Set mm0 to 0x8080808080808080 */

    "  lea         128(%1), %%edi      \n\t" /* Endpoint in input buffer */
    "1:                                \n\t" 
    "  movq         (%1), %%mm2        \n\t" /* First four input values */

    "  packsswb    8(%1), %%mm2        \n\t" /* pack with next(high) four values */
    "  por         %%mm0, %%mm0        \n\t" 
    "  pxor        %%mm0, %%mm2        \n\t" /* Convert result to unsigned (same as add 128) */
    "  lea         16(%1), %1          \n\t" /* Step source buffer */
    "  cmp         %%edi, %1           \n\t" /* are we done */

    "  movq        %%mm2, (%0)         \n\t" /* store results */

    "  lea         (%0, %2), %0        \n\t" /* Step output buffer */
    "  jc          1b                  \n\t" /* Loop back if we are not done */
      : "+r" (ReconPtr)
      : "r" (ChangePtr),
        "r" (LineStep)
      : "memory", "edi"
  );
}

static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
  __asm__ __volatile__ (
    "  .balign 16                      \n\t"

    "  pxor        %%mm0, %%mm0        \n\t"
    "  lea         128(%1), %%edi      \n\t"

    "1:                                \n\t"
    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */

    "  movq        (%1), %%mm4         \n\t" /* first 4 changes */
    "  movq        %%mm2, %%mm3        \n\t"
    "  movq        8(%1), %%mm5        \n\t" /* last 4 changes */
    "  punpcklbw   %%mm0, %%mm2        \n\t" /* turn first 4 refs into positive 16-bit #s */
    "  paddsw      %%mm4, %%mm2        \n\t" /* add in first 4 changes */
    "  punpckhbw   %%mm0, %%mm3        \n\t" /* turn last 4 refs into positive 16-bit #s */
    "  paddsw      %%mm5, %%mm3        \n\t" /* add in last 4 changes */
    "  add         %3, %2              \n\t" /* next row of reference pixels */
    "  packuswb    %%mm3, %%mm2        \n\t" /* pack result to unsigned 8-bit values */
    "  lea         16(%1), %1          \n\t" /* next row of changes */
    "  cmp         %%edi, %1            \n\t" /* are we done? */

    "  movq        %%mm2, (%0)         \n\t" /* store result */

    "  lea         (%0, %3), %0        \n\t" /* next row of output */
    "  jc          1b                  \n\t"
      : "+r" (ReconPtr)
      : "r" (ChangePtr),
        "r" (RefPtr),
        "r" (LineStep)
      : "memory", "edi"
  );
}

static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
			   ogg_uint32_t LineStep)
{
  __asm__ __volatile__ (
    "  .balign 16                      \n\t"

    "  pxor        %%mm0, %%mm0        \n\t"
    "  lea         128(%1), %%edi      \n\t"

    "1:                                \n\t"
    "  movq        (%2), %%mm2         \n\t" /* (+3 misaligned) 8 reference pixels */
    "  movq        (%3), %%mm4         \n\t" /* (+3 misaligned) 8 reference pixels */

    "  movq        %%mm2, %%mm3        \n\t"
    "  punpcklbw   %%mm0, %%mm2        \n\t" /* mm2 = start ref1 as positive 16-bit #s */
    "  movq        %%mm4, %%mm5        \n\t"
    "  movq        (%1), %%mm6         \n\t" /* first 4 changes */
    "  punpckhbw   %%mm0, %%mm3        \n\t" /* mm3 = end ref1 as positive 16-bit #s */
    "  movq        8(%1), %%mm7        \n\t" /* last 4 changes */
    "  punpcklbw   %%mm0, %%mm4        \n\t" /* mm4 = start ref2 as positive 16-bit #s */
    "  punpckhbw   %%mm0, %%mm5        \n\t" /* mm5 = end ref2 as positive 16-bit #s */
    "  paddw       %%mm4, %%mm2        \n\t" /* mm2 = start (ref1 + ref2) */
    "  paddw       %%mm5, %%mm3        \n\t" /* mm3 = end (ref1 + ref2) */
    "  psrlw       $1, %%mm2           \n\t" /* mm2 = start (ref1 + ref2)/2 */
    "  psrlw       $1, %%mm3           \n\t" /* mm3 = end (ref1 + ref2)/2 */
    "  paddw       %%mm6, %%mm2        \n\t" /* add changes to start */
    "  paddw       %%mm7, %%mm3        \n\t" /* add changes to end */
    "  lea         16(%1), %1          \n\t" /* next row of changes */
    "  packuswb    %%mm3, %%mm2        \n\t" /* pack start|end to unsigned 8-bit */
    "  add         %4, %2              \n\t" /* next row of reference pixels */
    "  add         %4, %3              \n\t" /* next row of reference pixels */
    "  movq        %%mm2, (%0)         \n\t" /* store result */
    "  add         %4, %0              \n\t" /* next row of output */
    "  cmp         %%edi, %1           \n\t" /* are we done? */
    "  jc          1b                  \n\t"
      : "+r" (ReconPtr)
      : "r" (ChangePtr),
        "r" (RefPtr1),
        "r" (RefPtr2),
        "m" (LineStep)
      : "memory", "edi"
  );
}

void dsp_i386_mmx_recon_init(DspFunctions *funcs)
{
  funcs->copy8x8 = copy8x8__mmx;
  funcs->recon_intra8x8 = recon_intra8x8__mmx;
  funcs->recon_inter8x8 = recon_inter8x8__mmx;
  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
}


Lines 6-12 Link Here

(-)libtheora-1.0alpha3/lib/Makefile.am (-1 / +2 lines)
6		6
7	libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \	7	libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \
8	encoder_internal.h idct.c reconstruct.c block_inline.h \	8	encoder_internal.h idct.c reconstruct.c block_inline.h \
9	encoder_lookup.h mcomp.c scan.c blockmap.c misc_common.c \	9	encoder_lookup.h cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c \
		10	i386/recon_mmx.c i386/fdct_mmx.c mcomp.c scan.c blockmap.c misc_common.c \
10	dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \	11	dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \
11	huffman.c pp.h toplevel.c decode.c huffman.h quant.c \	12	huffman.c pp.h toplevel.c decode.c huffman.h quant.c \
12	comment.c toplevel_lookup.h mcomp.h	13	comment.c toplevel_lookup.h mcomp.h





#include <stdlib.h>
#include <stdio.h>
#include "dsp.h"
#include "encoder_internal.h"

/* Initialises motion compentsation. */

                          unsigned char * RefDataPtr1,
                          unsigned char * RefDataPtr2,
                          ogg_uint32_t PixelsPerLine ) {
  ogg_uint32_t  i;
  ogg_int32_t   XSum=0;
  ogg_int32_t   XXSum=0;
  ogg_int32_t   DiffVal;
  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;

  /* Mode of interpolation chosen based upon on the offset of the
     second reference pointer */
  if ( RefOffset == 0 ) {
    DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine,
		          RefDataPtr1, RefPixelsPerLine);
      XSum += DiffVal;

      /* negative array indexes are strictly forbidden by ANSI C and C99 */
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      /* Step to next row of block. */
      NewDataPtr += PixelsPerLine;
      RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
    }

  }else{
    DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine,
		          RefDataPtr1, 
		          RefDataPtr2, RefPixelsPerLine);
      DiffVal = ((int)NewDataPtr[0]) -
        (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[1]) -
        (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[2]) -
        (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[3]) -
        (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[4]) -
        (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[5]) -
        (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[6]) -
        (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      DiffVal = ((int)NewDataPtr[7]) -
        (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
      XSum += DiffVal;
      XXSum += DiffVal*DiffVal;

      /* Step to next row of block. */
      NewDataPtr += PixelsPerLine;
      RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
      RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
    }
  }

  /* Compute and return population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ));
}

static ogg_uint32_t GetSumAbsDiffs  (unsigned char * NewDataPtr,
                              unsigned char  * RefDataPtr,
                              ogg_uint32_t PixelsPerLine,
                              ogg_uint32_t ErrorSoFar) {
  ogg_uint32_t  i;
  ogg_uint32_t  DiffVal = ErrorSoFar;

  /* Decide on standard or MMX implementation */
  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );

    /* Step to next row of block. */
    NewDataPtr += PixelsPerLine;
    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
  }

  return DiffVal;
}

static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
                                 unsigned char * RefDataPtr,
                                 ogg_uint32_t PixelsPerLine,
                                 ogg_uint32_t ErrorSoFar,
                                 ogg_uint32_t BestSoFar ) {
  ogg_uint32_t  i;
  ogg_uint32_t  DiffVal = ErrorSoFar;

  for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
    DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
    DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
    DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
    DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
    DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
    DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
    DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
    DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );

    if ( DiffVal > BestSoFar )break;

    /* Step to next row of block. */
    NewDataPtr += PixelsPerLine;
    RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
  }

  return DiffVal;
}


                                      ogg_uint32_t ErrorSoFar,
                                      ogg_uint32_t BestSoFar ) {

  ogg_uint32_t  i;
  ogg_uint32_t  DiffVal = ErrorSoFar;
  ogg_int32_t   RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
  ogg_uint32_t  RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;

  if ( RefOffset == 0 ) {
    /* Simple case as for non 0.5 pixel */
    DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine, 
		               RefDataPtr1, RefPixelsPerLine);
  } else  {
    DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine, 
		               RefDataPtr1, 
		               RefDataPtr2, RefPixelsPerLine, BestSoFar);
      DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
                                            (int)RefDataPtr2[1]) / 2) );
      DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
                                            (int)RefDataPtr2[2]) / 2) );
      DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
                                            (int)RefDataPtr2[3]) / 2) );
      DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
                                            (int)RefDataPtr2[4]) / 2) );
      DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
                                            (int)RefDataPtr2[5]) / 2) );
      DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
                                            (int)RefDataPtr2[6]) / 2) );
      DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
                                            (int)RefDataPtr2[7]) / 2) );

      if ( DiffVal > BestSoFar ) break;

      /* Step to next row of block. */
      SrcData += PixelsPerLine;
      RefDataPtr1 += RefPixelsPerLine;
      RefDataPtr2 += RefPixelsPerLine;
    }
  }

  return DiffVal;
}

static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
                            ogg_uint32_t PixelsPerLine ) {
  ogg_uint32_t  i;
  ogg_uint32_t  XSum=0;
  ogg_uint32_t  XXSum=0;
  unsigned char *DiffPtr;

  /* Loop expanded out for speed. */
  DiffPtr = DataPtr;

  for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {

    /* Examine alternate pixel locations. */
    XSum += DiffPtr[0];
    XXSum += DiffPtr[0]*DiffPtr[0];
    XSum += DiffPtr[1];
    XXSum += DiffPtr[1]*DiffPtr[1];
    XSum += DiffPtr[2];
    XXSum += DiffPtr[2]*DiffPtr[2];
    XSum += DiffPtr[3];
    XXSum += DiffPtr[3]*DiffPtr[3];
    XSum += DiffPtr[4];
    XXSum += DiffPtr[4]*DiffPtr[4];
    XSum += DiffPtr[5];
    XXSum += DiffPtr[5]*DiffPtr[5];
    XSum += DiffPtr[6];
    XXSum += DiffPtr[6]*DiffPtr[6];
    XSum += DiffPtr[7];
    XXSum += DiffPtr[7]*DiffPtr[7];

    /* Step to next row of block. */
    DiffPtr += PixelsPerLine;
  }

  /* Compute population variance as mis-match metric. */
  return (( (XXSum<<6) - XSum*XSum ) );
}

ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
                              ogg_uint32_t PixelsPerLine ) {
  ogg_uint32_t  LocalFragIndex = FragIndex;
  ogg_uint32_t  IntraError = 0;

  dsp_static_save_fpu ();

  /* Add together the intra errors for those blocks in the macro block
     that are coded (Y only) */
  if ( cpi->pb.display_fragments[LocalFragIndex] )
    IntraError +=
      dsp_static_intra8x8_err (&cpi->
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
                    PixelsPerLine);


  LocalFragIndex++;
  if ( cpi->pb.display_fragments[LocalFragIndex] )
    IntraError +=
      dsp_static_intra8x8_err (&cpi->
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
                    PixelsPerLine);

  LocalFragIndex = FragIndex + cpi->pb.HFragments;
  if ( cpi->pb.display_fragments[LocalFragIndex] )
    IntraError +=
      dsp_static_intra8x8_err (&cpi->
                     ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
                    PixelsPerLine);

  LocalFragIndex++;
  if ( cpi->pb.display_fragments[LocalFragIndex] )
    IntraError +=
      dsp_static_intra8x8_err (&cpi->
                    ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
                    PixelsPerLine);

  dsp_static_restore_fpu ();

  return IntraError;
}

  unsigned char * SrcPtr1;
  unsigned char * RefPtr1;

  dsp_static_save_fpu ();

  /* Work out pixel offset into source buffer. */
  PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];


    InterError += GetInterErr( SrcPtr1, RefPtr1,
                                 &RefPtr1[RefPtr2Offset], PixelsPerLine );
  }

  dsp_static_restore_fpu ();

  return InterError;
}


  unsigned char * RefDataPtr1;
  unsigned char * RefDataPtr2;

  dsp_static_save_fpu ();

  /* Note which of the four blocks in the macro block are to be
     included in the search. */
  MBlockDispFrags[0] =


  /* Check the 0,0 candidate. */
  if ( MBlockDispFrags[0] ) {
    Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr,
                         PixelsPerLine + STRIDE_EXTRA);
  }
  if ( MBlockDispFrags[1] ) {
    Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8,
                         PixelsPerLine + STRIDE_EXTRA);
  }
  if ( MBlockDispFrags[2] ) {
    Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
                         PixelsPerLine + STRIDE_EXTRA);
  }
  if ( MBlockDispFrags[3] ) {
    Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
                         PixelsPerLine + STRIDE_EXTRA);
  }

  /* Set starting values to results of 0, 0 vector. */


      /* Get the score for the current offset */
      if ( MBlockDispFrags[0] ) {
        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
                             PixelsPerLine + STRIDE_EXTRA);
      }

      if ( MBlockDispFrags[1] && (Error < MinError) ) {
        Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
                             PixelsPerLine + STRIDE_EXTRA, MinError);
      }

      if ( MBlockDispFrags[2] && (Error < MinError) ) {
        Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
                             PixelsPerLine + STRIDE_EXTRA, MinError);
      }

      if ( MBlockDispFrags[3] && (Error < MinError) ) {
        Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
                             PixelsPerLine + STRIDE_EXTRA, MinError);
                                 PixelsPerLine, Error, MinError );
      }

      if ( Error < MinError ) {

  InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
                                  FragIndex, MV->x, MV->y, PixelsPerLine );

  dsp_static_restore_fpu ();

  /* Return score of best matching block. */
  return InterMVError;
}

  unsigned char * RefDataPtr1;
  unsigned char * RefDataPtr2;

  dsp_static_save_fpu ();

  /* Note which of the four blocks in the macro block are to be
     included in the search. */
  MBlockDispFrags[0] = cpi->


      /* Summ errors for each block. */
      if ( MBlockDispFrags[0] ) {
        Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
                             PixelsPerLine + STRIDE_EXTRA);
      }
      if ( MBlockDispFrags[1] ){
        Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
                             PixelsPerLine + STRIDE_EXTRA);
      }
      if ( MBlockDispFrags[2] ){
        Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
                             PixelsPerLine + STRIDE_EXTRA);
      }
      if ( MBlockDispFrags[3] ){
        Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
                             PixelsPerLine + STRIDE_EXTRA);
      }

      /* Was this the best so far */

  InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
                                  FragIndex, MV->x, MV->y, PixelsPerLine );

  dsp_static_restore_fpu ();

  /* Return score of best matching block. */
  return InterMVError;
}


    for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
      /* Get the block error score. */
      Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr,
                             PixelsPerLine + STRIDE_EXTRA);

      /* Was this the best so far */
      if ( Error < MinError ) {

                                        MOTION_VECTOR *MV ) {
  ogg_uint32_t  InterMVError;

  dsp_static_save_fpu ();

  /* For the moment the 4MV mode is only deemd to be valid if all four
     Y blocks are to be updated */
  /* This May be adapted later. */

    InterMVError = HUGE_ERROR;
  }

  dsp_static_restore_fpu ();

  /* Return score of best matching block. */
  return InterMVError;
}




#include <string.h>
#include "encoder_internal.h"
#include "pp.h"
#include "dsp.h"

#define MAX(a, b) ((a>b)?a:b)
#define MIN(a, b) ((a<b)?a:b)


      } else {

        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);

      }


        DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
                        LineLength,Quality,QuantScale);
      }else{
        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
      }

      ++Block;

        DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
                        LineLength,Quality,QuantScale);
      }else{
        dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
      }

      ++Block;

}

void PostProcess(PB_INSTANCE *pbi){
  dsp_static_save_fpu ();
  switch (pbi->PostProcessingLevel){
  case 8:
    /* on a slow machine, use a simpler and faster deblocking filter */

    DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer);
    break;
  }
  dsp_static_restore_fpu ();
}





 ********************************************************************/

#include "encoder_internal.h"
#include "dsp.h"
#include "cpu.h"

static void copy8x8__c (unsigned char *src,
	                unsigned char *dest,
	                unsigned int stride)
{
  int j;
  for ( j = 0; j < 8; j++ ){
    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
    src+=stride;
    dest+=stride;
  }
}

static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
		      ogg_uint32_t LineStep)
{
  ogg_uint32_t i;

  for (i = 8; i; i--){
    /* Convert the data back to 8 bit unsigned */
    /* Saturate the output to unsigend 8 bit values */
    ReconPtr[0] = clamp255( ChangePtr[0] + 128 );

    ReconPtr[7] = clamp255( ChangePtr[7] + 128 );

    ReconPtr += LineStep;
    ChangePtr += 8;
  }

}

static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr,
		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
  ogg_uint32_t i;

  for (i = 8; i; i--){
    ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
    ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
    ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);

    ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
    ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);

    ChangePtr += 8;
    ReconPtr += LineStep;
    RefPtr += LineStep;
  }

}

static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1,
		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
			   ogg_uint32_t LineStep)
{
  ogg_uint32_t  i;

  for (i = 8; i; i--){
    ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
    ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
    ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );

    ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
    ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );

    ChangePtr += 8;
    ReconPtr += LineStep;
    RefPtr1 += LineStep;
    RefPtr2 += LineStep;
  }
}

void dsp_recon_init (DspFunctions *funcs)
{
  funcs->copy8x8 = copy8x8__c;
  funcs->recon_intra8x8 = recon_intra8x8__c;
  funcs->recon_inter8x8 = recon_inter8x8__c;
  funcs->recon_inter8x8_half = recon_inter8x8_half__c;
  if (cpu_flags & CPU_X86_MMX) {
    dsp_i386_mmx_recon_init(&dsp_funcs);
  }
}




#include <math.h>
#include <string.h>
#include "encoder_internal.h"
#include "dsp.h"

#define MAX_SEARCH_LINE_LEN                   7

#define SET8_0(ptr) \
  ((ogg_uint32_t *)ptr)[0] = 0x00000000; \
  ((ogg_uint32_t *)ptr)[1] = 0x00000000;
#define SET8_1(ptr) \
  ((ogg_uint32_t *)ptr)[0] = 0x01010101; \
  ((ogg_uint32_t *)ptr)[1] = 0x01010101;
#define SET8_8(ptr) \
  ((ogg_uint32_t *)ptr)[0] = 0x08080808; \
  ((ogg_uint32_t *)ptr)[1] = 0x08080808;

static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
  0, 0, 0, 0, 2, 4, 12, 24
};

  ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4));
}

static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
                                  unsigned char * Src2 ){
  ogg_uint32_t SadValue;
  ogg_uint32_t SadValue1;

  SadValue    = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
    abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );

  SadValue1   = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
    abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );

  SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;

  return SadValue;
}

static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
                           unsigned char * Src1,
                           unsigned char * Src2 ){
  ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
  ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
  ogg_uint32_t MaxSad = 0;
  ogg_uint32_t i;

  for ( i = 0; i < 4; i++ ){
    SadValue[0] += abs(Src1[0] - Src2[0]);
    SadValue[1] += abs(Src1[1] - Src2[1]);
    SadValue[2] += abs(Src1[2] - Src2[2]);
    SadValue[3] += abs(Src1[3] - Src2[3]);
    SadValue[4] += abs(Src1[4] - Src2[4]);
    SadValue[5] += abs(Src1[5] - Src2[5]);
    SadValue[6] += abs(Src1[6] - Src2[6]);
    SadValue[7] += abs(Src1[7] - Src2[7]);

    Src1 += ppi->PlaneStride;
    Src2 += ppi->PlaneStride;
  }

  for ( i = 0; i < 4; i++ ){
    SadValue2[0] += abs(Src1[0] - Src2[0]);
    SadValue2[1] += abs(Src1[1] - Src2[1]);
    SadValue2[2] += abs(Src1[2] - Src2[2]);
    SadValue2[3] += abs(Src1[3] - Src2[3]);
    SadValue2[4] += abs(Src1[4] - Src2[4]);
    SadValue2[5] += abs(Src1[5] - Src2[5]);
    SadValue2[6] += abs(Src1[6] - Src2[6]);
    SadValue2[7] += abs(Src1[7] - Src2[7]);

    Src1 += ppi->PlaneStride;
    Src2 += ppi->PlaneStride;
  }

  for ( i = 0; i < 8; i++ ){
    if ( SadValue[i] > MaxSad )
      MaxSad = SadValue[i];
    if ( SadValue2[i] > MaxSad )
      MaxSad = SadValue2[i];
  }

  return MaxSad;
}


static int RowSadScan( PP_INSTANCE *ppi,
                       unsigned char * YuvPtr1,
                       unsigned char * YuvPtr2,

    for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
      if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
        /* Calculate the SAD score for the block row */
        GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2);

        /* Now test the group SAD score */
        if ( GrpSad > LocalGrpLowSadThresh ){

    /* Skip if block already marked to be coded. */
    if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
      /* Calculate the SAD score for the block column */
      MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );

      /* Now test the group SAD score */
      if ( MaxSad > LocalGrpLowSadThresh ){

      if (*DispFragPtr == CANDIDATE_BLOCK){

        /* Clear down entries in changed locals array */
        SET8_0(ChLocalsPtr);

        for ( j = 0; j < HFRAGPIXELS; j++ ){
          /* Take a local copy of the measured difference. */

      }else{
        /* If we are breaking out here mark all pixels as changed. */
        if ( *DispFragPtr > BLOCK_NOT_CODED ){
          SET8_1(bits_map_ptr);
          SET8_8(ChLocalsPtr);
        }else{
          SET8_0(ChLocalsPtr);
        }
      }


    /* Test for break out conditions to save time. */
    if (*DispFragPtr == CANDIDATE_BLOCK){
      /* Clear down entries in changed locals array */
      SET8_0(ChLocalsPtr);

      for ( j = 0; j < HFRAGPIXELS; j++ ){
        /* Take a local copy of the measured difference. */

    }else{
      /* If we are breaking out here mark all pixels as changed. */
      if ( *DispFragPtr > BLOCK_NOT_CODED ){
        SET8_1(bits_map_ptr);
        SET8_8(ChLocalsPtr);
      }else{
        SET8_0(ChLocalsPtr);
      }
    }


      /* Test for break out conditions to save time. */
      if (*DispFragPtr == CANDIDATE_BLOCK){
        /* Clear down entries in changed locals array */
        SET8_0(ChLocalsPtr);
        for ( j = 0; j < HFRAGPIXELS; j++ ){
          /* Take a local copy of the measured difference. */
          Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];

      }else{
        /* If we are breaking out here mark all pixels as changed. */
        if ( *DispFragPtr > BLOCK_NOT_CODED ){
          SET8_1(bits_map_ptr);
          SET8_8(ChLocalsPtr);
        }else{
          SET8_0(ChLocalsPtr);
        }
      }


    /* Test for break out conditions to save time. */
    if (*DispFragPtr == CANDIDATE_BLOCK){
      /* Clear down entries in changed locals array */
      SET8_0(ChLocalsPtr);

      for ( j = 0; j < HFRAGPIXELS; j++ ){
        /* Take a local copy of the measured difference. */

    }else{
      /* If we are breaking out here mark all pixels as changed.*/
      if ( *DispFragPtr > BLOCK_NOT_CODED ) {
          SET8_1(bits_map_ptr);
          SET8_8(ChLocalsPtr);
        }else{
          SET8_0(ChLocalsPtr);
        }
    }
    /* If we have a lot of changed pixels for this fragment on this

        }
      }else{
        if ( *DispFragPtr > BLOCK_NOT_CODED )
          SET8_0(ChLocalsPtr);

        /* Step pointers */
        ChLocalsPtr += HFRAGPIXELS;

        }
      }else{
        if ( *DispFragPtr > BLOCK_NOT_CODED )
          SET8_0(ChLocalsPtr);

        /* Step pointers */
        ChLocalsPtr += HFRAGPIXELS;

    /* Fast break out test for obvious yes and no cases in this row of
       blocks */
    if ( i < ppi->PlaneVFragments ){
      dsp_static_save_fpu ();
      UpdatedOrCandidateBlocks =
        RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
      UpdatedOrCandidateBlocks |=
        ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
      dsp_static_restore_fpu ();
    }else{
      /* Make sure we still call other functions if RowSadScan() disabled */
      UpdatedOrCandidateBlocks = 1;

Lines 787-792 Link Here

(-)libtheora-1.0alpha3/lib/toplevel.c (+4 lines)
787		787
788	CP_INSTANCE *cpi;	788	CP_INSTANCE *cpi;
789		789
		790	dsp_static_init ();
		791
790	memset(th, 0, sizeof(*th));	792	memset(th, 0, sizeof(*th));
791	th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));	793	th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
792		794
Lines 1446-1451 Link Here
1446	PB_INSTANCE *pbi;	1448	PB_INSTANCE *pbi;
1447	codec_setup_info *ci;	1449	codec_setup_info *ci;
1448		1450
		1451	dsp_static_init ();
		1452
1449	ci=(codec_setup_info *)c->codec_setup;	1453	ci=(codec_setup_info *)c->codec_setup;
1450	th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));	1454	th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));
1451		1455

Return to bug 68549

Lines 531-538 Link Here

(-)libtheora-1.0alpha3/lib/encode.c (-16 / +6 lines)
531		531
532	static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,	532	static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi,
533	ogg_int32_t BlockIndex ) {	533	ogg_int32_t BlockIndex ) {
534	ogg_uint32_t i;	534	ogg_uint32_t ErrorVal;
535	ogg_uint32_t ErrorVal = 0;
536		535
537	unsigned char * SrcDataPtr =	536	unsigned char * SrcDataPtr =
538	&cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];	537	&cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]];
Lines 550-570 Link Here
550	RecStride = cpi->pb.UVStride;	549	RecStride = cpi->pb.UVStride;
551	}	550	}
552		551
		552	ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride);
553		553
554	/* Decide on standard or MMX implementation */
555	for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
556	ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) );
557	ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) );
558	ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) );
559	ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) );
560	ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) );
561	ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) );
562	ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) );
563	ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) );
564	/* Step to next row of block. */
565	SrcDataPtr += SrcStride;
566	RecDataPtr += RecStride;
567	}
568	return ErrorVal;	554	return ErrorVal;
569	}	555	}
570		556
Lines 933-941 Link Here
933	/* Zero Decoder EOB run count */	919	/* Zero Decoder EOB run count */
934	cpi->pb.EOB_Run = 0;	920	cpi->pb.EOB_Run = 0;
935		921
		922	dsp_static_save_fpu ();
		923
936	/* Encode any fragments coded using DCT. */	924	/* Encode any fragments coded using DCT. */
937	coded_pixels += QuadCodeDisplayFragments (cpi);	925	coded_pixels += QuadCodeDisplayFragments (cpi);
938		926
		927	dsp_static_restore_fpu ();
		928
939	return coded_pixels;	929	return coded_pixels;
940		930
941	}	931	}

Lines 19-24 Link Here

(-)libtheora-1.0alpha3/lib/pp.c (-4 / +6 lines)
19	#include <string.h>	19	#include <string.h>
20	#include "encoder_internal.h"	20	#include "encoder_internal.h"
21	#include "pp.h"	21	#include "pp.h"
		22	#include "dsp.h"
22		23
23	#define MAX(a, b) ((a>b)?a:b)	24	#define MAX(a, b) ((a>b)?a:b)
24	#define MIN(a, b) ((a<b)?a:b)	25	#define MIN(a, b) ((a<b)?a:b)
Lines 490-496 Link Here
490		491
491	} else {	492	} else {
492		493
493	CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);	494	dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
494		495
495	}	496	}
496		497
Lines 529-535 Link Here
529	DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,	530	DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
530	LineLength,Quality,QuantScale);	531	LineLength,Quality,QuantScale);
531	}else{	532	}else{
532	CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);	533	dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
533	}	534	}
534		535
535	++Block;	536	++Block;
Lines 565-571 Link Here
565	DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,	566	DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col,
566	LineLength,Quality,QuantScale);	567	LineLength,Quality,QuantScale);
567	}else{	568	}else{
568	CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);	569	dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength);
569	}	570	}
570		571
571	++Block;	572	++Block;
Lines 913-919 Link Here
913	}	914	}
914		915
915	void PostProcess(PB_INSTANCE *pbi){	916	void PostProcess(PB_INSTANCE *pbi){
916		917	dsp_static_save_fpu ();
917	switch (pbi->PostProcessingLevel){	918	switch (pbi->PostProcessingLevel){
918	case 8:	919	case 8:
919	/* on a slow machine, use a simpler and faster deblocking filter */	920	/* on a slow machine, use a simpler and faster deblocking filter */
Lines 947-951 Link Here
947	DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer);	948	DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer);
948	break;	949	break;
949	}	950	}
		951	dsp_static_restore_fpu ();
950	}	952	}
951		953

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/cpu.c (+107 lines)
		1	/********************************************************************
		2	* *
		3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
		4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
		5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
		6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
		7	* *
		8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
		9	* by the Xiph.Org Foundation http://www.xiph.org/ *
		10	* *
		11	********************************************************************
		12
		13	function:
		14	last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
		15
		16	********************************************************************/
		17
		18	#include "cpu.h"
		19
		20	ogg_uint32_t cpu_flags = 0;
		21
		22	#if 1
		23	static ogg_uint32_t cpu_get_flags (void)
		24	{
		25	ogg_uint32_t eax, ebx, ecx, edx;
		26	ogg_uint32_t flags;
		27
		28	#define cpuid(op,eax,ebx,ecx,edx) \
		29	asm volatile ("pushl %%ebx \n\t" \
		30	"cpuid \n\t" \
		31	"movl %%ebx,%1 \n\t" \
		32	"popl %%ebx" \
		33	: "=a" (eax), \
		34	"=r" (ebx), \
		35	"=c" (ecx), \
		36	"=d" (edx) \
		37	: "a" (op) \
		38	: "cc")
		39
		40	asm volatile ("pushfl \n\t"
		41	"pushfl \n\t"
		42	"popl %0 \n\t"
		43	"movl %0,%1 \n\t"
		44	"xorl $0x200000,%0 \n\t"
		45	"pushl %0 \n\t"
		46	"popfl \n\t"
		47	"pushfl \n\t"
		48	"popl %0 \n\t"
		49	"popfl"
		50	: "=r" (eax),
		51	"=r" (ebx)
		52	:
		53	: "cc");
		54
		55	if (eax == ebx) /* no cpuid */
		56	return 0;
		57
		58	cpuid(0, eax, ebx, ecx, edx);
		59
		60	if (ebx == 0x756e6547 &&
		61	edx == 0x49656e69 &&
		62	ecx == 0x6c65746e) {
		63	/* intel */
		64
65	inteltest:
66	cpuid(1, eax, ebx, ecx, edx);
67	if ((edx & 0x00800000) == 0)
68	return 0;
69	flags = CPU_X86_MMX;
70	if (edx & 0x02000000)
71	flags \|= CPU_X86_MMXEXT \| CPU_X86_SSE;
72	if (edx & 0x04000000)
73	flags \|= CPU_X86_SSE2;
74	return flags;
75	} else if (ebx == 0x68747541 &&
76	edx == 0x69746e65 &&
77	ecx == 0x444d4163) {
78	/* AMD */
79	cpuid(0x80000000, eax, ebx, ecx, edx);
80	if ((unsigned)eax < 0x80000001)
81	goto inteltest;
82	cpuid(0x80000001, eax, ebx, ecx, edx);
83	if ((edx & 0x00800000) == 0)
84	return 0;
85	flags = CPU_X86_MMX;
86	if (edx & 0x80000000)
87	flags \|= CPU_X86_3DNOW;
88	if (edx & 0x00400000)
89	flags \|= CPU_X86_MMXEXT;
90	return flags;
91	}
92	else {
93	/* implement me */
94	}
95
96	return flags;
97	}
98	#else
99	static ogg_uint32_t cpu_get_flags (void) {
100	return 0;
101	}
102	#endif
103
104	void cpu_init ()
105	{
106	cpu_flags = cpu_get_flags();
107	}

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/cpu.h (+28 lines)
	1	/********************************************************************
	2	* *
	3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
	4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
	5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
	6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
	7	* *
	8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
	9	* by the Xiph.Org Foundation http://www.xiph.org/ *
	10	* *
	11	********************************************************************
	12
	13	function:
	14	last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
	15
	16	********************************************************************/
	17
	18	#include "encoder_internal.h"
	19
	20	extern ogg_uint32_t cpu_flags;
	21
	22	#define CPU_X86_MMX (1<<0)
	23	#define CPU_X86_3DNOW (1<<1)
	24	#define CPU_X86_MMXEXT (1<<2)
	25	#define CPU_X86_SSE (1<<3)
	26	#define CPU_X86_SSE2 (1<<4)
	27
	28	void cpu_init () ;

Lines 16-21 Link Here

(-)libtheora-1.0alpha3/lib/dct.c (-1 / +11 lines)
16	********************************************************************/	16	********************************************************************/
17		17
18	#include "encoder_internal.h"	18	#include "encoder_internal.h"
		19	#include "cpu.h"
19		20
20	static ogg_int32_t xC1S7 = 64277;	21	static ogg_int32_t xC1S7 = 64277;
21	static ogg_int32_t xC2S6 = 60547;	22	static ogg_int32_t xC2S6 = 60547;
Lines 28-34 Link Here
28	#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)	29	#define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31)
29	#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )	30	#define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) )
30		31
31	void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){	32	static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){
32	int loop;	33	int loop;
33		34
34	ogg_int32_t is07, is12, is34, is56;	35	ogg_int32_t is07, is12, is34, is56;
Lines 251-253 Link Here
251	op ++;	252	op ++;
252	}	253	}
253	}	254	}
		255
		256	void dsp_dct_init (DspFunctions *funcs)
		257	{
		258	funcs->fdct_short = fdct_short__c;
		259	if (cpu_flags & CPU_X86_MMX) {
		260	dsp_i386_mmx_fdct_init(&dsp_funcs);
		261	}
		262	}
		263

Lines 18-23 Link Here

(-)libtheora-1.0alpha3/lib/dct_decode.c (-33 / +18 lines)
18	#include <stdlib.h>	18	#include <stdlib.h>
19	#include <string.h>	19	#include <string.h>
20	#include "encoder_internal.h"	20	#include "encoder_internal.h"
		21	#include "dsp.h"
21		22
22		23
23	#define GOLDEN_FRAME_THRESH_Q 50	24	#define GOLDEN_FRAME_THRESH_Q 50
Lines 112-133 Link Here
112	SetupBoundingValueArray_Generic(pbi, FLimit);	113	SetupBoundingValueArray_Generic(pbi, FLimit);
113	}	114	}
114		115
115	void CopyBlock(unsigned char *src,
116	unsigned char *dest,
117	unsigned int srcstride){
118	unsigned char *s = src;
119	unsigned char *d = dest;
120	unsigned int stride = srcstride;
121
122	int j;
123	for ( j = 0; j < 8; j++ ){
124	((ogg_uint32_t)d)[0] = ((ogg_uint32_t)s)[0];
125	((ogg_uint32_t)d)[1] = ((ogg_uint32_t)s)[1];
126	s+=stride;
127	d+=stride;
128	}
129	}
130
131	static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){	116	static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){
132	ogg_uint32_t ReconPixelsPerLine;	117	ogg_uint32_t ReconPixelsPerLine;
133	ogg_int32_t ReconPixelIndex;	118	ogg_int32_t ReconPixelIndex;
Lines 160-167 Link Here
160	ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];	145	ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber];
161		146
162	/* Get the pixel index for the first pixel in the fragment. */	147	/* Get the pixel index for the first pixel in the fragment. */
163	ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),	148	dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]),
164	(ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine );	149	(ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine);
165		150
166	}	151	}
167		152
Lines 237-246 Link Here
237	/* Reconstruct the pixel data using the last frame reconstruction	222	/* Reconstruct the pixel data using the last frame reconstruction
238	and change data when the motion vector is (0,0), the recon is	223	and change data when the motion vector is (0,0), the recon is
239	based on the lastframe without loop filtering---- for testing */	224	based on the lastframe without loop filtering---- for testing */
240	ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],	225	dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
241	&pbi->LastFrameRecon[ReconPixelIndex],	226	&pbi->LastFrameRecon[ReconPixelIndex],
242	pbi->ReconDataBuffer, ReconPixelsPerLine );	227	pbi->ReconDataBuffer, ReconPixelsPerLine);
243
244	}else if ( ModeUsesMC[pbi->CodingMode] ) {	228	}else if ( ModeUsesMC[pbi->CodingMode] ) {
245	/* The mode uses a motion vector. */	229	/* The mode uses a motion vector. */
246	/* Get vector from list */	230	/* Get vector from list */
Lines 287-315 Link Here
287	if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {	271	if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) {
288	/* Reconstruct the pixel dats from the reference frame and change data	272	/* Reconstruct the pixel dats from the reference frame and change data
289	(no half pixel in this case as the two references were the same. */	273	(no half pixel in this case as the two references were the same. */
290	ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],	274	dsp_static_recon_inter8x8 (
		275	&pbi->ThisFrameRecon[ReconPixelIndex],
291	LastFrameRecPtr, pbi->ReconDataBuffer,	276	LastFrameRecPtr, pbi->ReconDataBuffer,
292	ReconPixelsPerLine );	277	ReconPixelsPerLine);
293	}else{	278	}else{
294	/* Fractional pixel reconstruction. */	279	/* Fractional pixel reconstruction. */
295	/* Note that we only use two pixels per reconstruction even for	280	/* Note that we only use two pixels per reconstruction even for
296	the diagonal. */	281	the diagonal. */
297	ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex],	282	dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex],
298	LastFrameRecPtr, LastFrameRecPtr2,	283	LastFrameRecPtr, LastFrameRecPtr2,
299	pbi->ReconDataBuffer, ReconPixelsPerLine );	284	pbi->ReconDataBuffer, ReconPixelsPerLine);
300	}	285	}
301	} else if ( pbi->CodingMode == CODE_USING_GOLDEN ){	286	} else if ( pbi->CodingMode == CODE_USING_GOLDEN ){
302	/* Golden frame with motion vector */	287	/* Golden frame with motion vector */
303	/* Reconstruct the pixel data using the golden frame	288	/* Reconstruct the pixel data using the golden frame
304	reconstruction and change data */	289	reconstruction and change data */
305	ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],	290	dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
306	&pbi->GoldenFrame[ ReconPixelIndex ],	291	&pbi->GoldenFrame[ ReconPixelIndex ],
307	pbi->ReconDataBuffer, ReconPixelsPerLine );	292	pbi->ReconDataBuffer, ReconPixelsPerLine);
308	} else {	293	} else {
309	/* Simple Intra coding */	294	/* Simple Intra coding */
310	/* Get the pixel index for the first pixel in the fragment. */	295	/* Get the pixel index for the first pixel in the fragment. */
311	ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex],	296	dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex],
312	pbi->ReconDataBuffer, ReconPixelsPerLine );	297	pbi->ReconDataBuffer, ReconPixelsPerLine);
313	}	298	}
314	}	299	}
315		300
Lines 464-470 Link Here
464	SrcPtr = &SrcReconPtr[ PixelIndex ];	449	SrcPtr = &SrcReconPtr[ PixelIndex ];
465	DestPtr = &DestReconPtr[ PixelIndex ];	450	DestPtr = &DestReconPtr[ PixelIndex ];
466		451
467	CopyBlock(SrcPtr, DestPtr, PlaneLineStep);	452	dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
468	}	453	}
469	}	454	}
470		455
Lines 476-482 Link Here
476	SrcPtr = &SrcReconPtr[ PixelIndex ];	461	SrcPtr = &SrcReconPtr[ PixelIndex ];
477	DestPtr = &DestReconPtr[ PixelIndex ];	462	DestPtr = &DestReconPtr[ PixelIndex ];
478		463
479	CopyBlock(SrcPtr, DestPtr, PlaneLineStep);	464	dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
480		465
481	}	466	}
482	}	467	}
Lines 505-511 Link Here
505	SrcPtr = &SrcReconPtr[ PixelIndex ];	490	SrcPtr = &SrcReconPtr[ PixelIndex ];
506	DestPtr = &DestReconPtr[ PixelIndex ];	491	DestPtr = &DestReconPtr[ PixelIndex ];
507		492
508	CopyBlock(SrcPtr, DestPtr, PlaneLineStep);	493	dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
509	}	494	}
510	}	495	}
511		496
Lines 517-523 Link Here
517	SrcPtr = &SrcReconPtr[ PixelIndex ];	502	SrcPtr = &SrcReconPtr[ PixelIndex ];
518	DestPtr = &DestReconPtr[ PixelIndex ];	503	DestPtr = &DestReconPtr[ PixelIndex ];
519		504
520	CopyBlock(SrcPtr, DestPtr, PlaneLineStep);	505	dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep);
521		506
522	}	507	}
523	}	508	}

Lines 17-126 Link Here

(-)libtheora-1.0alpha3/lib/dct_encode.c (-110 / +13 lines)
17		17
18	#include <stdlib.h>	18	#include <stdlib.h>
19	#include "encoder_internal.h"	19	#include "encoder_internal.h"
		20	#include "dsp.h"
20		21
21	static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };	22	static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 };
22		23
23	static void Sub8 (unsigned char FiltPtr, unsigned char ReconPtr,
24	ogg_int16_t DctInputPtr, unsigned char old_ptr1,
25	unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine,
26	ogg_uint32_t ReconPixelsPerLine ) {
27	int i;
28
29	/* For each block row */
30	for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
31	DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) );
32	DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) );
33	DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) );
34	DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) );
35	DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) );
36	DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) );
37	DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) );
38	DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) );
39
40	/* Update the screen canvas in one step*/
41	((ogg_uint32_t)old_ptr1)[0] = ((ogg_uint32_t)new_ptr1)[0];
42	((ogg_uint32_t)old_ptr1)[1] = ((ogg_uint32_t)new_ptr1)[1];
43
44	/* Start next row */
45	new_ptr1 += PixelsPerLine;
46	old_ptr1 += PixelsPerLine;
47	FiltPtr += PixelsPerLine;
48	ReconPtr += ReconPixelsPerLine;
49	DctInputPtr += BLOCK_HEIGHT_WIDTH;
50	}
51	}
52
53	static void Sub8_128 (unsigned char FiltPtr, ogg_int16_t DctInputPtr,
54	unsigned char old_ptr1, unsigned char new_ptr1,
55	ogg_uint32_t PixelsPerLine ) {
56	int i;
57	/* For each block row */
58	for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){
59	/* INTRA mode so code raw image data */
60	/* We convert the data to 8 bit signed (by subtracting 128) as
61	this reduces the internal precision requirments in the DCT
62	transform. */
63	DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128);
64	DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128);
65	DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128);
66	DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128);
67	DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128);
68	DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128);
69	DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128);
70	DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128);
71
72	/* Update the screen canvas in one step */
73	((ogg_uint32_t)old_ptr1)[0] = ((ogg_uint32_t)new_ptr1)[0];
74	((ogg_uint32_t)old_ptr1)[1] = ((ogg_uint32_t)new_ptr1)[1];
75
76	/* Start next row */
77	new_ptr1 += PixelsPerLine;
78	old_ptr1 += PixelsPerLine;
79	FiltPtr += PixelsPerLine;
80	DctInputPtr += BLOCK_HEIGHT_WIDTH;
81	}
82	}
83
84	static void Sub8Av2 (unsigned char FiltPtr, unsigned char ReconPtr1,
85	unsigned char ReconPtr2, ogg_int16_t DctInputPtr,
86	unsigned char old_ptr1, unsigned char new_ptr1,
87	ogg_uint32_t PixelsPerLine,
88	ogg_uint32_t ReconPixelsPerLine ) {
89	int i;
90
91	/* For each block row */
92	for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
93	DctInputPtr[0] = (ogg_int16_t)
94	((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) );
95	DctInputPtr[1] = (ogg_int16_t)
96	((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) );
97	DctInputPtr[2] = (ogg_int16_t)
98	((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) );
99	DctInputPtr[3] = (ogg_int16_t)
100	((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) );
101	DctInputPtr[4] = (ogg_int16_t)
102	((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) );
103	DctInputPtr[5] = (ogg_int16_t)
104	((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) );
105	DctInputPtr[6] = (ogg_int16_t)
106	((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) );
107	DctInputPtr[7] = (ogg_int16_t)
108	((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) );
109
110	/* Update the screen canvas in one step */
111	((ogg_uint32_t)old_ptr1)[0] = ((ogg_uint32_t)new_ptr1)[0];
112	((ogg_uint32_t)old_ptr1)[1] = ((ogg_uint32_t)new_ptr1)[1];
113
114	/* Start next row */
115	new_ptr1 += PixelsPerLine;
116	old_ptr1 += PixelsPerLine;
117	FiltPtr += PixelsPerLine;
118	ReconPtr1 += ReconPixelsPerLine;
119	ReconPtr2 += ReconPixelsPerLine;
120	DctInputPtr += BLOCK_HEIGHT_WIDTH;
121	}
122	}
123
124	static unsigned char TokenizeDctValue (ogg_int16_t DataValue,	24	static unsigned char TokenizeDctValue (ogg_int16_t DataValue,
125	ogg_uint32_t * TokenListPtr ){	25	ogg_uint32_t * TokenListPtr ){
126	unsigned char tokens_added = 0;	26	unsigned char tokens_added = 0;
Lines 452-464 Link Here
452		352
453	/* Is the MV offset exactly pixel alligned */	353	/* Is the MV offset exactly pixel alligned */
454	if ( AbsRefOffset == 0 ){	354	if ( AbsRefOffset == 0 ){
455	Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,	355	dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
456	PixelsPerLine, ReconPixelsPerLine );	356	PixelsPerLine, ReconPixelsPerLine);
		357	dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
457	} else {	358	} else {
458	/* Fractional pixel MVs. */	359	/* Fractional pixel MVs. */
459	/* Note that we only use two pixel values even for the diagonal */	360	/* Note that we only use two pixel values even for the diagonal */
460	Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1,	361	dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr,
461	new_ptr1, PixelsPerLine, ReconPixelsPerLine );	362	PixelsPerLine, ReconPixelsPerLine);
		363	dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
462	}	364	}
463	}	365	}
464		366
Lines 534-550 Link Here
534	pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];	436	pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]];
535	}	437	}
536		438
537	Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1,	439	dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr,
538	PixelsPerLine, ReconPixelsPerLine );	440	PixelsPerLine, ReconPixelsPerLine);
		441	dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
539	} else if ( cpi->pb.CodingMode==CODE_INTRA ) {	442	} else if ( cpi->pb.CodingMode==CODE_INTRA ) {
540	Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine);	443	dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine);
541		444	dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine);
542	}	445	}
543		446
544	/* Proceed to encode the data into the encode buffer if the encoder	447	/* Proceed to encode the data into the encode buffer if the encoder
545	is enabled. */	448	is enabled. */
546	/* Perform a 2D DCT transform on the data. */	449	/* Perform a 2D DCT transform on the data. */
547	fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );	450	dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes );
548		451
549	/* Quantize that transform data. */	452	/* Quantize that transform data. */
550	quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );	453	quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] );

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/dsp.c (+416 lines)
		1	/********************************************************************
		2	* *
		3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
		4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
		5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
		6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
		7	* *
		8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
		9	* by the Xiph.Org Foundation http://www.xiph.org/ *
		10	* *
		11	********************************************************************
		12
		13	function:
		14	last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
		15
		16	********************************************************************/
		17
		18	#include <stdlib.h>
		19	#include "cpu.h"
		20	#include "encoder_internal.h"
		21
		22	#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
		23	#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
		24	#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
		25
		26	DspFunctions dsp_funcs;
		27
		28	static void sub8x8__c (unsigned char FiltPtr, unsigned char ReconPtr,
		29	ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
		30	ogg_uint32_t ReconPixelsPerLine) {
		31	int i;
		32
		33	/* For each block row */
		34	for (i=8; i; i--) {
		35	DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
		36	DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
		37	DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
		38	DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
		39	DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
		40	DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
		41	DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
		42	DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
		43
		44	/* Start next row */
		45	FiltPtr += PixelsPerLine;
		46	ReconPtr += ReconPixelsPerLine;
		47	DctInputPtr += 8;
		48	}
		49	}
		50
		51	static void sub8x8_128__c (unsigned char FiltPtr, ogg_int16_t DctInputPtr,
		52	ogg_uint32_t PixelsPerLine) {
		53	int i;
		54	/* For each block row */
		55	for (i=8; i; i--) {
		56	/* INTRA mode so code raw image data */
		57	/* We convert the data to 8 bit signed (by subtracting 128) as
		58	this reduces the internal precision requirments in the DCT
		59	transform. */
		60	DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
		61	DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
		62	DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
		63	DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
		64	DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
65	DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
66	DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
67	DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
68
69	/* Start next row */
70	FiltPtr += PixelsPerLine;
71	DctInputPtr += 8;
72	}
73	}
74
75	static void sub8x8avg2__c (unsigned char FiltPtr, unsigned char ReconPtr1,
76	unsigned char ReconPtr2, ogg_int16_t DctInputPtr,
77	ogg_uint32_t PixelsPerLine,
78	ogg_uint32_t ReconPixelsPerLine)
79	{
80	int i;
81
82	/* For each block row */
83	for (i=8; i; i--) {
84	DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0]));
85	DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1]));
86	DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2]));
87	DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3]));
88	DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4]));
89	DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5]));
90	DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6]));
91	DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7]));
92
93	/* Start next row */
94	FiltPtr += PixelsPerLine;
95	ReconPtr1 += ReconPixelsPerLine;
96	ReconPtr2 += ReconPixelsPerLine;
97	DctInputPtr += 8;
98	}
99	}
100
101	static ogg_uint32_t row_sad8__c (unsigned char Src1, unsigned char Src2)
102	{
103	ogg_uint32_t SadValue;
104	ogg_uint32_t SadValue1;
105
106	SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) +
107	DSP_OP_ABS_DIFF (Src1[1], Src2[1]) +
108	DSP_OP_ABS_DIFF (Src1[2], Src2[2]) +
109	DSP_OP_ABS_DIFF (Src1[3], Src2[3]);
110
111	SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) +
112	DSP_OP_ABS_DIFF (Src1[5], Src2[5]) +
113	DSP_OP_ABS_DIFF (Src1[6], Src2[6]) +
114	DSP_OP_ABS_DIFF (Src1[7], Src2[7]);
115
116	SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
117
118	return SadValue;
119	}
120
121	static ogg_uint32_t col_sad8x8__c (unsigned char Src1, unsigned char Src2,
122	ogg_uint32_t stride)
123	{
124	ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
125	ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
126	ogg_uint32_t MaxSad = 0;
127	ogg_uint32_t i;
128
129	for ( i = 0; i < 4; i++ ){
130	SadValue[0] += abs(Src1[0] - Src2[0]);
131	SadValue[1] += abs(Src1[1] - Src2[1]);
132	SadValue[2] += abs(Src1[2] - Src2[2]);
133	SadValue[3] += abs(Src1[3] - Src2[3]);
134	SadValue[4] += abs(Src1[4] - Src2[4]);
135	SadValue[5] += abs(Src1[5] - Src2[5]);
136	SadValue[6] += abs(Src1[6] - Src2[6]);
137	SadValue[7] += abs(Src1[7] - Src2[7]);
138
139	Src1 += stride;
140	Src2 += stride;
141	}
142
143	for ( i = 0; i < 4; i++ ){
144	SadValue2[0] += abs(Src1[0] - Src2[0]);
145	SadValue2[1] += abs(Src1[1] - Src2[1]);
146	SadValue2[2] += abs(Src1[2] - Src2[2]);
147	SadValue2[3] += abs(Src1[3] - Src2[3]);
148	SadValue2[4] += abs(Src1[4] - Src2[4]);
149	SadValue2[5] += abs(Src1[5] - Src2[5]);
150	SadValue2[6] += abs(Src1[6] - Src2[6]);
151	SadValue2[7] += abs(Src1[7] - Src2[7]);
152
153	Src1 += stride;
154	Src2 += stride;
155	}
156
157	for ( i = 0; i < 8; i++ ){
158	if ( SadValue[i] > MaxSad )
159	MaxSad = SadValue[i];
160	if ( SadValue2[i] > MaxSad )
161	MaxSad = SadValue2[i];
162	}
163
164	return MaxSad;
165	}
166
167	static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1,
168	unsigned char *ptr2, ogg_uint32_t stride2)
169	{
170	ogg_uint32_t i;
171	ogg_uint32_t sad = 0;
172
173	for (i=8; i; i--) {
174	sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
175	sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
176	sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
177	sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
178	sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
179	sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
180	sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
181	sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
182
183	/* Step to next row of block. */
184	ptr1 += stride1;
185	ptr2 += stride2;
186	}
187
188	return sad;
189	}
190
191	static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1,
192	unsigned char *ptr2, ogg_uint32_t stride2,
193	ogg_uint32_t thres)
194	{
195	ogg_uint32_t i;
196	ogg_uint32_t sad = 0;
197
198	for (i=8; i; i--) {
199	sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
200	sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
201	sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
202	sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
203	sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
204	sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
205	sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
206	sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
207
208	if (sad > thres )
209	break;
210
211	/* Step to next row of block. */
212	ptr1 += stride1;
213	ptr2 += stride2;
214	}
215
216	return sad;
217	}
218
219	static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
220	unsigned char *RefDataPtr1,
221	unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
222	ogg_uint32_t thres)
223	{
224	ogg_uint32_t i;
225	ogg_uint32_t sad = 0;
226
227	for (i=8; i; i--) {
228	sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
229	sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
230	sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
231	sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
232	sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
233	sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
234	sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
235	sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
236
237	if ( sad > thres )
238	break;
239
240	/* Step to next row of block. */
241	SrcData += SrcStride;
242	RefDataPtr1 += RefStride;
243	RefDataPtr2 += RefStride;
244	}
245
246	return sad;
247	}
248
249	static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride)
250	{
251	ogg_uint32_t i;
252	ogg_uint32_t XSum=0;
253	ogg_uint32_t XXSum=0;
254
255	for (i=8; i; i--) {
256	/* Examine alternate pixel locations. */
257	XSum += DataPtr[0];
258	XXSum += DataPtr[0]*DataPtr[0];
259	XSum += DataPtr[1];
260	XXSum += DataPtr[1]*DataPtr[1];
261	XSum += DataPtr[2];
262	XXSum += DataPtr[2]*DataPtr[2];
263	XSum += DataPtr[3];
264	XXSum += DataPtr[3]*DataPtr[3];
265	XSum += DataPtr[4];
266	XXSum += DataPtr[4]*DataPtr[4];
267	XSum += DataPtr[5];
268	XXSum += DataPtr[5]*DataPtr[5];
269	XSum += DataPtr[6];
270	XXSum += DataPtr[6]*DataPtr[6];
271	XSum += DataPtr[7];
272	XXSum += DataPtr[7]*DataPtr[7];
273
274	/* Step to next row of block. */
275	DataPtr += Stride;
276	}
277
278	/* Compute population variance as mis-match metric. */
279	return (( (XXSum<<6) - XSum*XSum ) );
280	}
281
282	static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
283	unsigned char *RefDataPtr, ogg_uint32_t RefStride)
284	{
285	ogg_uint32_t i;
286	ogg_uint32_t XSum=0;
287	ogg_uint32_t XXSum=0;
288	ogg_int32_t DiffVal;
289
290	for (i=8; i; i--) {
291	DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]);
292	XSum += DiffVal;
293	XXSum += DiffVal*DiffVal;
294
295	DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]);
296	XSum += DiffVal;
297	XXSum += DiffVal*DiffVal;
298
299	DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]);
300	XSum += DiffVal;
301	XXSum += DiffVal*DiffVal;
302
303	DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]);
304	XSum += DiffVal;
305	XXSum += DiffVal*DiffVal;
306
307	DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]);
308	XSum += DiffVal;
309	XXSum += DiffVal*DiffVal;
310
311	DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]);
312	XSum += DiffVal;
313	XXSum += DiffVal*DiffVal;
314
315	DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]);
316	XSum += DiffVal;
317	XXSum += DiffVal*DiffVal;
318
319	DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]);
320	XSum += DiffVal;
321	XXSum += DiffVal*DiffVal;
322
323	/* Step to next row of block. */
324	SrcData += SrcStride;
325	RefDataPtr += RefStride;
326	}
327
328	/* Compute and return population variance as mis-match metric. */
329	return (( (XXSum<<6) - XSum*XSum ));
330	}
331
332	static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride,
333	unsigned char *RefDataPtr1,
334	unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
335	{
336	ogg_uint32_t i;
337	ogg_uint32_t XSum=0;
338	ogg_uint32_t XXSum=0;
339	ogg_int32_t DiffVal;
340
341	for (i=8; i; i--) {
342	DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
343	XSum += DiffVal;
344	XXSum += DiffVal*DiffVal;
345
346	DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
347	XSum += DiffVal;
348	XXSum += DiffVal*DiffVal;
349
350	DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
351	XSum += DiffVal;
352	XXSum += DiffVal*DiffVal;
353
354	DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
355	XSum += DiffVal;
356	XXSum += DiffVal*DiffVal;
357
358	DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
359	XSum += DiffVal;
360	XXSum += DiffVal*DiffVal;
361
362	DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
363	XSum += DiffVal;
364	XXSum += DiffVal*DiffVal;
365
366	DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
367	XSum += DiffVal;
368	XXSum += DiffVal*DiffVal;
369
370	DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
371	XSum += DiffVal;
372	XXSum += DiffVal*DiffVal;
373
374	/* Step to next row of block. */
375	SrcData += SrcStride;
376	RefDataPtr1 += RefStride;
377	RefDataPtr2 += RefStride;
378	}
379
380	/* Compute and return population variance as mis-match metric. */
381	return (( (XXSum<<6) - XSum*XSum ));
382	}
383
384	static void nop (void) { /* NOP */ }
385
386	void dsp_init(DspFunctions *funcs)
387	{
388	funcs->save_fpu = nop;
389	funcs->restore_fpu = nop;
390	funcs->sub8x8 = sub8x8__c;
391	funcs->sub8x8_128 = sub8x8_128__c;
392	funcs->sub8x8avg2 = sub8x8avg2__c;
393	funcs->row_sad8 = row_sad8__c;
394	funcs->col_sad8x8 = col_sad8x8__c;
395	funcs->sad8x8 = sad8x8__c;
396	funcs->sad8x8_thres = sad8x8_thres__c;
397	funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
398	funcs->intra8x8_err = intra8x8_err__c;
399	funcs->inter8x8_err = inter8x8_err__c;
400	funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c;
401	}
402
403	void dsp_static_init(void)
404	{
405	cpu_init ();
406	dsp_init (&dsp_funcs);
407	dsp_recon_init (&dsp_funcs);
408	dsp_dct_init (&dsp_funcs);
409	if (cpu_flags & CPU_X86_MMX) {
410	dsp_i386_mmx_init(&dsp_funcs);
411	}
412	if (cpu_flags & CPU_X86_MMXEXT) {
413	dsp_i386_mmxext_init(&dsp_funcs);
414	}
415	}
416

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/dsp.h (+154 lines)
		1	/********************************************************************
		2	* *
		3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
		4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
		5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
		6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
		7	* *
		8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
		9	* by the Xiph.Org Foundation http://www.xiph.org/ *
		10	* *
		11	********************************************************************
		12
		13	function:
		14	last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
		15
		16	********************************************************************/
		17
		18	#ifndef DSP_H
		19	#define DSP_H
		20
		21	#include <theora/theora.h>
		22
		23	typedef struct
		24	{
		25	void (*save_fpu) (void);
		26	void (*restore_fpu) (void);
		27
		28	void (sub8x8) (unsigned char FiltPtr, unsigned char *ReconPtr,
		29	ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
		30	ogg_uint32_t ReconPixelsPerLine);
		31
		32	void (sub8x8_128) (unsigned char FiltPtr, ogg_int16_t *DctInputPtr,
		33	ogg_uint32_t PixelsPerLine);
		34
		35	void (sub8x8avg2) (unsigned char FiltPtr, unsigned char *ReconPtr1,
		36	unsigned char ReconPtr2, ogg_int16_t DctInputPtr,
		37	ogg_uint32_t PixelsPerLine,
		38	ogg_uint32_t ReconPixelsPerLine);
		39
		40	void (copy8x8) (unsigned char src, unsigned char *dest,
		41	ogg_uint32_t stride);
		42
		43	void (recon_intra8x8) (unsigned char ReconPtr, ogg_int16_t *ChangePtr,
		44	ogg_uint32_t LineStep);
		45
		46	void (recon_inter8x8) (unsigned char ReconPtr, unsigned char *RefPtr,
		47	ogg_int16_t *ChangePtr, ogg_uint32_t LineStep);
		48
		49	void (recon_inter8x8_half) (unsigned char ReconPtr, unsigned char *RefPtr1,
		50	unsigned char RefPtr2, ogg_int16_t ChangePtr,
		51	ogg_uint32_t LineStep);
		52
		53	void (fdct_short) (ogg_int16_t InputData, ogg_int16_t *OutputData);
		54
		55	ogg_uint32_t (row_sad8) (unsigned char Src1, unsigned char *Src2);
		56
		57	ogg_uint32_t (col_sad8x8) (unsigned char Src1, unsigned char *Src2,
		58	ogg_uint32_t stride);
		59
		60	ogg_uint32_t (sad8x8) (unsigned char ptr1, ogg_uint32_t stride1,
		61	unsigned char *ptr2, ogg_uint32_t stride2);
		62
		63	ogg_uint32_t (sad8x8_thres) (unsigned char ptr1, ogg_uint32_t stride1,
		64	unsigned char *ptr2, ogg_uint32_t stride2,
65	ogg_uint32_t thres);
66
67	ogg_uint32_t (sad8x8_xy2_thres)(unsigned char SrcData, ogg_uint32_t SrcStride,
68	unsigned char *RefDataPtr1,
69	unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
70	ogg_uint32_t thres);
71
72	ogg_uint32_t (intra8x8_err) (unsigned char DataPtr, ogg_uint32_t Stride);
73
74	ogg_uint32_t (inter8x8_err) (unsigned char SrcData, ogg_uint32_t SrcStride,
75	unsigned char *RefDataPtr, ogg_uint32_t RefStride);
76
77	ogg_uint32_t (inter8x8_err_xy2)(unsigned char SrcData, ogg_uint32_t SrcStride,
78	unsigned char *RefDataPtr1,
79	unsigned char *RefDataPtr2, ogg_uint32_t RefStride);
80	} DspFunctions;
81
82	extern DspFunctions dsp_funcs;
83
84	extern void dsp_recon_init (DspFunctions *funcs);
85
86	void dsp_init(DspFunctions *funcs);
87	void dsp_static_init(void);
88
89	#define dsp_save_fpu(funcs) (funcs.save_fpu ())
90	#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs)
91
92	#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
93	#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs)
94
95	#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5))
96	#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5)
97
98	#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
99	#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3)
100
101	#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6))
102	#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6)
103
104	#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
105	#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1)
106
107	#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1))
108	#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1)
109
110	#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \
111	(funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1))
112	#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \
113	dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1)
114
115	#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \
116	(funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1))
117	#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \
118	dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1)
119
120	#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
121	#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out)
122
123	#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2))
124	#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2)
125
126	#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1))
127	#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1)
128
129	#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2))
130	#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2)
131
132	#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t))
133	#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t)
134
135	#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \
136	(funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t))
137	#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \
138	dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t)
139
140	#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1))
141	#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1)
142
143	#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \
144	(funcs.inter8x8_err (ptr1,str1,ptr2,str2))
145	#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \
146	dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2)
147
148	#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \
149	(funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2))
150	#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \
151	dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2)
152
153
154	#endif /* DSP_H */

Lines 24-29 Link Here

(-)libtheora-1.0alpha3/lib/encoder_internal.h (-15 / +2 lines)
24		24
25	#include <theora/theora.h>	25	#include <theora/theora.h>
26	#include "huffman.h"	26	#include "huffman.h"
		27	#include "dsp.h"
27		28
28	#ifndef LIBOGG2	29	#ifndef LIBOGG2
29	#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )	30	#define theora_read(x,y,z) ( *z = oggpackB_read(x,y) )
Lines 689-711 Link Here
689	ogg_int16_t *QuantMatrix,	690	ogg_int16_t *QuantMatrix,
690	ogg_int16_t * OutputData );	691	ogg_int16_t * OutputData );
691		692
692	extern void ReconIntra( PB_INSTANCE pbi, unsigned char ReconPtr,	693	extern void dsp_recon_init (DspFunctions *funcs);
693	ogg_int16_t * ChangePtr, ogg_uint32_t LineStep );
694
695	extern void ReconInter( PB_INSTANCE pbi, unsigned char ReconPtr,
696	unsigned char * RefPtr, ogg_int16_t * ChangePtr,
697	ogg_uint32_t LineStep ) ;
698
699	extern void ReconInterHalfPixel2( PB_INSTANCE pbi, unsigned char ReconPtr,
700	unsigned char * RefPtr1,
701	unsigned char * RefPtr2,
702	ogg_int16_t * ChangePtr,
703	ogg_uint32_t LineStep ) ;
704		694
705	extern void SetupLoopFilter(PB_INSTANCE *pbi);	695	extern void SetupLoopFilter(PB_INSTANCE *pbi);
706	extern void CopyBlock(unsigned char *src,
707	unsigned char *dest,
708	unsigned int srcstride);
709	extern void LoopFilter(PB_INSTANCE *pbi);	696	extern void LoopFilter(PB_INSTANCE *pbi);
710	extern void ReconRefFrames (PB_INSTANCE *pbi);	697	extern void ReconRefFrames (PB_INSTANCE *pbi);
711	extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,	698	extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock,

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/i386/dsp_mmx.c (+642 lines)
		1	/********************************************************************
		2	* *
		3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
		4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
		5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
		6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
		7	* *
		8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
		9	* by the Xiph.Org Foundation http://www.xiph.org/ *
		10	* *
		11	********************************************************************
		12
		13	function:
		14	last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
		15
		16	********************************************************************/
		17
		18	#include <stdlib.h>
		19	#include "dsp.h"
		20
		21	static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL;
		22
		23	#if defined(__MINGW32__) \|\| defined(__CYGWIN__) \|\| \
		24	defined(__OS2__) \|\| (defined (__OpenBSD__) && !defined(__ELF__))
		25	# define M(a) "_" #a
		26	#else
		27	# define M(a) #a
		28	#endif
		29
		30	#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
		31	#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
		32	#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
		33
		34	static void sub8x8__mmx (unsigned char FiltPtr, unsigned char ReconPtr,
		35	ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine,
		36	ogg_uint32_t ReconPixelsPerLine)
		37	{
		38	__asm__ __volatile__ (
		39	" .balign 16 \n\t"
		40
		41	" pxor %%mm7, %%mm7 \n\t"
		42
		43	".rept 8 \n\t"
		44	" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
		45	" movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */
		46	" movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
		47	" movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
		48	/* convert from UINT8 to INT16 */
		49	" punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
		50	" punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */
		51	" punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
		52	" punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */
		53	/* start calculation */
		54	" psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */
		55	" psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */
		56	" movq %%mm0, (%2) \n\t" /* write answer out */
		57	" movq %%mm2, 8(%2) \n\t" /* write answer out */
		58	/* Increment pointers */
		59	" add $16, %2 \n\t"
		60	" add %3, %0 \n\t"
		61	" add %4, %1 \n\t"
		62	".endr \n\t"
		63
		64	: "+r" (FiltPtr),
65	"+r" (ReconPtr),
66	"+r" (DctInputPtr)
67	: "m" (PixelsPerLine),
68	"m" (ReconPixelsPerLine)
69	: "memory"
70	);
71	}
72
73	static void sub8x8_128__mmx (unsigned char FiltPtr, ogg_int16_t DctInputPtr,
74	ogg_uint32_t PixelsPerLine)
75	{
76	__asm__ __volatile__ (
77	" .balign 16 \n\t"
78
79	" pxor %%mm7, %%mm7 \n\t"
80	" movq "M(V128w)", %%mm1 \n\t"
81
82	".rept 8 \n\t"
83	" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
84	" movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
85	/* convert from UINT8 to INT16 */
86	" punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
87	" punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
88	/* start calculation */
89	" psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */
90	" psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */
91	" movq %%mm0, (%1) \n\t" /* write answer out */
92	" movq %%mm2, 8(%1) \n\t" /* write answer out */
93	/* Increment pointers */
94	" add $16, %1 \n\t"
95	" add %2, %0 \n\t"
96	".endr \n\t"
97
98	: "+r" (FiltPtr),
99	"+r" (DctInputPtr)
100	: "r" (PixelsPerLine)
101	: "memory"
102	);
103	}
104
105	static void sub8x8avg2__mmx (unsigned char FiltPtr, unsigned char ReconPtr1,
106	unsigned char ReconPtr2, ogg_int16_t DctInputPtr,
107	ogg_uint32_t PixelsPerLine,
108	ogg_uint32_t ReconPixelsPerLine)
109	{
110	__asm__ __volatile__ (
111	" .balign 16 \n\t"
112
113	" pxor %%mm7, %%mm7 \n\t"
114
115	".rept 8 \n\t"
116	" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */
117	" movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */
118	" movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */
119	" movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */
120	" movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */
121	" movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */
122	/* convert from UINT8 to INT16 */
123	" punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */
124	" punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */
125	" punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */
126	" punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */
127	" punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */
128	" punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */
129	/* average ReconPtr1 and ReconPtr2 */
130	" paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */
131	" paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */
132	" psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */
133	" psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */
134	" psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
135	" psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */
136	" movq %%mm0, (%3) \n\t" /* write answer out */
137	" movq %%mm2, 8(%3) \n\t" /* write answer out */
138	/* Increment pointers */
139	" add $16, %3 \n\t"
140	" add %4, %0 \n\t"
141	" add %5, %1 \n\t"
142	" add %5, %2 \n\t"
143	".endr \n\t"
144
145	: "+r" (FiltPtr),
146	"+r" (ReconPtr1),
147	"+r" (ReconPtr2),
148	"+r" (DctInputPtr)
149	: "m" (PixelsPerLine),
150	"m" (ReconPixelsPerLine)
151	: "memory"
152	);
153	}
154
155	static ogg_uint32_t row_sad8__mmx (unsigned char Src1, unsigned char Src2)
156	{
157	ogg_uint32_t MaxSad;
158
159	__asm__ __volatile__ (
160	" .balign 16 \n\t"
161
162	" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
163	" pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */
164	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
165	" movq (%2), %%mm1 \n\t"
166
167	" movq %%mm0, %%mm2 \n\t"
168	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
169	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
170	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
171
172	" movq %%mm0, %%mm1 \n\t"
173
174	" punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */
175	" punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */
176
177	" movq %%mm0, %%mm2 \n\t"
178	" movq %%mm1, %%mm3 \n\t"
179	" psrlq $32, %%mm2 \n\t" /* fold and add */
180	" psrlq $32, %%mm3 \n\t"
181	" paddw %%mm2, %%mm0 \n\t"
182	" paddw %%mm3, %%mm1 \n\t"
183	" movq %%mm0, %%mm2 \n\t"
184	" movq %%mm1, %%mm3 \n\t"
185	" psrlq $16, %%mm2 \n\t"
186	" psrlq $16, %%mm3 \n\t"
187	" paddw %%mm2, %%mm0 \n\t"
188	" paddw %%mm3, %%mm1 \n\t"
189
190	" psubusw %%mm0, %%mm1 \n\t"
191	" paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */
192	" movd %%mm1, %0 \n\t"
193	" andl $0xffff, %0 \n\t"
194
195	: "=m" (MaxSad),
196	"+r" (Src1),
197	"+r" (Src2)
198	:
199	: "memory"
200	);
201	return MaxSad;
202	}
203
204	static ogg_uint32_t col_sad8x8__mmx (unsigned char Src1, unsigned char Src2,
205	ogg_uint32_t stride)
206	{
207	ogg_uint32_t MaxSad;
208
209	__asm__ __volatile__ (
210	" .balign 16 \n\t"
211
212	" pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
213	" pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
214	" pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
215	" pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
216	" pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
217	" mov $4, %%edi \n\t" /* 4 rows */
218	"1: \n\t"
219	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
220	" movq (%2), %%mm1 \n\t" /* take 8 bytes */
221
222	" movq %%mm0, %%mm2 \n\t"
223	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
224	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
225	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
226	" movq %%mm0, %%mm1 \n\t"
227
228	" punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
229	" paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
230	" punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
231	" paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
232	" add %3, %1 \n\t" /* Inc pointer into the new data */
233	" add %3, %2 \n\t" /* Inc pointer into the new data */
234
235	" dec %%edi \n\t"
236	" jnz 1b \n\t"
237
238	" mov $4, %%edi \n\t" /* 4 rows */
239	"2: \n\t"
240	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
241	" movq (%2), %%mm1 \n\t" /* take 8 bytes */
242
243	" movq %%mm0, %%mm2 \n\t"
244	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
245	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
246	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
247	" movq %%mm0, %%mm1 \n\t"
248
249	" punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
250	" paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
251	" punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
252	" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
253	" add %3, %1 \n\t" /* Inc pointer into the new data */
254	" add %3, %2 \n\t" /* Inc pointer into the new data */
255
256	" dec %%edi \n\t"
257	" jnz 2b \n\t"
258
259	" psubusw %%mm6, %%mm7 \n\t"
260	" paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */
261	" psubusw %%mm4, %%mm5 \n\t"
262	" paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */
263	" psubusw %%mm5, %%mm7 \n\t"
264	" paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
265	" movq %%mm7, %%mm6 \n\t"
266	" psrlq $32, %%mm6 \n\t"
267	" psubusw %%mm6, %%mm7 \n\t"
268	" paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
269	" movq %%mm7, %%mm6 \n\t"
270	" psrlq $16, %%mm6 \n\t"
271	" psubusw %%mm6, %%mm7 \n\t"
272	" paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */
273	" movd %%mm7, %0 \n\t"
274	" andl $0xffff, %0 \n\t"
275
276	: "=r" (MaxSad),
277	"+r" (Src1),
278	"+r" (Src2)
279	: "r" (stride)
280	: "memory", "edi"
281	);
282
283	return MaxSad;
284	}
285
286	static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
287	unsigned char *ptr2, ogg_uint32_t stride2)
288	{
289	ogg_uint32_t DiffVal;
290
291	__asm__ __volatile__ (
292	" .balign 16 \n\t"
293	" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
294	" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
295	".rept 8 \n\t"
296	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
297	" movq (%2), %%mm1 \n\t"
298	" movq %%mm0, %%mm2 \n\t"
299
300	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
301	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
302	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
303	" movq %%mm0, %%mm1 \n\t"
304
305	" punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
306	" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
307	" punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
308	" add %3, %1 \n\t" /* Inc pointer into the new data */
309	" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
310	" add %4, %2 \n\t" /* Inc pointer into ref data */
311	".endr \n\t"
312
313	" movq %%mm7, %%mm0 \n\t"
314	" psrlq $32, %%mm7 \n\t"
315	" paddw %%mm0, %%mm7 \n\t"
316	" movq %%mm7, %%mm0 \n\t"
317	" psrlq $16, %%mm7 \n\t"
318	" paddw %%mm0, %%mm7 \n\t"
319	" movd %%mm7, %0 \n\t"
320	" andl $0xffff, %0 \n\t"
321
322	: "=m" (DiffVal),
323	"+r" (ptr1),
324	"+r" (ptr2)
325	: "r" (stride1),
326	"r" (stride2)
327	: "memory"
328	);
329
330	return DiffVal;
331	}
332
333	static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1,
334	unsigned char *ptr2, ogg_uint32_t stride2,
335	ogg_uint32_t thres)
336	{
337	return sad8x8__mmx (ptr1, stride1, ptr2, stride2);
338	}
339
340	static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
341	unsigned char *RefDataPtr1,
342	unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
343	ogg_uint32_t thres)
344	{
345	ogg_uint32_t DiffVal;
346
347	__asm__ __volatile__ (
348	" .balign 16 \n\t"
349
350	" pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */
351	" paddb %%mm5, %%mm5 \n\t"
352
353	" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */
354	" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
355	" mov $8, %%edi \n\t" /* 8 rows */
356	"1: \n\t"
357	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
358
359	" movq (%2), %%mm2 \n\t"
360	" movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */
361	" movq %%mm2, %%mm1 \n\t"
362	" pand %%mm3, %%mm1 \n\t"
363	" pxor %%mm2, %%mm3 \n\t"
364	" pand %%mm5, %%mm3 \n\t"
365	" psrlq $1, %%mm3 \n\t"
366	" paddb %%mm3, %%mm1 \n\t"
367
368	" movq %%mm0, %%mm2 \n\t"
369
370	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
371	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
372	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
373	" movq %%mm0, %%mm1 \n\t"
374
375	" punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */
376	" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
377	" punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */
378	" add %4, %1 \n\t" /* Inc pointer into the new data */
379	" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
380	" add %5, %2 \n\t" /* Inc pointer into ref data */
381	" add %5, %3 \n\t" /* Inc pointer into ref data */
382
383	" dec %%edi \n\t"
384	" jnz 1b \n\t"
385
386	" movq %%mm7, %%mm0 \n\t"
387	" psrlq $32, %%mm7 \n\t"
388	" paddw %%mm0, %%mm7 \n\t"
389	" movq %%mm7, %%mm0 \n\t"
390	" psrlq $16, %%mm7 \n\t"
391	" paddw %%mm0, %%mm7 \n\t"
392	" movd %%mm7, %0 \n\t"
393	" andl $0xffff, %0 \n\t"
394
395	: "=m" (DiffVal),
396	"+r" (SrcData),
397	"+r" (RefDataPtr1),
398	"+r" (RefDataPtr2)
399	: "m" (SrcStride),
400	"m" (RefStride)
401	: "edi", "memory"
402	);
403
404	return DiffVal;
405	}
406
407	static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride)
408	{
409	ogg_uint32_t XSum;
410	ogg_uint32_t XXSum;
411
412	__asm__ __volatile__ (
413	" .balign 16 \n\t"
414
415	" pxor %%mm5, %%mm5 \n\t"
416	" pxor %%mm6, %%mm6 \n\t"
417	" pxor %%mm7, %%mm7 \n\t"
418	" mov $8, %%edi \n\t"
419	"1: \n\t"
420	" movq (%2), %%mm0 \n\t" /* take 8 bytes */
421	" movq %%mm0, %%mm2 \n\t"
422
423	" punpcklbw %%mm6, %%mm0 \n\t"
424	" punpckhbw %%mm6, %%mm2 \n\t"
425
426	" paddw %%mm0, %%mm5 \n\t"
427	" paddw %%mm2, %%mm5 \n\t"
428
429	" pmaddwd %%mm0, %%mm0 \n\t"
430	" pmaddwd %%mm2, %%mm2 \n\t"
431
432	" paddd %%mm0, %%mm7 \n\t"
433	" paddd %%mm2, %%mm7 \n\t"
434
435	" add %3, %2 \n\t" /* Inc pointer into src data */
436
437	" dec %%edi \n\t"
438	" jnz 1b \n\t"
439
440	" movq %%mm5, %%mm0 \n\t"
441	" psrlq $32, %%mm5 \n\t"
442	" paddw %%mm0, %%mm5 \n\t"
443	" movq %%mm5, %%mm0 \n\t"
444	" psrlq $16, %%mm5 \n\t"
445	" paddw %%mm0, %%mm5 \n\t"
446	" movd %%mm5, %%edi \n\t"
447	" movsx %%di, %%edi \n\t"
448	" movl %%edi, %0 \n\t"
449
450	" movq %%mm7, %%mm0 \n\t"
451	" psrlq $32, %%mm7 \n\t"
452	" paddd %%mm0, %%mm7 \n\t"
453	" movd %%mm7, %1 \n\t"
454
455	: "=r" (XSum),
456	"=r" (XXSum),
457	"+r" (DataPtr)
458	: "r" (Stride)
459	: "edi", "memory"
460	);
461
462	/* Compute population variance as mis-match metric. */
463	return (( (XXSum<<6) - XSum*XSum ) );
464	}
465
466	static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
467	unsigned char *RefDataPtr, ogg_uint32_t RefStride)
468	{
469	ogg_uint32_t XSum;
470	ogg_uint32_t XXSum;
471
472	__asm__ __volatile__ (
473	" .balign 16 \n\t"
474
475	" pxor %%mm5, %%mm5 \n\t"
476	" pxor %%mm6, %%mm6 \n\t"
477	" pxor %%mm7, %%mm7 \n\t"
478	" mov $8, %%edi \n\t"
479	"1: \n\t"
480	" movq (%2), %%mm0 \n\t" /* take 8 bytes */
481	" movq (%3), %%mm1 \n\t"
482	" movq %%mm0, %%mm2 \n\t"
483	" movq %%mm1, %%mm3 \n\t"
484
485	" punpcklbw %%mm6, %%mm0 \n\t"
486	" punpcklbw %%mm6, %%mm1 \n\t"
487	" punpckhbw %%mm6, %%mm2 \n\t"
488	" punpckhbw %%mm6, %%mm3 \n\t"
489
490	" psubsw %%mm1, %%mm0 \n\t"
491	" psubsw %%mm3, %%mm2 \n\t"
492
493	" paddw %%mm0, %%mm5 \n\t"
494	" paddw %%mm2, %%mm5 \n\t"
495
496	" pmaddwd %%mm0, %%mm0 \n\t"
497	" pmaddwd %%mm2, %%mm2 \n\t"
498
499	" paddd %%mm0, %%mm7 \n\t"
500	" paddd %%mm2, %%mm7 \n\t"
501
502	" add %4, %2 \n\t" /* Inc pointer into src data */
503	" add %5, %3 \n\t" /* Inc pointer into ref data */
504
505	" dec %%edi \n\t"
506	" jnz 1b \n\t"
507
508	" movq %%mm5, %%mm0 \n\t"
509	" psrlq $32, %%mm5 \n\t"
510	" paddw %%mm0, %%mm5 \n\t"
511	" movq %%mm5, %%mm0 \n\t"
512	" psrlq $16, %%mm5 \n\t"
513	" paddw %%mm0, %%mm5 \n\t"
514	" movd %%mm5, %%edi \n\t"
515	" movsx %%di, %%edi \n\t"
516	" movl %%edi, %0 \n\t"
517
518	" movq %%mm7, %%mm0 \n\t"
519	" psrlq $32, %%mm7 \n\t"
520	" paddd %%mm0, %%mm7 \n\t"
521	" movd %%mm7, %1 \n\t"
522
523	: "=m" (XSum),
524	"=m" (XXSum),
525	"+r" (SrcData),
526	"+r" (RefDataPtr)
527	: "m" (SrcStride),
528	"m" (RefStride)
529	: "edi", "memory"
530	);
531
532	/* Compute and return population variance as mis-match metric. */
533	return (( (XXSum<<6) - XSum*XSum ));
534	}
535
536	static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride,
537	unsigned char *RefDataPtr1,
538	unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
539	{
540	ogg_uint32_t XSum;
541	ogg_uint32_t XXSum;
542
543	__asm__ __volatile__ (
544	" .balign 16 \n\t"
545
546	" pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */
547	" paddb %%mm4, %%mm4 \n\t"
548	" pxor %%mm5, %%mm5 \n\t"
549	" pxor %%mm6, %%mm6 \n\t"
550	" pxor %%mm7, %%mm7 \n\t"
551	" mov $8, %%edi \n\t"
552	"1: \n\t"
553	" movq (%2), %%mm0 \n\t" /* take 8 bytes */
554
555	" movq (%3), %%mm2 \n\t"
556	" movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */
557	" movq %%mm2, %%mm1 \n\t"
558	" pand %%mm3, %%mm1 \n\t"
559	" pxor %%mm2, %%mm3 \n\t"
560	" pand %%mm4, %%mm3 \n\t"
561	" psrlq $1, %%mm3 \n\t"
562	" paddb %%mm3, %%mm1 \n\t"
563
564	" movq %%mm0, %%mm2 \n\t"
565	" movq %%mm1, %%mm3 \n\t"
566
567	" punpcklbw %%mm6, %%mm0 \n\t"
568	" punpcklbw %%mm6, %%mm1 \n\t"
569	" punpckhbw %%mm6, %%mm2 \n\t"
570	" punpckhbw %%mm6, %%mm3 \n\t"
571
572	" psubsw %%mm1, %%mm0 \n\t"
573	" psubsw %%mm3, %%mm2 \n\t"
574
575	" paddw %%mm0, %%mm5 \n\t"
576	" paddw %%mm2, %%mm5 \n\t"
577
578	" pmaddwd %%mm0, %%mm0 \n\t"
579	" pmaddwd %%mm2, %%mm2 \n\t"
580
581	" paddd %%mm0, %%mm7 \n\t"
582	" paddd %%mm2, %%mm7 \n\t"
583
584	" add %5, %2 \n\t" /* Inc pointer into src data */
585	" add %6, %3 \n\t" /* Inc pointer into ref data */
586	" add %6, %4 \n\t" /* Inc pointer into ref data */
587
588	" dec %%edi \n\t"
589	" jnz 1b \n\t"
590
591	" movq %%mm5, %%mm0 \n\t"
592	" psrlq $32, %%mm5 \n\t"
593	" paddw %%mm0, %%mm5 \n\t"
594	" movq %%mm5, %%mm0 \n\t"
595	" psrlq $16, %%mm5 \n\t"
596	" paddw %%mm0, %%mm5 \n\t"
597	" movd %%mm5, %%edi \n\t"
598	" movsx %%di, %%edi \n\t"
599	" movl %%edi, %0 \n\t"
600
601	" movq %%mm7, %%mm0 \n\t"
602	" psrlq $32, %%mm7 \n\t"
603	" paddd %%mm0, %%mm7 \n\t"
604	" movd %%mm7, %1 \n\t"
605
606	: "=m" (XSum),
607	"=m" (XXSum),
608	"+r" (SrcData),
609	"+r" (RefDataPtr1),
610	"+r" (RefDataPtr2)
611	: "m" (SrcStride),
612	"m" (RefStride)
613	: "edi", "memory"
614	);
615
616	/* Compute and return population variance as mis-match metric. */
617	return (( (XXSum<<6) - XSum*XSum ));
618	}
619
620	static void restore_fpu (void)
621	{
622	__asm__ __volatile__ (
623	" emms \n\t"
624	);
625	}
626
627	void dsp_i386_mmx_init(DspFunctions *funcs)
628	{
629	funcs->restore_fpu = restore_fpu;
630	funcs->sub8x8 = sub8x8__mmx;
631	funcs->sub8x8_128 = sub8x8_128__mmx;
632	funcs->sub8x8avg2 = sub8x8avg2__mmx;
633	funcs->row_sad8 = row_sad8__mmx;
634	funcs->col_sad8x8 = col_sad8x8__mmx;
635	funcs->sad8x8 = sad8x8__mmx;
636	funcs->sad8x8_thres = sad8x8_thres__mmx;
637	funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx;
638	funcs->intra8x8_err = intra8x8_err__mmx;
639	funcs->inter8x8_err = inter8x8_err__mmx;
640	funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx;
641	}
642

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/i386/dsp_mmxext.c (+316 lines)
		1	/********************************************************************
		2	* *
		3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
		4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
		5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
		6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
		7	* *
		8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
		9	* by the Xiph.Org Foundation http://www.xiph.org/ *
		10	* *
		11	********************************************************************
		12
		13	function:
		14	last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $
		15
		16	********************************************************************/
		17
		18	#include <stdlib.h>
		19	#include "dsp.h"
		20
		21	static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
		22	unsigned char *ptr2, ogg_uint32_t stride2)
		23	{
		24	ogg_uint32_t DiffVal;
		25
		26	__asm__ __volatile__ (
		27	" .balign 16 \n\t"
		28	" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
		29
		30	".rept 7 \n\t"
		31	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
		32	" movq (%2), %%mm1 \n\t"
		33	" psadbw %%mm1, %%mm0 \n\t"
		34	" add %3, %1 \n\t" /* Inc pointer into the new data */
		35	" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
		36	" add %4, %2 \n\t" /* Inc pointer into ref data */
		37	".endr \n\t"
		38
		39	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
		40	" movq (%2), %%mm1 \n\t"
		41	" psadbw %%mm1, %%mm0 \n\t"
		42	" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
		43	" movd %%mm7, %0 \n\t"
		44
		45	: "=r" (DiffVal),
		46	"+r" (ptr1),
		47	"+r" (ptr2)
		48	: "r" (stride1),
		49	"r" (stride2)
		50	: "memory"
		51	);
		52
		53	return DiffVal;
		54	}
		55
		56	static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1,
		57	unsigned char *ptr2, ogg_uint32_t stride2,
		58	ogg_uint32_t thres)
		59	{
		60	ogg_uint32_t DiffVal;
		61
		62	__asm__ __volatile__ (
		63	" .balign 16 \n\t"
		64	" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
65
66	".rept 8 \n\t"
67	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
68	" movq (%2), %%mm1 \n\t"
69	" psadbw %%mm1, %%mm0 \n\t"
70	" add %3, %1 \n\t" /* Inc pointer into the new data */
71	" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
72	" add %4, %2 \n\t" /* Inc pointer into ref data */
73	".endr \n\t"
74
75	" movd %%mm7, %0 \n\t"
76
77	: "=r" (DiffVal),
78	"+r" (ptr1),
79	"+r" (ptr2)
80	: "r" (stride1),
81	"r" (stride2)
82	: "memory"
83	);
84
85	return DiffVal;
86	}
87
88	static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
89	unsigned char *RefDataPtr1,
90	unsigned char *RefDataPtr2, ogg_uint32_t RefStride,
91	ogg_uint32_t thres)
92	{
93	ogg_uint32_t DiffVal;
94
95	__asm__ __volatile__ (
96	" .balign 16 \n\t"
97	" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */
98	".rept 8 \n\t"
99	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
100	" movq (%2), %%mm1 \n\t"
101	" movq (%3), %%mm2 \n\t"
102	" pavgb %%mm2, %%mm1 \n\t"
103	" psadbw %%mm1, %%mm0 \n\t"
104
105	" add %4, %1 \n\t" /* Inc pointer into the new data */
106	" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */
107	" add %5, %2 \n\t" /* Inc pointer into ref data */
108	" add %5, %3 \n\t" /* Inc pointer into ref data */
109	".endr \n\t"
110
111	" movd %%mm7, %0 \n\t"
112	: "=m" (DiffVal),
113	"+r" (SrcData),
114	"+r" (RefDataPtr1),
115	"+r" (RefDataPtr2)
116	: "m" (SrcStride),
117	"m" (RefStride)
118	: "memory"
119	);
120
121	return DiffVal;
122	}
123
124	static ogg_uint32_t row_sad8__mmxext (unsigned char Src1, unsigned char Src2)
125	{
126	ogg_uint32_t MaxSad;
127
128	__asm__ __volatile__ (
129	" .balign 16 \n\t"
130
131	" movd (%1), %%mm0 \n\t"
132	" movd (%2), %%mm1 \n\t"
133	" psadbw %%mm0, %%mm1 \n\t"
134	" movd 4(%1), %%mm2 \n\t"
135	" movd 4(%2), %%mm3 \n\t"
136	" psadbw %%mm2, %%mm3 \n\t"
137
138	" pmaxsw %%mm1, %%mm3 \n\t"
139	" movd %%mm3, %0 \n\t"
140	" andl $0xffff, %0 \n\t"
141
142	: "=m" (MaxSad),
143	"+r" (Src1),
144	"+r" (Src2)
145	:
146	: "memory"
147	);
148
149	return MaxSad;
150	}
151
152	static ogg_uint32_t col_sad8x8__mmxext (unsigned char Src1, unsigned char Src2,
153	ogg_uint32_t stride)
154	{
155	ogg_uint32_t MaxSad;
156
157	__asm__ __volatile__ (
158	" .balign 16 \n\t"
159
160	" pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */
161	" pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */
162	" pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */
163	" pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */
164	" pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */
165	" mov $4, %%edi \n\t" /* 4 rows */
166	"1: \n\t"
167	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
168	" movq (%2), %%mm1 \n\t" /* take 8 bytes */
169
170	" movq %%mm0, %%mm2 \n\t"
171	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
172	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
173	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
174	" movq %%mm0, %%mm1 \n\t"
175
176	" punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
177	" paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */
178	" punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
179	" paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */
180	" add %3, %1 \n\t" /* Inc pointer into the new data */
181	" add %3, %2 \n\t" /* Inc pointer into the new data */
182
183	" dec %%edi \n\t"
184	" jnz 1b \n\t"
185
186	" mov $4, %%edi \n\t" /* 4 rows */
187	"2: \n\t"
188	" movq (%1), %%mm0 \n\t" /* take 8 bytes */
189	" movq (%2), %%mm1 \n\t" /* take 8 bytes */
190
191	" movq %%mm0, %%mm2 \n\t"
192	" psubusb %%mm1, %%mm0 \n\t" /* A - B */
193	" psubusb %%mm2, %%mm1 \n\t" /* B - A */
194	" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */
195	" movq %%mm0, %%mm1 \n\t"
196
197	" punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */
198	" paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */
199	" punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */
200	" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */
201	" add %3, %1 \n\t" /* Inc pointer into the new data */
202	" add %3, %2 \n\t" /* Inc pointer into the new data */
203
204	" dec %%edi \n\t"
205	" jnz 2b \n\t"
206
207	" pmaxsw %%mm6, %%mm7 \n\t"
208	" pmaxsw %%mm4, %%mm5 \n\t"
209	" pmaxsw %%mm5, %%mm7 \n\t"
210	" movq %%mm7, %%mm6 \n\t"
211	" psrlq $32, %%mm6 \n\t"
212	" pmaxsw %%mm6, %%mm7 \n\t"
213	" movq %%mm7, %%mm6 \n\t"
214	" psrlq $16, %%mm6 \n\t"
215	" pmaxsw %%mm6, %%mm7 \n\t"
216	" movd %%mm7, %0 \n\t"
217	" andl $0xffff, %0 \n\t"
218
219	: "=r" (MaxSad),
220	"+r" (Src1),
221	"+r" (Src2)
222	: "r" (stride)
223	: "memory", "edi"
224	);
225
226	return MaxSad;
227	}
228
229	static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride,
230	unsigned char *RefDataPtr1,
231	unsigned char *RefDataPtr2, ogg_uint32_t RefStride)
232	{
233	ogg_uint32_t XSum;
234	ogg_uint32_t XXSum;
235
236	__asm__ __volatile__ (
237	" .balign 16 \n\t"
238
239	" pxor %%mm4, %%mm4 \n\t"
240	" pxor %%mm5, %%mm5 \n\t"
241	" pxor %%mm6, %%mm6 \n\t"
242	" pxor %%mm7, %%mm7 \n\t"
243	" mov $8, %%edi \n\t"
244	"1: \n\t"
245	" movq (%2), %%mm0 \n\t" /* take 8 bytes */
246
247	" movq (%3), %%mm2 \n\t"
248	" movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */
249	" pavgb %%mm2, %%mm1 \n\t"
250
251	" movq %%mm0, %%mm2 \n\t"
252	" movq %%mm1, %%mm3 \n\t"
253
254	" punpcklbw %%mm6, %%mm0 \n\t"
255	" punpcklbw %%mm4, %%mm1 \n\t"
256	" punpckhbw %%mm6, %%mm2 \n\t"
257	" punpckhbw %%mm4, %%mm3 \n\t"
258
259	" psubsw %%mm1, %%mm0 \n\t"
260	" psubsw %%mm3, %%mm2 \n\t"
261
262	" paddw %%mm0, %%mm5 \n\t"
263	" paddw %%mm2, %%mm5 \n\t"
264
265	" pmaddwd %%mm0, %%mm0 \n\t"
266	" pmaddwd %%mm2, %%mm2 \n\t"
267
268	" paddd %%mm0, %%mm7 \n\t"
269	" paddd %%mm2, %%mm7 \n\t"
270
271	" add %5, %2 \n\t" /* Inc pointer into src data */
272	" add %6, %3 \n\t" /* Inc pointer into ref data */
273	" add %6, %4 \n\t" /* Inc pointer into ref data */
274
275	" dec %%edi \n\t"
276	" jnz 1b \n\t"
277
278	" movq %%mm5, %%mm0 \n\t"
279	" psrlq $32, %%mm5 \n\t"
280	" paddw %%mm0, %%mm5 \n\t"
281	" movq %%mm5, %%mm0 \n\t"
282	" psrlq $16, %%mm5 \n\t"
283	" paddw %%mm0, %%mm5 \n\t"
284	" movd %%mm5, %%edi \n\t"
285	" movsx %%di, %%edi \n\t"
286	" movl %%edi, %0 \n\t"
287
288	" movq %%mm7, %%mm0 \n\t"
289	" psrlq $32, %%mm7 \n\t"
290	" paddd %%mm0, %%mm7 \n\t"
291	" movd %%mm7, %1 \n\t"
292
293	: "=m" (XSum),
294	"=m" (XXSum),
295	"+r" (SrcData),
296	"+r" (RefDataPtr1),
297	"+r" (RefDataPtr2)
298	: "m" (SrcStride),
299	"m" (RefStride)
300	: "edi", "memory"
301	);
302
303	/* Compute and return population variance as mis-match metric. */
304	return (( (XXSum<<6) - XSum*XSum ));
305	}
306
307	void dsp_i386_mmxext_init(DspFunctions *funcs)
308	{
309	funcs->row_sad8 = row_sad8__mmxext;
310	funcs->col_sad8x8 = col_sad8x8__mmxext;
311	funcs->sad8x8 = sad8x8__mmxext;
312	funcs->sad8x8_thres = sad8x8_thres__mmxext;
313	funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
314	funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext;
315	}
316

Line 0 Link Here

(-)libtheora-1.0alpha3/lib/i386/recon_mmx.c (+185 lines)
		1	/********************************************************************
		2	* *
		3	* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
		4	* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
		5	* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
		6	* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
		7	* *
		8	* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
		9	* by the Xiph.Org Foundation http://www.xiph.org/ *
		10	* *
		11	********************************************************************
		12
		13	function:
		14	last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
		15
		16	********************************************************************/
		17
		18	#include "encoder_internal.h"
		19
		20	static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
		21
		22	#if defined(__MINGW32__) \|\| defined(__CYGWIN__) \|\| \
		23	defined(__OS2__) \|\| (defined (__OpenBSD__) && !defined(__ELF__))
		24	# define M(a) "_" #a
		25	#else
		26	# define M(a) #a
		27	#endif
		28
		29	static void copy8x8__mmx (unsigned char *src,
		30	unsigned char *dest,
		31	unsigned int stride)
		32	{
		33	__asm__ __volatile__ (
		34	" .balign 16 \n\t"
		35
		36	" lea (%2, %2, 2), %%edi \n\t"
		37
		38	" movq (%1), %%mm0 \n\t"
		39	" movq (%1, %2), %%mm1 \n\t"
		40	" movq (%1, %2, 2), %%mm2 \n\t"
		41	" movq (%1, %%edi), %%mm3 \n\t"
		42
		43	" lea (%1, %2, 4), %1 \n\t"
		44
		45	" movq %%mm0, (%0) \n\t"
		46	" movq %%mm1, (%0, %2) \n\t"
		47	" movq %%mm2, (%0, %2, 2) \n\t"
		48	" movq %%mm3, (%0, %%edi) \n\t"
		49
		50	" lea (%0, %2, 4), %0 \n\t"
		51
		52	" movq (%1), %%mm0 \n\t"
		53	" movq (%1, %2), %%mm1 \n\t"
		54	" movq (%1, %2, 2), %%mm2 \n\t"
		55	" movq (%1, %%edi), %%mm3 \n\t"
		56
		57	" movq %%mm0, (%0) \n\t"
		58	" movq %%mm1, (%0, %2) \n\t"
		59	" movq %%mm2, (%0, %2, 2) \n\t"
		60	" movq %%mm3, (%0, %%edi) \n\t"
		61	: "+a" (dest)
		62	: "c" (src),
		63	"d" (stride)
		64	: "memory", "edi"
65	);
66	}
67
68	static void recon_intra8x8__mmx (unsigned char ReconPtr, ogg_int16_t ChangePtr,
69	ogg_uint32_t LineStep)
70	{
71	__asm__ __volatile__ (
72	" .balign 16 \n\t"
73
74	" movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
75
76	" lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */
77	"1: \n\t"
78	" movq (%1), %%mm2 \n\t" /* First four input values */
79
80	" packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
81	" por %%mm0, %%mm0 \n\t"
82	" pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
83	" lea 16(%1), %1 \n\t" /* Step source buffer */
84	" cmp %%edi, %1 \n\t" /* are we done */
85
86	" movq %%mm2, (%0) \n\t" /* store results */
87
88	" lea (%0, %2), %0 \n\t" /* Step output buffer */
89	" jc 1b \n\t" /* Loop back if we are not done */
90	: "+r" (ReconPtr)
91	: "r" (ChangePtr),
92	"r" (LineStep)
93	: "memory", "edi"
94	);
95	}
96
97	static void recon_inter8x8__mmx (unsigned char ReconPtr, unsigned char RefPtr,
98	ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
99	{
100	__asm__ __volatile__ (
101	" .balign 16 \n\t"
102
103	" pxor %%mm0, %%mm0 \n\t"
104	" lea 128(%1), %%edi \n\t"
105
106	"1: \n\t"
107	" movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
108
109	" movq (%1), %%mm4 \n\t" /* first 4 changes */
110	" movq %%mm2, %%mm3 \n\t"
111	" movq 8(%1), %%mm5 \n\t" /* last 4 changes */
112	" punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
113	" paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
114	" punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
115	" paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
116	" add %3, %2 \n\t" /* next row of reference pixels */
117	" packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
118	" lea 16(%1), %1 \n\t" /* next row of changes */
119	" cmp %%edi, %1 \n\t" /* are we done? */
120
121	" movq %%mm2, (%0) \n\t" /* store result */
122
123	" lea (%0, %3), %0 \n\t" /* next row of output */
124	" jc 1b \n\t"
125	: "+r" (ReconPtr)
126	: "r" (ChangePtr),
127	"r" (RefPtr),
128	"r" (LineStep)
129	: "memory", "edi"
130	);
131	}
132
133	static void recon_inter8x8_half__mmx (unsigned char ReconPtr, unsigned char RefPtr1,
134	unsigned char RefPtr2, ogg_int16_t ChangePtr,
135	ogg_uint32_t LineStep)
136	{
137	__asm__ __volatile__ (
138	" .balign 16 \n\t"
139
140	" pxor %%mm0, %%mm0 \n\t"
141	" lea 128(%1), %%edi \n\t"
142
143	"1: \n\t"
144	" movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
145	" movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */
146
147	" movq %%mm2, %%mm3 \n\t"
148	" punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */
149	" movq %%mm4, %%mm5 \n\t"
150	" movq (%1), %%mm6 \n\t" /* first 4 changes */
151	" punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */
152	" movq 8(%1), %%mm7 \n\t" /* last 4 changes */
153	" punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */
154	" punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */
155	" paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */
156	" paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */
157	" psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */
158	" psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */
159	" paddw %%mm6, %%mm2 \n\t" /* add changes to start */
160	" paddw %%mm7, %%mm3 \n\t" /* add changes to end */
161	" lea 16(%1), %1 \n\t" /* next row of changes */
162	" packuswb %%mm3, %%mm2 \n\t" /* pack start\|end to unsigned 8-bit */
163	" add %4, %2 \n\t" /* next row of reference pixels */
164	" add %4, %3 \n\t" /* next row of reference pixels */
165	" movq %%mm2, (%0) \n\t" /* store result */
166	" add %4, %0 \n\t" /* next row of output */
167	" cmp %%edi, %1 \n\t" /* are we done? */
168	" jc 1b \n\t"
169	: "+r" (ReconPtr)
170	: "r" (ChangePtr),
171	"r" (RefPtr1),
172	"r" (RefPtr2),
173	"m" (LineStep)
174	: "memory", "edi"
175	);
176	}
177
178	void dsp_i386_mmx_recon_init(DspFunctions *funcs)
179	{
180	funcs->copy8x8 = copy8x8__mmx;
181	funcs->recon_intra8x8 = recon_intra8x8__mmx;
182	funcs->recon_inter8x8 = recon_inter8x8__mmx;
183	funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
184	}
185

Lines 17-22 Link Here

(-)libtheora-1.0alpha3/lib/mcomp.c (-249 / +69 lines)
17		17
18	#include <stdlib.h>	18	#include <stdlib.h>
19	#include <stdio.h>	19	#include <stdio.h>
		20	#include "dsp.h"
20	#include "encoder_internal.h"	21	#include "encoder_internal.h"
21		22
22	/* Initialises motion compentsation. */	23	/* Initialises motion compentsation. */
Lines 100-260 Link Here
100	unsigned char * RefDataPtr1,	101	unsigned char * RefDataPtr1,
101	unsigned char * RefDataPtr2,	102	unsigned char * RefDataPtr2,
102	ogg_uint32_t PixelsPerLine ) {	103	ogg_uint32_t PixelsPerLine ) {
103	ogg_uint32_t i;
104	ogg_int32_t XSum=0;
105	ogg_int32_t XXSum=0;
106	ogg_int32_t DiffVal;	104	ogg_int32_t DiffVal;
107	ogg_int32_t AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2));	105	ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
		106	ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
108		107
109	/* Mode of interpolation chosen based upon on the offset of the	108	/* Mode of interpolation chosen based upon on the offset of the
110	second reference pointer */	109	second reference pointer */
111	if ( AbsRefOffset == 0 ) {	110	if ( RefOffset == 0 ) {
112	for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {	111	DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine,
113	DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0];	112	RefDataPtr1, RefPixelsPerLine);
114	XSum += DiffVal;
115
116	/* negative array indexes are strictly forbidden by ANSI C and C99 */
117	XXSum += DiffVal*DiffVal;
118
119	DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1];
120	XSum += DiffVal;
121	XXSum += DiffVal*DiffVal;
122
123	DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2];
124	XSum += DiffVal;
125	XXSum += DiffVal*DiffVal;
126
127	DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3];
128	XSum += DiffVal;
129	XXSum += DiffVal*DiffVal;
130
131	DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4];
132	XSum += DiffVal;
133	XXSum += DiffVal*DiffVal;
134
135	DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5];
136	XSum += DiffVal;
137	XXSum += DiffVal*DiffVal;
138
139	DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6];
140	XSum += DiffVal;
141	XXSum += DiffVal*DiffVal;
142
143	DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7];
144	XSum += DiffVal;
145	XXSum += DiffVal*DiffVal;
146
147	/* Step to next row of block. */
148	NewDataPtr += PixelsPerLine;
149	RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine;
150	}
151
152	}else{	113	}else{
153		114	DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine,
154	/* Simple two reference interpolation */	115	RefDataPtr1,
155	for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {	116	RefDataPtr2, RefPixelsPerLine);
156	DiffVal = ((int)NewDataPtr[0]) -
157	(((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2);
158	XSum += DiffVal;
159	XXSum += DiffVal*DiffVal;
160
161	DiffVal = ((int)NewDataPtr[1]) -
162	(((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2);
163	XSum += DiffVal;
164	XXSum += DiffVal*DiffVal;
165
166	DiffVal = ((int)NewDataPtr[2]) -
167	(((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2);
168	XSum += DiffVal;
169	XXSum += DiffVal*DiffVal;
170
171	DiffVal = ((int)NewDataPtr[3]) -
172	(((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2);
173	XSum += DiffVal;
174	XXSum += DiffVal*DiffVal;
175
176	DiffVal = ((int)NewDataPtr[4]) -
177	(((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2);
178	XSum += DiffVal;
179	XXSum += DiffVal*DiffVal;
180
181	DiffVal = ((int)NewDataPtr[5]) -
182	(((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2);
183	XSum += DiffVal;
184	XXSum += DiffVal*DiffVal;
185
186	DiffVal = ((int)NewDataPtr[6]) -
187	(((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2);
188	XSum += DiffVal;
189	XXSum += DiffVal*DiffVal;
190
191	DiffVal = ((int)NewDataPtr[7]) -
192	(((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2);
193	XSum += DiffVal;
194	XXSum += DiffVal*DiffVal;
195
196	/* Step to next row of block. */
197	NewDataPtr += PixelsPerLine;
198	RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine;
199	RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine;
200	}
201	}	117	}
202		118
203	/* Compute and return population variance as mis-match metric. */	119	/* Compute and return population variance as mis-match metric. */
204	return (( (XXSum<<6) - XSum*XSum ));
205	}
206
207	static ogg_uint32_t GetSumAbsDiffs (unsigned char * NewDataPtr,
208	unsigned char * RefDataPtr,
209	ogg_uint32_t PixelsPerLine,
210	ogg_uint32_t ErrorSoFar) {
211	ogg_uint32_t i;
212	ogg_uint32_t DiffVal = ErrorSoFar;
213
214	/* Decide on standard or MMX implementation */
215	for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
216	DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
217	DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
218	DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
219	DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
220	DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
221	DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
222	DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
223	DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
224
225	/* Step to next row of block. */
226	NewDataPtr += PixelsPerLine;
227	RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
228	}
229
230	return DiffVal;
231	}
232
233	static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr,
234	unsigned char * RefDataPtr,
235	ogg_uint32_t PixelsPerLine,
236	ogg_uint32_t ErrorSoFar,
237	ogg_uint32_t BestSoFar ) {
238	ogg_uint32_t i;
239	ogg_uint32_t DiffVal = ErrorSoFar;
240
241	for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {
242	DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) );
243	DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) );
244	DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) );
245	DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) );
246	DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) );
247	DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) );
248	DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) );
249	DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) );
250
251	if ( DiffVal > BestSoFar )break;
252
253	/* Step to next row of block. */
254	NewDataPtr += PixelsPerLine;
255	RefDataPtr += STRIDE_EXTRA+PixelsPerLine;
256	}
257
258	return DiffVal;	120	return DiffVal;
259	}	121	}
260		122
Lines 265-382 Link Here
265	ogg_uint32_t ErrorSoFar,	127	ogg_uint32_t ErrorSoFar,
266	ogg_uint32_t BestSoFar ) {	128	ogg_uint32_t BestSoFar ) {
267		129
268	ogg_uint32_t i;
269	ogg_uint32_t DiffVal = ErrorSoFar;	130	ogg_uint32_t DiffVal = ErrorSoFar;
270	ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);	131	ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2);
271	ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;	132	ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA;
272		133
273	if ( RefOffset == 0 ) {	134	if ( RefOffset == 0 ) {
274	/* Simple case as for non 0.5 pixel */	135	/* Simple case as for non 0.5 pixel */
275	DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine,	136	DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine,
276	ErrorSoFar);	137	RefDataPtr1, RefPixelsPerLine);
277	} else {	138	} else {
278	for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) {	139	DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine,
279	DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] +	140	RefDataPtr1,
280	(int)RefDataPtr2[0]) / 2) );	141	RefDataPtr2, RefPixelsPerLine, BestSoFar);
281	DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] +
282	(int)RefDataPtr2[1]) / 2) );
283	DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] +
284	(int)RefDataPtr2[2]) / 2) );
285	DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] +
286	(int)RefDataPtr2[3]) / 2) );
287	DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] +
288	(int)RefDataPtr2[4]) / 2) );
289	DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] +
290	(int)RefDataPtr2[5]) / 2) );
291	DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] +
292	(int)RefDataPtr2[6]) / 2) );
293	DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] +
294	(int)RefDataPtr2[7]) / 2) );
295
296	if ( DiffVal > BestSoFar ) break;
297
298	/* Step to next row of block. */
299	SrcData += PixelsPerLine;
300	RefDataPtr1 += RefPixelsPerLine;
301	RefDataPtr2 += RefPixelsPerLine;
302	}
303	}	142	}
304		143
305	return DiffVal;	144	return DiffVal;
306	}	145	}
307		146
308	static ogg_uint32_t GetIntraError (unsigned char * DataPtr,
309	ogg_uint32_t PixelsPerLine ) {
310	ogg_uint32_t i;
311	ogg_uint32_t XSum=0;
312	ogg_uint32_t XXSum=0;
313	unsigned char *DiffPtr;
314
315	/* Loop expanded out for speed. */
316	DiffPtr = DataPtr;
317
318	for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) {
319
320	/* Examine alternate pixel locations. */
321	XSum += DiffPtr[0];
322	XXSum += DiffPtr[0]*DiffPtr[0];
323	XSum += DiffPtr[1];
324	XXSum += DiffPtr[1]*DiffPtr[1];
325	XSum += DiffPtr[2];
326	XXSum += DiffPtr[2]*DiffPtr[2];
327	XSum += DiffPtr[3];
328	XXSum += DiffPtr[3]*DiffPtr[3];
329	XSum += DiffPtr[4];
330	XXSum += DiffPtr[4]*DiffPtr[4];
331	XSum += DiffPtr[5];
332	XXSum += DiffPtr[5]*DiffPtr[5];
333	XSum += DiffPtr[6];
334	XXSum += DiffPtr[6]*DiffPtr[6];
335	XSum += DiffPtr[7];
336	XXSum += DiffPtr[7]*DiffPtr[7];
337
338	/* Step to next row of block. */
339	DiffPtr += PixelsPerLine;
340	}
341
342	/* Compute population variance as mis-match metric. */
343	return (( (XXSum<<6) - XSum*XSum ) );
344	}
345
346	ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,	147	ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex,
347	ogg_uint32_t PixelsPerLine ) {	148	ogg_uint32_t PixelsPerLine ) {
348	ogg_uint32_t LocalFragIndex = FragIndex;	149	ogg_uint32_t LocalFragIndex = FragIndex;
349	ogg_uint32_t IntraError = 0;	150	ogg_uint32_t IntraError = 0;
350		151
		152	dsp_static_save_fpu ();
		153
351	/* Add together the intra errors for those blocks in the macro block	154	/* Add together the intra errors for those blocks in the macro block
352	that are coded (Y only) */	155	that are coded (Y only) */
353	if ( cpi->pb.display_fragments[LocalFragIndex] )	156	if ( cpi->pb.display_fragments[LocalFragIndex] )
354	IntraError +=	157	IntraError +=
355	GetIntraError(&cpi->	158	dsp_static_intra8x8_err (&cpi->
356	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],	159	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
357	PixelsPerLine );	160	PixelsPerLine);
358
359		161
360	LocalFragIndex++;	162	LocalFragIndex++;
361	if ( cpi->pb.display_fragments[LocalFragIndex] )	163	if ( cpi->pb.display_fragments[LocalFragIndex] )
362	IntraError +=	164	IntraError +=
363	GetIntraError(&cpi->	165	dsp_static_intra8x8_err (&cpi->
364	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],	166	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
365	PixelsPerLine );	167	PixelsPerLine);
366		168
367	LocalFragIndex = FragIndex + cpi->pb.HFragments;	169	LocalFragIndex = FragIndex + cpi->pb.HFragments;
368	if ( cpi->pb.display_fragments[LocalFragIndex] )	170	if ( cpi->pb.display_fragments[LocalFragIndex] )
369	IntraError +=	171	IntraError +=
370	GetIntraError(&cpi->	172	dsp_static_intra8x8_err (&cpi->
371	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],	173	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
372	PixelsPerLine );	174	PixelsPerLine);
373		175
374	LocalFragIndex++;	176	LocalFragIndex++;
375	if ( cpi->pb.display_fragments[LocalFragIndex] )	177	if ( cpi->pb.display_fragments[LocalFragIndex] )
376	IntraError +=	178	IntraError +=
377	GetIntraError(&cpi->	179	dsp_static_intra8x8_err (&cpi->
378	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],	180	ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]],
379	PixelsPerLine );	181	PixelsPerLine);
		182
		183	dsp_static_restore_fpu ();
380		184
381	return IntraError;	185	return IntraError;
382	}	186	}
Lines 400-405 Link Here
400	unsigned char * SrcPtr1;	204	unsigned char * SrcPtr1;
401	unsigned char * RefPtr1;	205	unsigned char * RefPtr1;
402		206
		207	dsp_static_save_fpu ();
		208
403	/* Work out pixel offset into source buffer. */	209	/* Work out pixel offset into source buffer. */
404	PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];	210	PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex];
405		211
Lines 462-467 Link Here
462	InterError += GetInterErr( SrcPtr1, RefPtr1,	268	InterError += GetInterErr( SrcPtr1, RefPtr1,
463	&RefPtr1[RefPtr2Offset], PixelsPerLine );	269	&RefPtr1[RefPtr2Offset], PixelsPerLine );
464	}	270	}
		271
		272	dsp_static_restore_fpu ();
		273
465	return InterError;	274	return InterError;
466	}	275	}
467		276
Lines 496-501 Link Here
496	unsigned char * RefDataPtr1;	305	unsigned char * RefDataPtr1;
497	unsigned char * RefDataPtr2;	306	unsigned char * RefDataPtr2;
498		307
		308	dsp_static_save_fpu ();
		309
499	/* Note which of the four blocks in the macro block are to be	310	/* Note which of the four blocks in the macro block are to be
500	included in the search. */	311	included in the search. */
501	MBlockDispFrags[0] =	312	MBlockDispFrags[0] =
Lines 518-537 Link Here
518		329
519	/* Check the 0,0 candidate. */	330	/* Check the 0,0 candidate. */
520	if ( MBlockDispFrags[0] ) {	331	if ( MBlockDispFrags[0] ) {
521	Error = GetSumAbsDiffs( SrcPtr[0], RefPtr,	332	Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr,
522	PixelsPerLine, Error);	333	PixelsPerLine + STRIDE_EXTRA);
523	}	334	}
524	if ( MBlockDispFrags[1] ) {	335	if ( MBlockDispFrags[1] ) {
525	Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8,	336	Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8,
526	PixelsPerLine, Error);	337	PixelsPerLine + STRIDE_EXTRA);
527	}	338	}
528	if ( MBlockDispFrags[2] ) {	339	if ( MBlockDispFrags[2] ) {
529	Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset,	340	Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset,
530	PixelsPerLine, Error);	341	PixelsPerLine + STRIDE_EXTRA);
531	}	342	}
532	if ( MBlockDispFrags[3] ) {	343	if ( MBlockDispFrags[3] ) {
533	Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8,	344	Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8,
534	PixelsPerLine, Error);	345	PixelsPerLine + STRIDE_EXTRA);
535	}	346	}
536		347
537	/* Set starting values to results of 0, 0 vector. */	348	/* Set starting values to results of 0, 0 vector. */
Lines 554-577 Link Here
554		365
555	/* Get the score for the current offset */	366	/* Get the score for the current offset */
556	if ( MBlockDispFrags[0] ) {	367	if ( MBlockDispFrags[0] ) {
557	Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,	368	Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
558	PixelsPerLine, Error);	369	PixelsPerLine + STRIDE_EXTRA);
559	}	370	}
560		371
561	if ( MBlockDispFrags[1] && (Error < MinError) ) {	372	if ( MBlockDispFrags[1] && (Error < MinError) ) {
562	Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,	373	Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
563	PixelsPerLine, Error, MinError );	374	PixelsPerLine + STRIDE_EXTRA, MinError);
564	}	375	}
565		376
566	if ( MBlockDispFrags[2] && (Error < MinError) ) {	377	if ( MBlockDispFrags[2] && (Error < MinError) ) {
567	Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,	378	Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
568	PixelsPerLine, Error, MinError );	379	PixelsPerLine + STRIDE_EXTRA, MinError);
569	}	380	}
570		381
571	if ( MBlockDispFrags[3] && (Error < MinError) ) {	382	if ( MBlockDispFrags[3] && (Error < MinError) ) {
572	Error = GetNextSumAbsDiffs( SrcPtr[3],	383	Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
573	CandidateBlockPtr + RefRow2Offset + 8,	384	PixelsPerLine + STRIDE_EXTRA, MinError);
574	PixelsPerLine, Error, MinError );
575	}	385	}
576		386
577	if ( Error < MinError ) {	387	if ( Error < MinError ) {
Lines 652-657 Link Here
652	InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,	462	InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
653	FragIndex, MV->x, MV->y, PixelsPerLine );	463	FragIndex, MV->x, MV->y, PixelsPerLine );
654		464
		465	dsp_static_restore_fpu ();
		466
655	/* Return score of best matching block. */	467	/* Return score of best matching block. */
656	return InterMVError;	468	return InterMVError;
657	}	469	}
Lines 684-689 Link Here
684	unsigned char * RefDataPtr1;	496	unsigned char * RefDataPtr1;
685	unsigned char * RefDataPtr2;	497	unsigned char * RefDataPtr2;
686		498
		499	dsp_static_save_fpu ();
		500
687	/* Note which of the four blocks in the macro block are to be	501	/* Note which of the four blocks in the macro block are to be
688	included in the search. */	502	included in the search. */
689	MBlockDispFrags[0] = cpi->	503	MBlockDispFrags[0] = cpi->
Lines 717-736 Link Here
717		531
718	/* Summ errors for each block. */	532	/* Summ errors for each block. */
719	if ( MBlockDispFrags[0] ) {	533	if ( MBlockDispFrags[0] ) {
720	Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr,	534	Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr,
721	PixelsPerLine, Error);	535	PixelsPerLine + STRIDE_EXTRA);
722	}	536	}
723	if ( MBlockDispFrags[1] ){	537	if ( MBlockDispFrags[1] ){
724	Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8,	538	Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8,
725	PixelsPerLine, Error);	539	PixelsPerLine + STRIDE_EXTRA);
726	}	540	}
727	if ( MBlockDispFrags[2] ){	541	if ( MBlockDispFrags[2] ){
728	Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset,	542	Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset,
729	PixelsPerLine, Error);	543	PixelsPerLine + STRIDE_EXTRA);
730	}	544	}
731	if ( MBlockDispFrags[3] ){	545	if ( MBlockDispFrags[3] ){
732	Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8,	546	Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8,
733	PixelsPerLine, Error);	547	PixelsPerLine + STRIDE_EXTRA);
734	}	548	}
735		549
736	/* Was this the best so far */	550	/* Was this the best so far */
Lines 808-813 Link Here
808	InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,	622	InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr,
809	FragIndex, MV->x, MV->y, PixelsPerLine );	623	FragIndex, MV->x, MV->y, PixelsPerLine );
810		624
		625	dsp_static_restore_fpu ();
		626
811	/* Return score of best matching block. */	627	/* Return score of best matching block. */
812	return InterMVError;	628	return InterMVError;
813	}	629	}
Lines 850-857 Link Here
850		666
851	for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){	667	for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){
852	/* Get the block error score. */	668	/* Get the block error score. */
853	Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr,	669	Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr,
854	PixelsPerLine, 0);	670	PixelsPerLine + STRIDE_EXTRA);
855		671
856	/* Was this the best so far */	672	/* Was this the best so far */
857	if ( Error < MinError ) {	673	if ( Error < MinError ) {
Lines 911-916 Link Here
911	MOTION_VECTOR *MV ) {	727	MOTION_VECTOR *MV ) {
912	ogg_uint32_t InterMVError;	728	ogg_uint32_t InterMVError;
913		729
		730	dsp_static_save_fpu ();
		731
914	/* For the moment the 4MV mode is only deemd to be valid if all four	732	/* For the moment the 4MV mode is only deemd to be valid if all four
915	Y blocks are to be updated */	733	Y blocks are to be updated */
916	/* This May be adapted later. */	734	/* This May be adapted later. */
Lines 941-946 Link Here
941	InterMVError = HUGE_ERROR;	759	InterMVError = HUGE_ERROR;
942	}	760	}
943		761
		762	dsp_static_restore_fpu ();
		763
944	/* Return score of best matching block. */	764	/* Return score of best matching block. */
945	return InterMVError;	765	return InterMVError;
946	}	766	}

Lines 16-27 Link Here

(-)libtheora-1.0alpha3/lib/reconstruct.c (-16 / +41 lines)
16	********************************************************************/	16	********************************************************************/
17		17
18	#include "encoder_internal.h"	18	#include "encoder_internal.h"
		19	#include "dsp.h"
		20	#include "cpu.h"
19		21
20	void ReconIntra( PB_INSTANCE pbi, unsigned char ReconPtr,	22	static void copy8x8__c (unsigned char *src,
21	ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {	23	unsigned char *dest,
		24	unsigned int stride)
		25	{
		26	int j;
		27	for ( j = 0; j < 8; j++ ){
		28	((ogg_uint32_t)dest)[0] = ((ogg_uint32_t)src)[0];
		29	((ogg_uint32_t)dest)[1] = ((ogg_uint32_t)src)[1];
		30	src+=stride;
		31	dest+=stride;
		32	}
		33	}
		34
		35	static void recon_intra8x8__c (unsigned char ReconPtr, ogg_int16_t ChangePtr,
		36	ogg_uint32_t LineStep)
		37	{
22	ogg_uint32_t i;	38	ogg_uint32_t i;
23		39
24	for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){	40	for (i = 8; i; i--){
25	/* Convert the data back to 8 bit unsigned */	41	/* Convert the data back to 8 bit unsigned */
26	/* Saturate the output to unsigend 8 bit values */	42	/* Saturate the output to unsigend 8 bit values */
27	ReconPtr[0] = clamp255( ChangePtr[0] + 128 );	43	ReconPtr[0] = clamp255( ChangePtr[0] + 128 );
Lines 34-50 Link Here
34	ReconPtr[7] = clamp255( ChangePtr[7] + 128 );	50	ReconPtr[7] = clamp255( ChangePtr[7] + 128 );
35		51
36	ReconPtr += LineStep;	52	ReconPtr += LineStep;
37	ChangePtr += BLOCK_HEIGHT_WIDTH;	53	ChangePtr += 8;
38	}	54	}
39
40	}	55	}
41		56
42	void ReconInter( PB_INSTANCE pbi, unsigned char ReconPtr,	57	static void recon_inter8x8__c (unsigned char ReconPtr, unsigned char RefPtr,
43	unsigned char * RefPtr, ogg_int16_t * ChangePtr,	58	ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
44	ogg_uint32_t LineStep ) {	59	{
45	ogg_uint32_t i;	60	ogg_uint32_t i;
46		61
47	for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) {	62	for (i = 8; i; i--){
48	ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);	63	ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]);
49	ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);	64	ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]);
50	ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);	65	ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]);
Lines 54-72 Link Here
54	ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);	69	ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]);
55	ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);	70	ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]);
56		71
57	ChangePtr += BLOCK_HEIGHT_WIDTH;	72	ChangePtr += 8;
58	ReconPtr += LineStep;	73	ReconPtr += LineStep;
59	RefPtr += LineStep;	74	RefPtr += LineStep;
60	}	75	}
61
62	}	76	}
63		77
64	void ReconInterHalfPixel2( PB_INSTANCE pbi, unsigned char ReconPtr,	78	static void recon_inter8x8_half__c (unsigned char ReconPtr, unsigned char RefPtr1,
65	unsigned char * RefPtr1, unsigned char * RefPtr2,	79	unsigned char RefPtr2, ogg_int16_t ChangePtr,
66	ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) {	80	ogg_uint32_t LineStep)
		81	{
67	ogg_uint32_t i;	82	ogg_uint32_t i;
68		83
69	for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){	84	for (i = 8; i; i--){
70	ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );	85	ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] );
71	ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );	86	ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] );
72	ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );	87	ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] );
Lines 76-85 Link Here
76	ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );	91	ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] );
77	ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );	92	ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] );
78		93
79	ChangePtr += BLOCK_HEIGHT_WIDTH;	94	ChangePtr += 8;
80	ReconPtr += LineStep;	95	ReconPtr += LineStep;
81	RefPtr1 += LineStep;	96	RefPtr1 += LineStep;
82	RefPtr2 += LineStep;	97	RefPtr2 += LineStep;
83	}	98	}
		99	}
84		100
		101	void dsp_recon_init (DspFunctions *funcs)
		102	{
		103	funcs->copy8x8 = copy8x8__c;
		104	funcs->recon_intra8x8 = recon_intra8x8__c;
		105	funcs->recon_inter8x8 = recon_inter8x8__c;
		106	funcs->recon_inter8x8_half = recon_inter8x8_half__c;
		107	if (cpu_flags & CPU_X86_MMX) {
		108	dsp_i386_mmx_recon_init(&dsp_funcs);
		109	}
85	}	110	}

Lines 19-27 Link Here

(-)libtheora-1.0alpha3/lib/scan.c (-85 / +35 lines)
19	#include <math.h>	19	#include <math.h>
20	#include <string.h>	20	#include <string.h>
21	#include "encoder_internal.h"	21	#include "encoder_internal.h"
		22	#include "dsp.h"
22		23
23	#define MAX_SEARCH_LINE_LEN 7	24	#define MAX_SEARCH_LINE_LEN 7
24		25
		26	#define SET8_0(ptr) \
		27	((ogg_uint32_t *)ptr)[0] = 0x00000000; \
		28	((ogg_uint32_t *)ptr)[1] = 0x00000000;
		29	#define SET8_1(ptr) \
		30	((ogg_uint32_t *)ptr)[0] = 0x01010101; \
		31	((ogg_uint32_t *)ptr)[1] = 0x01010101;
		32	#define SET8_8(ptr) \
		33	((ogg_uint32_t *)ptr)[0] = 0x08080808; \
		34	((ogg_uint32_t *)ptr)[1] = 0x08080808;
		35
25	static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {	36	static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = {
26	0, 0, 0, 0, 2, 4, 12, 24	37	0, 0, 0, 0, 2, 4, 12, 24
27	};	38	};
Lines 384-452 Link Here
384	ppi->KFIndicator = ((ppi->KFIndicator100)/((ppi->ScanYPlaneFragments3)/4));	395	ppi->KFIndicator = ((ppi->KFIndicator100)/((ppi->ScanYPlaneFragments3)/4));
385	}	396	}
386		397
387	static ogg_uint32_t ScalarRowSAD( unsigned char * Src1,
388	unsigned char * Src2 ){
389	ogg_uint32_t SadValue;
390	ogg_uint32_t SadValue1;
391
392	SadValue = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) +
393	abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] );
394
395	SadValue1 = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) +
396	abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] );
397
398	SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1;
399
400	return SadValue;
401	}
402
403	static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi,
404	unsigned char * Src1,
405	unsigned char * Src2 ){
406	ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0};
407	ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0};
408	ogg_uint32_t MaxSad = 0;
409	ogg_uint32_t i;
410
411	for ( i = 0; i < 4; i++ ){
412	SadValue[0] += abs(Src1[0] - Src2[0]);
413	SadValue[1] += abs(Src1[1] - Src2[1]);
414	SadValue[2] += abs(Src1[2] - Src2[2]);
415	SadValue[3] += abs(Src1[3] - Src2[3]);
416	SadValue[4] += abs(Src1[4] - Src2[4]);
417	SadValue[5] += abs(Src1[5] - Src2[5]);
418	SadValue[6] += abs(Src1[6] - Src2[6]);
419	SadValue[7] += abs(Src1[7] - Src2[7]);
420
421	Src1 += ppi->PlaneStride;
422	Src2 += ppi->PlaneStride;
423	}
424
425	for ( i = 0; i < 4; i++ ){
426	SadValue2[0] += abs(Src1[0] - Src2[0]);
427	SadValue2[1] += abs(Src1[1] - Src2[1]);
428	SadValue2[2] += abs(Src1[2] - Src2[2]);
429	SadValue2[3] += abs(Src1[3] - Src2[3]);
430	SadValue2[4] += abs(Src1[4] - Src2[4]);
431	SadValue2[5] += abs(Src1[5] - Src2[5]);
432	SadValue2[6] += abs(Src1[6] - Src2[6]);
433	SadValue2[7] += abs(Src1[7] - Src2[7]);
434
435	Src1 += ppi->PlaneStride;
436	Src2 += ppi->PlaneStride;
437	}
438
439	for ( i = 0; i < 8; i++ ){
440	if ( SadValue[i] > MaxSad )
441	MaxSad = SadValue[i];
442	if ( SadValue2[i] > MaxSad )
443	MaxSad = SadValue2[i];
444	}
445
446	return MaxSad;
447	}
448
449
450	static int RowSadScan( PP_INSTANCE *ppi,	398	static int RowSadScan( PP_INSTANCE *ppi,
451	unsigned char * YuvPtr1,	399	unsigned char * YuvPtr1,
452	unsigned char * YuvPtr2,	400	unsigned char * YuvPtr2,
Lines 475-481 Link Here
475	for ( i = 0; i < ppi->PlaneHFragments; i ++ ){	423	for ( i = 0; i < ppi->PlaneHFragments; i ++ ){
476	if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){	424	if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
477	/* Calculate the SAD score for the block row */	425	/* Calculate the SAD score for the block row */
478	GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2);	426	GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2);
479		427
480	/* Now test the group SAD score */	428	/* Now test the group SAD score */
481	if ( GrpSad > LocalGrpLowSadThresh ){	429	if ( GrpSad > LocalGrpLowSadThresh ){
Lines 532-538 Link Here
532	/* Skip if block already marked to be coded. */	480	/* Skip if block already marked to be coded. */
533	if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){	481	if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){
534	/* Calculate the SAD score for the block column */	482	/* Calculate the SAD score for the block column */
535	MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 );	483	MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride );
536		484
537	/* Now test the group SAD score */	485	/* Now test the group SAD score */
538	if ( MaxSad > LocalGrpLowSadThresh ){	486	if ( MaxSad > LocalGrpLowSadThresh ){
Lines 758-764 Link Here
758	if (*DispFragPtr == CANDIDATE_BLOCK){	706	if (*DispFragPtr == CANDIDATE_BLOCK){
759		707
760	/* Clear down entries in changed locals array */	708	/* Clear down entries in changed locals array */
761	memset(ChLocalsPtr,0,8);	709	SET8_0(ChLocalsPtr);
762		710
763	for ( j = 0; j < HFRAGPIXELS; j++ ){	711	for ( j = 0; j < HFRAGPIXELS; j++ ){
764	/* Take a local copy of the measured difference. */	712	/* Take a local copy of the measured difference. */
Lines 777-786 Link Here
777	}else{	725	}else{
778	/* If we are breaking out here mark all pixels as changed. */	726	/* If we are breaking out here mark all pixels as changed. */
779	if ( *DispFragPtr > BLOCK_NOT_CODED ){	727	if ( *DispFragPtr > BLOCK_NOT_CODED ){
780	memset(bits_map_ptr,1,8);	728	SET8_1(bits_map_ptr);
781	memset(ChLocalsPtr,8,8);	729	SET8_8(ChLocalsPtr);
782	}else{	730	}else{
783	memset(ChLocalsPtr,0,8);	731	SET8_0(ChLocalsPtr);
784	}	732	}
785	}	733	}
786		734
Lines 816-822 Link Here
816	/* Test for break out conditions to save time. */	764	/* Test for break out conditions to save time. */
817	if (*DispFragPtr == CANDIDATE_BLOCK){	765	if (*DispFragPtr == CANDIDATE_BLOCK){
818	/* Clear down entries in changed locals array */	766	/* Clear down entries in changed locals array */
819	memset(ChLocalsPtr,0,8);	767	SET8_0(ChLocalsPtr);
820		768
821	for ( j = 0; j < HFRAGPIXELS; j++ ){	769	for ( j = 0; j < HFRAGPIXELS; j++ ){
822	/* Take a local copy of the measured difference. */	770	/* Take a local copy of the measured difference. */
Lines 839-848 Link Here
839	}else{	787	}else{
840	/* If we are breaking out here mark all pixels as changed. */	788	/* If we are breaking out here mark all pixels as changed. */
841	if ( *DispFragPtr > BLOCK_NOT_CODED ){	789	if ( *DispFragPtr > BLOCK_NOT_CODED ){
842	memset(bits_map_ptr,1,8);	790	SET8_1(bits_map_ptr);
843	memset(ChLocalsPtr,8,8);	791	SET8_8(ChLocalsPtr);
844	}else{	792	}else{
845	memset(ChLocalsPtr,0,8);	793	SET8_0(ChLocalsPtr);
846	}	794	}
847	}	795	}
848		796
Lines 876-882 Link Here
876	/* Test for break out conditions to save time. */	824	/* Test for break out conditions to save time. */
877	if (*DispFragPtr == CANDIDATE_BLOCK){	825	if (*DispFragPtr == CANDIDATE_BLOCK){
878	/* Clear down entries in changed locals array */	826	/* Clear down entries in changed locals array */
879	memset(ChLocalsPtr,0,8);	827	SET8_0(ChLocalsPtr);
880	for ( j = 0; j < HFRAGPIXELS; j++ ){	828	for ( j = 0; j < HFRAGPIXELS; j++ ){
881	/* Take a local copy of the measured difference. */	829	/* Take a local copy of the measured difference. */
882	Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];	830	Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j];
Lines 899-908 Link Here
899	}else{	847	}else{
900	/* If we are breaking out here mark all pixels as changed. */	848	/* If we are breaking out here mark all pixels as changed. */
901	if ( *DispFragPtr > BLOCK_NOT_CODED ){	849	if ( *DispFragPtr > BLOCK_NOT_CODED ){
902	memset(bits_map_ptr,1,8);	850	SET8_1(bits_map_ptr);
903	memset(ChLocalsPtr,8,8);	851	SET8_8(ChLocalsPtr);
904	}else{	852	}else{
905	memset(ChLocalsPtr,0,8);	853	SET8_0(ChLocalsPtr);
906	}	854	}
907	}	855	}
908		856
Lines 935-941 Link Here
935	/* Test for break out conditions to save time. */	883	/* Test for break out conditions to save time. */
936	if (*DispFragPtr == CANDIDATE_BLOCK){	884	if (*DispFragPtr == CANDIDATE_BLOCK){
937	/* Clear down entries in changed locals array */	885	/* Clear down entries in changed locals array */
938	memset(ChLocalsPtr,0,8);	886	SET8_0(ChLocalsPtr);
939		887
940	for ( j = 0; j < HFRAGPIXELS; j++ ){	888	for ( j = 0; j < HFRAGPIXELS; j++ ){
941	/* Take a local copy of the measured difference. */	889	/* Take a local copy of the measured difference. */
Lines 959-968 Link Here
959	}else{	907	}else{
960	/* If we are breaking out here mark all pixels as changed.*/	908	/* If we are breaking out here mark all pixels as changed.*/
961	if ( *DispFragPtr > BLOCK_NOT_CODED ) {	909	if ( *DispFragPtr > BLOCK_NOT_CODED ) {
962	memset(bits_map_ptr,1,8);	910	SET8_1(bits_map_ptr);
963	memset(ChLocalsPtr,8,8);	911	SET8_8(ChLocalsPtr);
964	}else{	912	}else{
965	memset(ChLocalsPtr,0,8);	913	SET8_0(ChLocalsPtr);
966	}	914	}
967	}	915	}
968	/* If we have a lot of changed pixels for this fragment on this	916	/* If we have a lot of changed pixels for this fragment on this
Lines 1071-1077 Link Here
1071	}	1019	}
1072	}else{	1020	}else{
1073	if ( *DispFragPtr > BLOCK_NOT_CODED )	1021	if ( *DispFragPtr > BLOCK_NOT_CODED )
1074	memset(ChLocalsPtr,0,8);	1022	SET8_0(ChLocalsPtr);
1075		1023
1076	/* Step pointers */	1024	/* Step pointers */
1077	ChLocalsPtr += HFRAGPIXELS;	1025	ChLocalsPtr += HFRAGPIXELS;
Lines 1133-1139 Link Here
1133	}	1081	}
1134	}else{	1082	}else{
1135	if ( *DispFragPtr > BLOCK_NOT_CODED )	1083	if ( *DispFragPtr > BLOCK_NOT_CODED )
1136	memset(ChLocalsPtr,0,8);	1084	SET8_0(ChLocalsPtr);
1137		1085
1138	/* Step pointers */	1086	/* Step pointers */
1139	ChLocalsPtr += HFRAGPIXELS;	1087	ChLocalsPtr += HFRAGPIXELS;
Lines 2126-2135 Link Here
2126	/* Fast break out test for obvious yes and no cases in this row of	2074	/* Fast break out test for obvious yes and no cases in this row of
2127	blocks */	2075	blocks */
2128	if ( i < ppi->PlaneVFragments ){	2076	if ( i < ppi->PlaneVFragments ){
		2077	dsp_static_save_fpu ();
2129	UpdatedOrCandidateBlocks =	2078	UpdatedOrCandidateBlocks =
2130	RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );	2079	RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
2131	if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) )	2080	UpdatedOrCandidateBlocks \|=
2132	UpdatedOrCandidateBlocks = 1;	2081	ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 );
		2082	dsp_static_restore_fpu ();
2133	}else{	2083	}else{
2134	/* Make sure we still call other functions if RowSadScan() disabled */	2084	/* Make sure we still call other functions if RowSadScan() disabled */
2135	UpdatedOrCandidateBlocks = 1;	2085	UpdatedOrCandidateBlocks = 1;