--- libtheora-1.0alpha3/lib/blockmap.c 2003-12-03 09:59:39.000000000 +0100 +++ libtheora-1.0alpha3/lib/blockmap.c 2004-10-06 17:48:22.202433112 +0200 @@ -21,7 +21,7 @@ ogg_uint32_t FirstSB, ogg_uint32_t FirstFrag, ogg_uint32_t HFrags, ogg_uint32_t VFrags ){ - ogg_uint32_t i, j; + ogg_uint32_t i, j = 0; ogg_uint32_t xpos; ogg_uint32_t ypos; ogg_uint32_t SBrow, SBcol; --- libtheora-1.0alpha3/lib/cpu.c 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/cpu.c 2004-10-06 17:48:22.203432960 +0200 @@ -0,0 +1,107 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#include "cpu.h" + +ogg_uint32_t cpu_flags = 0; + +#if 1 +static ogg_uint32_t cpu_get_flags (void) +{ + ogg_uint32_t eax, ebx, ecx, edx; + ogg_uint32_t flags; + +#define cpuid(op,eax,ebx,ecx,edx) \ + asm volatile ("pushl %%ebx \n\t" \ + "cpuid \n\t" \ + "movl %%ebx,%1 \n\t" \ + "popl %%ebx" \ + : "=a" (eax), \ + "=r" (ebx), \ + "=c" (ecx), \ + "=d" (edx) \ + : "a" (op) \ + : "cc") + + asm volatile ("pushfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "movl %0,%1 \n\t" + "xorl $0x200000,%0 \n\t" + "pushl %0 \n\t" + "popfl \n\t" + "pushfl \n\t" + "popl %0 \n\t" + "popfl" + : "=r" (eax), + "=r" (ebx) + : + : "cc"); + + if (eax == ebx) /* no cpuid */ + return 0; + + cpuid(0, eax, ebx, ecx, edx); + + if (ebx == 0x756e6547 && + edx == 0x49656e69 && + ecx == 0x6c65746e) { + /* intel */ + + inteltest: + cpuid(1, eax, ebx, ecx, edx); + if ((edx & 0x00800000) == 0) + return 0; + flags = CPU_X86_MMX; + if (edx & 0x02000000) + flags |= CPU_X86_MMXEXT | CPU_X86_SSE; + if (edx & 0x04000000) + flags |= CPU_X86_SSE2; + return flags; + } else if (ebx == 0x68747541 && + edx == 0x69746e65 && + ecx == 0x444d4163) { + /* AMD */ + cpuid(0x80000000, eax, ebx, ecx, edx); + if ((unsigned)eax < 0x80000001) + goto inteltest; + cpuid(0x80000001, eax, ebx, ecx, edx); + if ((edx & 0x00800000) == 0) + return 0; + flags = CPU_X86_MMX; + if (edx & 0x80000000) + flags |= CPU_X86_3DNOW; + if (edx & 0x00400000) + flags |= CPU_X86_MMXEXT; + return flags; + } + else { + /* implement me */ + } + + return flags; +} +#else +static ogg_uint32_t cpu_get_flags (void) { + return 0; +} +#endif + +void cpu_init () +{ + cpu_flags = cpu_get_flags(); +} --- libtheora-1.0alpha3/lib/cpu.h 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/cpu.h 2004-10-06 17:48:22.243426880 +0200 @@ -0,0 +1,28 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#include "encoder_internal.h" + +extern ogg_uint32_t cpu_flags; + +#define CPU_X86_MMX (1<<0) +#define CPU_X86_3DNOW (1<<1) +#define CPU_X86_MMXEXT (1<<2) +#define CPU_X86_SSE (1<<3) +#define CPU_X86_SSE2 (1<<4) + +void cpu_init () ; --- libtheora-1.0alpha3/lib/dct.c 2003-12-03 09:59:39.000000000 +0100 +++ libtheora-1.0alpha3/lib/dct.c 2004-10-06 17:48:22.244426728 +0200 @@ -16,6 +16,7 @@ ********************************************************************/ #include "encoder_internal.h" +#include "cpu.h" static ogg_int32_t xC1S7 = 64277; static ogg_int32_t xC2S6 = 60547; @@ -28,7 +29,7 @@ #define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31) #define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) ) -void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){ +static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){ int loop; ogg_int32_t is07, is12, is34, is56; @@ -251,3 +252,12 @@ op ++; } } + +void dsp_dct_init (DspFunctions *funcs) +{ + funcs->fdct_short = fdct_short__c; + if (cpu_flags & CPU_X86_MMX) { + dsp_i386_mmx_fdct_init(&dsp_funcs); + } +} + --- libtheora-1.0alpha3/lib/dct_decode.c 2004-03-18 18:10:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/dct_decode.c 2004-10-06 17:48:22.284420648 +0200 @@ -18,6 +18,7 @@ #include #include #include "encoder_internal.h" +#include "dsp.h" #define GOLDEN_FRAME_THRESH_Q 50 @@ -112,22 +113,6 @@ SetupBoundingValueArray_Generic(pbi, FLimit); } -void CopyBlock(unsigned char *src, - unsigned char *dest, - unsigned int srcstride){ - unsigned char *s = src; - unsigned char *d = dest; - unsigned int stride = srcstride; - - int j; - for ( j = 0; j < 8; j++ ){ - ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0]; - ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1]; - s+=stride; - d+=stride; - } -} - static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){ ogg_uint32_t ReconPixelsPerLine; ogg_int32_t ReconPixelIndex; @@ -160,8 +145,8 @@ ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber]; /* Get the pixel index for the first pixel in the fragment. */ - ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]), - (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine ); + dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]), + (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine); } @@ -237,10 +222,9 @@ /* Reconstruct the pixel data using the last frame reconstruction and change data when the motion vector is (0,0), the recon is based on the lastframe without loop filtering---- for testing */ - ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], + dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex], &pbi->LastFrameRecon[ReconPixelIndex], - pbi->ReconDataBuffer, ReconPixelsPerLine ); - + pbi->ReconDataBuffer, ReconPixelsPerLine); }else if ( ModeUsesMC[pbi->CodingMode] ) { /* The mode uses a motion vector. */ /* Get vector from list */ @@ -287,29 +271,30 @@ if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) { /* Reconstruct the pixel dats from the reference frame and change data (no half pixel in this case as the two references were the same. */ - ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], + dsp_static_recon_inter8x8 ( + &pbi->ThisFrameRecon[ReconPixelIndex], LastFrameRecPtr, pbi->ReconDataBuffer, - ReconPixelsPerLine ); + ReconPixelsPerLine); }else{ /* Fractional pixel reconstruction. */ /* Note that we only use two pixels per reconstruction even for the diagonal. */ - ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex], + dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex], LastFrameRecPtr, LastFrameRecPtr2, - pbi->ReconDataBuffer, ReconPixelsPerLine ); + pbi->ReconDataBuffer, ReconPixelsPerLine); } } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){ /* Golden frame with motion vector */ /* Reconstruct the pixel data using the golden frame reconstruction and change data */ - ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], + dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex], &pbi->GoldenFrame[ ReconPixelIndex ], - pbi->ReconDataBuffer, ReconPixelsPerLine ); + pbi->ReconDataBuffer, ReconPixelsPerLine); } else { /* Simple Intra coding */ /* Get the pixel index for the first pixel in the fragment. */ - ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], - pbi->ReconDataBuffer, ReconPixelsPerLine ); + dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex], + pbi->ReconDataBuffer, ReconPixelsPerLine); } } @@ -464,7 +449,7 @@ SrcPtr = &SrcReconPtr[ PixelIndex ]; DestPtr = &DestReconPtr[ PixelIndex ]; - CopyBlock(SrcPtr, DestPtr, PlaneLineStep); + dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); } } @@ -476,7 +461,7 @@ SrcPtr = &SrcReconPtr[ PixelIndex ]; DestPtr = &DestReconPtr[ PixelIndex ]; - CopyBlock(SrcPtr, DestPtr, PlaneLineStep); + dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); } } @@ -505,7 +490,7 @@ SrcPtr = &SrcReconPtr[ PixelIndex ]; DestPtr = &DestReconPtr[ PixelIndex ]; - CopyBlock(SrcPtr, DestPtr, PlaneLineStep); + dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); } } @@ -517,7 +502,7 @@ SrcPtr = &SrcReconPtr[ PixelIndex ]; DestPtr = &DestReconPtr[ PixelIndex ]; - CopyBlock(SrcPtr, DestPtr, PlaneLineStep); + dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); } } --- libtheora-1.0alpha3/lib/dct_encode.c 2003-06-10 03:31:33.000000000 +0200 +++ libtheora-1.0alpha3/lib/dct_encode.c 2004-10-06 17:48:22.285420496 +0200 @@ -17,110 +17,10 @@ #include #include "encoder_internal.h" +#include "dsp.h" static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 }; -static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr, - ogg_int16_t *DctInputPtr, unsigned char *old_ptr1, - unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine, - ogg_uint32_t ReconPixelsPerLine ) { - int i; - - /* For each block row */ - for ( i=0; ipb.recon_pixel_index_table[FragIndex]]; } - Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1, - PixelsPerLine, ReconPixelsPerLine ); + dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr, + PixelsPerLine, ReconPixelsPerLine); + dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine); } else if ( cpi->pb.CodingMode==CODE_INTRA ) { - Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine); - + dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine); + dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine); } /* Proceed to encode the data into the encode buffer if the encoder is enabled. */ /* Perform a 2D DCT transform on the data. */ - fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes ); + dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes ); /* Quantize that transform data. */ quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] ); --- libtheora-1.0alpha3/lib/decode.c 2003-12-06 19:06:20.000000000 +0100 +++ libtheora-1.0alpha3/lib/decode.c 2004-10-06 17:48:22.324414568 +0200 @@ -796,6 +796,8 @@ /* Make a not of the number of coded blocks this frame */ pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex; + dsp_static_save_fpu(); + /* Decode the modes data */ DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols); @@ -808,6 +810,7 @@ /* Reconstruct and display the frame */ ReconRefFrames(pbi); + dsp_static_restore_fpu(); } --- libtheora-1.0alpha3/lib/dsp.c 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/dsp.c 2004-10-06 17:48:22.363408640 +0200 @@ -0,0 +1,416 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#include +#include "cpu.h" +#include "encoder_internal.h" + +#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) +#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) +#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b)))) + +DspFunctions dsp_funcs; + +static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr, + ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, + ogg_uint32_t ReconPixelsPerLine) { + int i; + + /* For each block row */ + for (i=8; i; i--) { + DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]); + DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]); + DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]); + DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]); + DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]); + DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]); + DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]); + DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]); + + /* Start next row */ + FiltPtr += PixelsPerLine; + ReconPtr += ReconPixelsPerLine; + DctInputPtr += 8; + } +} + +static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, + ogg_uint32_t PixelsPerLine) { + int i; + /* For each block row */ + for (i=8; i; i--) { + /* INTRA mode so code raw image data */ + /* We convert the data to 8 bit signed (by subtracting 128) as + this reduces the internal precision requirments in the DCT + transform. */ + DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128); + DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128); + DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128); + DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128); + DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128); + DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128); + DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128); + DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128); + + /* Start next row */ + FiltPtr += PixelsPerLine; + DctInputPtr += 8; + } +} + +static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1, + unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, + ogg_uint32_t PixelsPerLine, + ogg_uint32_t ReconPixelsPerLine) +{ + int i; + + /* For each block row */ + for (i=8; i; i--) { + DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0])); + DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1])); + DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2])); + DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3])); + DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4])); + DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5])); + DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6])); + DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7])); + + /* Start next row */ + FiltPtr += PixelsPerLine; + ReconPtr1 += ReconPixelsPerLine; + ReconPtr2 += ReconPixelsPerLine; + DctInputPtr += 8; + } +} + +static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2) +{ + ogg_uint32_t SadValue; + ogg_uint32_t SadValue1; + + SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + + DSP_OP_ABS_DIFF (Src1[1], Src2[1]) + + DSP_OP_ABS_DIFF (Src1[2], Src2[2]) + + DSP_OP_ABS_DIFF (Src1[3], Src2[3]); + + SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + + DSP_OP_ABS_DIFF (Src1[5], Src2[5]) + + DSP_OP_ABS_DIFF (Src1[6], Src2[6]) + + DSP_OP_ABS_DIFF (Src1[7], Src2[7]); + + SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1; + + return SadValue; +} + +static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2, + ogg_uint32_t stride) +{ + ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0}; + ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0}; + ogg_uint32_t MaxSad = 0; + ogg_uint32_t i; + + for ( i = 0; i < 4; i++ ){ + SadValue[0] += abs(Src1[0] - Src2[0]); + SadValue[1] += abs(Src1[1] - Src2[1]); + SadValue[2] += abs(Src1[2] - Src2[2]); + SadValue[3] += abs(Src1[3] - Src2[3]); + SadValue[4] += abs(Src1[4] - Src2[4]); + SadValue[5] += abs(Src1[5] - Src2[5]); + SadValue[6] += abs(Src1[6] - Src2[6]); + SadValue[7] += abs(Src1[7] - Src2[7]); + + Src1 += stride; + Src2 += stride; + } + + for ( i = 0; i < 4; i++ ){ + SadValue2[0] += abs(Src1[0] - Src2[0]); + SadValue2[1] += abs(Src1[1] - Src2[1]); + SadValue2[2] += abs(Src1[2] - Src2[2]); + SadValue2[3] += abs(Src1[3] - Src2[3]); + SadValue2[4] += abs(Src1[4] - Src2[4]); + SadValue2[5] += abs(Src1[5] - Src2[5]); + SadValue2[6] += abs(Src1[6] - Src2[6]); + SadValue2[7] += abs(Src1[7] - Src2[7]); + + Src1 += stride; + Src2 += stride; + } + + for ( i = 0; i < 8; i++ ){ + if ( SadValue[i] > MaxSad ) + MaxSad = SadValue[i]; + if ( SadValue2[i] > MaxSad ) + MaxSad = SadValue2[i]; + } + + return MaxSad; +} + +static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2) +{ + ogg_uint32_t i; + ogg_uint32_t sad = 0; + + for (i=8; i; i--) { + sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); + sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); + sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); + sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); + sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); + sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); + sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); + sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); + + /* Step to next row of block. */ + ptr1 += stride1; + ptr2 += stride2; + } + + return sad; +} + +static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2, + ogg_uint32_t thres) +{ + ogg_uint32_t i; + ogg_uint32_t sad = 0; + + for (i=8; i; i--) { + sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); + sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); + sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); + sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); + sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); + sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); + sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); + sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); + + if (sad > thres ) + break; + + /* Step to next row of block. */ + ptr1 += stride1; + ptr2 += stride2; + } + + return sad; +} + +static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride, + ogg_uint32_t thres) +{ + ogg_uint32_t i; + ogg_uint32_t sad = 0; + + for (i=8; i; i--) { + sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); + sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); + sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); + sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); + sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); + sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); + sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); + sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); + + if ( sad > thres ) + break; + + /* Step to next row of block. */ + SrcData += SrcStride; + RefDataPtr1 += RefStride; + RefDataPtr2 += RefStride; + } + + return sad; +} + +static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride) +{ + ogg_uint32_t i; + ogg_uint32_t XSum=0; + ogg_uint32_t XXSum=0; + + for (i=8; i; i--) { + /* Examine alternate pixel locations. */ + XSum += DataPtr[0]; + XXSum += DataPtr[0]*DataPtr[0]; + XSum += DataPtr[1]; + XXSum += DataPtr[1]*DataPtr[1]; + XSum += DataPtr[2]; + XXSum += DataPtr[2]*DataPtr[2]; + XSum += DataPtr[3]; + XXSum += DataPtr[3]*DataPtr[3]; + XSum += DataPtr[4]; + XXSum += DataPtr[4]*DataPtr[4]; + XSum += DataPtr[5]; + XXSum += DataPtr[5]*DataPtr[5]; + XSum += DataPtr[6]; + XXSum += DataPtr[6]*DataPtr[6]; + XSum += DataPtr[7]; + XXSum += DataPtr[7]*DataPtr[7]; + + /* Step to next row of block. */ + DataPtr += Stride; + } + + /* Compute population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum ) ); +} + +static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr, ogg_uint32_t RefStride) +{ + ogg_uint32_t i; + ogg_uint32_t XSum=0; + ogg_uint32_t XXSum=0; + ogg_int32_t DiffVal; + + for (i=8; i; i--) { + DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + /* Step to next row of block. */ + SrcData += SrcStride; + RefDataPtr += RefStride; + } + + /* Compute and return population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum )); +} + +static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride) +{ + ogg_uint32_t i; + ogg_uint32_t XSum=0; + ogg_uint32_t XXSum=0; + ogg_int32_t DiffVal; + + for (i=8; i; i--) { + DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); + XSum += DiffVal; + XXSum += DiffVal*DiffVal; + + /* Step to next row of block. */ + SrcData += SrcStride; + RefDataPtr1 += RefStride; + RefDataPtr2 += RefStride; + } + + /* Compute and return population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum )); +} + +static void nop (void) { /* NOP */ } + +void dsp_init(DspFunctions *funcs) +{ + funcs->save_fpu = nop; + funcs->restore_fpu = nop; + funcs->sub8x8 = sub8x8__c; + funcs->sub8x8_128 = sub8x8_128__c; + funcs->sub8x8avg2 = sub8x8avg2__c; + funcs->row_sad8 = row_sad8__c; + funcs->col_sad8x8 = col_sad8x8__c; + funcs->sad8x8 = sad8x8__c; + funcs->sad8x8_thres = sad8x8_thres__c; + funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c; + funcs->intra8x8_err = intra8x8_err__c; + funcs->inter8x8_err = inter8x8_err__c; + funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c; +} + +void dsp_static_init(void) +{ + cpu_init (); + dsp_init (&dsp_funcs); + dsp_recon_init (&dsp_funcs); + dsp_dct_init (&dsp_funcs); + if (cpu_flags & CPU_X86_MMX) { + dsp_i386_mmx_init(&dsp_funcs); + } + if (cpu_flags & CPU_X86_MMXEXT) { + dsp_i386_mmxext_init(&dsp_funcs); + } +} + --- libtheora-1.0alpha3/lib/dsp.h 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/dsp.h 2004-10-06 17:48:22.364408488 +0200 @@ -0,0 +1,154 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#ifndef DSP_H +#define DSP_H + +#include + +typedef struct +{ + void (*save_fpu) (void); + void (*restore_fpu) (void); + + void (*sub8x8) (unsigned char *FiltPtr, unsigned char *ReconPtr, + ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, + ogg_uint32_t ReconPixelsPerLine); + + void (*sub8x8_128) (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, + ogg_uint32_t PixelsPerLine); + + void (*sub8x8avg2) (unsigned char *FiltPtr, unsigned char *ReconPtr1, + unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, + ogg_uint32_t PixelsPerLine, + ogg_uint32_t ReconPixelsPerLine); + + void (*copy8x8) (unsigned char *src, unsigned char *dest, + ogg_uint32_t stride); + + void (*recon_intra8x8) (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, + ogg_uint32_t LineStep); + + void (*recon_inter8x8) (unsigned char *ReconPtr, unsigned char *RefPtr, + ogg_int16_t *ChangePtr, ogg_uint32_t LineStep); + + void (*recon_inter8x8_half) (unsigned char *ReconPtr, unsigned char *RefPtr1, + unsigned char *RefPtr2, ogg_int16_t *ChangePtr, + ogg_uint32_t LineStep); + + void (*fdct_short) (ogg_int16_t *InputData, ogg_int16_t *OutputData); + + ogg_uint32_t (*row_sad8) (unsigned char *Src1, unsigned char *Src2); + + ogg_uint32_t (*col_sad8x8) (unsigned char *Src1, unsigned char *Src2, + ogg_uint32_t stride); + + ogg_uint32_t (*sad8x8) (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2); + + ogg_uint32_t (*sad8x8_thres) (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2, + ogg_uint32_t thres); + + ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride, + ogg_uint32_t thres); + + ogg_uint32_t (*intra8x8_err) (unsigned char *DataPtr, ogg_uint32_t Stride); + + ogg_uint32_t (*inter8x8_err) (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr, ogg_uint32_t RefStride); + + ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride); +} DspFunctions; + +extern DspFunctions dsp_funcs; + +extern void dsp_recon_init (DspFunctions *funcs); + +void dsp_init(DspFunctions *funcs); +void dsp_static_init(void); + +#define dsp_save_fpu(funcs) (funcs.save_fpu ()) +#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs) + +#define dsp_restore_fpu(funcs) (funcs.restore_fpu ()) +#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs) + +#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5)) +#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5) + +#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3)) +#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3) + +#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6)) +#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6) + +#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1)) +#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1) + +#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1)) +#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1) + +#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \ + (funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1)) +#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \ + dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1) + +#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \ + (funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1)) +#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \ + dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1) + +#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out)) +#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out) + +#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2)) +#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2) + +#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1)) +#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1) + +#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2)) +#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2) + +#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t)) +#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t) + +#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \ + (funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t)) +#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \ + dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t) + +#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1)) +#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1) + +#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \ + (funcs.inter8x8_err (ptr1,str1,ptr2,str2)) +#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \ + dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2) + +#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \ + (funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2)) +#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \ + dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2) + + +#endif /* DSP_H */ --- libtheora-1.0alpha3/lib/encode.c 2004-03-18 15:25:25.000000000 +0100 +++ libtheora-1.0alpha3/lib/encode.c 2004-10-06 17:48:22.401402864 +0200 @@ -531,8 +531,7 @@ static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi, ogg_int32_t BlockIndex ) { - ogg_uint32_t i; - ogg_uint32_t ErrorVal = 0; + ogg_uint32_t ErrorVal; unsigned char * SrcDataPtr = &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]]; @@ -550,21 +549,8 @@ RecStride = cpi->pb.UVStride; } + ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride); - /* Decide on standard or MMX implementation */ - for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) { - ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) ); - ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) ); - ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) ); - ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) ); - ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) ); - ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) ); - ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) ); - ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) ); - /* Step to next row of block. */ - SrcDataPtr += SrcStride; - RecDataPtr += RecStride; - } return ErrorVal; } @@ -933,9 +919,13 @@ /* Zero Decoder EOB run count */ cpi->pb.EOB_Run = 0; + dsp_static_save_fpu (); + /* Encode any fragments coded using DCT. */ coded_pixels += QuadCodeDisplayFragments (cpi); + dsp_static_restore_fpu (); + return coded_pixels; } --- libtheora-1.0alpha3/lib/encoder_internal.h 2004-03-09 03:02:56.000000000 +0100 +++ libtheora-1.0alpha3/lib/encoder_internal.h 2004-10-06 17:48:22.436397544 +0200 @@ -24,6 +24,7 @@ #include #include "huffman.h" +#include "dsp.h" #ifndef LIBOGG2 #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) ) @@ -689,23 +690,9 @@ ogg_int16_t *QuantMatrix, ogg_int16_t * OutputData ); -extern void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr, - ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ); - -extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr, - unsigned char * RefPtr, ogg_int16_t * ChangePtr, - ogg_uint32_t LineStep ) ; - -extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr, - unsigned char * RefPtr1, - unsigned char * RefPtr2, - ogg_int16_t * ChangePtr, - ogg_uint32_t LineStep ) ; +extern void dsp_recon_init (DspFunctions *funcs); extern void SetupLoopFilter(PB_INSTANCE *pbi); -extern void CopyBlock(unsigned char *src, - unsigned char *dest, - unsigned int srcstride); extern void LoopFilter(PB_INSTANCE *pbi); extern void ReconRefFrames (PB_INSTANCE *pbi); extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock, --- libtheora-1.0alpha3/lib/i386/dsp_mmx.c 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/i386/dsp_mmx.c 2004-10-06 17:48:22.472392072 +0200 @@ -0,0 +1,642 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#include +#include "dsp.h" + +static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL; + +#if defined(__MINGW32__) || defined(__CYGWIN__) || \ + defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) +# define M(a) "_" #a +#else +# define M(a) #a +#endif + +#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) +#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) +#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b)))) + +static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr, + ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, + ogg_uint32_t ReconPixelsPerLine) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm7, %%mm7 \n\t" + + ".rept 8 \n\t" + " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ + " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ + " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ + " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ + /* convert from UINT8 to INT16 */ + " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ + " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ + " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ + " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ + /* start calculation */ + " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ + " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ + " movq %%mm0, (%2) \n\t" /* write answer out */ + " movq %%mm2, 8(%2) \n\t" /* write answer out */ + /* Increment pointers */ + " add $16, %2 \n\t" + " add %3, %0 \n\t" + " add %4, %1 \n\t" + ".endr \n\t" + + : "+r" (FiltPtr), + "+r" (ReconPtr), + "+r" (DctInputPtr) + : "m" (PixelsPerLine), + "m" (ReconPixelsPerLine) + : "memory" + ); +} + +static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, + ogg_uint32_t PixelsPerLine) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm7, %%mm7 \n\t" + " movq "M(V128w)", %%mm1 \n\t" + + ".rept 8 \n\t" + " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ + " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ + /* convert from UINT8 to INT16 */ + " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ + " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ + /* start calculation */ + " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ + " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ + " movq %%mm0, (%1) \n\t" /* write answer out */ + " movq %%mm2, 8(%1) \n\t" /* write answer out */ + /* Increment pointers */ + " add $16, %1 \n\t" + " add %2, %0 \n\t" + ".endr \n\t" + + : "+r" (FiltPtr), + "+r" (DctInputPtr) + : "r" (PixelsPerLine) + : "memory" + ); +} + +static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1, + unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, + ogg_uint32_t PixelsPerLine, + ogg_uint32_t ReconPixelsPerLine) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm7, %%mm7 \n\t" + + ".rept 8 \n\t" + " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ + " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ + " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ + " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ + " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ + " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */ + /* convert from UINT8 to INT16 */ + " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ + " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ + " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ + " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ + " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ + " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ + /* average ReconPtr1 and ReconPtr2 */ + " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ + " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ + " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ + " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ + " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ + " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ + " movq %%mm0, (%3) \n\t" /* write answer out */ + " movq %%mm2, 8(%3) \n\t" /* write answer out */ + /* Increment pointers */ + " add $16, %3 \n\t" + " add %4, %0 \n\t" + " add %5, %1 \n\t" + " add %5, %2 \n\t" + ".endr \n\t" + + : "+r" (FiltPtr), + "+r" (ReconPtr1), + "+r" (ReconPtr2), + "+r" (DctInputPtr) + : "m" (PixelsPerLine), + "m" (ReconPixelsPerLine) + : "memory" + ); +} + +static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2) +{ + ogg_uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */ + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */ + " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */ + + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + " psrlq $32, %%mm2 \n\t" /* fold and add */ + " psrlq $32, %%mm3 \n\t" + " paddw %%mm2, %%mm0 \n\t" + " paddw %%mm3, %%mm1 \n\t" + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + " psrlq $16, %%mm2 \n\t" + " psrlq $16, %%mm3 \n\t" + " paddw %%mm2, %%mm0 \n\t" + " paddw %%mm3, %%mm1 \n\t" + + " psubusw %%mm0, %%mm1 \n\t" + " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */ + " movd %%mm1, %0 \n\t" + " andl $0xffff, %0 \n\t" + + : "=m" (MaxSad), + "+r" (Src1), + "+r" (Src2) + : + : "memory" + ); + return MaxSad; +} + +static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2, + ogg_uint32_t stride) +{ + ogg_uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ + " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ + " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ + " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ + " mov $4, %%edi \n\t" /* 4 rows */ + "1: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " mov $4, %%edi \n\t" /* 4 rows */ + "2: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 2b \n\t" + + " psubusw %%mm6, %%mm7 \n\t" + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */ + " psubusw %%mm4, %%mm5 \n\t" + " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */ + " psubusw %%mm5, %%mm7 \n\t" + " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ + " movq %%mm7, %%mm6 \n\t" + " psrlq $32, %%mm6 \n\t" + " psubusw %%mm6, %%mm7 \n\t" + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ + " movq %%mm7, %%mm6 \n\t" + " psrlq $16, %%mm6 \n\t" + " psubusw %%mm6, %%mm7 \n\t" + " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + + : "=r" (MaxSad), + "+r" (Src1), + "+r" (Src2) + : "r" (stride) + : "memory", "edi" + ); + + return MaxSad; +} + +static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2) +{ + ogg_uint32_t DiffVal; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + ".rept 8 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " movq %%mm0, %%mm2 \n\t" + + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq %%mm7, %%mm0 \n\t" + " psrlq $16, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + + : "=m" (DiffVal), + "+r" (ptr1), + "+r" (ptr2) + : "r" (stride1), + "r" (stride2) + : "memory" + ); + + return DiffVal; +} + +static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2, + ogg_uint32_t thres) +{ + return sad8x8__mmx (ptr1, stride1, ptr2, stride2); +} + +static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride, + ogg_uint32_t thres) +{ + ogg_uint32_t DiffVal; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */ + " paddb %%mm5, %%mm5 \n\t" + + " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + " mov $8, %%edi \n\t" /* 8 rows */ + "1: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + + " movq (%2), %%mm2 \n\t" + " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */ + " movq %%mm2, %%mm1 \n\t" + " pand %%mm3, %%mm1 \n\t" + " pxor %%mm2, %%mm3 \n\t" + " pand %%mm5, %%mm3 \n\t" + " psrlq $1, %%mm3 \n\t" + " paddb %%mm3, %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " add %4, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %5, %2 \n\t" /* Inc pointer into ref data */ + " add %5, %3 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movq %%mm7, %%mm0 \n\t" + " psrlq $16, %%mm7 \n\t" + " paddw %%mm0, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + + : "=m" (DiffVal), + "+r" (SrcData), + "+r" (RefDataPtr1), + "+r" (RefDataPtr2) + : "m" (SrcStride), + "m" (RefStride) + : "edi", "memory" + ); + + return DiffVal; +} + +static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride) +{ + ogg_uint32_t XSum; + ogg_uint32_t XXSum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + " movq %%mm0, %%mm2 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %3, %2 \n\t" /* Inc pointer into src data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + + : "=r" (XSum), + "=r" (XXSum), + "+r" (DataPtr) + : "r" (Stride) + : "edi", "memory" + ); + + /* Compute population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum ) ); +} + +static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr, ogg_uint32_t RefStride) +{ + ogg_uint32_t XSum; + ogg_uint32_t XXSum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + " movq (%3), %%mm1 \n\t" + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpcklbw %%mm6, %%mm1 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + " punpckhbw %%mm6, %%mm3 \n\t" + + " psubsw %%mm1, %%mm0 \n\t" + " psubsw %%mm3, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %4, %2 \n\t" /* Inc pointer into src data */ + " add %5, %3 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + + : "=m" (XSum), + "=m" (XXSum), + "+r" (SrcData), + "+r" (RefDataPtr) + : "m" (SrcStride), + "m" (RefStride) + : "edi", "memory" + ); + + /* Compute and return population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum )); +} + +static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride) +{ + ogg_uint32_t XSum; + ogg_uint32_t XXSum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */ + " paddb %%mm4, %%mm4 \n\t" + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + + " movq (%3), %%mm2 \n\t" + " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */ + " movq %%mm2, %%mm1 \n\t" + " pand %%mm3, %%mm1 \n\t" + " pxor %%mm2, %%mm3 \n\t" + " pand %%mm4, %%mm3 \n\t" + " psrlq $1, %%mm3 \n\t" + " paddb %%mm3, %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpcklbw %%mm6, %%mm1 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + " punpckhbw %%mm6, %%mm3 \n\t" + + " psubsw %%mm1, %%mm0 \n\t" + " psubsw %%mm3, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %5, %2 \n\t" /* Inc pointer into src data */ + " add %6, %3 \n\t" /* Inc pointer into ref data */ + " add %6, %4 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + + : "=m" (XSum), + "=m" (XXSum), + "+r" (SrcData), + "+r" (RefDataPtr1), + "+r" (RefDataPtr2) + : "m" (SrcStride), + "m" (RefStride) + : "edi", "memory" + ); + + /* Compute and return population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum )); +} + +static void restore_fpu (void) +{ + __asm__ __volatile__ ( + " emms \n\t" + ); +} + +void dsp_i386_mmx_init(DspFunctions *funcs) +{ + funcs->restore_fpu = restore_fpu; + funcs->sub8x8 = sub8x8__mmx; + funcs->sub8x8_128 = sub8x8_128__mmx; + funcs->sub8x8avg2 = sub8x8avg2__mmx; + funcs->row_sad8 = row_sad8__mmx; + funcs->col_sad8x8 = col_sad8x8__mmx; + funcs->sad8x8 = sad8x8__mmx; + funcs->sad8x8_thres = sad8x8_thres__mmx; + funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx; + funcs->intra8x8_err = intra8x8_err__mmx; + funcs->inter8x8_err = inter8x8_err__mmx; + funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx; +} + --- libtheora-1.0alpha3/lib/i386/dsp_mmxext.c 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/i386/dsp_mmxext.c 2004-10-06 17:48:22.474391768 +0200 @@ -0,0 +1,316 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#include +#include "dsp.h" + +static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2) +{ + ogg_uint32_t DiffVal; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + + ".rept 7 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " movd %%mm7, %0 \n\t" + + : "=r" (DiffVal), + "+r" (ptr1), + "+r" (ptr2) + : "r" (stride1), + "r" (stride2) + : "memory" + ); + + return DiffVal; +} + +static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1, + unsigned char *ptr2, ogg_uint32_t stride2, + ogg_uint32_t thres) +{ + ogg_uint32_t DiffVal; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + + ".rept 8 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " add %4, %2 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movd %%mm7, %0 \n\t" + + : "=r" (DiffVal), + "+r" (ptr1), + "+r" (ptr2) + : "r" (stride1), + "r" (stride2) + : "memory" + ); + + return DiffVal; +} + +static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride, + ogg_uint32_t thres) +{ + ogg_uint32_t DiffVal; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ + ".rept 8 \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" + " movq (%3), %%mm2 \n\t" + " pavgb %%mm2, %%mm1 \n\t" + " psadbw %%mm1, %%mm0 \n\t" + + " add %4, %1 \n\t" /* Inc pointer into the new data */ + " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ + " add %5, %2 \n\t" /* Inc pointer into ref data */ + " add %5, %3 \n\t" /* Inc pointer into ref data */ + ".endr \n\t" + + " movd %%mm7, %0 \n\t" + : "=m" (DiffVal), + "+r" (SrcData), + "+r" (RefDataPtr1), + "+r" (RefDataPtr2) + : "m" (SrcStride), + "m" (RefStride) + : "memory" + ); + + return DiffVal; +} + +static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2) +{ + ogg_uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " movd (%1), %%mm0 \n\t" + " movd (%2), %%mm1 \n\t" + " psadbw %%mm0, %%mm1 \n\t" + " movd 4(%1), %%mm2 \n\t" + " movd 4(%2), %%mm3 \n\t" + " psadbw %%mm2, %%mm3 \n\t" + + " pmaxsw %%mm1, %%mm3 \n\t" + " movd %%mm3, %0 \n\t" + " andl $0xffff, %0 \n\t" + + : "=m" (MaxSad), + "+r" (Src1), + "+r" (Src2) + : + : "memory" + ); + + return MaxSad; +} + +static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2, + ogg_uint32_t stride) +{ + ogg_uint32_t MaxSad; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ + " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ + " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ + " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ + " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ + " mov $4, %%edi \n\t" /* 4 rows */ + "1: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " mov $4, %%edi \n\t" /* 4 rows */ + "2: \n\t" + " movq (%1), %%mm0 \n\t" /* take 8 bytes */ + " movq (%2), %%mm1 \n\t" /* take 8 bytes */ + + " movq %%mm0, %%mm2 \n\t" + " psubusb %%mm1, %%mm0 \n\t" /* A - B */ + " psubusb %%mm2, %%mm1 \n\t" /* B - A */ + " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ + " movq %%mm0, %%mm1 \n\t" + + " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ + " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ + " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ + " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ + " add %3, %1 \n\t" /* Inc pointer into the new data */ + " add %3, %2 \n\t" /* Inc pointer into the new data */ + + " dec %%edi \n\t" + " jnz 2b \n\t" + + " pmaxsw %%mm6, %%mm7 \n\t" + " pmaxsw %%mm4, %%mm5 \n\t" + " pmaxsw %%mm5, %%mm7 \n\t" + " movq %%mm7, %%mm6 \n\t" + " psrlq $32, %%mm6 \n\t" + " pmaxsw %%mm6, %%mm7 \n\t" + " movq %%mm7, %%mm6 \n\t" + " psrlq $16, %%mm6 \n\t" + " pmaxsw %%mm6, %%mm7 \n\t" + " movd %%mm7, %0 \n\t" + " andl $0xffff, %0 \n\t" + + : "=r" (MaxSad), + "+r" (Src1), + "+r" (Src2) + : "r" (stride) + : "memory", "edi" + ); + + return MaxSad; +} + +static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride, + unsigned char *RefDataPtr1, + unsigned char *RefDataPtr2, ogg_uint32_t RefStride) +{ + ogg_uint32_t XSum; + ogg_uint32_t XXSum; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm4, %%mm4 \n\t" + " pxor %%mm5, %%mm5 \n\t" + " pxor %%mm6, %%mm6 \n\t" + " pxor %%mm7, %%mm7 \n\t" + " mov $8, %%edi \n\t" + "1: \n\t" + " movq (%2), %%mm0 \n\t" /* take 8 bytes */ + + " movq (%3), %%mm2 \n\t" + " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */ + " pavgb %%mm2, %%mm1 \n\t" + + " movq %%mm0, %%mm2 \n\t" + " movq %%mm1, %%mm3 \n\t" + + " punpcklbw %%mm6, %%mm0 \n\t" + " punpcklbw %%mm4, %%mm1 \n\t" + " punpckhbw %%mm6, %%mm2 \n\t" + " punpckhbw %%mm4, %%mm3 \n\t" + + " psubsw %%mm1, %%mm0 \n\t" + " psubsw %%mm3, %%mm2 \n\t" + + " paddw %%mm0, %%mm5 \n\t" + " paddw %%mm2, %%mm5 \n\t" + + " pmaddwd %%mm0, %%mm0 \n\t" + " pmaddwd %%mm2, %%mm2 \n\t" + + " paddd %%mm0, %%mm7 \n\t" + " paddd %%mm2, %%mm7 \n\t" + + " add %5, %2 \n\t" /* Inc pointer into src data */ + " add %6, %3 \n\t" /* Inc pointer into ref data */ + " add %6, %4 \n\t" /* Inc pointer into ref data */ + + " dec %%edi \n\t" + " jnz 1b \n\t" + + " movq %%mm5, %%mm0 \n\t" + " psrlq $32, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movq %%mm5, %%mm0 \n\t" + " psrlq $16, %%mm5 \n\t" + " paddw %%mm0, %%mm5 \n\t" + " movd %%mm5, %%edi \n\t" + " movsx %%di, %%edi \n\t" + " movl %%edi, %0 \n\t" + + " movq %%mm7, %%mm0 \n\t" + " psrlq $32, %%mm7 \n\t" + " paddd %%mm0, %%mm7 \n\t" + " movd %%mm7, %1 \n\t" + + : "=m" (XSum), + "=m" (XXSum), + "+r" (SrcData), + "+r" (RefDataPtr1), + "+r" (RefDataPtr2) + : "m" (SrcStride), + "m" (RefStride) + : "edi", "memory" + ); + + /* Compute and return population variance as mis-match metric. */ + return (( (XXSum<<6) - XSum*XSum )); +} + +void dsp_i386_mmxext_init(DspFunctions *funcs) +{ + funcs->row_sad8 = row_sad8__mmxext; + funcs->col_sad8x8 = col_sad8x8__mmxext; + funcs->sad8x8 = sad8x8__mmxext; + funcs->sad8x8_thres = sad8x8_thres__mmxext; + funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext; + funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext; +} + --- libtheora-1.0alpha3/lib/i386/fdct_mmx.c 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/i386/fdct_mmx.c 2004-10-06 17:48:22.509386448 +0200 @@ -0,0 +1,340 @@ +;//========================================================================== +;// +;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY +;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR +;// PURPOSE. +;// +;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved. +;// +;//-------------------------------------------------------------------------- + +#include +#include "dsp.h" + +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL; +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL; +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL; +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL; +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL; +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL; +static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL; + +#if defined(__MINGW32__) || defined(__CYGWIN__) || \ + defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) +# define M(a) "_" #a +#else +# define M(a) #a +#endif + +/*********************************************************************** + * File: fdct_m.asm + * + * Description: + * This function perform 2-D Forward DCT on a 8x8 block + * + * + * Input: Pointers to input source data buffer and destination + * buffer. + * + * Note: none + * + * Special Notes: We try to do the truncation right to match the result + * of the c version. + * + ************************************************************************/ + +/* execute stage 1 of forward DCT */ +#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \ + " movq " #ip0 ", %%mm0 \n\t" \ + " movq " #ip1 ", %%mm1 \n\t" \ + " movq " #ip3 ", %%mm2 \n\t" \ + " movq " #ip5 ", %%mm3 \n\t" \ + " movq %%mm0, %%mm4 \n\t" \ + " movq %%mm1, %%mm5 \n\t" \ + " movq %%mm2, %%mm6 \n\t" \ + " movq %%mm3, %%mm7 \n\t" \ + \ + " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \ + " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \ + " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \ + " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \ + " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \ + " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \ + \ + " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \ + \ + " paddsw %%mm2, %%mm2 \n\t" \ + \ + " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \ + \ + " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \ + " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \ + " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \ + " paddsw %%mm3, %%mm3 \n\t" \ + " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \ + \ + " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \ + /* ------------------------------------------------------------------- */ \ + " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \ + " paddsw %%mm7, %%mm7 \n\t" \ + " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \ + /* ------------------------------------------------------------------- */ \ + " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \ + " paddsw %%mm3, %%mm3 \n\t" \ + \ + " movq %%mm2, %%mm0 \n\t" /* make a copy */ \ + " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \ + \ + " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \ + " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \ + " psrlw $15, %%mm2 \n\t" \ + " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \ + \ + " movq %%mm3, %%mm2 \n\t" \ + " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \ + \ + " movq %%mm3, %%mm0 \n\t" \ + " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \ + \ + " psrlw $15, %%mm2 \n\t" \ + " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \ + " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \ + \ + " movq %%mm3," #ip0 " \n\t" \ + /* ------------------------------------------------------------------- */ \ + " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \ + " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \ + \ + " movq " #temp ", %%mm2 \n\t" \ + " movq %%mm2, %%mm0 \n\t" \ + \ + " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \ + " paddw %%mm0, %%mm3 \n\t" \ + \ + " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ + " movq %%mm5, %%mm0 \n\t" \ + \ + " movq %%mm5, %%mm2 \n\t" \ + " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \ + \ + " psrlw $15, %%mm2 \n\t" \ + " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ + \ + " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \ + " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \ + \ + " movq %%mm5, %%mm0 \n\t" \ + " movq %%mm5, %%mm2 \n\t" \ + \ + " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " movq " #temp ", %%mm3 \n\t" \ + " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \ + \ + " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ + " movq %%mm3, %%mm2 \n\t" \ + \ + " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ + " psubsw %%mm5, %%mm3 \n\t" \ + \ + " movq %%mm3," #ip6 " \n\t" \ + /* ------------------------------------------------------------------- */ \ + " movq "M(xC4S4)", %%mm0 \n\t" \ + " movq %%mm1, %%mm2 \n\t" \ + " movq %%mm1, %%mm3 \n\t" \ + \ + " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \ + " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \ + \ + " movq %%mm7, %%mm2 \n\t" \ + " movq %%mm7, %%mm3 \n\t" \ + \ + " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \ + " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \ + /* ------------------------------------------------------------------- */ \ + " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \ + " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \ + \ + " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \ + " paddsw %%mm6, %%mm6 \n\t" \ + " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \ + \ + " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \ + " paddsw %%mm1, %%mm1 \n\t" \ + " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \ + /* ------------------------------------------------------------------- */ \ + " movq "M(xC1S7)", %%mm7 \n\t" \ + " movq %%mm1, %%mm2 \n\t" \ + \ + " movq %%mm1, %%mm3 \n\t" \ + " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \ + \ + " movq "M(xC7S1)", %%mm7 \n\t" \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \ + " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \ + \ + " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \ + " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ + \ + " movq %%mm0, %%mm5 \n\t" \ + " movq %%mm0, %%mm2 \n\t" \ + \ + " movq "M(xC1S7)", %%mm7 \n\t" \ + " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \ + \ + " movq "M(xC7S1)", %%mm7 \n\t" \ + " psrlw $15, %%mm2 \n\t" \ + \ + " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \ + " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ + \ + " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \ + " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ + \ + " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \ + " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \ + \ + " movq %%mm1," #ip1 " \n\t" \ + " movq %%mm3," #ip7 " \n\t" \ + /* ------------------------------------------------------------------- */ \ + " movq "M(xC3S5)", %%mm0 \n\t" \ + " movq "M(xC5S3)", %%mm1 \n\t" \ + \ + " movq %%mm6, %%mm5 \n\t" \ + " movq %%mm6, %%mm7 \n\t" \ + \ + " movq %%mm4, %%mm2 \n\t" \ + " movq %%mm4, %%mm3 \n\t" \ + \ + " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \ + " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \ + \ + " psrlw $15, %%mm2 \n\t" \ + " psrlw $15, %%mm5 \n\t" \ + \ + " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \ + " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \ + \ + " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \ + " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \ + \ + " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \ + " movq %%mm4," #ip3 " \n\t" \ + \ + " movq %%mm3, %%mm4 \n\t" \ + " movq %%mm7, %%mm6 \n\t" \ + \ + " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \ + " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \ + \ + " paddw %%mm2, %%mm4 \n\t" \ + " paddw %%mm5, %%mm6 \n\t" \ + \ + " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \ + " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \ + \ + " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \ + " movq %%mm3," #ip5 " \n\t" + +#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \ + op0,op1,op2,op3,op4,op5,op6,op7) \ + " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \ + " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \ + " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \ + " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \ + " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \ + " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \ + " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \ + " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \ + " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \ + /* Transpose 2x8 block */ \ + " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \ + " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \ + " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \ + " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \ + " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \ + " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \ + " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \ + " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \ + " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \ + " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \ + " movq %%mm4," #op4 " \n\t" \ + " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \ + " movq %%mm5," #op5 " \n\t" \ + " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \ + " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \ + " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \ + " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \ + " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \ + " movq %%mm6," #op7 " \n\t" \ + " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \ + " movq %%mm1," #op6 " \n\t" \ + " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \ + " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \ + " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \ + " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \ + " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \ + " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \ + " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \ + " movq %%mm0," #op0 " \n\t" \ + " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \ + " movq %%mm1," #op1 " \n\t" \ + " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \ + " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \ + " movq %%mm4," #op3 " \n\t" \ + " movq %%mm2," #op2 " \n\t" + + +static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData) +{ + ogg_int64_t __attribute__((aligned(8))) align_tmp[16]; + ogg_int16_t *const temp= (int16_t*)align_tmp; + + __asm__ __volatile__ ( + " .balign 16 \n\t" + /* + * Input data is an 8x8 block. To make processing of the data more efficent + * we will transpose the block of data to two 4x8 blocks??? + */ + Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0), + (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1)) + Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2)) + + Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0), + 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1)) + Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) + + Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), + 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1)) + Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2)) + + Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), + 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1)) + Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) + + " emms \n\t" + + : "+r" (InputData), + "+r" (OutputData) + : "r" (temp) + : "memory" + ); +} + +void dsp_i386_mmx_fdct_init(DspFunctions *funcs) +{ + funcs->fdct_short = fdct_short__mmx; +} --- libtheora-1.0alpha3/lib/i386/recon_mmx.c 1970-01-01 01:00:00.000000000 +0100 +++ libtheora-1.0alpha3/lib/i386/recon_mmx.c 2004-10-06 17:48:22.510386296 +0200 @@ -0,0 +1,185 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $ + + ********************************************************************/ + +#include "encoder_internal.h" + +static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL; + +#if defined(__MINGW32__) || defined(__CYGWIN__) || \ + defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) +# define M(a) "_" #a +#else +# define M(a) #a +#endif + +static void copy8x8__mmx (unsigned char *src, + unsigned char *dest, + unsigned int stride) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " lea (%2, %2, 2), %%edi \n\t" + + " movq (%1), %%mm0 \n\t" + " movq (%1, %2), %%mm1 \n\t" + " movq (%1, %2, 2), %%mm2 \n\t" + " movq (%1, %%edi), %%mm3 \n\t" + + " lea (%1, %2, 4), %1 \n\t" + + " movq %%mm0, (%0) \n\t" + " movq %%mm1, (%0, %2) \n\t" + " movq %%mm2, (%0, %2, 2) \n\t" + " movq %%mm3, (%0, %%edi) \n\t" + + " lea (%0, %2, 4), %0 \n\t" + + " movq (%1), %%mm0 \n\t" + " movq (%1, %2), %%mm1 \n\t" + " movq (%1, %2, 2), %%mm2 \n\t" + " movq (%1, %%edi), %%mm3 \n\t" + + " movq %%mm0, (%0) \n\t" + " movq %%mm1, (%0, %2) \n\t" + " movq %%mm2, (%0, %2, 2) \n\t" + " movq %%mm3, (%0, %%edi) \n\t" + : "+a" (dest) + : "c" (src), + "d" (stride) + : "memory", "edi" + ); +} + +static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, + ogg_uint32_t LineStep) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */ + + " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */ + "1: \n\t" + " movq (%1), %%mm2 \n\t" /* First four input values */ + + " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */ + " por %%mm0, %%mm0 \n\t" + " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */ + " lea 16(%1), %1 \n\t" /* Step source buffer */ + " cmp %%edi, %1 \n\t" /* are we done */ + + " movq %%mm2, (%0) \n\t" /* store results */ + + " lea (%0, %2), %0 \n\t" /* Step output buffer */ + " jc 1b \n\t" /* Loop back if we are not done */ + : "+r" (ReconPtr) + : "r" (ChangePtr), + "r" (LineStep) + : "memory", "edi" + ); +} + +static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr, + ogg_int16_t *ChangePtr, ogg_uint32_t LineStep) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm0, %%mm0 \n\t" + " lea 128(%1), %%edi \n\t" + + "1: \n\t" + " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */ + + " movq (%1), %%mm4 \n\t" /* first 4 changes */ + " movq %%mm2, %%mm3 \n\t" + " movq 8(%1), %%mm5 \n\t" /* last 4 changes */ + " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */ + " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */ + " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */ + " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */ + " add %3, %2 \n\t" /* next row of reference pixels */ + " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */ + " lea 16(%1), %1 \n\t" /* next row of changes */ + " cmp %%edi, %1 \n\t" /* are we done? */ + + " movq %%mm2, (%0) \n\t" /* store result */ + + " lea (%0, %3), %0 \n\t" /* next row of output */ + " jc 1b \n\t" + : "+r" (ReconPtr) + : "r" (ChangePtr), + "r" (RefPtr), + "r" (LineStep) + : "memory", "edi" + ); +} + +static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1, + unsigned char *RefPtr2, ogg_int16_t *ChangePtr, + ogg_uint32_t LineStep) +{ + __asm__ __volatile__ ( + " .balign 16 \n\t" + + " pxor %%mm0, %%mm0 \n\t" + " lea 128(%1), %%edi \n\t" + + "1: \n\t" + " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */ + " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */ + + " movq %%mm2, %%mm3 \n\t" + " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */ + " movq %%mm4, %%mm5 \n\t" + " movq (%1), %%mm6 \n\t" /* first 4 changes */ + " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */ + " movq 8(%1), %%mm7 \n\t" /* last 4 changes */ + " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */ + " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */ + " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */ + " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */ + " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */ + " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */ + " paddw %%mm6, %%mm2 \n\t" /* add changes to start */ + " paddw %%mm7, %%mm3 \n\t" /* add changes to end */ + " lea 16(%1), %1 \n\t" /* next row of changes */ + " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */ + " add %4, %2 \n\t" /* next row of reference pixels */ + " add %4, %3 \n\t" /* next row of reference pixels */ + " movq %%mm2, (%0) \n\t" /* store result */ + " add %4, %0 \n\t" /* next row of output */ + " cmp %%edi, %1 \n\t" /* are we done? */ + " jc 1b \n\t" + : "+r" (ReconPtr) + : "r" (ChangePtr), + "r" (RefPtr1), + "r" (RefPtr2), + "m" (LineStep) + : "memory", "edi" + ); +} + +void dsp_i386_mmx_recon_init(DspFunctions *funcs) +{ + funcs->copy8x8 = copy8x8__mmx; + funcs->recon_intra8x8 = recon_intra8x8__mmx; + funcs->recon_inter8x8 = recon_inter8x8__mmx; + funcs->recon_inter8x8_half = recon_inter8x8_half__mmx; +} + --- libtheora-1.0alpha3/lib/Makefile.am 2003-06-15 02:56:42.000000000 +0200 +++ libtheora-1.0alpha3/lib/Makefile.am 2004-10-06 17:48:22.510386296 +0200 @@ -6,7 +6,8 @@ libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \ encoder_internal.h idct.c reconstruct.c block_inline.h \ - encoder_lookup.h mcomp.c scan.c blockmap.c misc_common.c \ + encoder_lookup.h cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c \ + i386/recon_mmx.c i386/fdct_mmx.c mcomp.c scan.c blockmap.c misc_common.c \ dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \ huffman.c pp.h toplevel.c decode.c huffman.h quant.c \ comment.c toplevel_lookup.h mcomp.h --- libtheora-1.0alpha3/lib/mcomp.c 2003-12-03 09:59:41.000000000 +0100 +++ libtheora-1.0alpha3/lib/mcomp.c 2004-10-06 17:48:22.543381280 +0200 @@ -17,6 +17,7 @@ #include #include +#include "dsp.h" #include "encoder_internal.h" /* Initialises motion compentsation. */ @@ -100,161 +101,22 @@ unsigned char * RefDataPtr1, unsigned char * RefDataPtr2, ogg_uint32_t PixelsPerLine ) { - ogg_uint32_t i; - ogg_int32_t XSum=0; - ogg_int32_t XXSum=0; ogg_int32_t DiffVal; - ogg_int32_t AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2)); + ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2); + ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA; /* Mode of interpolation chosen based upon on the offset of the second reference pointer */ - if ( AbsRefOffset == 0 ) { - for ( i=0; i BestSoFar )break; - - /* Step to next row of block. */ - NewDataPtr += PixelsPerLine; - RefDataPtr += STRIDE_EXTRA+PixelsPerLine; - } - return DiffVal; } @@ -265,118 +127,60 @@ ogg_uint32_t ErrorSoFar, ogg_uint32_t BestSoFar ) { - ogg_uint32_t i; ogg_uint32_t DiffVal = ErrorSoFar; ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2); ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA; if ( RefOffset == 0 ) { /* Simple case as for non 0.5 pixel */ - DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine, - ErrorSoFar); + DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine, + RefDataPtr1, RefPixelsPerLine); } else { - for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) { - DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] + - (int)RefDataPtr2[0]) / 2) ); - DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] + - (int)RefDataPtr2[1]) / 2) ); - DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] + - (int)RefDataPtr2[2]) / 2) ); - DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] + - (int)RefDataPtr2[3]) / 2) ); - DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] + - (int)RefDataPtr2[4]) / 2) ); - DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] + - (int)RefDataPtr2[5]) / 2) ); - DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] + - (int)RefDataPtr2[6]) / 2) ); - DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] + - (int)RefDataPtr2[7]) / 2) ); - - if ( DiffVal > BestSoFar ) break; - - /* Step to next row of block. */ - SrcData += PixelsPerLine; - RefDataPtr1 += RefPixelsPerLine; - RefDataPtr2 += RefPixelsPerLine; - } + DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine, + RefDataPtr1, + RefDataPtr2, RefPixelsPerLine, BestSoFar); } return DiffVal; } -static ogg_uint32_t GetIntraError (unsigned char * DataPtr, - ogg_uint32_t PixelsPerLine ) { - ogg_uint32_t i; - ogg_uint32_t XSum=0; - ogg_uint32_t XXSum=0; - unsigned char *DiffPtr; - - /* Loop expanded out for speed. */ - DiffPtr = DataPtr; - - for ( i=0; ipb.display_fragments[LocalFragIndex] ) IntraError += - GetIntraError(&cpi-> + dsp_static_intra8x8_err (&cpi-> ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], - PixelsPerLine ); - + PixelsPerLine); LocalFragIndex++; if ( cpi->pb.display_fragments[LocalFragIndex] ) IntraError += - GetIntraError(&cpi-> + dsp_static_intra8x8_err (&cpi-> ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], - PixelsPerLine ); + PixelsPerLine); LocalFragIndex = FragIndex + cpi->pb.HFragments; if ( cpi->pb.display_fragments[LocalFragIndex] ) IntraError += - GetIntraError(&cpi-> + dsp_static_intra8x8_err (&cpi-> ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], - PixelsPerLine ); + PixelsPerLine); LocalFragIndex++; if ( cpi->pb.display_fragments[LocalFragIndex] ) IntraError += - GetIntraError(&cpi-> + dsp_static_intra8x8_err (&cpi-> ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], - PixelsPerLine ); + PixelsPerLine); + + dsp_static_restore_fpu (); return IntraError; } @@ -400,6 +204,8 @@ unsigned char * SrcPtr1; unsigned char * RefPtr1; + dsp_static_save_fpu (); + /* Work out pixel offset into source buffer. */ PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex]; @@ -462,6 +268,9 @@ InterError += GetInterErr( SrcPtr1, RefPtr1, &RefPtr1[RefPtr2Offset], PixelsPerLine ); } + + dsp_static_restore_fpu (); + return InterError; } @@ -496,6 +305,8 @@ unsigned char * RefDataPtr1; unsigned char * RefDataPtr2; + dsp_static_save_fpu (); + /* Note which of the four blocks in the macro block are to be included in the search. */ MBlockDispFrags[0] = @@ -518,20 +329,20 @@ /* Check the 0,0 candidate. */ if ( MBlockDispFrags[0] ) { - Error = GetSumAbsDiffs( SrcPtr[0], RefPtr, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[1] ) { - Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[2] ) { - Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[3] ) { - Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8, + PixelsPerLine + STRIDE_EXTRA); } /* Set starting values to results of 0, 0 vector. */ @@ -554,24 +365,23 @@ /* Get the score for the current offset */ if ( MBlockDispFrags[0] ) { - Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[1] && (Error < MinError) ) { - Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8, - PixelsPerLine, Error, MinError ); + Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8, + PixelsPerLine + STRIDE_EXTRA, MinError); } if ( MBlockDispFrags[2] && (Error < MinError) ) { - Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset, - PixelsPerLine, Error, MinError ); + Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset, + PixelsPerLine + STRIDE_EXTRA, MinError); } if ( MBlockDispFrags[3] && (Error < MinError) ) { - Error = GetNextSumAbsDiffs( SrcPtr[3], - CandidateBlockPtr + RefRow2Offset + 8, - PixelsPerLine, Error, MinError ); + Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8, + PixelsPerLine + STRIDE_EXTRA, MinError); } if ( Error < MinError ) { @@ -652,6 +462,8 @@ InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr, FragIndex, MV->x, MV->y, PixelsPerLine ); + dsp_static_restore_fpu (); + /* Return score of best matching block. */ return InterMVError; } @@ -684,6 +496,8 @@ unsigned char * RefDataPtr1; unsigned char * RefDataPtr2; + dsp_static_save_fpu (); + /* Note which of the four blocks in the macro block are to be included in the search. */ MBlockDispFrags[0] = cpi-> @@ -717,20 +531,20 @@ /* Summ errors for each block. */ if ( MBlockDispFrags[0] ) { - Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[1] ){ - Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[2] ){ - Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset, + PixelsPerLine + STRIDE_EXTRA); } if ( MBlockDispFrags[3] ){ - Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8, - PixelsPerLine, Error); + Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8, + PixelsPerLine + STRIDE_EXTRA); } /* Was this the best so far */ @@ -808,6 +622,8 @@ InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr, FragIndex, MV->x, MV->y, PixelsPerLine ); + dsp_static_restore_fpu (); + /* Return score of best matching block. */ return InterMVError; } @@ -850,8 +666,8 @@ for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){ /* Get the block error score. */ - Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr, - PixelsPerLine, 0); + Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr, + PixelsPerLine + STRIDE_EXTRA); /* Was this the best so far */ if ( Error < MinError ) { @@ -911,6 +727,8 @@ MOTION_VECTOR *MV ) { ogg_uint32_t InterMVError; + dsp_static_save_fpu (); + /* For the moment the 4MV mode is only deemd to be valid if all four Y blocks are to be updated */ /* This May be adapted later. */ @@ -941,6 +759,8 @@ InterMVError = HUGE_ERROR; } + dsp_static_restore_fpu (); + /* Return score of best matching block. */ return InterMVError; } --- libtheora-1.0alpha3/lib/pp.c 2003-12-03 09:59:41.000000000 +0100 +++ libtheora-1.0alpha3/lib/pp.c 2004-10-06 17:48:22.545380976 +0200 @@ -19,6 +19,7 @@ #include #include "encoder_internal.h" #include "pp.h" +#include "dsp.h" #define MAX(a, b) ((a>b)?a:b) #define MIN(a, b) ((aPostProcessingLevel){ case 8: /* on a slow machine, use a simpler and faster deblocking filter */ @@ -947,5 +948,6 @@ DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer); break; } + dsp_static_restore_fpu (); } --- libtheora-1.0alpha3/lib/reconstruct.c 2003-12-03 09:59:41.000000000 +0100 +++ libtheora-1.0alpha3/lib/reconstruct.c 2004-10-06 17:48:22.574376568 +0200 @@ -16,12 +16,28 @@ ********************************************************************/ #include "encoder_internal.h" +#include "dsp.h" +#include "cpu.h" -void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr, - ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) { +static void copy8x8__c (unsigned char *src, + unsigned char *dest, + unsigned int stride) +{ + int j; + for ( j = 0; j < 8; j++ ){ + ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0]; + ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1]; + src+=stride; + dest+=stride; + } +} + +static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, + ogg_uint32_t LineStep) +{ ogg_uint32_t i; - for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){ + for (i = 8; i; i--){ /* Convert the data back to 8 bit unsigned */ /* Saturate the output to unsigend 8 bit values */ ReconPtr[0] = clamp255( ChangePtr[0] + 128 ); @@ -34,17 +50,16 @@ ReconPtr[7] = clamp255( ChangePtr[7] + 128 ); ReconPtr += LineStep; - ChangePtr += BLOCK_HEIGHT_WIDTH; + ChangePtr += 8; } - } -void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr, - unsigned char * RefPtr, ogg_int16_t * ChangePtr, - ogg_uint32_t LineStep ) { +static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr, + ogg_int16_t *ChangePtr, ogg_uint32_t LineStep) +{ ogg_uint32_t i; - for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) { + for (i = 8; i; i--){ ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]); ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]); ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]); @@ -54,19 +69,19 @@ ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]); ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]); - ChangePtr += BLOCK_HEIGHT_WIDTH; + ChangePtr += 8; ReconPtr += LineStep; RefPtr += LineStep; } - } -void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr, - unsigned char * RefPtr1, unsigned char * RefPtr2, - ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) { +static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1, + unsigned char *RefPtr2, ogg_int16_t *ChangePtr, + ogg_uint32_t LineStep) +{ ogg_uint32_t i; - for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){ + for (i = 8; i; i--){ ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] ); ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] ); ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] ); @@ -76,10 +91,20 @@ ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] ); ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] ); - ChangePtr += BLOCK_HEIGHT_WIDTH; + ChangePtr += 8; ReconPtr += LineStep; RefPtr1 += LineStep; RefPtr2 += LineStep; } +} +void dsp_recon_init (DspFunctions *funcs) +{ + funcs->copy8x8 = copy8x8__c; + funcs->recon_intra8x8 = recon_intra8x8__c; + funcs->recon_inter8x8 = recon_inter8x8__c; + funcs->recon_inter8x8_half = recon_inter8x8_half__c; + if (cpu_flags & CPU_X86_MMX) { + dsp_i386_mmx_recon_init(&dsp_funcs); + } } --- libtheora-1.0alpha3/lib/scan.c 2003-12-03 09:59:41.000000000 +0100 +++ libtheora-1.0alpha3/lib/scan.c 2004-10-06 17:48:22.609371248 +0200 @@ -19,9 +19,20 @@ #include #include #include "encoder_internal.h" +#include "dsp.h" #define MAX_SEARCH_LINE_LEN 7 +#define SET8_0(ptr) \ + ((ogg_uint32_t *)ptr)[0] = 0x00000000; \ + ((ogg_uint32_t *)ptr)[1] = 0x00000000; +#define SET8_1(ptr) \ + ((ogg_uint32_t *)ptr)[0] = 0x01010101; \ + ((ogg_uint32_t *)ptr)[1] = 0x01010101; +#define SET8_8(ptr) \ + ((ogg_uint32_t *)ptr)[0] = 0x08080808; \ + ((ogg_uint32_t *)ptr)[1] = 0x08080808; + static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = { 0, 0, 0, 0, 2, 4, 12, 24 }; @@ -384,69 +395,6 @@ ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4)); } -static ogg_uint32_t ScalarRowSAD( unsigned char * Src1, - unsigned char * Src2 ){ - ogg_uint32_t SadValue; - ogg_uint32_t SadValue1; - - SadValue = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) + - abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] ); - - SadValue1 = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) + - abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] ); - - SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1; - - return SadValue; -} - -static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi, - unsigned char * Src1, - unsigned char * Src2 ){ - ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0}; - ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0}; - ogg_uint32_t MaxSad = 0; - ogg_uint32_t i; - - for ( i = 0; i < 4; i++ ){ - SadValue[0] += abs(Src1[0] - Src2[0]); - SadValue[1] += abs(Src1[1] - Src2[1]); - SadValue[2] += abs(Src1[2] - Src2[2]); - SadValue[3] += abs(Src1[3] - Src2[3]); - SadValue[4] += abs(Src1[4] - Src2[4]); - SadValue[5] += abs(Src1[5] - Src2[5]); - SadValue[6] += abs(Src1[6] - Src2[6]); - SadValue[7] += abs(Src1[7] - Src2[7]); - - Src1 += ppi->PlaneStride; - Src2 += ppi->PlaneStride; - } - - for ( i = 0; i < 4; i++ ){ - SadValue2[0] += abs(Src1[0] - Src2[0]); - SadValue2[1] += abs(Src1[1] - Src2[1]); - SadValue2[2] += abs(Src1[2] - Src2[2]); - SadValue2[3] += abs(Src1[3] - Src2[3]); - SadValue2[4] += abs(Src1[4] - Src2[4]); - SadValue2[5] += abs(Src1[5] - Src2[5]); - SadValue2[6] += abs(Src1[6] - Src2[6]); - SadValue2[7] += abs(Src1[7] - Src2[7]); - - Src1 += ppi->PlaneStride; - Src2 += ppi->PlaneStride; - } - - for ( i = 0; i < 8; i++ ){ - if ( SadValue[i] > MaxSad ) - MaxSad = SadValue[i]; - if ( SadValue2[i] > MaxSad ) - MaxSad = SadValue2[i]; - } - - return MaxSad; -} - - static int RowSadScan( PP_INSTANCE *ppi, unsigned char * YuvPtr1, unsigned char * YuvPtr2, @@ -475,7 +423,7 @@ for ( i = 0; i < ppi->PlaneHFragments; i ++ ){ if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){ /* Calculate the SAD score for the block row */ - GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2); + GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2); /* Now test the group SAD score */ if ( GrpSad > LocalGrpLowSadThresh ){ @@ -532,7 +480,7 @@ /* Skip if block already marked to be coded. */ if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){ /* Calculate the SAD score for the block column */ - MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 ); + MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride ); /* Now test the group SAD score */ if ( MaxSad > LocalGrpLowSadThresh ){ @@ -758,7 +706,7 @@ if (*DispFragPtr == CANDIDATE_BLOCK){ /* Clear down entries in changed locals array */ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); for ( j = 0; j < HFRAGPIXELS; j++ ){ /* Take a local copy of the measured difference. */ @@ -777,10 +725,10 @@ }else{ /* If we are breaking out here mark all pixels as changed. */ if ( *DispFragPtr > BLOCK_NOT_CODED ){ - memset(bits_map_ptr,1,8); - memset(ChLocalsPtr,8,8); + SET8_1(bits_map_ptr); + SET8_8(ChLocalsPtr); }else{ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); } } @@ -816,7 +764,7 @@ /* Test for break out conditions to save time. */ if (*DispFragPtr == CANDIDATE_BLOCK){ /* Clear down entries in changed locals array */ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); for ( j = 0; j < HFRAGPIXELS; j++ ){ /* Take a local copy of the measured difference. */ @@ -839,10 +787,10 @@ }else{ /* If we are breaking out here mark all pixels as changed. */ if ( *DispFragPtr > BLOCK_NOT_CODED ){ - memset(bits_map_ptr,1,8); - memset(ChLocalsPtr,8,8); + SET8_1(bits_map_ptr); + SET8_8(ChLocalsPtr); }else{ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); } } @@ -876,7 +824,7 @@ /* Test for break out conditions to save time. */ if (*DispFragPtr == CANDIDATE_BLOCK){ /* Clear down entries in changed locals array */ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); for ( j = 0; j < HFRAGPIXELS; j++ ){ /* Take a local copy of the measured difference. */ Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j]; @@ -899,10 +847,10 @@ }else{ /* If we are breaking out here mark all pixels as changed. */ if ( *DispFragPtr > BLOCK_NOT_CODED ){ - memset(bits_map_ptr,1,8); - memset(ChLocalsPtr,8,8); + SET8_1(bits_map_ptr); + SET8_8(ChLocalsPtr); }else{ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); } } @@ -935,7 +883,7 @@ /* Test for break out conditions to save time. */ if (*DispFragPtr == CANDIDATE_BLOCK){ /* Clear down entries in changed locals array */ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); for ( j = 0; j < HFRAGPIXELS; j++ ){ /* Take a local copy of the measured difference. */ @@ -959,10 +907,10 @@ }else{ /* If we are breaking out here mark all pixels as changed.*/ if ( *DispFragPtr > BLOCK_NOT_CODED ) { - memset(bits_map_ptr,1,8); - memset(ChLocalsPtr,8,8); + SET8_1(bits_map_ptr); + SET8_8(ChLocalsPtr); }else{ - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); } } /* If we have a lot of changed pixels for this fragment on this @@ -1071,7 +1019,7 @@ } }else{ if ( *DispFragPtr > BLOCK_NOT_CODED ) - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); /* Step pointers */ ChLocalsPtr += HFRAGPIXELS; @@ -1133,7 +1081,7 @@ } }else{ if ( *DispFragPtr > BLOCK_NOT_CODED ) - memset(ChLocalsPtr,0,8); + SET8_0(ChLocalsPtr); /* Step pointers */ ChLocalsPtr += HFRAGPIXELS; @@ -2126,10 +2074,12 @@ /* Fast break out test for obvious yes and no cases in this row of blocks */ if ( i < ppi->PlaneVFragments ){ + dsp_static_save_fpu (); UpdatedOrCandidateBlocks = RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ); - if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) ) - UpdatedOrCandidateBlocks = 1; + UpdatedOrCandidateBlocks |= + ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ); + dsp_static_restore_fpu (); }else{ /* Make sure we still call other functions if RowSadScan() disabled */ UpdatedOrCandidateBlocks = 1; --- libtheora-1.0alpha3/lib/toplevel.c 2004-03-18 03:00:30.000000000 +0100 +++ libtheora-1.0alpha3/lib/toplevel.c 2004-10-06 17:48:22.611370944 +0200 @@ -787,6 +787,8 @@ CP_INSTANCE *cpi; + dsp_static_init (); + memset(th, 0, sizeof(*th)); th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi)); @@ -1446,6 +1448,8 @@ PB_INSTANCE *pbi; codec_setup_info *ci; + dsp_static_init (); + ci=(codec_setup_info *)c->codec_setup; th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi));