Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 42417 Details for
Bug 68549
media-libs/libtheora massive speed difference
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
libtheora mmx patch
libtheora-1.0alpha3-mmx.patch (text/plain), 141.58 KB, created by
Zaheer Abbas Merali (RETIRED)
on 2004-10-22 07:33:11 UTC
(
hide
)
Description:
libtheora mmx patch
Filename:
MIME Type:
Creator:
Zaheer Abbas Merali (RETIRED)
Created:
2004-10-22 07:33:11 UTC
Size:
141.58 KB
patch
obsolete
>diff -Naur libtheora-1.0alpha3/lib/blockmap.c libtheora-1.0alpha3.mmx/lib/blockmap.c >--- libtheora-1.0alpha3/lib/blockmap.c 2003-12-03 09:59:39.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/blockmap.c 2004-10-06 17:48:22.202433112 +0200 >@@ -21,7 +21,7 @@ > ogg_uint32_t FirstSB, > ogg_uint32_t FirstFrag, ogg_uint32_t HFrags, > ogg_uint32_t VFrags ){ >- ogg_uint32_t i, j; >+ ogg_uint32_t i, j = 0; > ogg_uint32_t xpos; > ogg_uint32_t ypos; > ogg_uint32_t SBrow, SBcol; >diff -Naur libtheora-1.0alpha3/lib/cpu.c libtheora-1.0alpha3.mmx/lib/cpu.c >--- libtheora-1.0alpha3/lib/cpu.c 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/cpu.c 2004-10-06 17:48:22.203432960 +0200 >@@ -0,0 +1,107 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#include "cpu.h" >+ >+ogg_uint32_t cpu_flags = 0; >+ >+#if 1 >+static ogg_uint32_t cpu_get_flags (void) >+{ >+ ogg_uint32_t eax, ebx, ecx, edx; >+ ogg_uint32_t flags; >+ >+#define cpuid(op,eax,ebx,ecx,edx) \ >+ asm volatile ("pushl %%ebx \n\t" \ >+ "cpuid \n\t" \ >+ "movl %%ebx,%1 \n\t" \ >+ "popl %%ebx" \ >+ : "=a" (eax), \ >+ "=r" (ebx), \ >+ "=c" (ecx), \ >+ "=d" (edx) \ >+ : "a" (op) \ >+ : "cc") >+ >+ asm volatile ("pushfl \n\t" >+ "pushfl \n\t" >+ "popl %0 \n\t" >+ "movl %0,%1 \n\t" >+ "xorl $0x200000,%0 \n\t" >+ "pushl %0 \n\t" >+ "popfl \n\t" >+ "pushfl \n\t" >+ "popl %0 \n\t" >+ "popfl" >+ : "=r" (eax), >+ "=r" (ebx) >+ : >+ : "cc"); >+ >+ if (eax == ebx) /* no cpuid */ >+ return 0; >+ >+ cpuid(0, eax, ebx, ecx, edx); >+ >+ if (ebx == 0x756e6547 && >+ edx == 0x49656e69 && >+ ecx == 0x6c65746e) { >+ /* intel */ >+ >+ inteltest: >+ cpuid(1, eax, ebx, ecx, edx); >+ if ((edx & 0x00800000) == 0) >+ return 0; >+ flags = CPU_X86_MMX; >+ if (edx & 0x02000000) >+ flags |= CPU_X86_MMXEXT | CPU_X86_SSE; >+ if (edx & 0x04000000) >+ flags |= CPU_X86_SSE2; >+ return flags; >+ } else if (ebx == 0x68747541 && >+ edx == 0x69746e65 && >+ ecx == 0x444d4163) { >+ /* AMD */ >+ cpuid(0x80000000, eax, ebx, ecx, edx); >+ if ((unsigned)eax < 0x80000001) >+ goto inteltest; >+ cpuid(0x80000001, eax, ebx, ecx, edx); >+ if ((edx & 0x00800000) == 0) >+ return 0; >+ flags = CPU_X86_MMX; >+ if (edx & 0x80000000) >+ flags |= CPU_X86_3DNOW; >+ if (edx & 0x00400000) >+ flags |= CPU_X86_MMXEXT; >+ return flags; >+ } >+ else { >+ /* implement me */ >+ } >+ >+ return flags; >+} >+#else >+static ogg_uint32_t cpu_get_flags (void) { >+ return 0; >+} >+#endif >+ >+void cpu_init () >+{ >+ cpu_flags = cpu_get_flags(); >+} >diff -Naur libtheora-1.0alpha3/lib/cpu.h libtheora-1.0alpha3.mmx/lib/cpu.h >--- libtheora-1.0alpha3/lib/cpu.h 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/cpu.h 2004-10-06 17:48:22.243426880 +0200 >@@ -0,0 +1,28 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#include "encoder_internal.h" >+ >+extern ogg_uint32_t cpu_flags; >+ >+#define CPU_X86_MMX (1<<0) >+#define CPU_X86_3DNOW (1<<1) >+#define CPU_X86_MMXEXT (1<<2) >+#define CPU_X86_SSE (1<<3) >+#define CPU_X86_SSE2 (1<<4) >+ >+void cpu_init () ; >diff -Naur libtheora-1.0alpha3/lib/dct.c libtheora-1.0alpha3.mmx/lib/dct.c >--- libtheora-1.0alpha3/lib/dct.c 2003-12-03 09:59:39.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/dct.c 2004-10-06 17:48:22.244426728 +0200 >@@ -16,6 +16,7 @@ > ********************************************************************/ > > #include "encoder_internal.h" >+#include "cpu.h" > > static ogg_int32_t xC1S7 = 64277; > static ogg_int32_t xC2S6 = 60547; >@@ -28,7 +29,7 @@ > #define SIGNBITDUPPED(X) ((signed )(((X) & 0x80000000)) >> 31) > #define DOROUND(X) ( (SIGNBITDUPPED(X) & (0xffff)) + (X) ) > >-void fdct_short ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){ >+static void fdct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ){ > int loop; > > ogg_int32_t is07, is12, is34, is56; >@@ -251,3 +252,12 @@ > op ++; > } > } >+ >+void dsp_dct_init (DspFunctions *funcs) >+{ >+ funcs->fdct_short = fdct_short__c; >+ if (cpu_flags & CPU_X86_MMX) { >+ dsp_i386_mmx_fdct_init(&dsp_funcs); >+ } >+} >+ >diff -Naur libtheora-1.0alpha3/lib/dct_decode.c libtheora-1.0alpha3.mmx/lib/dct_decode.c >--- libtheora-1.0alpha3/lib/dct_decode.c 2004-03-18 18:10:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/dct_decode.c 2004-10-06 17:48:22.284420648 +0200 >@@ -18,6 +18,7 @@ > #include <stdlib.h> > #include <string.h> > #include "encoder_internal.h" >+#include "dsp.h" > > > #define GOLDEN_FRAME_THRESH_Q 50 >@@ -112,22 +113,6 @@ > SetupBoundingValueArray_Generic(pbi, FLimit); > } > >-void CopyBlock(unsigned char *src, >- unsigned char *dest, >- unsigned int srcstride){ >- unsigned char *s = src; >- unsigned char *d = dest; >- unsigned int stride = srcstride; >- >- int j; >- for ( j = 0; j < 8; j++ ){ >- ((ogg_uint32_t*)d)[0] = ((ogg_uint32_t*)s)[0]; >- ((ogg_uint32_t*)d)[1] = ((ogg_uint32_t*)s)[1]; >- s+=stride; >- d+=stride; >- } >-} >- > static void ExpandKFBlock ( PB_INSTANCE *pbi, ogg_int32_t FragmentNumber ){ > ogg_uint32_t ReconPixelsPerLine; > ogg_int32_t ReconPixelIndex; >@@ -160,8 +145,8 @@ > ReconPixelIndex = pbi->recon_pixel_index_table[FragmentNumber]; > > /* Get the pixel index for the first pixel in the fragment. */ >- ReconIntra( pbi, (unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]), >- (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine ); >+ dsp_static_recon_intra8x8 ((unsigned char *)(&pbi->ThisFrameRecon[ReconPixelIndex]), >+ (ogg_uint16_t *)pbi->ReconDataBuffer, ReconPixelsPerLine); > > } > >@@ -237,10 +222,9 @@ > /* Reconstruct the pixel data using the last frame reconstruction > and change data when the motion vector is (0,0), the recon is > based on the lastframe without loop filtering---- for testing */ >- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], >+ dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex], > &pbi->LastFrameRecon[ReconPixelIndex], >- pbi->ReconDataBuffer, ReconPixelsPerLine ); >- >+ pbi->ReconDataBuffer, ReconPixelsPerLine); > }else if ( ModeUsesMC[pbi->CodingMode] ) { > /* The mode uses a motion vector. */ > /* Get vector from list */ >@@ -287,29 +271,30 @@ > if ( (int)(LastFrameRecPtr - LastFrameRecPtr2) == 0 ) { > /* Reconstruct the pixel dats from the reference frame and change data > (no half pixel in this case as the two references were the same. */ >- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], >+ dsp_static_recon_inter8x8 ( >+ &pbi->ThisFrameRecon[ReconPixelIndex], > LastFrameRecPtr, pbi->ReconDataBuffer, >- ReconPixelsPerLine ); >+ ReconPixelsPerLine); > }else{ > /* Fractional pixel reconstruction. */ > /* Note that we only use two pixels per reconstruction even for > the diagonal. */ >- ReconInterHalfPixel2( pbi,&pbi->ThisFrameRecon[ReconPixelIndex], >+ dsp_static_recon_inter8x8_half(&pbi->ThisFrameRecon[ReconPixelIndex], > LastFrameRecPtr, LastFrameRecPtr2, >- pbi->ReconDataBuffer, ReconPixelsPerLine ); >+ pbi->ReconDataBuffer, ReconPixelsPerLine); > } > } else if ( pbi->CodingMode == CODE_USING_GOLDEN ){ > /* Golden frame with motion vector */ > /* Reconstruct the pixel data using the golden frame > reconstruction and change data */ >- ReconInter( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], >+ dsp_static_recon_inter8x8 (&pbi->ThisFrameRecon[ReconPixelIndex], > &pbi->GoldenFrame[ ReconPixelIndex ], >- pbi->ReconDataBuffer, ReconPixelsPerLine ); >+ pbi->ReconDataBuffer, ReconPixelsPerLine); > } else { > /* Simple Intra coding */ > /* Get the pixel index for the first pixel in the fragment. */ >- ReconIntra( pbi, &pbi->ThisFrameRecon[ReconPixelIndex], >- pbi->ReconDataBuffer, ReconPixelsPerLine ); >+ dsp_static_recon_intra8x8 (&pbi->ThisFrameRecon[ReconPixelIndex], >+ pbi->ReconDataBuffer, ReconPixelsPerLine); > } > } > >@@ -464,7 +449,7 @@ > SrcPtr = &SrcReconPtr[ PixelIndex ]; > DestPtr = &DestReconPtr[ PixelIndex ]; > >- CopyBlock(SrcPtr, DestPtr, PlaneLineStep); >+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); > } > } > >@@ -476,7 +461,7 @@ > SrcPtr = &SrcReconPtr[ PixelIndex ]; > DestPtr = &DestReconPtr[ PixelIndex ]; > >- CopyBlock(SrcPtr, DestPtr, PlaneLineStep); >+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); > > } > } >@@ -505,7 +490,7 @@ > SrcPtr = &SrcReconPtr[ PixelIndex ]; > DestPtr = &DestReconPtr[ PixelIndex ]; > >- CopyBlock(SrcPtr, DestPtr, PlaneLineStep); >+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); > } > } > >@@ -517,7 +502,7 @@ > SrcPtr = &SrcReconPtr[ PixelIndex ]; > DestPtr = &DestReconPtr[ PixelIndex ]; > >- CopyBlock(SrcPtr, DestPtr, PlaneLineStep); >+ dsp_static_copy8x8 (SrcPtr, DestPtr, PlaneLineStep); > > } > } >diff -Naur libtheora-1.0alpha3/lib/dct_encode.c libtheora-1.0alpha3.mmx/lib/dct_encode.c >--- libtheora-1.0alpha3/lib/dct_encode.c 2003-06-10 03:31:33.000000000 +0200 >+++ libtheora-1.0alpha3.mmx/lib/dct_encode.c 2004-10-06 17:48:22.285420496 +0200 >@@ -17,110 +17,10 @@ > > #include <stdlib.h> > #include "encoder_internal.h" >+#include "dsp.h" > > static int ModeUsesMC[MAX_MODES] = { 0, 0, 1, 1, 1, 0, 1, 1 }; > >-static void Sub8 (unsigned char *FiltPtr, unsigned char *ReconPtr, >- ogg_int16_t *DctInputPtr, unsigned char *old_ptr1, >- unsigned char *new_ptr1, ogg_uint32_t PixelsPerLine, >- ogg_uint32_t ReconPixelsPerLine ) { >- int i; >- >- /* For each block row */ >- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){ >- DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - ((int)ReconPtr[0]) ); >- DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - ((int)ReconPtr[1]) ); >- DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - ((int)ReconPtr[2]) ); >- DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - ((int)ReconPtr[3]) ); >- DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - ((int)ReconPtr[4]) ); >- DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - ((int)ReconPtr[5]) ); >- DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - ((int)ReconPtr[6]) ); >- DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - ((int)ReconPtr[7]) ); >- >- /* Update the screen canvas in one step*/ >- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0]; >- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1]; >- >- /* Start next row */ >- new_ptr1 += PixelsPerLine; >- old_ptr1 += PixelsPerLine; >- FiltPtr += PixelsPerLine; >- ReconPtr += ReconPixelsPerLine; >- DctInputPtr += BLOCK_HEIGHT_WIDTH; >- } >-} >- >-static void Sub8_128 (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, >- unsigned char *old_ptr1, unsigned char *new_ptr1, >- ogg_uint32_t PixelsPerLine ) { >- int i; >- /* For each block row */ >- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ){ >- /* INTRA mode so code raw image data */ >- /* We convert the data to 8 bit signed (by subtracting 128) as >- this reduces the internal precision requirments in the DCT >- transform. */ >- DctInputPtr[0] = (ogg_int16_t)((int)(FiltPtr[0]) - 128); >- DctInputPtr[1] = (ogg_int16_t)((int)(FiltPtr[1]) - 128); >- DctInputPtr[2] = (ogg_int16_t)((int)(FiltPtr[2]) - 128); >- DctInputPtr[3] = (ogg_int16_t)((int)(FiltPtr[3]) - 128); >- DctInputPtr[4] = (ogg_int16_t)((int)(FiltPtr[4]) - 128); >- DctInputPtr[5] = (ogg_int16_t)((int)(FiltPtr[5]) - 128); >- DctInputPtr[6] = (ogg_int16_t)((int)(FiltPtr[6]) - 128); >- DctInputPtr[7] = (ogg_int16_t)((int)(FiltPtr[7]) - 128); >- >- /* Update the screen canvas in one step */ >- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0]; >- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1]; >- >- /* Start next row */ >- new_ptr1 += PixelsPerLine; >- old_ptr1 += PixelsPerLine; >- FiltPtr += PixelsPerLine; >- DctInputPtr += BLOCK_HEIGHT_WIDTH; >- } >-} >- >-static void Sub8Av2 (unsigned char *FiltPtr, unsigned char *ReconPtr1, >- unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, >- unsigned char *old_ptr1, unsigned char *new_ptr1, >- ogg_uint32_t PixelsPerLine, >- ogg_uint32_t ReconPixelsPerLine ) { >- int i; >- >- /* For each block row */ >- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) { >- DctInputPtr[0] = (ogg_int16_t) >- ((int)(FiltPtr[0]) - (((int)ReconPtr1[0] + (int)ReconPtr2[0]) / 2) ); >- DctInputPtr[1] = (ogg_int16_t) >- ((int)(FiltPtr[1]) - (((int)ReconPtr1[1] + (int)ReconPtr2[1]) / 2) ); >- DctInputPtr[2] = (ogg_int16_t) >- ((int)(FiltPtr[2]) - (((int)ReconPtr1[2] + (int)ReconPtr2[2]) / 2) ); >- DctInputPtr[3] = (ogg_int16_t) >- ((int)(FiltPtr[3]) - (((int)ReconPtr1[3] + (int)ReconPtr2[3]) / 2) ); >- DctInputPtr[4] = (ogg_int16_t) >- ((int)(FiltPtr[4]) - (((int)ReconPtr1[4] + (int)ReconPtr2[4]) / 2) ); >- DctInputPtr[5] = (ogg_int16_t) >- ((int)(FiltPtr[5]) - (((int)ReconPtr1[5] + (int)ReconPtr2[5]) / 2) ); >- DctInputPtr[6] = (ogg_int16_t) >- ((int)(FiltPtr[6]) - (((int)ReconPtr1[6] + (int)ReconPtr2[6]) / 2) ); >- DctInputPtr[7] = (ogg_int16_t) >- ((int)(FiltPtr[7]) - (((int)ReconPtr1[7] + (int)ReconPtr2[7]) / 2) ); >- >- /* Update the screen canvas in one step */ >- ((ogg_uint32_t*)old_ptr1)[0] = ((ogg_uint32_t*)new_ptr1)[0]; >- ((ogg_uint32_t*)old_ptr1)[1] = ((ogg_uint32_t*)new_ptr1)[1]; >- >- /* Start next row */ >- new_ptr1 += PixelsPerLine; >- old_ptr1 += PixelsPerLine; >- FiltPtr += PixelsPerLine; >- ReconPtr1 += ReconPixelsPerLine; >- ReconPtr2 += ReconPixelsPerLine; >- DctInputPtr += BLOCK_HEIGHT_WIDTH; >- } >-} >- > static unsigned char TokenizeDctValue (ogg_int16_t DataValue, > ogg_uint32_t * TokenListPtr ){ > unsigned char tokens_added = 0; >@@ -452,13 +352,15 @@ > > /* Is the MV offset exactly pixel alligned */ > if ( AbsRefOffset == 0 ){ >- Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1, >- PixelsPerLine, ReconPixelsPerLine ); >+ dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr, >+ PixelsPerLine, ReconPixelsPerLine); >+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine); > } else { > /* Fractional pixel MVs. */ > /* Note that we only use two pixel values even for the diagonal */ >- Sub8Av2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, old_ptr1, >- new_ptr1, PixelsPerLine, ReconPixelsPerLine ); >+ dsp_static_sub8x8avg2(FiltPtr, ReconPtr1,ReconPtr2,DctInputPtr, >+ PixelsPerLine, ReconPixelsPerLine); >+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine); > } > } > >@@ -534,17 +436,18 @@ > pb.GoldenFrame[cpi->pb.recon_pixel_index_table[FragIndex]]; > } > >- Sub8( FiltPtr, ReconPtr1, DctInputPtr, old_ptr1, new_ptr1, >- PixelsPerLine, ReconPixelsPerLine ); >+ dsp_static_sub8x8( FiltPtr, ReconPtr1, DctInputPtr, >+ PixelsPerLine, ReconPixelsPerLine); >+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine); > } else if ( cpi->pb.CodingMode==CODE_INTRA ) { >- Sub8_128(FiltPtr, DctInputPtr, old_ptr1, new_ptr1, PixelsPerLine); >- >+ dsp_static_sub8x8_128(FiltPtr, DctInputPtr, PixelsPerLine); >+ dsp_static_copy8x8 (new_ptr1, old_ptr1, PixelsPerLine); > } > > /* Proceed to encode the data into the encode buffer if the encoder > is enabled. */ > /* Perform a 2D DCT transform on the data. */ >- fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes ); >+ dsp_static_fdct_short( cpi->DCTDataBuffer, cpi->DCT_codes ); > > /* Quantize that transform data. */ > quantize ( &cpi->pb, cpi->DCT_codes, cpi->pb.QFragData[FragIndex] ); >diff -Naur libtheora-1.0alpha3/lib/decode.c libtheora-1.0alpha3.mmx/lib/decode.c >--- libtheora-1.0alpha3/lib/decode.c 2003-12-06 19:06:20.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/decode.c 2004-10-06 17:48:22.324414568 +0200 >@@ -796,6 +796,8 @@ > /* Make a not of the number of coded blocks this frame */ > pbi->CodedBlocksThisFrame = pbi->CodedBlockIndex; > >+ dsp_static_save_fpu(); >+ > /* Decode the modes data */ > DecodeModes( pbi, pbi->YSBRows, pbi->YSBCols); > >@@ -808,6 +810,7 @@ > /* Reconstruct and display the frame */ > ReconRefFrames(pbi); > >+ dsp_static_restore_fpu(); > } > > >diff -Naur libtheora-1.0alpha3/lib/dsp.c libtheora-1.0alpha3.mmx/lib/dsp.c >--- libtheora-1.0alpha3/lib/dsp.c 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/dsp.c 2004-10-06 17:48:22.363408640 +0200 >@@ -0,0 +1,416 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#include <stdlib.h> >+#include "cpu.h" >+#include "encoder_internal.h" >+ >+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) >+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) >+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b)))) >+ >+DspFunctions dsp_funcs; >+ >+static void sub8x8__c (unsigned char *FiltPtr, unsigned char *ReconPtr, >+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, >+ ogg_uint32_t ReconPixelsPerLine) { >+ int i; >+ >+ /* For each block row */ >+ for (i=8; i; i--) { >+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]); >+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]); >+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]); >+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]); >+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]); >+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]); >+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]); >+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]); >+ >+ /* Start next row */ >+ FiltPtr += PixelsPerLine; >+ ReconPtr += ReconPixelsPerLine; >+ DctInputPtr += 8; >+ } >+} >+ >+static void sub8x8_128__c (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, >+ ogg_uint32_t PixelsPerLine) { >+ int i; >+ /* For each block row */ >+ for (i=8; i; i--) { >+ /* INTRA mode so code raw image data */ >+ /* We convert the data to 8 bit signed (by subtracting 128) as >+ this reduces the internal precision requirments in the DCT >+ transform. */ >+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128); >+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128); >+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128); >+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128); >+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128); >+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128); >+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128); >+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128); >+ >+ /* Start next row */ >+ FiltPtr += PixelsPerLine; >+ DctInputPtr += 8; >+ } >+} >+ >+static void sub8x8avg2__c (unsigned char *FiltPtr, unsigned char *ReconPtr1, >+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, >+ ogg_uint32_t PixelsPerLine, >+ ogg_uint32_t ReconPixelsPerLine) >+{ >+ int i; >+ >+ /* For each block row */ >+ for (i=8; i; i--) { >+ DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], DSP_OP_AVG (ReconPtr1[0], ReconPtr2[0])); >+ DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], DSP_OP_AVG (ReconPtr1[1], ReconPtr2[1])); >+ DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], DSP_OP_AVG (ReconPtr1[2], ReconPtr2[2])); >+ DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], DSP_OP_AVG (ReconPtr1[3], ReconPtr2[3])); >+ DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], DSP_OP_AVG (ReconPtr1[4], ReconPtr2[4])); >+ DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], DSP_OP_AVG (ReconPtr1[5], ReconPtr2[5])); >+ DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], DSP_OP_AVG (ReconPtr1[6], ReconPtr2[6])); >+ DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], DSP_OP_AVG (ReconPtr1[7], ReconPtr2[7])); >+ >+ /* Start next row */ >+ FiltPtr += PixelsPerLine; >+ ReconPtr1 += ReconPixelsPerLine; >+ ReconPtr2 += ReconPixelsPerLine; >+ DctInputPtr += 8; >+ } >+} >+ >+static ogg_uint32_t row_sad8__c (unsigned char *Src1, unsigned char *Src2) >+{ >+ ogg_uint32_t SadValue; >+ ogg_uint32_t SadValue1; >+ >+ SadValue = DSP_OP_ABS_DIFF (Src1[0], Src2[0]) + >+ DSP_OP_ABS_DIFF (Src1[1], Src2[1]) + >+ DSP_OP_ABS_DIFF (Src1[2], Src2[2]) + >+ DSP_OP_ABS_DIFF (Src1[3], Src2[3]); >+ >+ SadValue1 = DSP_OP_ABS_DIFF (Src1[4], Src2[4]) + >+ DSP_OP_ABS_DIFF (Src1[5], Src2[5]) + >+ DSP_OP_ABS_DIFF (Src1[6], Src2[6]) + >+ DSP_OP_ABS_DIFF (Src1[7], Src2[7]); >+ >+ SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1; >+ >+ return SadValue; >+} >+ >+static ogg_uint32_t col_sad8x8__c (unsigned char *Src1, unsigned char *Src2, >+ ogg_uint32_t stride) >+{ >+ ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0}; >+ ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0}; >+ ogg_uint32_t MaxSad = 0; >+ ogg_uint32_t i; >+ >+ for ( i = 0; i < 4; i++ ){ >+ SadValue[0] += abs(Src1[0] - Src2[0]); >+ SadValue[1] += abs(Src1[1] - Src2[1]); >+ SadValue[2] += abs(Src1[2] - Src2[2]); >+ SadValue[3] += abs(Src1[3] - Src2[3]); >+ SadValue[4] += abs(Src1[4] - Src2[4]); >+ SadValue[5] += abs(Src1[5] - Src2[5]); >+ SadValue[6] += abs(Src1[6] - Src2[6]); >+ SadValue[7] += abs(Src1[7] - Src2[7]); >+ >+ Src1 += stride; >+ Src2 += stride; >+ } >+ >+ for ( i = 0; i < 4; i++ ){ >+ SadValue2[0] += abs(Src1[0] - Src2[0]); >+ SadValue2[1] += abs(Src1[1] - Src2[1]); >+ SadValue2[2] += abs(Src1[2] - Src2[2]); >+ SadValue2[3] += abs(Src1[3] - Src2[3]); >+ SadValue2[4] += abs(Src1[4] - Src2[4]); >+ SadValue2[5] += abs(Src1[5] - Src2[5]); >+ SadValue2[6] += abs(Src1[6] - Src2[6]); >+ SadValue2[7] += abs(Src1[7] - Src2[7]); >+ >+ Src1 += stride; >+ Src2 += stride; >+ } >+ >+ for ( i = 0; i < 8; i++ ){ >+ if ( SadValue[i] > MaxSad ) >+ MaxSad = SadValue[i]; >+ if ( SadValue2[i] > MaxSad ) >+ MaxSad = SadValue2[i]; >+ } >+ >+ return MaxSad; >+} >+ >+static ogg_uint32_t sad8x8__c (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2) >+{ >+ ogg_uint32_t i; >+ ogg_uint32_t sad = 0; >+ >+ for (i=8; i; i--) { >+ sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); >+ sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); >+ sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); >+ sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); >+ sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); >+ sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); >+ sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); >+ sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); >+ >+ /* Step to next row of block. */ >+ ptr1 += stride1; >+ ptr2 += stride2; >+ } >+ >+ return sad; >+} >+ >+static ogg_uint32_t sad8x8_thres__c (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2, >+ ogg_uint32_t thres) >+{ >+ ogg_uint32_t i; >+ ogg_uint32_t sad = 0; >+ >+ for (i=8; i; i--) { >+ sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]); >+ sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]); >+ sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]); >+ sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]); >+ sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]); >+ sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]); >+ sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]); >+ sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]); >+ >+ if (sad > thres ) >+ break; >+ >+ /* Step to next row of block. */ >+ ptr1 += stride1; >+ ptr2 += stride2; >+ } >+ >+ return sad; >+} >+ >+static ogg_uint32_t sad8x8_xy2_thres__c (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride, >+ ogg_uint32_t thres) >+{ >+ ogg_uint32_t i; >+ ogg_uint32_t sad = 0; >+ >+ for (i=8; i; i--) { >+ sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); >+ sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); >+ sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); >+ sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); >+ sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); >+ sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); >+ sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); >+ sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); >+ >+ if ( sad > thres ) >+ break; >+ >+ /* Step to next row of block. */ >+ SrcData += SrcStride; >+ RefDataPtr1 += RefStride; >+ RefDataPtr2 += RefStride; >+ } >+ >+ return sad; >+} >+ >+static ogg_uint32_t intra8x8_err__c (unsigned char *DataPtr, ogg_uint32_t Stride) >+{ >+ ogg_uint32_t i; >+ ogg_uint32_t XSum=0; >+ ogg_uint32_t XXSum=0; >+ >+ for (i=8; i; i--) { >+ /* Examine alternate pixel locations. */ >+ XSum += DataPtr[0]; >+ XXSum += DataPtr[0]*DataPtr[0]; >+ XSum += DataPtr[1]; >+ XXSum += DataPtr[1]*DataPtr[1]; >+ XSum += DataPtr[2]; >+ XXSum += DataPtr[2]*DataPtr[2]; >+ XSum += DataPtr[3]; >+ XXSum += DataPtr[3]*DataPtr[3]; >+ XSum += DataPtr[4]; >+ XXSum += DataPtr[4]*DataPtr[4]; >+ XSum += DataPtr[5]; >+ XXSum += DataPtr[5]*DataPtr[5]; >+ XSum += DataPtr[6]; >+ XXSum += DataPtr[6]*DataPtr[6]; >+ XSum += DataPtr[7]; >+ XXSum += DataPtr[7]*DataPtr[7]; >+ >+ /* Step to next row of block. */ >+ DataPtr += Stride; >+ } >+ >+ /* Compute population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum ) ); >+} >+ >+static ogg_uint32_t inter8x8_err__c (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr, ogg_uint32_t RefStride) >+{ >+ ogg_uint32_t i; >+ ogg_uint32_t XSum=0; >+ ogg_uint32_t XXSum=0; >+ ogg_int32_t DiffVal; >+ >+ for (i=8; i; i--) { >+ DiffVal = DSP_OP_DIFF (SrcData[0], RefDataPtr[0]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[1], RefDataPtr[1]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[2], RefDataPtr[2]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[3], RefDataPtr[3]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[4], RefDataPtr[4]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[5], RefDataPtr[5]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[6], RefDataPtr[6]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF (SrcData[7], RefDataPtr[7]); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ /* Step to next row of block. */ >+ SrcData += SrcStride; >+ RefDataPtr += RefStride; >+ } >+ >+ /* Compute and return population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum )); >+} >+ >+static ogg_uint32_t inter8x8_err_xy2__c (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride) >+{ >+ ogg_uint32_t i; >+ ogg_uint32_t XSum=0; >+ ogg_uint32_t XXSum=0; >+ ogg_int32_t DiffVal; >+ >+ for (i=8; i; i--) { >+ DiffVal = DSP_OP_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ DiffVal = DSP_OP_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7])); >+ XSum += DiffVal; >+ XXSum += DiffVal*DiffVal; >+ >+ /* Step to next row of block. */ >+ SrcData += SrcStride; >+ RefDataPtr1 += RefStride; >+ RefDataPtr2 += RefStride; >+ } >+ >+ /* Compute and return population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum )); >+} >+ >+static void nop (void) { /* NOP */ } >+ >+void dsp_init(DspFunctions *funcs) >+{ >+ funcs->save_fpu = nop; >+ funcs->restore_fpu = nop; >+ funcs->sub8x8 = sub8x8__c; >+ funcs->sub8x8_128 = sub8x8_128__c; >+ funcs->sub8x8avg2 = sub8x8avg2__c; >+ funcs->row_sad8 = row_sad8__c; >+ funcs->col_sad8x8 = col_sad8x8__c; >+ funcs->sad8x8 = sad8x8__c; >+ funcs->sad8x8_thres = sad8x8_thres__c; >+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c; >+ funcs->intra8x8_err = intra8x8_err__c; >+ funcs->inter8x8_err = inter8x8_err__c; >+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__c; >+} >+ >+void dsp_static_init(void) >+{ >+ cpu_init (); >+ dsp_init (&dsp_funcs); >+ dsp_recon_init (&dsp_funcs); >+ dsp_dct_init (&dsp_funcs); >+ if (cpu_flags & CPU_X86_MMX) { >+ dsp_i386_mmx_init(&dsp_funcs); >+ } >+ if (cpu_flags & CPU_X86_MMXEXT) { >+ dsp_i386_mmxext_init(&dsp_funcs); >+ } >+} >+ >diff -Naur libtheora-1.0alpha3/lib/dsp.h libtheora-1.0alpha3.mmx/lib/dsp.h >--- libtheora-1.0alpha3/lib/dsp.h 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/dsp.h 2004-10-06 17:48:22.364408488 +0200 >@@ -0,0 +1,154 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#ifndef DSP_H >+#define DSP_H >+ >+#include <theora/theora.h> >+ >+typedef struct >+{ >+ void (*save_fpu) (void); >+ void (*restore_fpu) (void); >+ >+ void (*sub8x8) (unsigned char *FiltPtr, unsigned char *ReconPtr, >+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, >+ ogg_uint32_t ReconPixelsPerLine); >+ >+ void (*sub8x8_128) (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, >+ ogg_uint32_t PixelsPerLine); >+ >+ void (*sub8x8avg2) (unsigned char *FiltPtr, unsigned char *ReconPtr1, >+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, >+ ogg_uint32_t PixelsPerLine, >+ ogg_uint32_t ReconPixelsPerLine); >+ >+ void (*copy8x8) (unsigned char *src, unsigned char *dest, >+ ogg_uint32_t stride); >+ >+ void (*recon_intra8x8) (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, >+ ogg_uint32_t LineStep); >+ >+ void (*recon_inter8x8) (unsigned char *ReconPtr, unsigned char *RefPtr, >+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep); >+ >+ void (*recon_inter8x8_half) (unsigned char *ReconPtr, unsigned char *RefPtr1, >+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr, >+ ogg_uint32_t LineStep); >+ >+ void (*fdct_short) (ogg_int16_t *InputData, ogg_int16_t *OutputData); >+ >+ ogg_uint32_t (*row_sad8) (unsigned char *Src1, unsigned char *Src2); >+ >+ ogg_uint32_t (*col_sad8x8) (unsigned char *Src1, unsigned char *Src2, >+ ogg_uint32_t stride); >+ >+ ogg_uint32_t (*sad8x8) (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2); >+ >+ ogg_uint32_t (*sad8x8_thres) (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2, >+ ogg_uint32_t thres); >+ >+ ogg_uint32_t (*sad8x8_xy2_thres)(unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride, >+ ogg_uint32_t thres); >+ >+ ogg_uint32_t (*intra8x8_err) (unsigned char *DataPtr, ogg_uint32_t Stride); >+ >+ ogg_uint32_t (*inter8x8_err) (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr, ogg_uint32_t RefStride); >+ >+ ogg_uint32_t (*inter8x8_err_xy2)(unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride); >+} DspFunctions; >+ >+extern DspFunctions dsp_funcs; >+ >+extern void dsp_recon_init (DspFunctions *funcs); >+ >+void dsp_init(DspFunctions *funcs); >+void dsp_static_init(void); >+ >+#define dsp_save_fpu(funcs) (funcs.save_fpu ()) >+#define dsp_static_save_fpu() dsp_save_fpu(dsp_funcs) >+ >+#define dsp_restore_fpu(funcs) (funcs.restore_fpu ()) >+#define dsp_static_restore_fpu() dsp_restore_fpu(dsp_funcs) >+ >+#define dsp_sub8x8(funcs,a1,a2,a3,a4,a5) (funcs.sub8x8 (a1,a2,a3,a4,a5)) >+#define dsp_static_sub8x8(a1,a2,a3,a4,a5) dsp_sub8x8(dsp_funcs,a1,a2,a3,a4,a5) >+ >+#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3)) >+#define dsp_static_sub8x8_128(a1,a2,a3) dsp_sub8x8_128(dsp_funcs,a1,a2,a3) >+ >+#define dsp_sub8x8avg2(funcs,a1,a2,a3,a4,a5,a6) (funcs.sub8x8avg2 (a1,a2,a3,a4,a5,a6)) >+#define dsp_static_sub8x8avg2(a1,a2,a3,a4,a5,a6) dsp_sub8x8avg2(dsp_funcs,a1,a2,a3,a4,a5,a6) >+ >+#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1)) >+#define dsp_static_copy8x8(ptr1,ptr2,str1) dsp_copy8x8(dsp_funcs,ptr1,ptr2,str1) >+ >+#define dsp_recon_intra8x8(funcs,ptr1,ptr2,str1) (funcs.recon_intra8x8 (ptr1,ptr2,str1)) >+#define dsp_static_recon_intra8x8(ptr1,ptr2,str1) dsp_recon_intra8x8(dsp_funcs,ptr1,ptr2,str1) >+ >+#define dsp_recon_inter8x8(funcs,ptr1,ptr2,ptr3,str1) \ >+ (funcs.recon_inter8x8 (ptr1,ptr2,ptr3,str1)) >+#define dsp_static_recon_inter8x8(ptr1,ptr2,ptr3,str1) \ >+ dsp_recon_inter8x8(dsp_funcs,ptr1,ptr2,ptr3,str1) >+ >+#define dsp_recon_inter8x8_half(funcs,ptr1,ptr2,ptr3,ptr4,str1) \ >+ (funcs.recon_inter8x8_half (ptr1,ptr2,ptr3,ptr4,str1)) >+#define dsp_static_recon_inter8x8_half(ptr1,ptr2,ptr3,ptr4,str1) \ >+ dsp_recon_inter8x8_half(dsp_funcs,ptr1,ptr2,ptr3,ptr4,str1) >+ >+#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out)) >+#define dsp_static_fdct_short(in,out) dsp_fdct_short(dsp_funcs,in,out) >+ >+#define dsp_row_sad8(funcs,ptr1,ptr2) (funcs.row_sad8 (ptr1,ptr2)) >+#define dsp_static_row_sad8(ptr1,ptr2) dsp_row_sad8(dsp_funcs,ptr1,ptr2) >+ >+#define dsp_col_sad8x8(funcs,ptr1,ptr2,str1) (funcs.col_sad8x8 (ptr1,ptr2,str1)) >+#define dsp_static_col_sad8x8(ptr1,ptr2,str1) dsp_col_sad8x8(dsp_funcs,ptr1,ptr2,str1) >+ >+#define dsp_sad8x8(funcs,ptr1,str1,ptr2,str2) (funcs.sad8x8 (ptr1,str1,ptr2,str2)) >+#define dsp_static_sad8x8(ptr1,str1,ptr2,str2) dsp_sad8x8(dsp_funcs,ptr1,str1,ptr2,str2) >+ >+#define dsp_sad8x8_thres(funcs,ptr1,str1,ptr2,str2,t) (funcs.sad8x8_thres (ptr1,str1,ptr2,str2,t)) >+#define dsp_static_sad8x8_thres(ptr1,str1,ptr2,str2,t) dsp_sad8x8_thres(dsp_funcs,ptr1,str1,ptr2,str2,t) >+ >+#define dsp_sad8x8_xy2_thres(funcs,ptr1,str1,ptr2,ptr3,str2,t) \ >+ (funcs.sad8x8_xy2_thres (ptr1,str1,ptr2,ptr3,str2,t)) >+#define dsp_static_sad8x8_xy2_thres(ptr1,str1,ptr2,ptr3,str2,t) \ >+ dsp_sad8x8_xy2_thres(dsp_funcs,ptr1,str1,ptr2,ptr3,str2,t) >+ >+#define dsp_intra8x8_err(funcs,ptr1,str1) (funcs.intra8x8_err (ptr1,str1)) >+#define dsp_static_intra8x8_err(ptr1,str1) dsp_intra8x8_err(dsp_funcs,ptr1,str1) >+ >+#define dsp_inter8x8_err(funcs,ptr1,str1,ptr2,str2) \ >+ (funcs.inter8x8_err (ptr1,str1,ptr2,str2)) >+#define dsp_static_inter8x8_err(ptr1,str1,ptr2,str2) \ >+ dsp_inter8x8_err(dsp_funcs,ptr1,str1,ptr2,str2) >+ >+#define dsp_inter8x8_err_xy2(funcs,ptr1,str1,ptr2,ptr3,str2) \ >+ (funcs.inter8x8_err_xy2 (ptr1,str1,ptr2,ptr3,str2)) >+#define dsp_static_inter8x8_err_xy2(ptr1,str1,ptr2,ptr3,str2) \ >+ dsp_inter8x8_err_xy2(dsp_funcs,ptr1,str1,ptr2,ptr3,str2) >+ >+ >+#endif /* DSP_H */ >diff -Naur libtheora-1.0alpha3/lib/encode.c libtheora-1.0alpha3.mmx/lib/encode.c >--- libtheora-1.0alpha3/lib/encode.c 2004-03-18 15:25:25.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/encode.c 2004-10-06 17:48:22.401402864 +0200 >@@ -531,8 +531,7 @@ > > static ogg_uint32_t GetBlockReconErrorSlow( CP_INSTANCE *cpi, > ogg_int32_t BlockIndex ) { >- ogg_uint32_t i; >- ogg_uint32_t ErrorVal = 0; >+ ogg_uint32_t ErrorVal; > > unsigned char * SrcDataPtr = > &cpi->ConvDestBuffer[cpi->pb.pixel_index_table[BlockIndex]]; >@@ -550,21 +549,8 @@ > RecStride = cpi->pb.UVStride; > } > >+ ErrorVal = dsp_static_sad8x8 (SrcDataPtr, SrcStride, RecDataPtr, RecStride); > >- /* Decide on standard or MMX implementation */ >- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) { >- ErrorVal += abs( ((int)SrcDataPtr[0]) - ((int)RecDataPtr[0]) ); >- ErrorVal += abs( ((int)SrcDataPtr[1]) - ((int)RecDataPtr[1]) ); >- ErrorVal += abs( ((int)SrcDataPtr[2]) - ((int)RecDataPtr[2]) ); >- ErrorVal += abs( ((int)SrcDataPtr[3]) - ((int)RecDataPtr[3]) ); >- ErrorVal += abs( ((int)SrcDataPtr[4]) - ((int)RecDataPtr[4]) ); >- ErrorVal += abs( ((int)SrcDataPtr[5]) - ((int)RecDataPtr[5]) ); >- ErrorVal += abs( ((int)SrcDataPtr[6]) - ((int)RecDataPtr[6]) ); >- ErrorVal += abs( ((int)SrcDataPtr[7]) - ((int)RecDataPtr[7]) ); >- /* Step to next row of block. */ >- SrcDataPtr += SrcStride; >- RecDataPtr += RecStride; >- } > return ErrorVal; > } > >@@ -933,9 +919,13 @@ > /* Zero Decoder EOB run count */ > cpi->pb.EOB_Run = 0; > >+ dsp_static_save_fpu (); >+ > /* Encode any fragments coded using DCT. */ > coded_pixels += QuadCodeDisplayFragments (cpi); > >+ dsp_static_restore_fpu (); >+ > return coded_pixels; > > } >diff -Naur libtheora-1.0alpha3/lib/encoder_internal.h libtheora-1.0alpha3.mmx/lib/encoder_internal.h >--- libtheora-1.0alpha3/lib/encoder_internal.h 2004-03-09 03:02:56.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/encoder_internal.h 2004-10-06 17:48:22.436397544 +0200 >@@ -24,6 +24,7 @@ > > #include <theora/theora.h> > #include "huffman.h" >+#include "dsp.h" > > #ifndef LIBOGG2 > #define theora_read(x,y,z) ( *z = oggpackB_read(x,y) ) >@@ -689,23 +690,9 @@ > ogg_int16_t *QuantMatrix, > ogg_int16_t * OutputData ); > >-extern void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr, >- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ); >- >-extern void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr, >- unsigned char * RefPtr, ogg_int16_t * ChangePtr, >- ogg_uint32_t LineStep ) ; >- >-extern void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr, >- unsigned char * RefPtr1, >- unsigned char * RefPtr2, >- ogg_int16_t * ChangePtr, >- ogg_uint32_t LineStep ) ; >+extern void dsp_recon_init (DspFunctions *funcs); > > extern void SetupLoopFilter(PB_INSTANCE *pbi); >-extern void CopyBlock(unsigned char *src, >- unsigned char *dest, >- unsigned int srcstride); > extern void LoopFilter(PB_INSTANCE *pbi); > extern void ReconRefFrames (PB_INSTANCE *pbi); > extern void ExpandToken( Q_LIST_ENTRY * ExpandedBlock, >diff -Naur libtheora-1.0alpha3/lib/i386/dsp_mmx.c libtheora-1.0alpha3.mmx/lib/i386/dsp_mmx.c >--- libtheora-1.0alpha3/lib/i386/dsp_mmx.c 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/i386/dsp_mmx.c 2004-10-06 17:48:22.472392072 +0200 >@@ -0,0 +1,642 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#include <stdlib.h> >+#include "dsp.h" >+ >+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL; >+ >+#if defined(__MINGW32__) || defined(__CYGWIN__) || \ >+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) >+# define M(a) "_" #a >+#else >+# define M(a) #a >+#endif >+ >+#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) >+#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) >+#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b)))) >+ >+static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr, >+ ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, >+ ogg_uint32_t ReconPixelsPerLine) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm7, %%mm7 \n\t" >+ >+ ".rept 8 \n\t" >+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ >+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ >+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ >+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ >+ /* convert from UINT8 to INT16 */ >+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ >+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ >+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ >+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ >+ /* start calculation */ >+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ >+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ >+ " movq %%mm0, (%2) \n\t" /* write answer out */ >+ " movq %%mm2, 8(%2) \n\t" /* write answer out */ >+ /* Increment pointers */ >+ " add $16, %2 \n\t" >+ " add %3, %0 \n\t" >+ " add %4, %1 \n\t" >+ ".endr \n\t" >+ >+ : "+r" (FiltPtr), >+ "+r" (ReconPtr), >+ "+r" (DctInputPtr) >+ : "m" (PixelsPerLine), >+ "m" (ReconPixelsPerLine) >+ : "memory" >+ ); >+} >+ >+static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, >+ ogg_uint32_t PixelsPerLine) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm7, %%mm7 \n\t" >+ " movq "M(V128w)", %%mm1 \n\t" >+ >+ ".rept 8 \n\t" >+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ >+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ >+ /* convert from UINT8 to INT16 */ >+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ >+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ >+ /* start calculation */ >+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ >+ " psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ >+ " movq %%mm0, (%1) \n\t" /* write answer out */ >+ " movq %%mm2, 8(%1) \n\t" /* write answer out */ >+ /* Increment pointers */ >+ " add $16, %1 \n\t" >+ " add %2, %0 \n\t" >+ ".endr \n\t" >+ >+ : "+r" (FiltPtr), >+ "+r" (DctInputPtr) >+ : "r" (PixelsPerLine) >+ : "memory" >+ ); >+} >+ >+static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1, >+ unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, >+ ogg_uint32_t PixelsPerLine, >+ ogg_uint32_t ReconPixelsPerLine) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm7, %%mm7 \n\t" >+ >+ ".rept 8 \n\t" >+ " movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ >+ " movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ >+ " movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ >+ " movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ >+ " movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ >+ " movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */ >+ /* convert from UINT8 to INT16 */ >+ " punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ >+ " punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ >+ " punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ >+ " punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ >+ " punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ >+ " punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ >+ /* average ReconPtr1 and ReconPtr2 */ >+ " paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ >+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ >+ " psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ >+ " psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ >+ " psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ >+ " psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ >+ " movq %%mm0, (%3) \n\t" /* write answer out */ >+ " movq %%mm2, 8(%3) \n\t" /* write answer out */ >+ /* Increment pointers */ >+ " add $16, %3 \n\t" >+ " add %4, %0 \n\t" >+ " add %5, %1 \n\t" >+ " add %5, %2 \n\t" >+ ".endr \n\t" >+ >+ : "+r" (FiltPtr), >+ "+r" (ReconPtr1), >+ "+r" (ReconPtr2), >+ "+r" (DctInputPtr) >+ : "m" (PixelsPerLine), >+ "m" (ReconPixelsPerLine) >+ : "memory" >+ ); >+} >+ >+static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2) >+{ >+ ogg_uint32_t MaxSad; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ >+ " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */ >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */ >+ " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */ >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " movq %%mm1, %%mm3 \n\t" >+ " psrlq $32, %%mm2 \n\t" /* fold and add */ >+ " psrlq $32, %%mm3 \n\t" >+ " paddw %%mm2, %%mm0 \n\t" >+ " paddw %%mm3, %%mm1 \n\t" >+ " movq %%mm0, %%mm2 \n\t" >+ " movq %%mm1, %%mm3 \n\t" >+ " psrlq $16, %%mm2 \n\t" >+ " psrlq $16, %%mm3 \n\t" >+ " paddw %%mm2, %%mm0 \n\t" >+ " paddw %%mm3, %%mm1 \n\t" >+ >+ " psubusw %%mm0, %%mm1 \n\t" >+ " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */ >+ " movd %%mm1, %0 \n\t" >+ " andl $0xffff, %0 \n\t" >+ >+ : "=m" (MaxSad), >+ "+r" (Src1), >+ "+r" (Src2) >+ : >+ : "memory" >+ ); >+ return MaxSad; >+} >+ >+static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2, >+ ogg_uint32_t stride) >+{ >+ ogg_uint32_t MaxSad; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ >+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ >+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ >+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ >+ " mov $4, %%edi \n\t" /* 4 rows */ >+ "1: \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */ >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ >+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ >+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ >+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " add %3, %2 \n\t" /* Inc pointer into the new data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " mov $4, %%edi \n\t" /* 4 rows */ >+ "2: \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */ >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ >+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ >+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ >+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " add %3, %2 \n\t" /* Inc pointer into the new data */ >+ >+ " dec %%edi \n\t" >+ " jnz 2b \n\t" >+ >+ " psubusw %%mm6, %%mm7 \n\t" >+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */ >+ " psubusw %%mm4, %%mm5 \n\t" >+ " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */ >+ " psubusw %%mm5, %%mm7 \n\t" >+ " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ >+ " movq %%mm7, %%mm6 \n\t" >+ " psrlq $32, %%mm6 \n\t" >+ " psubusw %%mm6, %%mm7 \n\t" >+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ >+ " movq %%mm7, %%mm6 \n\t" >+ " psrlq $16, %%mm6 \n\t" >+ " psubusw %%mm6, %%mm7 \n\t" >+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ >+ " movd %%mm7, %0 \n\t" >+ " andl $0xffff, %0 \n\t" >+ >+ : "=r" (MaxSad), >+ "+r" (Src1), >+ "+r" (Src2) >+ : "r" (stride) >+ : "memory", "edi" >+ ); >+ >+ return MaxSad; >+} >+ >+static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2) >+{ >+ ogg_uint32_t DiffVal; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ >+ ".rept 8 \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" >+ " movq %%mm0, %%mm2 \n\t" >+ >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ >+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ >+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ >+ " add %4, %2 \n\t" /* Inc pointer into ref data */ >+ ".endr \n\t" >+ >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $32, %%mm7 \n\t" >+ " paddw %%mm0, %%mm7 \n\t" >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $16, %%mm7 \n\t" >+ " paddw %%mm0, %%mm7 \n\t" >+ " movd %%mm7, %0 \n\t" >+ " andl $0xffff, %0 \n\t" >+ >+ : "=m" (DiffVal), >+ "+r" (ptr1), >+ "+r" (ptr2) >+ : "r" (stride1), >+ "r" (stride2) >+ : "memory" >+ ); >+ >+ return DiffVal; >+} >+ >+static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2, >+ ogg_uint32_t thres) >+{ >+ return sad8x8__mmx (ptr1, stride1, ptr2, stride2); >+} >+ >+static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride, >+ ogg_uint32_t thres) >+{ >+ ogg_uint32_t DiffVal; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */ >+ " paddb %%mm5, %%mm5 \n\t" >+ >+ " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ >+ " mov $8, %%edi \n\t" /* 8 rows */ >+ "1: \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ >+ " movq (%2), %%mm2 \n\t" >+ " movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */ >+ " movq %%mm2, %%mm1 \n\t" >+ " pand %%mm3, %%mm1 \n\t" >+ " pxor %%mm2, %%mm3 \n\t" >+ " pand %%mm5, %%mm3 \n\t" >+ " psrlq $1, %%mm3 \n\t" >+ " paddb %%mm3, %%mm1 \n\t" >+ >+ " movq %%mm0, %%mm2 \n\t" >+ >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ >+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ >+ " punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ >+ " add %4, %1 \n\t" /* Inc pointer into the new data */ >+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ >+ " add %5, %2 \n\t" /* Inc pointer into ref data */ >+ " add %5, %3 \n\t" /* Inc pointer into ref data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $32, %%mm7 \n\t" >+ " paddw %%mm0, %%mm7 \n\t" >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $16, %%mm7 \n\t" >+ " paddw %%mm0, %%mm7 \n\t" >+ " movd %%mm7, %0 \n\t" >+ " andl $0xffff, %0 \n\t" >+ >+ : "=m" (DiffVal), >+ "+r" (SrcData), >+ "+r" (RefDataPtr1), >+ "+r" (RefDataPtr2) >+ : "m" (SrcStride), >+ "m" (RefStride) >+ : "edi", "memory" >+ ); >+ >+ return DiffVal; >+} >+ >+static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride) >+{ >+ ogg_uint32_t XSum; >+ ogg_uint32_t XXSum; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm5, %%mm5 \n\t" >+ " pxor %%mm6, %%mm6 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" >+ " mov $8, %%edi \n\t" >+ "1: \n\t" >+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */ >+ " movq %%mm0, %%mm2 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" >+ " punpckhbw %%mm6, %%mm2 \n\t" >+ >+ " paddw %%mm0, %%mm5 \n\t" >+ " paddw %%mm2, %%mm5 \n\t" >+ >+ " pmaddwd %%mm0, %%mm0 \n\t" >+ " pmaddwd %%mm2, %%mm2 \n\t" >+ >+ " paddd %%mm0, %%mm7 \n\t" >+ " paddd %%mm2, %%mm7 \n\t" >+ >+ " add %3, %2 \n\t" /* Inc pointer into src data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $32, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $16, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movd %%mm5, %%edi \n\t" >+ " movsx %%di, %%edi \n\t" >+ " movl %%edi, %0 \n\t" >+ >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $32, %%mm7 \n\t" >+ " paddd %%mm0, %%mm7 \n\t" >+ " movd %%mm7, %1 \n\t" >+ >+ : "=r" (XSum), >+ "=r" (XXSum), >+ "+r" (DataPtr) >+ : "r" (Stride) >+ : "edi", "memory" >+ ); >+ >+ /* Compute population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum ) ); >+} >+ >+static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr, ogg_uint32_t RefStride) >+{ >+ ogg_uint32_t XSum; >+ ogg_uint32_t XXSum; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm5, %%mm5 \n\t" >+ " pxor %%mm6, %%mm6 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" >+ " mov $8, %%edi \n\t" >+ "1: \n\t" >+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%3), %%mm1 \n\t" >+ " movq %%mm0, %%mm2 \n\t" >+ " movq %%mm1, %%mm3 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" >+ " punpcklbw %%mm6, %%mm1 \n\t" >+ " punpckhbw %%mm6, %%mm2 \n\t" >+ " punpckhbw %%mm6, %%mm3 \n\t" >+ >+ " psubsw %%mm1, %%mm0 \n\t" >+ " psubsw %%mm3, %%mm2 \n\t" >+ >+ " paddw %%mm0, %%mm5 \n\t" >+ " paddw %%mm2, %%mm5 \n\t" >+ >+ " pmaddwd %%mm0, %%mm0 \n\t" >+ " pmaddwd %%mm2, %%mm2 \n\t" >+ >+ " paddd %%mm0, %%mm7 \n\t" >+ " paddd %%mm2, %%mm7 \n\t" >+ >+ " add %4, %2 \n\t" /* Inc pointer into src data */ >+ " add %5, %3 \n\t" /* Inc pointer into ref data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $32, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $16, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movd %%mm5, %%edi \n\t" >+ " movsx %%di, %%edi \n\t" >+ " movl %%edi, %0 \n\t" >+ >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $32, %%mm7 \n\t" >+ " paddd %%mm0, %%mm7 \n\t" >+ " movd %%mm7, %1 \n\t" >+ >+ : "=m" (XSum), >+ "=m" (XXSum), >+ "+r" (SrcData), >+ "+r" (RefDataPtr) >+ : "m" (SrcStride), >+ "m" (RefStride) >+ : "edi", "memory" >+ ); >+ >+ /* Compute and return population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum )); >+} >+ >+static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride) >+{ >+ ogg_uint32_t XSum; >+ ogg_uint32_t XXSum; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */ >+ " paddb %%mm4, %%mm4 \n\t" >+ " pxor %%mm5, %%mm5 \n\t" >+ " pxor %%mm6, %%mm6 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" >+ " mov $8, %%edi \n\t" >+ "1: \n\t" >+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */ >+ >+ " movq (%3), %%mm2 \n\t" >+ " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */ >+ " movq %%mm2, %%mm1 \n\t" >+ " pand %%mm3, %%mm1 \n\t" >+ " pxor %%mm2, %%mm3 \n\t" >+ " pand %%mm4, %%mm3 \n\t" >+ " psrlq $1, %%mm3 \n\t" >+ " paddb %%mm3, %%mm1 \n\t" >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " movq %%mm1, %%mm3 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" >+ " punpcklbw %%mm6, %%mm1 \n\t" >+ " punpckhbw %%mm6, %%mm2 \n\t" >+ " punpckhbw %%mm6, %%mm3 \n\t" >+ >+ " psubsw %%mm1, %%mm0 \n\t" >+ " psubsw %%mm3, %%mm2 \n\t" >+ >+ " paddw %%mm0, %%mm5 \n\t" >+ " paddw %%mm2, %%mm5 \n\t" >+ >+ " pmaddwd %%mm0, %%mm0 \n\t" >+ " pmaddwd %%mm2, %%mm2 \n\t" >+ >+ " paddd %%mm0, %%mm7 \n\t" >+ " paddd %%mm2, %%mm7 \n\t" >+ >+ " add %5, %2 \n\t" /* Inc pointer into src data */ >+ " add %6, %3 \n\t" /* Inc pointer into ref data */ >+ " add %6, %4 \n\t" /* Inc pointer into ref data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $32, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $16, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movd %%mm5, %%edi \n\t" >+ " movsx %%di, %%edi \n\t" >+ " movl %%edi, %0 \n\t" >+ >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $32, %%mm7 \n\t" >+ " paddd %%mm0, %%mm7 \n\t" >+ " movd %%mm7, %1 \n\t" >+ >+ : "=m" (XSum), >+ "=m" (XXSum), >+ "+r" (SrcData), >+ "+r" (RefDataPtr1), >+ "+r" (RefDataPtr2) >+ : "m" (SrcStride), >+ "m" (RefStride) >+ : "edi", "memory" >+ ); >+ >+ /* Compute and return population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum )); >+} >+ >+static void restore_fpu (void) >+{ >+ __asm__ __volatile__ ( >+ " emms \n\t" >+ ); >+} >+ >+void dsp_i386_mmx_init(DspFunctions *funcs) >+{ >+ funcs->restore_fpu = restore_fpu; >+ funcs->sub8x8 = sub8x8__mmx; >+ funcs->sub8x8_128 = sub8x8_128__mmx; >+ funcs->sub8x8avg2 = sub8x8avg2__mmx; >+ funcs->row_sad8 = row_sad8__mmx; >+ funcs->col_sad8x8 = col_sad8x8__mmx; >+ funcs->sad8x8 = sad8x8__mmx; >+ funcs->sad8x8_thres = sad8x8_thres__mmx; >+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx; >+ funcs->intra8x8_err = intra8x8_err__mmx; >+ funcs->inter8x8_err = inter8x8_err__mmx; >+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx; >+} >+ >diff -Naur libtheora-1.0alpha3/lib/i386/dsp_mmxext.c libtheora-1.0alpha3.mmx/lib/i386/dsp_mmxext.c >--- libtheora-1.0alpha3/lib/i386/dsp_mmxext.c 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/i386/dsp_mmxext.c 2004-10-06 17:48:22.474391768 +0200 >@@ -0,0 +1,316 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#include <stdlib.h> >+#include "dsp.h" >+ >+static ogg_uint32_t sad8x8__mmxext (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2) >+{ >+ ogg_uint32_t DiffVal; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ >+ >+ ".rept 7 \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" >+ " psadbw %%mm1, %%mm0 \n\t" >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ >+ " add %4, %2 \n\t" /* Inc pointer into ref data */ >+ ".endr \n\t" >+ >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" >+ " psadbw %%mm1, %%mm0 \n\t" >+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ >+ " movd %%mm7, %0 \n\t" >+ >+ : "=r" (DiffVal), >+ "+r" (ptr1), >+ "+r" (ptr2) >+ : "r" (stride1), >+ "r" (stride2) >+ : "memory" >+ ); >+ >+ return DiffVal; >+} >+ >+static ogg_uint32_t sad8x8_thres__mmxext (unsigned char *ptr1, ogg_uint32_t stride1, >+ unsigned char *ptr2, ogg_uint32_t stride2, >+ ogg_uint32_t thres) >+{ >+ ogg_uint32_t DiffVal; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ >+ >+ ".rept 8 \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" >+ " psadbw %%mm1, %%mm0 \n\t" >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ >+ " add %4, %2 \n\t" /* Inc pointer into ref data */ >+ ".endr \n\t" >+ >+ " movd %%mm7, %0 \n\t" >+ >+ : "=r" (DiffVal), >+ "+r" (ptr1), >+ "+r" (ptr2) >+ : "r" (stride1), >+ "r" (stride2) >+ : "memory" >+ ); >+ >+ return DiffVal; >+} >+ >+static ogg_uint32_t sad8x8_xy2_thres__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride, >+ ogg_uint32_t thres) >+{ >+ ogg_uint32_t DiffVal; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ >+ ".rept 8 \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" >+ " movq (%3), %%mm2 \n\t" >+ " pavgb %%mm2, %%mm1 \n\t" >+ " psadbw %%mm1, %%mm0 \n\t" >+ >+ " add %4, %1 \n\t" /* Inc pointer into the new data */ >+ " paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ >+ " add %5, %2 \n\t" /* Inc pointer into ref data */ >+ " add %5, %3 \n\t" /* Inc pointer into ref data */ >+ ".endr \n\t" >+ >+ " movd %%mm7, %0 \n\t" >+ : "=m" (DiffVal), >+ "+r" (SrcData), >+ "+r" (RefDataPtr1), >+ "+r" (RefDataPtr2) >+ : "m" (SrcStride), >+ "m" (RefStride) >+ : "memory" >+ ); >+ >+ return DiffVal; >+} >+ >+static ogg_uint32_t row_sad8__mmxext (unsigned char *Src1, unsigned char *Src2) >+{ >+ ogg_uint32_t MaxSad; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " movd (%1), %%mm0 \n\t" >+ " movd (%2), %%mm1 \n\t" >+ " psadbw %%mm0, %%mm1 \n\t" >+ " movd 4(%1), %%mm2 \n\t" >+ " movd 4(%2), %%mm3 \n\t" >+ " psadbw %%mm2, %%mm3 \n\t" >+ >+ " pmaxsw %%mm1, %%mm3 \n\t" >+ " movd %%mm3, %0 \n\t" >+ " andl $0xffff, %0 \n\t" >+ >+ : "=m" (MaxSad), >+ "+r" (Src1), >+ "+r" (Src2) >+ : >+ : "memory" >+ ); >+ >+ return MaxSad; >+} >+ >+static ogg_uint32_t col_sad8x8__mmxext (unsigned char *Src1, unsigned char *Src2, >+ ogg_uint32_t stride) >+{ >+ ogg_uint32_t MaxSad; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ >+ " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ >+ " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ >+ " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ >+ " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ >+ " mov $4, %%edi \n\t" /* 4 rows */ >+ "1: \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */ >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ >+ " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ >+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ >+ " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " add %3, %2 \n\t" /* Inc pointer into the new data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " mov $4, %%edi \n\t" /* 4 rows */ >+ "2: \n\t" >+ " movq (%1), %%mm0 \n\t" /* take 8 bytes */ >+ " movq (%2), %%mm1 \n\t" /* take 8 bytes */ >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " psubusb %%mm1, %%mm0 \n\t" /* A - B */ >+ " psubusb %%mm2, %%mm1 \n\t" /* B - A */ >+ " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ >+ " movq %%mm0, %%mm1 \n\t" >+ >+ " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ >+ " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ >+ " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ >+ " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ >+ " add %3, %1 \n\t" /* Inc pointer into the new data */ >+ " add %3, %2 \n\t" /* Inc pointer into the new data */ >+ >+ " dec %%edi \n\t" >+ " jnz 2b \n\t" >+ >+ " pmaxsw %%mm6, %%mm7 \n\t" >+ " pmaxsw %%mm4, %%mm5 \n\t" >+ " pmaxsw %%mm5, %%mm7 \n\t" >+ " movq %%mm7, %%mm6 \n\t" >+ " psrlq $32, %%mm6 \n\t" >+ " pmaxsw %%mm6, %%mm7 \n\t" >+ " movq %%mm7, %%mm6 \n\t" >+ " psrlq $16, %%mm6 \n\t" >+ " pmaxsw %%mm6, %%mm7 \n\t" >+ " movd %%mm7, %0 \n\t" >+ " andl $0xffff, %0 \n\t" >+ >+ : "=r" (MaxSad), >+ "+r" (Src1), >+ "+r" (Src2) >+ : "r" (stride) >+ : "memory", "edi" >+ ); >+ >+ return MaxSad; >+} >+ >+static ogg_uint32_t inter8x8_err_xy2__mmxext (unsigned char *SrcData, ogg_uint32_t SrcStride, >+ unsigned char *RefDataPtr1, >+ unsigned char *RefDataPtr2, ogg_uint32_t RefStride) >+{ >+ ogg_uint32_t XSum; >+ ogg_uint32_t XXSum; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm4, %%mm4 \n\t" >+ " pxor %%mm5, %%mm5 \n\t" >+ " pxor %%mm6, %%mm6 \n\t" >+ " pxor %%mm7, %%mm7 \n\t" >+ " mov $8, %%edi \n\t" >+ "1: \n\t" >+ " movq (%2), %%mm0 \n\t" /* take 8 bytes */ >+ >+ " movq (%3), %%mm2 \n\t" >+ " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */ >+ " pavgb %%mm2, %%mm1 \n\t" >+ >+ " movq %%mm0, %%mm2 \n\t" >+ " movq %%mm1, %%mm3 \n\t" >+ >+ " punpcklbw %%mm6, %%mm0 \n\t" >+ " punpcklbw %%mm4, %%mm1 \n\t" >+ " punpckhbw %%mm6, %%mm2 \n\t" >+ " punpckhbw %%mm4, %%mm3 \n\t" >+ >+ " psubsw %%mm1, %%mm0 \n\t" >+ " psubsw %%mm3, %%mm2 \n\t" >+ >+ " paddw %%mm0, %%mm5 \n\t" >+ " paddw %%mm2, %%mm5 \n\t" >+ >+ " pmaddwd %%mm0, %%mm0 \n\t" >+ " pmaddwd %%mm2, %%mm2 \n\t" >+ >+ " paddd %%mm0, %%mm7 \n\t" >+ " paddd %%mm2, %%mm7 \n\t" >+ >+ " add %5, %2 \n\t" /* Inc pointer into src data */ >+ " add %6, %3 \n\t" /* Inc pointer into ref data */ >+ " add %6, %4 \n\t" /* Inc pointer into ref data */ >+ >+ " dec %%edi \n\t" >+ " jnz 1b \n\t" >+ >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $32, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movq %%mm5, %%mm0 \n\t" >+ " psrlq $16, %%mm5 \n\t" >+ " paddw %%mm0, %%mm5 \n\t" >+ " movd %%mm5, %%edi \n\t" >+ " movsx %%di, %%edi \n\t" >+ " movl %%edi, %0 \n\t" >+ >+ " movq %%mm7, %%mm0 \n\t" >+ " psrlq $32, %%mm7 \n\t" >+ " paddd %%mm0, %%mm7 \n\t" >+ " movd %%mm7, %1 \n\t" >+ >+ : "=m" (XSum), >+ "=m" (XXSum), >+ "+r" (SrcData), >+ "+r" (RefDataPtr1), >+ "+r" (RefDataPtr2) >+ : "m" (SrcStride), >+ "m" (RefStride) >+ : "edi", "memory" >+ ); >+ >+ /* Compute and return population variance as mis-match metric. */ >+ return (( (XXSum<<6) - XSum*XSum )); >+} >+ >+void dsp_i386_mmxext_init(DspFunctions *funcs) >+{ >+ funcs->row_sad8 = row_sad8__mmxext; >+ funcs->col_sad8x8 = col_sad8x8__mmxext; >+ funcs->sad8x8 = sad8x8__mmxext; >+ funcs->sad8x8_thres = sad8x8_thres__mmxext; >+ funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext; >+ funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmxext; >+} >+ >diff -Naur libtheora-1.0alpha3/lib/i386/fdct_mmx.c libtheora-1.0alpha3.mmx/lib/i386/fdct_mmx.c >--- libtheora-1.0alpha3/lib/i386/fdct_mmx.c 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/i386/fdct_mmx.c 2004-10-06 17:48:22.509386448 +0200 >@@ -0,0 +1,340 @@ >+;//========================================================================== >+;// >+;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY >+;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE >+;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR >+;// PURPOSE. >+;// >+;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved. >+;// >+;//-------------------------------------------------------------------------- >+ >+#include <theora/theora.h> >+#include "dsp.h" >+ >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL; >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL; >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL; >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL; >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL; >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL; >+static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL; >+ >+#if defined(__MINGW32__) || defined(__CYGWIN__) || \ >+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) >+# define M(a) "_" #a >+#else >+# define M(a) #a >+#endif >+ >+/*********************************************************************** >+ * File: fdct_m.asm >+ * >+ * Description: >+ * This function perform 2-D Forward DCT on a 8x8 block >+ * >+ * >+ * Input: Pointers to input source data buffer and destination >+ * buffer. >+ * >+ * Note: none >+ * >+ * Special Notes: We try to do the truncation right to match the result >+ * of the c version. >+ * >+ ************************************************************************/ >+ >+/* execute stage 1 of forward DCT */ >+#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp) \ >+ " movq " #ip0 ", %%mm0 \n\t" \ >+ " movq " #ip1 ", %%mm1 \n\t" \ >+ " movq " #ip3 ", %%mm2 \n\t" \ >+ " movq " #ip5 ", %%mm3 \n\t" \ >+ " movq %%mm0, %%mm4 \n\t" \ >+ " movq %%mm1, %%mm5 \n\t" \ >+ " movq %%mm2, %%mm6 \n\t" \ >+ " movq %%mm3, %%mm7 \n\t" \ >+ \ >+ " paddsw " #ip7 ", %%mm0 \n\t" /* mm0 = ip0 + ip7 = is07 */ \ >+ " paddsw " #ip2 ", %%mm1 \n\t" /* mm1 = ip1 + ip2 = is12 */ \ >+ " paddsw " #ip4 ", %%mm2 \n\t" /* mm2 = ip3 + ip4 = is34 */ \ >+ " paddsw " #ip6 ", %%mm3 \n\t" /* mm3 = ip5 + ip6 = is56 */ \ >+ " psubsw " #ip7 ", %%mm4 \n\t" /* mm4 = ip0 - ip7 = id07 */ \ >+ " psubsw " #ip2 ", %%mm5 \n\t" /* mm5 = ip1 - ip2 = id12 */ \ >+ \ >+ " psubsw %%mm2, %%mm0 \n\t" /* mm0 = is07 - is34 */ \ >+ \ >+ " paddsw %%mm2, %%mm2 \n\t" \ >+ \ >+ " psubsw " #ip4 ", %%mm6 \n\t" /* mm6 = ip3 - ip4 = id34 */ \ >+ \ >+ " paddsw %%mm0, %%mm2 \n\t" /* mm2 = is07 + is34 = is0734 */ \ >+ " psubsw %%mm3, %%mm1 \n\t" /* mm1 = is12 - is56 */ \ >+ " movq %%mm0," #temp " \n\t" /* Save is07 - is34 to free mm0; */ \ >+ " paddsw %%mm3, %%mm3 \n\t" \ >+ " paddsw %%mm1, %%mm3 \n\t" /* mm3 = is12 + 1s56 = is1256 */ \ >+ \ >+ " psubsw " #ip6 ", %%mm7 \n\t" /* mm7 = ip5 - ip6 = id56 */ \ >+ /* ------------------------------------------------------------------- */ \ >+ " psubsw %%mm7, %%mm5 \n\t" /* mm5 = id12 - id56 */ \ >+ " paddsw %%mm7, %%mm7 \n\t" \ >+ " paddsw %%mm5, %%mm7 \n\t" /* mm7 = id12 + id56 */ \ >+ /* ------------------------------------------------------------------- */ \ >+ " psubsw %%mm3, %%mm2 \n\t" /* mm2 = is0734 - is1256 */ \ >+ " paddsw %%mm3, %%mm3 \n\t" \ >+ \ >+ " movq %%mm2, %%mm0 \n\t" /* make a copy */ \ >+ " paddsw %%mm2, %%mm3 \n\t" /* mm3 = is0734 + is1256 */ \ >+ \ >+ " pmulhw "M(xC4S4)", %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \ >+ " paddw %%mm2, %%mm0 \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ " paddw %%mm2, %%mm0 \n\t" /* Truncate mm0, now it is op[4] */ \ >+ \ >+ " movq %%mm3, %%mm2 \n\t" \ >+ " movq %%mm0," #ip4 " \n\t" /* save ip4, now mm0,mm2 are free */ \ >+ \ >+ " movq %%mm3, %%mm0 \n\t" \ >+ " pmulhw "M(xC4S4)", %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \ >+ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ " paddw %%mm0, %%mm3 \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) */ \ >+ " paddw %%mm2, %%mm3 \n\t" /* Truncate mm3, now it is op[0] */ \ >+ \ >+ " movq %%mm3," #ip0 " \n\t" \ >+ /* ------------------------------------------------------------------- */ \ >+ " movq " #temp ", %%mm3 \n\t" /* mm3 = irot_input_y */ \ >+ " pmulhw "M(xC2S6)", %%mm3 \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \ >+ \ >+ " movq " #temp ", %%mm2 \n\t" \ >+ " movq %%mm2, %%mm0 \n\t" \ >+ \ >+ " psrlw $15, %%mm2 \n\t" /* mm3 = xC2S6 * irot_input_y */ \ >+ " paddw %%mm0, %%mm3 \n\t" \ >+ \ >+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ >+ " movq %%mm5, %%mm0 \n\t" \ >+ \ >+ " movq %%mm5, %%mm2 \n\t" \ >+ " pmulhw "M(xC6S2)", %%mm0 \n\t" /* mm0 = xC6S2 * irot_input_x */ \ >+ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ >+ \ >+ " paddsw %%mm0, %%mm3 \n\t" /* ip[2] */ \ >+ " movq %%mm3," #ip2 " \n\t" /* Save ip2 */ \ >+ \ >+ " movq %%mm5, %%mm0 \n\t" \ >+ " movq %%mm5, %%mm2 \n\t" \ >+ \ >+ " pmulhw "M(xC2S6)", %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ \ >+ " movq " #temp ", %%mm3 \n\t" \ >+ " paddw %%mm0, %%mm5 \n\t" /* mm5 = xC2S6 * irot_input_x */ \ >+ \ >+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ >+ " movq %%mm3, %%mm2 \n\t" \ >+ \ >+ " pmulhw "M(xC6S2)", %%mm3 \n\t" /* mm3 = xC6S2 * irot_input_y */ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ \ >+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ >+ " psubsw %%mm5, %%mm3 \n\t" \ >+ \ >+ " movq %%mm3," #ip6 " \n\t" \ >+ /* ------------------------------------------------------------------- */ \ >+ " movq "M(xC4S4)", %%mm0 \n\t" \ >+ " movq %%mm1, %%mm2 \n\t" \ >+ " movq %%mm1, %%mm3 \n\t" \ >+ \ >+ " pmulhw %%mm0, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ \ >+ " paddw %%mm3, %%mm1 \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \ >+ " paddw %%mm2, %%mm1 \n\t" /* Truncate mm1, now it is icommon_product1 */ \ >+ \ >+ " movq %%mm7, %%mm2 \n\t" \ >+ " movq %%mm7, %%mm3 \n\t" \ >+ \ >+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ \ >+ " paddw %%mm3, %%mm7 \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \ >+ " paddw %%mm2, %%mm7 \n\t" /* Truncate mm7, now it is icommon_product2 */ \ >+ /* ------------------------------------------------------------------- */ \ >+ " pxor %%mm0, %%mm0 \n\t" /* Clear mm0 */ \ >+ " psubsw %%mm6, %%mm0 \n\t" /* mm0 = - id34 */ \ >+ \ >+ " psubsw %%mm7, %%mm0 \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \ >+ " paddsw %%mm6, %%mm6 \n\t" \ >+ " paddsw %%mm0, %%mm6 \n\t" /* mm6 = id34 - icommon_product2 */ \ >+ \ >+ " psubsw %%mm1, %%mm4 \n\t" /* mm4 = id07 - icommon_product1 */ \ >+ " paddsw %%mm1, %%mm1 \n\t" \ >+ " paddsw %%mm4, %%mm1 \n\t" /* mm1 = id07 + icommon_product1 */ \ >+ /* ------------------------------------------------------------------- */ \ >+ " movq "M(xC1S7)", %%mm7 \n\t" \ >+ " movq %%mm1, %%mm2 \n\t" \ >+ \ >+ " movq %%mm1, %%mm3 \n\t" \ >+ " pmulhw %%mm7, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \ >+ \ >+ " movq "M(xC7S1)", %%mm7 \n\t" \ >+ " psrlw $15, %%mm2 \n\t" \ >+ \ >+ " paddw %%mm3, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x */ \ >+ " paddw %%mm2, %%mm1 \n\t" /* Trucated */ \ >+ \ >+ " pmulhw %%mm7, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x */ \ >+ " paddw %%mm2, %%mm3 \n\t" /* Truncated */ \ >+ \ >+ " movq %%mm0, %%mm5 \n\t" \ >+ " movq %%mm0, %%mm2 \n\t" \ >+ \ >+ " movq "M(xC1S7)", %%mm7 \n\t" \ >+ " pmulhw %%mm7, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \ >+ \ >+ " movq "M(xC7S1)", %%mm7 \n\t" \ >+ " psrlw $15, %%mm2 \n\t" \ >+ \ >+ " paddw %%mm5, %%mm0 \n\t" /* mm0 = xC1S7 * irot_input_y */ \ >+ " paddw %%mm2, %%mm0 \n\t" /* Truncated */ \ >+ \ >+ " pmulhw %%mm7, %%mm5 \n\t" /* mm5 = xC7S1 * irot_input_y */ \ >+ " paddw %%mm2, %%mm5 \n\t" /* Truncated */ \ >+ \ >+ " psubsw %%mm5, %%mm1 \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \ >+ " paddsw %%mm0, %%mm3 \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \ >+ \ >+ " movq %%mm1," #ip1 " \n\t" \ >+ " movq %%mm3," #ip7 " \n\t" \ >+ /* ------------------------------------------------------------------- */ \ >+ " movq "M(xC3S5)", %%mm0 \n\t" \ >+ " movq "M(xC5S3)", %%mm1 \n\t" \ >+ \ >+ " movq %%mm6, %%mm5 \n\t" \ >+ " movq %%mm6, %%mm7 \n\t" \ >+ \ >+ " movq %%mm4, %%mm2 \n\t" \ >+ " movq %%mm4, %%mm3 \n\t" \ >+ \ >+ " pmulhw %%mm0, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \ >+ " pmulhw %%mm1, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \ >+ \ >+ " psrlw $15, %%mm2 \n\t" \ >+ " psrlw $15, %%mm5 \n\t" \ >+ \ >+ " paddw %%mm3, %%mm4 \n\t" /* mm4 = xC3S5 * irot_input_x */ \ >+ " paddw %%mm7, %%mm6 \n\t" /* mm6 = xC5S3 * irot_input_y */ \ >+ \ >+ " paddw %%mm2, %%mm4 \n\t" /* Truncated */ \ >+ " paddw %%mm5, %%mm6 \n\t" /* Truncated */ \ >+ \ >+ " psubsw %%mm6, %%mm4 \n\t" /* ip3 */ \ >+ " movq %%mm4," #ip3 " \n\t" \ >+ \ >+ " movq %%mm3, %%mm4 \n\t" \ >+ " movq %%mm7, %%mm6 \n\t" \ >+ \ >+ " pmulhw %%mm1, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \ >+ " pmulhw %%mm0, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \ >+ \ >+ " paddw %%mm2, %%mm4 \n\t" \ >+ " paddw %%mm5, %%mm6 \n\t" \ >+ \ >+ " paddw %%mm4, %%mm3 \n\t" /* mm3 = xC5S3 * irot_input_x */ \ >+ " paddw %%mm6, %%mm7 \n\t" /* mm7 = xC3S5 * irot_input_y */ \ >+ \ >+ " paddw %%mm7, %%mm3 \n\t" /* ip5 */ \ >+ " movq %%mm3," #ip5 " \n\t" >+ >+#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7, \ >+ op0,op1,op2,op3,op4,op5,op6,op7) \ >+ " movq " #ip0 ", %%mm0 \n\t" /* mm0 = a0 a1 a2 a3 */ \ >+ " movq " #ip4 ", %%mm4 \n\t" /* mm4 = e4 e5 e6 e7 */ \ >+ " movq " #ip1 ", %%mm1 \n\t" /* mm1 = b0 b1 b2 b3 */ \ >+ " movq " #ip5 ", %%mm5 \n\t" /* mm5 = f4 f5 f6 f7 */ \ >+ " movq " #ip2 ", %%mm2 \n\t" /* mm2 = c0 c1 c2 c3 */ \ >+ " movq " #ip6 ", %%mm6 \n\t" /* mm6 = g4 g5 g6 g7 */ \ >+ " movq " #ip3 ", %%mm3 \n\t" /* mm3 = d0 d1 d2 d3 */ \ >+ " movq %%mm1," #op1 " \n\t" /* save b0 b1 b2 b3 */ \ >+ " movq " #ip7 ", %%mm7 \n\t" /* mm7 = h0 h1 h2 h3 */ \ >+ /* Transpose 2x8 block */ \ >+ " movq %%mm4, %%mm1 \n\t" /* mm1 = e3 e2 e1 e0 */ \ >+ " punpcklwd %%mm5, %%mm4 \n\t" /* mm4 = f1 e1 f0 e0 */ \ >+ " movq %%mm0," #op0 " \n\t" /* save a3 a2 a1 a0 */ \ >+ " punpckhwd %%mm5, %%mm1 \n\t" /* mm1 = f3 e3 f2 e2 */ \ >+ " movq %%mm6, %%mm0 \n\t" /* mm0 = g3 g2 g1 g0 */ \ >+ " punpcklwd %%mm7, %%mm6 \n\t" /* mm6 = h1 g1 h0 g0 */ \ >+ " movq %%mm4, %%mm5 \n\t" /* mm5 = f1 e1 f0 e0 */ \ >+ " punpckldq %%mm6, %%mm4 \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \ >+ " punpckhdq %%mm6, %%mm5 \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \ >+ " movq %%mm1, %%mm6 \n\t" /* mm6 = f3 e3 f2 e2 */ \ >+ " movq %%mm4," #op4 " \n\t" \ >+ " punpckhwd %%mm7, %%mm0 \n\t" /* mm0 = h3 g3 h2 g2 */ \ >+ " movq %%mm5," #op5 " \n\t" \ >+ " punpckhdq %%mm0, %%mm6 \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \ >+ " movq " #op0 ", %%mm4 \n\t" /* mm4 = a3 a2 a1 a0 */ \ >+ " punpckldq %%mm0, %%mm1 \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \ >+ " movq " #op1 ", %%mm5 \n\t" /* mm5 = b3 b2 b1 b0 */ \ >+ " movq %%mm4, %%mm0 \n\t" /* mm0 = a3 a2 a1 a0 */ \ >+ " movq %%mm6," #op7 " \n\t" \ >+ " punpcklwd %%mm5, %%mm0 \n\t" /* mm0 = b1 a1 b0 a0 */ \ >+ " movq %%mm1," #op6 " \n\t" \ >+ " punpckhwd %%mm5, %%mm4 \n\t" /* mm4 = b3 a3 b2 a2 */ \ >+ " movq %%mm2, %%mm5 \n\t" /* mm5 = c3 c2 c1 c0 */ \ >+ " punpcklwd %%mm3, %%mm2 \n\t" /* mm2 = d1 c1 d0 c0 */ \ >+ " movq %%mm0, %%mm1 \n\t" /* mm1 = b1 a1 b0 a0 */ \ >+ " punpckldq %%mm2, %%mm0 \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \ >+ " punpckhdq %%mm2, %%mm1 \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \ >+ " movq %%mm4, %%mm2 \n\t" /* mm2 = b3 a3 b2 a2 */ \ >+ " movq %%mm0," #op0 " \n\t" \ >+ " punpckhwd %%mm3, %%mm5 \n\t" /* mm5 = d3 c3 d2 c2 */ \ >+ " movq %%mm1," #op1 " \n\t" \ >+ " punpckhdq %%mm5, %%mm4 \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \ >+ " punpckldq %%mm5, %%mm2 \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \ >+ " movq %%mm4," #op3 " \n\t" \ >+ " movq %%mm2," #op2 " \n\t" >+ >+ >+static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData) >+{ >+ ogg_int64_t __attribute__((aligned(8))) align_tmp[16]; >+ ogg_int16_t *const temp= (int16_t*)align_tmp; >+ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ /* >+ * Input data is an 8x8 block. To make processing of the data more efficent >+ * we will transpose the block of data to two 4x8 blocks??? >+ */ >+ Transpose_mmx ( (%0), 16(%0), 32(%0), 48(%0), 8(%0), 24(%0), 40(%0), 56(%0), >+ (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1)) >+ Fdct_mmx ( (%1), 16(%1), 32(%1), 48(%1), 8(%1), 24(%1), 40(%1), 56(%1), (%2)) >+ >+ Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0), >+ 64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1)) >+ Fdct_mmx (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) >+ >+ Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), >+ 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1)) >+ Fdct_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2)) >+ >+ Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), >+ 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1)) >+ Fdct_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2)) >+ >+ " emms \n\t" >+ >+ : "+r" (InputData), >+ "+r" (OutputData) >+ : "r" (temp) >+ : "memory" >+ ); >+} >+ >+void dsp_i386_mmx_fdct_init(DspFunctions *funcs) >+{ >+ funcs->fdct_short = fdct_short__mmx; >+} >diff -Naur libtheora-1.0alpha3/lib/i386/recon_mmx.c libtheora-1.0alpha3.mmx/lib/i386/recon_mmx.c >--- libtheora-1.0alpha3/lib/i386/recon_mmx.c 1970-01-01 01:00:00.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/i386/recon_mmx.c 2004-10-06 17:48:22.510386296 +0200 >@@ -0,0 +1,185 @@ >+/******************************************************************** >+ * * >+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * >+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * >+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * >+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * >+ * * >+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * >+ * by the Xiph.Org Foundation http://www.xiph.org/ * >+ * * >+ ******************************************************************** >+ >+ function: >+ last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $ >+ >+ ********************************************************************/ >+ >+#include "encoder_internal.h" >+ >+static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL; >+ >+#if defined(__MINGW32__) || defined(__CYGWIN__) || \ >+ defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) >+# define M(a) "_" #a >+#else >+# define M(a) #a >+#endif >+ >+static void copy8x8__mmx (unsigned char *src, >+ unsigned char *dest, >+ unsigned int stride) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " lea (%2, %2, 2), %%edi \n\t" >+ >+ " movq (%1), %%mm0 \n\t" >+ " movq (%1, %2), %%mm1 \n\t" >+ " movq (%1, %2, 2), %%mm2 \n\t" >+ " movq (%1, %%edi), %%mm3 \n\t" >+ >+ " lea (%1, %2, 4), %1 \n\t" >+ >+ " movq %%mm0, (%0) \n\t" >+ " movq %%mm1, (%0, %2) \n\t" >+ " movq %%mm2, (%0, %2, 2) \n\t" >+ " movq %%mm3, (%0, %%edi) \n\t" >+ >+ " lea (%0, %2, 4), %0 \n\t" >+ >+ " movq (%1), %%mm0 \n\t" >+ " movq (%1, %2), %%mm1 \n\t" >+ " movq (%1, %2, 2), %%mm2 \n\t" >+ " movq (%1, %%edi), %%mm3 \n\t" >+ >+ " movq %%mm0, (%0) \n\t" >+ " movq %%mm1, (%0, %2) \n\t" >+ " movq %%mm2, (%0, %2, 2) \n\t" >+ " movq %%mm3, (%0, %%edi) \n\t" >+ : "+a" (dest) >+ : "c" (src), >+ "d" (stride) >+ : "memory", "edi" >+ ); >+} >+ >+static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, >+ ogg_uint32_t LineStep) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " movq "M(V128)", %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */ >+ >+ " lea 128(%1), %%edi \n\t" /* Endpoint in input buffer */ >+ "1: \n\t" >+ " movq (%1), %%mm2 \n\t" /* First four input values */ >+ >+ " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */ >+ " por %%mm0, %%mm0 \n\t" >+ " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */ >+ " lea 16(%1), %1 \n\t" /* Step source buffer */ >+ " cmp %%edi, %1 \n\t" /* are we done */ >+ >+ " movq %%mm2, (%0) \n\t" /* store results */ >+ >+ " lea (%0, %2), %0 \n\t" /* Step output buffer */ >+ " jc 1b \n\t" /* Loop back if we are not done */ >+ : "+r" (ReconPtr) >+ : "r" (ChangePtr), >+ "r" (LineStep) >+ : "memory", "edi" >+ ); >+} >+ >+static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr, >+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm0, %%mm0 \n\t" >+ " lea 128(%1), %%edi \n\t" >+ >+ "1: \n\t" >+ " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */ >+ >+ " movq (%1), %%mm4 \n\t" /* first 4 changes */ >+ " movq %%mm2, %%mm3 \n\t" >+ " movq 8(%1), %%mm5 \n\t" /* last 4 changes */ >+ " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */ >+ " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */ >+ " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */ >+ " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */ >+ " add %3, %2 \n\t" /* next row of reference pixels */ >+ " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */ >+ " lea 16(%1), %1 \n\t" /* next row of changes */ >+ " cmp %%edi, %1 \n\t" /* are we done? */ >+ >+ " movq %%mm2, (%0) \n\t" /* store result */ >+ >+ " lea (%0, %3), %0 \n\t" /* next row of output */ >+ " jc 1b \n\t" >+ : "+r" (ReconPtr) >+ : "r" (ChangePtr), >+ "r" (RefPtr), >+ "r" (LineStep) >+ : "memory", "edi" >+ ); >+} >+ >+static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1, >+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr, >+ ogg_uint32_t LineStep) >+{ >+ __asm__ __volatile__ ( >+ " .balign 16 \n\t" >+ >+ " pxor %%mm0, %%mm0 \n\t" >+ " lea 128(%1), %%edi \n\t" >+ >+ "1: \n\t" >+ " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */ >+ " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */ >+ >+ " movq %%mm2, %%mm3 \n\t" >+ " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */ >+ " movq %%mm4, %%mm5 \n\t" >+ " movq (%1), %%mm6 \n\t" /* first 4 changes */ >+ " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */ >+ " movq 8(%1), %%mm7 \n\t" /* last 4 changes */ >+ " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */ >+ " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */ >+ " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */ >+ " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */ >+ " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */ >+ " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */ >+ " paddw %%mm6, %%mm2 \n\t" /* add changes to start */ >+ " paddw %%mm7, %%mm3 \n\t" /* add changes to end */ >+ " lea 16(%1), %1 \n\t" /* next row of changes */ >+ " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */ >+ " add %4, %2 \n\t" /* next row of reference pixels */ >+ " add %4, %3 \n\t" /* next row of reference pixels */ >+ " movq %%mm2, (%0) \n\t" /* store result */ >+ " add %4, %0 \n\t" /* next row of output */ >+ " cmp %%edi, %1 \n\t" /* are we done? */ >+ " jc 1b \n\t" >+ : "+r" (ReconPtr) >+ : "r" (ChangePtr), >+ "r" (RefPtr1), >+ "r" (RefPtr2), >+ "m" (LineStep) >+ : "memory", "edi" >+ ); >+} >+ >+void dsp_i386_mmx_recon_init(DspFunctions *funcs) >+{ >+ funcs->copy8x8 = copy8x8__mmx; >+ funcs->recon_intra8x8 = recon_intra8x8__mmx; >+ funcs->recon_inter8x8 = recon_inter8x8__mmx; >+ funcs->recon_inter8x8_half = recon_inter8x8_half__mmx; >+} >+ >diff -Naur libtheora-1.0alpha3/lib/Makefile.am libtheora-1.0alpha3.mmx/lib/Makefile.am >--- libtheora-1.0alpha3/lib/Makefile.am 2003-06-15 02:56:42.000000000 +0200 >+++ libtheora-1.0alpha3.mmx/lib/Makefile.am 2004-10-06 17:48:22.510386296 +0200 >@@ -6,7 +6,8 @@ > > libtheora_la_SOURCES = encode.c hufftables.h quant_lookup.h \ > encoder_internal.h idct.c reconstruct.c block_inline.h \ >- encoder_lookup.h mcomp.c scan.c blockmap.c misc_common.c \ >+ encoder_lookup.h cpu.c dsp.h dsp.c i386/dsp_mmx.c i386/dsp_mmxext.c \ >+ i386/recon_mmx.c i386/fdct_mmx.c mcomp.c scan.c blockmap.c misc_common.c \ > dct.c frarray.c pb.c dct_decode.c frinit.c pp.c dct_encode.c \ > huffman.c pp.h toplevel.c decode.c huffman.h quant.c \ > comment.c toplevel_lookup.h mcomp.h >diff -Naur libtheora-1.0alpha3/lib/mcomp.c libtheora-1.0alpha3.mmx/lib/mcomp.c >--- libtheora-1.0alpha3/lib/mcomp.c 2003-12-03 09:59:41.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/mcomp.c 2004-10-06 17:48:22.543381280 +0200 >@@ -17,6 +17,7 @@ > > #include <stdlib.h> > #include <stdio.h> >+#include "dsp.h" > #include "encoder_internal.h" > > /* Initialises motion compentsation. */ >@@ -100,161 +101,22 @@ > unsigned char * RefDataPtr1, > unsigned char * RefDataPtr2, > ogg_uint32_t PixelsPerLine ) { >- ogg_uint32_t i; >- ogg_int32_t XSum=0; >- ogg_int32_t XXSum=0; > ogg_int32_t DiffVal; >- ogg_int32_t AbsRefOffset = abs((int)(RefDataPtr1 - RefDataPtr2)); >+ ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2); >+ ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA; > > /* Mode of interpolation chosen based upon on the offset of the > second reference pointer */ >- if ( AbsRefOffset == 0 ) { >- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) { >- DiffVal = ((int)NewDataPtr[0]) - (int)RefDataPtr1[0]; >- XSum += DiffVal; >- >- /* negative array indexes are strictly forbidden by ANSI C and C99 */ >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[1]) - (int)RefDataPtr1[1]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[2]) - (int)RefDataPtr1[2]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[3]) - (int)RefDataPtr1[3]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[4]) - (int)RefDataPtr1[4]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[5]) - (int)RefDataPtr1[5]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[6]) - (int)RefDataPtr1[6]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[7]) - (int)RefDataPtr1[7]; >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- /* Step to next row of block. */ >- NewDataPtr += PixelsPerLine; >- RefDataPtr1 += STRIDE_EXTRA + PixelsPerLine; >- } >- >+ if ( RefOffset == 0 ) { >+ DiffVal = dsp_static_inter8x8_err (NewDataPtr, PixelsPerLine, >+ RefDataPtr1, RefPixelsPerLine); > }else{ >- >- /* Simple two reference interpolation */ >- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) { >- DiffVal = ((int)NewDataPtr[0]) - >- (((int)RefDataPtr1[0] + (int)RefDataPtr2[0]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[1]) - >- (((int)RefDataPtr1[1] + (int)RefDataPtr2[1]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[2]) - >- (((int)RefDataPtr1[2] + (int)RefDataPtr2[2]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[3]) - >- (((int)RefDataPtr1[3] + (int)RefDataPtr2[3]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[4]) - >- (((int)RefDataPtr1[4] + (int)RefDataPtr2[4]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[5]) - >- (((int)RefDataPtr1[5] + (int)RefDataPtr2[5]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[6]) - >- (((int)RefDataPtr1[6] + (int)RefDataPtr2[6]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- DiffVal = ((int)NewDataPtr[7]) - >- (((int)RefDataPtr1[7] + (int)RefDataPtr2[7]) / 2); >- XSum += DiffVal; >- XXSum += DiffVal*DiffVal; >- >- /* Step to next row of block. */ >- NewDataPtr += PixelsPerLine; >- RefDataPtr1 += STRIDE_EXTRA+PixelsPerLine; >- RefDataPtr2 += STRIDE_EXTRA+PixelsPerLine; >- } >+ DiffVal = dsp_static_inter8x8_err_xy2 (NewDataPtr, PixelsPerLine, >+ RefDataPtr1, >+ RefDataPtr2, RefPixelsPerLine); > } > > /* Compute and return population variance as mis-match metric. */ >- return (( (XXSum<<6) - XSum*XSum )); >-} >- >-static ogg_uint32_t GetSumAbsDiffs (unsigned char * NewDataPtr, >- unsigned char * RefDataPtr, >- ogg_uint32_t PixelsPerLine, >- ogg_uint32_t ErrorSoFar) { >- ogg_uint32_t i; >- ogg_uint32_t DiffVal = ErrorSoFar; >- >- /* Decide on standard or MMX implementation */ >- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) { >- DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) ); >- DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) ); >- DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) ); >- DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) ); >- DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) ); >- DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) ); >- DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) ); >- DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) ); >- >- /* Step to next row of block. */ >- NewDataPtr += PixelsPerLine; >- RefDataPtr += STRIDE_EXTRA+PixelsPerLine; >- } >- >- return DiffVal; >-} >- >-static ogg_uint32_t GetNextSumAbsDiffs (unsigned char * NewDataPtr, >- unsigned char * RefDataPtr, >- ogg_uint32_t PixelsPerLine, >- ogg_uint32_t ErrorSoFar, >- ogg_uint32_t BestSoFar ) { >- ogg_uint32_t i; >- ogg_uint32_t DiffVal = ErrorSoFar; >- >- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) { >- DiffVal += abs( ((int)NewDataPtr[0]) - ((int)RefDataPtr[0]) ); >- DiffVal += abs( ((int)NewDataPtr[1]) - ((int)RefDataPtr[1]) ); >- DiffVal += abs( ((int)NewDataPtr[2]) - ((int)RefDataPtr[2]) ); >- DiffVal += abs( ((int)NewDataPtr[3]) - ((int)RefDataPtr[3]) ); >- DiffVal += abs( ((int)NewDataPtr[4]) - ((int)RefDataPtr[4]) ); >- DiffVal += abs( ((int)NewDataPtr[5]) - ((int)RefDataPtr[5]) ); >- DiffVal += abs( ((int)NewDataPtr[6]) - ((int)RefDataPtr[6]) ); >- DiffVal += abs( ((int)NewDataPtr[7]) - ((int)RefDataPtr[7]) ); >- >- if ( DiffVal > BestSoFar )break; >- >- /* Step to next row of block. */ >- NewDataPtr += PixelsPerLine; >- RefDataPtr += STRIDE_EXTRA+PixelsPerLine; >- } >- > return DiffVal; > } > >@@ -265,118 +127,60 @@ > ogg_uint32_t ErrorSoFar, > ogg_uint32_t BestSoFar ) { > >- ogg_uint32_t i; > ogg_uint32_t DiffVal = ErrorSoFar; > ogg_int32_t RefOffset = (int)(RefDataPtr1 - RefDataPtr2); > ogg_uint32_t RefPixelsPerLine = PixelsPerLine + STRIDE_EXTRA; > > if ( RefOffset == 0 ) { > /* Simple case as for non 0.5 pixel */ >- DiffVal += GetSumAbsDiffs( SrcData, RefDataPtr1, PixelsPerLine, >- ErrorSoFar); >+ DiffVal += dsp_static_sad8x8 (SrcData, PixelsPerLine, >+ RefDataPtr1, RefPixelsPerLine); > } else { >- for ( i=0; i < BLOCK_HEIGHT_WIDTH; i++ ) { >- DiffVal += abs( ((int)SrcData[0]) - (((int)RefDataPtr1[0] + >- (int)RefDataPtr2[0]) / 2) ); >- DiffVal += abs( ((int)SrcData[1]) - (((int)RefDataPtr1[1] + >- (int)RefDataPtr2[1]) / 2) ); >- DiffVal += abs( ((int)SrcData[2]) - (((int)RefDataPtr1[2] + >- (int)RefDataPtr2[2]) / 2) ); >- DiffVal += abs( ((int)SrcData[3]) - (((int)RefDataPtr1[3] + >- (int)RefDataPtr2[3]) / 2) ); >- DiffVal += abs( ((int)SrcData[4]) - (((int)RefDataPtr1[4] + >- (int)RefDataPtr2[4]) / 2) ); >- DiffVal += abs( ((int)SrcData[5]) - (((int)RefDataPtr1[5] + >- (int)RefDataPtr2[5]) / 2) ); >- DiffVal += abs( ((int)SrcData[6]) - (((int)RefDataPtr1[6] + >- (int)RefDataPtr2[6]) / 2) ); >- DiffVal += abs( ((int)SrcData[7]) - (((int)RefDataPtr1[7] + >- (int)RefDataPtr2[7]) / 2) ); >- >- if ( DiffVal > BestSoFar ) break; >- >- /* Step to next row of block. */ >- SrcData += PixelsPerLine; >- RefDataPtr1 += RefPixelsPerLine; >- RefDataPtr2 += RefPixelsPerLine; >- } >+ DiffVal += dsp_static_sad8x8_xy2_thres (SrcData, PixelsPerLine, >+ RefDataPtr1, >+ RefDataPtr2, RefPixelsPerLine, BestSoFar); > } > > return DiffVal; > } > >-static ogg_uint32_t GetIntraError (unsigned char * DataPtr, >- ogg_uint32_t PixelsPerLine ) { >- ogg_uint32_t i; >- ogg_uint32_t XSum=0; >- ogg_uint32_t XXSum=0; >- unsigned char *DiffPtr; >- >- /* Loop expanded out for speed. */ >- DiffPtr = DataPtr; >- >- for ( i=0; i<BLOCK_HEIGHT_WIDTH; i++ ) { >- >- /* Examine alternate pixel locations. */ >- XSum += DiffPtr[0]; >- XXSum += DiffPtr[0]*DiffPtr[0]; >- XSum += DiffPtr[1]; >- XXSum += DiffPtr[1]*DiffPtr[1]; >- XSum += DiffPtr[2]; >- XXSum += DiffPtr[2]*DiffPtr[2]; >- XSum += DiffPtr[3]; >- XXSum += DiffPtr[3]*DiffPtr[3]; >- XSum += DiffPtr[4]; >- XXSum += DiffPtr[4]*DiffPtr[4]; >- XSum += DiffPtr[5]; >- XXSum += DiffPtr[5]*DiffPtr[5]; >- XSum += DiffPtr[6]; >- XXSum += DiffPtr[6]*DiffPtr[6]; >- XSum += DiffPtr[7]; >- XXSum += DiffPtr[7]*DiffPtr[7]; >- >- /* Step to next row of block. */ >- DiffPtr += PixelsPerLine; >- } >- >- /* Compute population variance as mis-match metric. */ >- return (( (XXSum<<6) - XSum*XSum ) ); >-} >- > ogg_uint32_t GetMBIntraError (CP_INSTANCE *cpi, ogg_uint32_t FragIndex, > ogg_uint32_t PixelsPerLine ) { > ogg_uint32_t LocalFragIndex = FragIndex; > ogg_uint32_t IntraError = 0; > >+ dsp_static_save_fpu (); >+ > /* Add together the intra errors for those blocks in the macro block > that are coded (Y only) */ > if ( cpi->pb.display_fragments[LocalFragIndex] ) > IntraError += >- GetIntraError(&cpi-> >+ dsp_static_intra8x8_err (&cpi-> > ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], >- PixelsPerLine ); >- >+ PixelsPerLine); > > LocalFragIndex++; > if ( cpi->pb.display_fragments[LocalFragIndex] ) > IntraError += >- GetIntraError(&cpi-> >+ dsp_static_intra8x8_err (&cpi-> > ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], >- PixelsPerLine ); >+ PixelsPerLine); > > LocalFragIndex = FragIndex + cpi->pb.HFragments; > if ( cpi->pb.display_fragments[LocalFragIndex] ) > IntraError += >- GetIntraError(&cpi-> >+ dsp_static_intra8x8_err (&cpi-> > ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], >- PixelsPerLine ); >+ PixelsPerLine); > > LocalFragIndex++; > if ( cpi->pb.display_fragments[LocalFragIndex] ) > IntraError += >- GetIntraError(&cpi-> >+ dsp_static_intra8x8_err (&cpi-> > ConvDestBuffer[cpi->pb.pixel_index_table[LocalFragIndex]], >- PixelsPerLine ); >+ PixelsPerLine); >+ >+ dsp_static_restore_fpu (); > > return IntraError; > } >@@ -400,6 +204,8 @@ > unsigned char * SrcPtr1; > unsigned char * RefPtr1; > >+ dsp_static_save_fpu (); >+ > /* Work out pixel offset into source buffer. */ > PixelIndex = cpi->pb.pixel_index_table[LocalFragIndex]; > >@@ -462,6 +268,9 @@ > InterError += GetInterErr( SrcPtr1, RefPtr1, > &RefPtr1[RefPtr2Offset], PixelsPerLine ); > } >+ >+ dsp_static_restore_fpu (); >+ > return InterError; > } > >@@ -496,6 +305,8 @@ > unsigned char * RefDataPtr1; > unsigned char * RefDataPtr2; > >+ dsp_static_save_fpu (); >+ > /* Note which of the four blocks in the macro block are to be > included in the search. */ > MBlockDispFrags[0] = >@@ -518,20 +329,20 @@ > > /* Check the 0,0 candidate. */ > if ( MBlockDispFrags[0] ) { >- Error = GetSumAbsDiffs( SrcPtr[0], RefPtr, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, RefPtr, >+ PixelsPerLine + STRIDE_EXTRA); > } > if ( MBlockDispFrags[1] ) { >- Error = GetSumAbsDiffs( SrcPtr[1], RefPtr + 8, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, RefPtr + 8, >+ PixelsPerLine + STRIDE_EXTRA); > } > if ( MBlockDispFrags[2] ) { >- Error = GetSumAbsDiffs( SrcPtr[2], RefPtr + RefRow2Offset, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, RefPtr + RefRow2Offset, >+ PixelsPerLine + STRIDE_EXTRA); > } > if ( MBlockDispFrags[3] ) { >- Error = GetSumAbsDiffs( SrcPtr[3], RefPtr + RefRow2Offset + 8, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, RefPtr + RefRow2Offset + 8, >+ PixelsPerLine + STRIDE_EXTRA); > } > > /* Set starting values to results of 0, 0 vector. */ >@@ -554,24 +365,23 @@ > > /* Get the score for the current offset */ > if ( MBlockDispFrags[0] ) { >- Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr, >+ PixelsPerLine + STRIDE_EXTRA); > } > > if ( MBlockDispFrags[1] && (Error < MinError) ) { >- Error = GetNextSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8, >- PixelsPerLine, Error, MinError ); >+ Error += dsp_static_sad8x8_thres (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8, >+ PixelsPerLine + STRIDE_EXTRA, MinError); > } > > if ( MBlockDispFrags[2] && (Error < MinError) ) { >- Error = GetNextSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset, >- PixelsPerLine, Error, MinError ); >+ Error += dsp_static_sad8x8_thres (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset, >+ PixelsPerLine + STRIDE_EXTRA, MinError); > } > > if ( MBlockDispFrags[3] && (Error < MinError) ) { >- Error = GetNextSumAbsDiffs( SrcPtr[3], >- CandidateBlockPtr + RefRow2Offset + 8, >- PixelsPerLine, Error, MinError ); >+ Error += dsp_static_sad8x8_thres (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8, >+ PixelsPerLine + STRIDE_EXTRA, MinError); > } > > if ( Error < MinError ) { >@@ -652,6 +462,8 @@ > InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr, > FragIndex, MV->x, MV->y, PixelsPerLine ); > >+ dsp_static_restore_fpu (); >+ > /* Return score of best matching block. */ > return InterMVError; > } >@@ -684,6 +496,8 @@ > unsigned char * RefDataPtr1; > unsigned char * RefDataPtr2; > >+ dsp_static_save_fpu (); >+ > /* Note which of the four blocks in the macro block are to be > included in the search. */ > MBlockDispFrags[0] = cpi-> >@@ -717,20 +531,20 @@ > > /* Summ errors for each block. */ > if ( MBlockDispFrags[0] ) { >- Error = GetSumAbsDiffs( SrcPtr[0], CandidateBlockPtr, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[0], PixelsPerLine, CandidateBlockPtr, >+ PixelsPerLine + STRIDE_EXTRA); > } > if ( MBlockDispFrags[1] ){ >- Error = GetSumAbsDiffs( SrcPtr[1], CandidateBlockPtr + 8, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[1], PixelsPerLine, CandidateBlockPtr + 8, >+ PixelsPerLine + STRIDE_EXTRA); > } > if ( MBlockDispFrags[2] ){ >- Error = GetSumAbsDiffs( SrcPtr[2], CandidateBlockPtr + RefRow2Offset, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[2], PixelsPerLine, CandidateBlockPtr + RefRow2Offset, >+ PixelsPerLine + STRIDE_EXTRA); > } > if ( MBlockDispFrags[3] ){ >- Error = GetSumAbsDiffs( SrcPtr[3], CandidateBlockPtr + RefRow2Offset + 8, >- PixelsPerLine, Error); >+ Error += dsp_static_sad8x8 (SrcPtr[3], PixelsPerLine, CandidateBlockPtr + RefRow2Offset + 8, >+ PixelsPerLine + STRIDE_EXTRA); > } > > /* Was this the best so far */ >@@ -808,6 +622,8 @@ > InterMVError = GetMBInterError( cpi, cpi->ConvDestBuffer, RefFramePtr, > FragIndex, MV->x, MV->y, PixelsPerLine ); > >+ dsp_static_restore_fpu (); >+ > /* Return score of best matching block. */ > return InterMVError; > } >@@ -850,8 +666,8 @@ > > for ( j = 0; j < (ogg_int32_t)MAX_MV_EXTENT; j++ ){ > /* Get the block error score. */ >- Error = GetSumAbsDiffs( SrcPtr, CandidateBlockPtr, >- PixelsPerLine, 0); >+ Error = dsp_static_sad8x8 (SrcPtr, PixelsPerLine, CandidateBlockPtr, >+ PixelsPerLine + STRIDE_EXTRA); > > /* Was this the best so far */ > if ( Error < MinError ) { >@@ -911,6 +727,8 @@ > MOTION_VECTOR *MV ) { > ogg_uint32_t InterMVError; > >+ dsp_static_save_fpu (); >+ > /* For the moment the 4MV mode is only deemd to be valid if all four > Y blocks are to be updated */ > /* This May be adapted later. */ >@@ -941,6 +759,8 @@ > InterMVError = HUGE_ERROR; > } > >+ dsp_static_restore_fpu (); >+ > /* Return score of best matching block. */ > return InterMVError; > } >diff -Naur libtheora-1.0alpha3/lib/pp.c libtheora-1.0alpha3.mmx/lib/pp.c >--- libtheora-1.0alpha3/lib/pp.c 2003-12-03 09:59:41.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/pp.c 2004-10-06 17:48:22.545380976 +0200 >@@ -19,6 +19,7 @@ > #include <string.h> > #include "encoder_internal.h" > #include "pp.h" >+#include "dsp.h" > > #define MAX(a, b) ((a>b)?a:b) > #define MIN(a, b) ((a<b)?a:b) >@@ -490,7 +491,7 @@ > > } else { > >- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength); >+ dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength); > > } > >@@ -529,7 +530,7 @@ > DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col, > LineLength,Quality,QuantScale); > }else{ >- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength); >+ dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength); > } > > ++Block; >@@ -565,7 +566,7 @@ > DeringBlockWeak(SrcPtr + 8 * col, DestPtr + 8 * col, > LineLength,Quality,QuantScale); > }else{ >- CopyBlock(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength); >+ dsp_static_copy8x8(SrcPtr + 8 * col, DestPtr + 8 * col, LineLength); > } > > ++Block; >@@ -913,7 +914,7 @@ > } > > void PostProcess(PB_INSTANCE *pbi){ >- >+ dsp_static_save_fpu (); > switch (pbi->PostProcessingLevel){ > case 8: > /* on a slow machine, use a simpler and faster deblocking filter */ >@@ -947,5 +948,6 @@ > DeringFrame(pbi, pbi->PostProcessBuffer, pbi->PostProcessBuffer); > break; > } >+ dsp_static_restore_fpu (); > } > >diff -Naur libtheora-1.0alpha3/lib/reconstruct.c libtheora-1.0alpha3.mmx/lib/reconstruct.c >--- libtheora-1.0alpha3/lib/reconstruct.c 2003-12-03 09:59:41.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/reconstruct.c 2004-10-06 17:48:22.574376568 +0200 >@@ -16,12 +16,28 @@ > ********************************************************************/ > > #include "encoder_internal.h" >+#include "dsp.h" >+#include "cpu.h" > >-void ReconIntra( PB_INSTANCE *pbi, unsigned char * ReconPtr, >- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) { >+static void copy8x8__c (unsigned char *src, >+ unsigned char *dest, >+ unsigned int stride) >+{ >+ int j; >+ for ( j = 0; j < 8; j++ ){ >+ ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0]; >+ ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1]; >+ src+=stride; >+ dest+=stride; >+ } >+} >+ >+static void recon_intra8x8__c (unsigned char *ReconPtr, ogg_int16_t *ChangePtr, >+ ogg_uint32_t LineStep) >+{ > ogg_uint32_t i; > >- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){ >+ for (i = 8; i; i--){ > /* Convert the data back to 8 bit unsigned */ > /* Saturate the output to unsigend 8 bit values */ > ReconPtr[0] = clamp255( ChangePtr[0] + 128 ); >@@ -34,17 +50,16 @@ > ReconPtr[7] = clamp255( ChangePtr[7] + 128 ); > > ReconPtr += LineStep; >- ChangePtr += BLOCK_HEIGHT_WIDTH; >+ ChangePtr += 8; > } >- > } > >-void ReconInter( PB_INSTANCE *pbi, unsigned char * ReconPtr, >- unsigned char * RefPtr, ogg_int16_t * ChangePtr, >- ogg_uint32_t LineStep ) { >+static void recon_inter8x8__c (unsigned char *ReconPtr, unsigned char *RefPtr, >+ ogg_int16_t *ChangePtr, ogg_uint32_t LineStep) >+{ > ogg_uint32_t i; > >- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++) { >+ for (i = 8; i; i--){ > ReconPtr[0] = clamp255(RefPtr[0] + ChangePtr[0]); > ReconPtr[1] = clamp255(RefPtr[1] + ChangePtr[1]); > ReconPtr[2] = clamp255(RefPtr[2] + ChangePtr[2]); >@@ -54,19 +69,19 @@ > ReconPtr[6] = clamp255(RefPtr[6] + ChangePtr[6]); > ReconPtr[7] = clamp255(RefPtr[7] + ChangePtr[7]); > >- ChangePtr += BLOCK_HEIGHT_WIDTH; >+ ChangePtr += 8; > ReconPtr += LineStep; > RefPtr += LineStep; > } >- > } > >-void ReconInterHalfPixel2( PB_INSTANCE *pbi, unsigned char * ReconPtr, >- unsigned char * RefPtr1, unsigned char * RefPtr2, >- ogg_int16_t * ChangePtr, ogg_uint32_t LineStep ) { >+static void recon_inter8x8_half__c (unsigned char *ReconPtr, unsigned char *RefPtr1, >+ unsigned char *RefPtr2, ogg_int16_t *ChangePtr, >+ ogg_uint32_t LineStep) >+{ > ogg_uint32_t i; > >- for ( i = 0; i < BLOCK_HEIGHT_WIDTH; i++ ){ >+ for (i = 8; i; i--){ > ReconPtr[0] = clamp255((((int)RefPtr1[0] + (int)RefPtr2[0]) >> 1) + ChangePtr[0] ); > ReconPtr[1] = clamp255((((int)RefPtr1[1] + (int)RefPtr2[1]) >> 1) + ChangePtr[1] ); > ReconPtr[2] = clamp255((((int)RefPtr1[2] + (int)RefPtr2[2]) >> 1) + ChangePtr[2] ); >@@ -76,10 +91,20 @@ > ReconPtr[6] = clamp255((((int)RefPtr1[6] + (int)RefPtr2[6]) >> 1) + ChangePtr[6] ); > ReconPtr[7] = clamp255((((int)RefPtr1[7] + (int)RefPtr2[7]) >> 1) + ChangePtr[7] ); > >- ChangePtr += BLOCK_HEIGHT_WIDTH; >+ ChangePtr += 8; > ReconPtr += LineStep; > RefPtr1 += LineStep; > RefPtr2 += LineStep; > } >+} > >+void dsp_recon_init (DspFunctions *funcs) >+{ >+ funcs->copy8x8 = copy8x8__c; >+ funcs->recon_intra8x8 = recon_intra8x8__c; >+ funcs->recon_inter8x8 = recon_inter8x8__c; >+ funcs->recon_inter8x8_half = recon_inter8x8_half__c; >+ if (cpu_flags & CPU_X86_MMX) { >+ dsp_i386_mmx_recon_init(&dsp_funcs); >+ } > } >diff -Naur libtheora-1.0alpha3/lib/scan.c libtheora-1.0alpha3.mmx/lib/scan.c >--- libtheora-1.0alpha3/lib/scan.c 2003-12-03 09:59:41.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/scan.c 2004-10-06 17:48:22.609371248 +0200 >@@ -19,9 +19,20 @@ > #include <math.h> > #include <string.h> > #include "encoder_internal.h" >+#include "dsp.h" > > #define MAX_SEARCH_LINE_LEN 7 > >+#define SET8_0(ptr) \ >+ ((ogg_uint32_t *)ptr)[0] = 0x00000000; \ >+ ((ogg_uint32_t *)ptr)[1] = 0x00000000; >+#define SET8_1(ptr) \ >+ ((ogg_uint32_t *)ptr)[0] = 0x01010101; \ >+ ((ogg_uint32_t *)ptr)[1] = 0x01010101; >+#define SET8_8(ptr) \ >+ ((ogg_uint32_t *)ptr)[0] = 0x08080808; \ >+ ((ogg_uint32_t *)ptr)[1] = 0x08080808; >+ > static ogg_uint32_t LineLengthScores[ MAX_SEARCH_LINE_LEN + 1 ] = { > 0, 0, 0, 0, 2, 4, 12, 24 > }; >@@ -384,69 +395,6 @@ > ppi->KFIndicator = ((ppi->KFIndicator*100)/((ppi->ScanYPlaneFragments*3)/4)); > } > >-static ogg_uint32_t ScalarRowSAD( unsigned char * Src1, >- unsigned char * Src2 ){ >- ogg_uint32_t SadValue; >- ogg_uint32_t SadValue1; >- >- SadValue = abs( Src1[0] - Src2[0] ) + abs( Src1[1] - Src2[1] ) + >- abs( Src1[2] - Src2[2] ) + abs( Src1[3] - Src2[3] ); >- >- SadValue1 = abs( Src1[4] - Src2[4] ) + abs( Src1[5] - Src2[5] ) + >- abs( Src1[6] - Src2[6] ) + abs( Src1[7] - Src2[7] ); >- >- SadValue = ( SadValue > SadValue1 ) ? SadValue : SadValue1; >- >- return SadValue; >-} >- >-static ogg_uint32_t ScalarColSAD( PP_INSTANCE *ppi, >- unsigned char * Src1, >- unsigned char * Src2 ){ >- ogg_uint32_t SadValue[8] = {0,0,0,0,0,0,0,0}; >- ogg_uint32_t SadValue2[8] = {0,0,0,0,0,0,0,0}; >- ogg_uint32_t MaxSad = 0; >- ogg_uint32_t i; >- >- for ( i = 0; i < 4; i++ ){ >- SadValue[0] += abs(Src1[0] - Src2[0]); >- SadValue[1] += abs(Src1[1] - Src2[1]); >- SadValue[2] += abs(Src1[2] - Src2[2]); >- SadValue[3] += abs(Src1[3] - Src2[3]); >- SadValue[4] += abs(Src1[4] - Src2[4]); >- SadValue[5] += abs(Src1[5] - Src2[5]); >- SadValue[6] += abs(Src1[6] - Src2[6]); >- SadValue[7] += abs(Src1[7] - Src2[7]); >- >- Src1 += ppi->PlaneStride; >- Src2 += ppi->PlaneStride; >- } >- >- for ( i = 0; i < 4; i++ ){ >- SadValue2[0] += abs(Src1[0] - Src2[0]); >- SadValue2[1] += abs(Src1[1] - Src2[1]); >- SadValue2[2] += abs(Src1[2] - Src2[2]); >- SadValue2[3] += abs(Src1[3] - Src2[3]); >- SadValue2[4] += abs(Src1[4] - Src2[4]); >- SadValue2[5] += abs(Src1[5] - Src2[5]); >- SadValue2[6] += abs(Src1[6] - Src2[6]); >- SadValue2[7] += abs(Src1[7] - Src2[7]); >- >- Src1 += ppi->PlaneStride; >- Src2 += ppi->PlaneStride; >- } >- >- for ( i = 0; i < 8; i++ ){ >- if ( SadValue[i] > MaxSad ) >- MaxSad = SadValue[i]; >- if ( SadValue2[i] > MaxSad ) >- MaxSad = SadValue2[i]; >- } >- >- return MaxSad; >-} >- >- > static int RowSadScan( PP_INSTANCE *ppi, > unsigned char * YuvPtr1, > unsigned char * YuvPtr2, >@@ -475,7 +423,7 @@ > for ( i = 0; i < ppi->PlaneHFragments; i ++ ){ > if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){ > /* Calculate the SAD score for the block row */ >- GrpSad = ScalarRowSAD(LocalYuvPtr1,LocalYuvPtr2); >+ GrpSad = dsp_static_row_sad8(LocalYuvPtr1,LocalYuvPtr2); > > /* Now test the group SAD score */ > if ( GrpSad > LocalGrpLowSadThresh ){ >@@ -532,7 +480,7 @@ > /* Skip if block already marked to be coded. */ > if ( *LocalDispFragPtr <= BLOCK_NOT_CODED ){ > /* Calculate the SAD score for the block column */ >- MaxSad = ScalarColSAD( ppi, LocalYuvPtr1, LocalYuvPtr2 ); >+ MaxSad = dsp_static_col_sad8x8(LocalYuvPtr1, LocalYuvPtr2, ppi->PlaneStride ); > > /* Now test the group SAD score */ > if ( MaxSad > LocalGrpLowSadThresh ){ >@@ -758,7 +706,7 @@ > if (*DispFragPtr == CANDIDATE_BLOCK){ > > /* Clear down entries in changed locals array */ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > > for ( j = 0; j < HFRAGPIXELS; j++ ){ > /* Take a local copy of the measured difference. */ >@@ -777,10 +725,10 @@ > }else{ > /* If we are breaking out here mark all pixels as changed. */ > if ( *DispFragPtr > BLOCK_NOT_CODED ){ >- memset(bits_map_ptr,1,8); >- memset(ChLocalsPtr,8,8); >+ SET8_1(bits_map_ptr); >+ SET8_8(ChLocalsPtr); > }else{ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > } > } > >@@ -816,7 +764,7 @@ > /* Test for break out conditions to save time. */ > if (*DispFragPtr == CANDIDATE_BLOCK){ > /* Clear down entries in changed locals array */ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > > for ( j = 0; j < HFRAGPIXELS; j++ ){ > /* Take a local copy of the measured difference. */ >@@ -839,10 +787,10 @@ > }else{ > /* If we are breaking out here mark all pixels as changed. */ > if ( *DispFragPtr > BLOCK_NOT_CODED ){ >- memset(bits_map_ptr,1,8); >- memset(ChLocalsPtr,8,8); >+ SET8_1(bits_map_ptr); >+ SET8_8(ChLocalsPtr); > }else{ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > } > } > >@@ -876,7 +824,7 @@ > /* Test for break out conditions to save time. */ > if (*DispFragPtr == CANDIDATE_BLOCK){ > /* Clear down entries in changed locals array */ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > for ( j = 0; j < HFRAGPIXELS; j++ ){ > /* Take a local copy of the measured difference. */ > Diff = (int)YuvPtr1[j] - (int)YuvPtr2[j]; >@@ -899,10 +847,10 @@ > }else{ > /* If we are breaking out here mark all pixels as changed. */ > if ( *DispFragPtr > BLOCK_NOT_CODED ){ >- memset(bits_map_ptr,1,8); >- memset(ChLocalsPtr,8,8); >+ SET8_1(bits_map_ptr); >+ SET8_8(ChLocalsPtr); > }else{ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > } > } > >@@ -935,7 +883,7 @@ > /* Test for break out conditions to save time. */ > if (*DispFragPtr == CANDIDATE_BLOCK){ > /* Clear down entries in changed locals array */ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > > for ( j = 0; j < HFRAGPIXELS; j++ ){ > /* Take a local copy of the measured difference. */ >@@ -959,10 +907,10 @@ > }else{ > /* If we are breaking out here mark all pixels as changed.*/ > if ( *DispFragPtr > BLOCK_NOT_CODED ) { >- memset(bits_map_ptr,1,8); >- memset(ChLocalsPtr,8,8); >+ SET8_1(bits_map_ptr); >+ SET8_8(ChLocalsPtr); > }else{ >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > } > } > /* If we have a lot of changed pixels for this fragment on this >@@ -1071,7 +1019,7 @@ > } > }else{ > if ( *DispFragPtr > BLOCK_NOT_CODED ) >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > > /* Step pointers */ > ChLocalsPtr += HFRAGPIXELS; >@@ -1133,7 +1081,7 @@ > } > }else{ > if ( *DispFragPtr > BLOCK_NOT_CODED ) >- memset(ChLocalsPtr,0,8); >+ SET8_0(ChLocalsPtr); > > /* Step pointers */ > ChLocalsPtr += HFRAGPIXELS; >@@ -2126,10 +2074,12 @@ > /* Fast break out test for obvious yes and no cases in this row of > blocks */ > if ( i < ppi->PlaneVFragments ){ >+ dsp_static_save_fpu (); > UpdatedOrCandidateBlocks = > RowSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ); >- if( ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ) ) >- UpdatedOrCandidateBlocks = 1; >+ UpdatedOrCandidateBlocks |= >+ ColSadScan( ppi, RawPlanePtr0, RawPlanePtr1, DispFragPtr0 ); >+ dsp_static_restore_fpu (); > }else{ > /* Make sure we still call other functions if RowSadScan() disabled */ > UpdatedOrCandidateBlocks = 1; >diff -Naur libtheora-1.0alpha3/lib/toplevel.c libtheora-1.0alpha3.mmx/lib/toplevel.c >--- libtheora-1.0alpha3/lib/toplevel.c 2004-03-18 03:00:30.000000000 +0100 >+++ libtheora-1.0alpha3.mmx/lib/toplevel.c 2004-10-06 17:48:22.611370944 +0200 >@@ -787,6 +787,8 @@ > > CP_INSTANCE *cpi; > >+ dsp_static_init (); >+ > memset(th, 0, sizeof(*th)); > th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi)); > >@@ -1446,6 +1448,8 @@ > PB_INSTANCE *pbi; > codec_setup_info *ci; > >+ dsp_static_init (); >+ > ci=(codec_setup_info *)c->codec_setup; > th->internal_decode=pbi=_ogg_calloc(1,sizeof(*pbi)); >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 68549
: 42417