From 053dea12f27e6bb8acf6a103ef954da05419d3dc Mon Sep 17 00:00:00 2001 From: Aurelien Jacobs Date: Mon, 11 Oct 2004 02:19:29 +0000 Subject: adapting existing mmx/mmx2/sse/3dnow optimizations so they work on x86_64 patch by (Aurelien Jacobs ) Originally committed as revision 3578 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/bswap.h | 27 +- libavcodec/common.h | 16 +- libavcodec/i386/cputest.c | 16 +- libavcodec/i386/dsputil_mmx.c | 340 +++++----- libavcodec/i386/dsputil_mmx_avg.h | 352 +++++----- libavcodec/i386/dsputil_mmx_rnd.h | 160 ++--- libavcodec/i386/fdct_mmx.c | 6 +- libavcodec/i386/mmx.h | 6 + libavcodec/i386/motion_est_mmx.c | 121 ++-- libavcodec/i386/mpegvideo_mmx.c | 145 ++-- libavcodec/i386/mpegvideo_mmx_template.c | 43 +- libavcodec/libpostproc/postprocess.c | 13 +- libavcodec/libpostproc/postprocess_template.c | 926 +++++++++++++------------- libavcodec/msmpeg4.c | 2 +- 14 files changed, 1115 insertions(+), 1058 deletions(-) (limited to 'libavcodec') diff --git a/libavcodec/bswap.h b/libavcodec/bswap.h index 460f7abd40..eb1d87a551 100644 --- a/libavcodec/bswap.h +++ b/libavcodec/bswap.h @@ -10,17 +10,23 @@ #include #else -#ifdef ARCH_X86 -static inline unsigned short ByteSwap16(unsigned short x) +#ifdef ARCH_X86_64 +# define LEGACY_REGS "=Q" +#else +# define LEGACY_REGS "=q" +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +static inline uint16_t ByteSwap16(uint16_t x) { __asm("xchgb %b0,%h0" : - "=q" (x) : + LEGACY_REGS (x) : "0" (x)); return x; } #define bswap_16(x) ByteSwap16(x) -static inline unsigned int ByteSwap32(unsigned int x) +static inline uint32_t ByteSwap32(uint32_t x) { #if __CPU__ > 386 __asm("bswap %0": @@ -29,21 +35,28 @@ static inline unsigned int ByteSwap32(unsigned int x) __asm("xchgb %b0,%h0\n" " rorl $16,%0\n" " xchgb %b0,%h0": - "=q" (x) : + LEGACY_REGS (x) : #endif "0" (x)); return x; } #define bswap_32(x) ByteSwap32(x) -static inline unsigned long long int ByteSwap64(unsigned long long int x) +static inline uint64_t ByteSwap64(uint64_t x) { +#ifdef ARCH_X86_64 + __asm("bswap %0": + "=r" (x) : + "0" (x)); + return x; +#else register union { __extension__ uint64_t __ll; uint32_t __l[2]; } __x; asm("xchgl %0,%1": "=r"(__x.__l[0]),"=r"(__x.__l[1]): - "0"(bswap_32((unsigned long)x)),"1"(bswap_32((unsigned long)(x>>32)))); + "0"(bswap_32((uint32_t)x)),"1"(bswap_32((uint32_t)(x>>32)))); return __x.__ll; +#endif } #define bswap_64(x) ByteSwap64(x) diff --git a/libavcodec/common.h b/libavcodec/common.h index 441d841d68..b87feb08e5 100644 --- a/libavcodec/common.h +++ b/libavcodec/common.h @@ -254,7 +254,7 @@ inline void dprintf(const char* fmt,...) {} extern const uint32_t inverse[256]; -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) # define FASTDIV(a,b) \ ({\ int ret,dmy;\ @@ -271,7 +271,7 @@ extern const uint32_t inverse[256]; # define FASTDIV(a,b) ((a)/(b)) #endif -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) // avoid +32 for shift optimization (gcc should do that ...) static inline int32_t NEG_SSR32( int32_t a, int8_t s){ asm ("sarl %1, %0\n\t" @@ -390,7 +390,7 @@ typedef struct RL_VLC_ELEM { #endif /* used to avoid missaligned exceptions on some archs (alpha, ...) */ -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) # define unaligned32(a) (*(const uint32_t*)(a)) #else # ifdef __GNUC__ @@ -460,7 +460,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value) static inline void put_bits(PutBitContext *s, int n, unsigned int value) { # ifdef ALIGNED_BITSTREAM_WRITER -# ifdef ARCH_X86 +# if defined(ARCH_X86) || defined(ARCH_X86_64) asm volatile( "movl %0, %%ecx \n\t" "xorl %%eax, %%eax \n\t" @@ -491,7 +491,7 @@ static inline void put_bits(PutBitContext *s, int n, unsigned int value) s->index= index; # endif # else //ALIGNED_BITSTREAM_WRITER -# ifdef ARCH_X86 +# if defined(ARCH_X86) || defined(ARCH_X86_64) asm volatile( "movl $7, %%ecx \n\t" "andl %0, %%ecx \n\t" @@ -738,7 +738,7 @@ static inline int get_bits_count(GetBitContext *s){ name##_bit_count-= 32;\ }\ -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) # define SKIP_CACHE(name, gb, num)\ asm(\ "shldl %2, %1, %0 \n\t"\ @@ -1218,7 +1218,7 @@ static inline int ff_get_fourcc(const char *s){ #define MKBETAG(a,b,c,d) (d | (c << 8) | (b << 16) | (a << 24)) -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) #define MASK_ABS(mask, level)\ asm volatile(\ "cdq \n\t"\ @@ -1252,7 +1252,7 @@ if((y)<(x)){\ } #endif -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) static inline long long rdtsc(void) { long long l; diff --git a/libavcodec/i386/cputest.c b/libavcodec/i386/cputest.c index 34b813148c..9b2e6a2ce9 100644 --- a/libavcodec/i386/cputest.c +++ b/libavcodec/i386/cputest.c @@ -4,12 +4,20 @@ #include #include "../dsputil.h" +#ifdef ARCH_X86_64 +# define REG_b "rbx" +# define REG_S "rsi" +#else +# define REG_b "ebx" +# define REG_S "esi" +#endif + /* ebx saving is necessary for PIC. gcc seems unable to see it alone */ #define cpuid(index,eax,ebx,ecx,edx)\ __asm __volatile\ - ("movl %%ebx, %%esi\n\t"\ + ("mov %%"REG_b", %%"REG_S"\n\t"\ "cpuid\n\t"\ - "xchgl %%ebx, %%esi"\ + "xchg %%"REG_b", %%"REG_S\ : "=a" (eax), "=S" (ebx),\ "=c" (ecx), "=d" (edx)\ : "0" (index)); @@ -24,7 +32,7 @@ int mm_support(void) /* See if CPUID instruction is supported ... */ /* ... Get copies of EFLAGS into eax and ecx */ "pushf\n\t" - "popl %0\n\t" + "pop %0\n\t" "movl %0, %1\n\t" /* ... Toggle the ID bit in one copy and store */ @@ -35,7 +43,7 @@ int mm_support(void) /* ... Get the (hopefully modified) EFLAGS */ "pushf\n\t" - "popl %0\n\t" + "pop %0\n\t" : "=a" (eax), "=c" (ecx) : : "cc" diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c index 4d19e66f37..6071d04181 100644 --- a/libavcodec/i386/dsputil_mmx.c +++ b/libavcodec/i386/dsputil_mmx.c @@ -187,7 +187,7 @@ static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xF static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) { asm volatile( - "movl $-128, %%eax \n\t" + "mov $-128, %%"REG_a" \n\t" "pxor %%mm7, %%mm7 \n\t" ".balign 16 \n\t" "1: \n\t" @@ -199,16 +199,16 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" - "movq %%mm0, (%1, %%eax)\n\t" - "movq %%mm1, 8(%1, %%eax)\n\t" - "movq %%mm2, 16(%1, %%eax)\n\t" - "movq %%mm3, 24(%1, %%eax)\n\t" - "addl %3, %0 \n\t" - "addl $32, %%eax \n\t" + "movq %%mm0, (%1, %%"REG_a")\n\t" + "movq %%mm1, 8(%1, %%"REG_a")\n\t" + "movq %%mm2, 16(%1, %%"REG_a")\n\t" + "movq %%mm3, 24(%1, %%"REG_a")\n\t" + "add %3, %0 \n\t" + "add $32, %%"REG_a" \n\t" "js 1b \n\t" : "+r" (pixels) - : "r" (block+64), "r" (line_size), "r" (line_size*2) - : "%eax" + : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) + : "%"REG_a ); } @@ -216,7 +216,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint { asm volatile( "pxor %%mm7, %%mm7 \n\t" - "movl $-128, %%eax \n\t" + "mov $-128, %%"REG_a" \n\t" ".balign 16 \n\t" "1: \n\t" "movq (%0), %%mm0 \n\t" @@ -229,15 +229,15 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint "punpckhbw %%mm7, %%mm3 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" - "movq %%mm0, (%2, %%eax)\n\t" - "movq %%mm1, 8(%2, %%eax)\n\t" - "addl %3, %0 \n\t" - "addl %3, %1 \n\t" - "addl $16, %%eax \n\t" + "movq %%mm0, (%2, %%"REG_a")\n\t" + "movq %%mm1, 8(%2, %%"REG_a")\n\t" + "add %3, %0 \n\t" + "add %3, %1 \n\t" + "add $16, %%"REG_a" \n\t" "jnz 1b \n\t" : "+r" (s1), "+r" (s2) - : "r" (block+64), "r" (stride) - : "%eax" + : "r" (block+64), "r" ((long)stride) + : "%"REG_a ); } #endif //CONFIG_ENCODERS @@ -268,7 +268,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size "movq %%mm2, (%0, %1)\n\t" "movq %%mm4, (%0, %1, 2)\n\t" "movq %%mm6, (%0, %2)\n\t" - ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) + ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) :"memory"); pix += line_size*4; p += 32; @@ -293,7 +293,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size "movq %%mm2, (%0, %1)\n\t" "movq %%mm4, (%0, %1, 2)\n\t" "movq %%mm6, (%0, %2)\n\t" - ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p) + ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) :"memory"); } @@ -359,59 +359,59 @@ void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" ".balign 8 \n\t" "1: \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" "movd %%mm0, (%2) \n\t" "movd %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" "movd %%mm0, (%2) \n\t" "movd %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) - : "r"(line_size) - : "%eax", "memory" + : "r"((long)line_size) + : "%"REG_a, "memory" ); } static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) - : "r"(line_size) - : "%eax", "memory" + : "r"((long)line_size) + : "%"REG_a, "memory" ); } static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" @@ -422,8 +422,8 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" @@ -432,13 +432,13 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) - : "r"(line_size) - : "%eax", "memory" + : "r"((long)line_size) + : "%"REG_a, "memory" ); } @@ -446,16 +446,16 @@ static void clear_blocks_mmx(DCTELEM *blocks) { __asm __volatile( "pxor %%mm7, %%mm7 \n\t" - "movl $-128*6, %%eax \n\t" + "mov $-128*6, %%"REG_a" \n\t" "1: \n\t" - "movq %%mm7, (%0, %%eax) \n\t" - "movq %%mm7, 8(%0, %%eax) \n\t" - "movq %%mm7, 16(%0, %%eax) \n\t" - "movq %%mm7, 24(%0, %%eax) \n\t" - "addl $32, %%eax \n\t" + "movq %%mm7, (%0, %%"REG_a") \n\t" + "movq %%mm7, 8(%0, %%"REG_a") \n\t" + "movq %%mm7, 16(%0, %%"REG_a") \n\t" + "movq %%mm7, 24(%0, %%"REG_a") \n\t" + "add $32, %%"REG_a" \n\t" " js 1b \n\t" - : : "r" (((int)blocks)+128*6) - : "%eax" + : : "r" (((uint8_t *)blocks)+128*6) + : "%"REG_a ); } @@ -463,7 +463,7 @@ static void clear_blocks_mmx(DCTELEM *blocks) static int pix_sum16_mmx(uint8_t * pix, int line_size){ const int h=16; int sum; - int index= -line_size*h; + long index= -line_size*h; __asm __volatile( "pxor %%mm7, %%mm7 \n\t" @@ -481,7 +481,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){ "paddw %%mm2, %%mm3 \n\t" "paddw %%mm1, %%mm3 \n\t" "paddw %%mm3, %%mm6 \n\t" - "addl %3, %1 \n\t" + "add %3, %1 \n\t" " js 1b \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $32, %%mm6 \n\t" @@ -492,7 +492,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){ "movd %%mm6, %0 \n\t" "andl $0xFFFF, %0 \n\t" : "=&r" (sum), "+r" (index) - : "r" (pix - index), "r" (line_size) + : "r" (pix - index), "r" ((long)line_size) ); return sum; @@ -500,7 +500,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){ #endif //CONFIG_ENCODERS static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ - int i=0; + long i=0; asm volatile( "1: \n\t" "movq (%1, %0), %%mm0 \n\t" @@ -511,11 +511,11 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ "movq 8(%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%2, %0) \n\t" - "addl $16, %0 \n\t" - "cmpl %3, %0 \n\t" + "add $16, %0 \n\t" + "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (i) - : "r"(src), "r"(dst), "r"(w-15) + : "r"(src), "r"(dst), "r"((long)w-15) ); for(; iavg_ ## postfix1 = avg_ ## postfix2; static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ - int i=0; + long i=0; assert(ABS(scale) < 256); scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; @@ -2863,8 +2863,8 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 "paddd %%mm1, %%mm0 \n\t" "psrld $4, %%mm0 \n\t" "paddd %%mm0, %%mm7 \n\t" - "addl $16, %0 \n\t" - "cmpl $128, %0 \n\t" //FIXME optimize & bench + "add $16, %0 \n\t" + "cmp $128, %0 \n\t" //FIXME optimize & bench " jb 1b \n\t" "movq %%mm7, %%mm6 \n\t" "psrlq $32, %%mm7 \n\t" @@ -2879,7 +2879,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 } static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ - int i=0; + long i=0; if(ABS(scale) < 256){ scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; @@ -2902,8 +2902,8 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ "paddw 8(%2, %0), %%mm1 \n\t" "movq %%mm0, (%2, %0) \n\t" "movq %%mm1, 8(%2, %0) \n\t" - "addl $16, %0 \n\t" - "cmpl $128, %0 \n\t" //FIXME optimize & bench + "add $16, %0 \n\t" + "cmp $128, %0 \n\t" //FIXME optimize & bench " jb 1b \n\t" : "+r" (i) diff --git a/libavcodec/i386/dsputil_mmx_avg.h b/libavcodec/i386/dsputil_mmx_avg.h index 0cbf1376ae..46d8ae5716 100644 --- a/libavcodec/i386/dsputil_mmx_avg.h +++ b/libavcodec/i386/dsputil_mmx_avg.h @@ -28,7 +28,7 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" @@ -36,21 +36,21 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_ PAVGB" 1(%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" PAVGB" 1(%1), %%mm0 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) @@ -60,34 +60,34 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int " jz 1f \n\t" "movd (%1), %%mm0 \n\t" "movd (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $4, %2 \n\t" + "add %4, %1 \n\t" + "add $4, %2 \n\t" PAVGB" %%mm1, %%mm0 \n\t" "movd %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movd (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movd (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 4(%2), %%mm1 \n\t" "movd %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movd %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movd (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movd (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" 8(%2), %%mm0 \n\t" PAVGB" 12(%2), %%mm1 \n\t" "movd %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movd %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" - "addl $16, %2 \n\t" + "add %5, %3 \n\t" + "add $16, %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -95,7 +95,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); } @@ -107,34 +107,34 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int " jz 1f \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $8, %2 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" PAVGB" %%mm1, %%mm0 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 24(%2), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -142,7 +142,7 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) @@ -158,20 +158,20 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src " jz 1f \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $8, %2 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" "pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm1 \n\t" PAVGB" %%mm1, %%mm0 \n\t" "pxor %%mm6, %%mm0 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%2), %%mm2 \n\t" "movq 8(%2), %%mm3 \n\t" "pxor %%mm6, %%mm0 \n\t" @@ -183,13 +183,13 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src "pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm1 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq 16(%2), %%mm2 \n\t" "movq 24(%2), %%mm3 \n\t" "pxor %%mm6, %%mm0 \n\t" @@ -201,10 +201,10 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src "pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm1 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -212,7 +212,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) @@ -227,39 +227,39 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int " jz 1f \n\t" "movd (%1), %%mm0 \n\t" "movd (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $4, %2 \n\t" + "add %4, %1 \n\t" + "add $4, %2 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t" "movd %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movd (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movd (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 4(%2), %%mm1 \n\t" PAVGB" (%3), %%mm0 \n\t" "movd %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" PAVGB" (%3), %%mm1 \n\t" "movd %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movd (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movd (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" 8(%2), %%mm0 \n\t" PAVGB" 12(%2), %%mm1 \n\t" PAVGB" (%3), %%mm0 \n\t" "movd %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" PAVGB" (%3), %%mm1 \n\t" "movd %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" - "addl $16, %2 \n\t" + "add %5, %3 \n\t" + "add $16, %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -267,7 +267,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); } @@ -279,39 +279,39 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int " jz 1f \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $8, %2 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" (%3), %%mm0 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm1 \n\t" PAVGB" (%3), %%mm0 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" PAVGB" (%3), %%mm1 \n\t" "movq %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 24(%2), %%mm1 \n\t" PAVGB" (%3), %%mm0 \n\t" "movq %%mm0, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" PAVGB" (%3), %%mm1 \n\t" "movq %%mm1, (%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -319,7 +319,7 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) @@ -330,7 +330,7 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" @@ -344,8 +344,8 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line "movq %%mm1, (%2, %3) \n\t" "movq %%mm2, 8(%2) \n\t" "movq %%mm3, 8(%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1), %%mm2 \n\t" @@ -354,17 +354,17 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line PAVGB" 1(%1, %3), %%mm1 \n\t" PAVGB" 9(%1), %%mm2 \n\t" PAVGB" 9(%1, %3), %%mm3 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm2, 8(%2) \n\t" "movq %%mm3, 8(%2, %3) \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) @@ -376,30 +376,30 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int "movq 8(%1), %%mm1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $16, %2 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 24(%2), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -407,7 +407,7 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) @@ -424,36 +424,36 @@ static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int "movq 8(%1), %%mm1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $16, %2 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" PAVGB" (%3), %%mm0 \n\t" PAVGB" 8(%3), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" 8(%2), %%mm1 \n\t" PAVGB" (%3), %%mm0 \n\t" PAVGB" 8(%3), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGB" 16(%2), %%mm0 \n\t" PAVGB" 24(%2), %%mm1 \n\t" PAVGB" (%3), %%mm0 \n\t" PAVGB" 8(%3), %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -461,7 +461,7 @@ static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) @@ -487,16 +487,16 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr PAVGB" %%mm3, %%mm1 \n\t" "pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $16, %2 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%2), %%mm2 \n\t" "movq 8(%2), %%mm3 \n\t" "pxor %%mm6, %%mm0 \n\t" @@ -509,10 +509,10 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr "pxor %%mm6, %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq 16(%2), %%mm2 \n\t" "movq 24(%2), %%mm3 \n\t" "pxor %%mm6, %%mm0 \n\t" @@ -525,8 +525,8 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr "pxor %%mm6, %%mm1 \n\t" "movq %%mm0, (%3) \n\t" "movq %%mm1, 8(%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -534,7 +534,7 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); //the following should be used, though better not with gcc ... /* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) @@ -547,13 +547,13 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in { MOVQ_BONE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm2 \n\t" "movq 1(%1), %%mm1 \n\t" "movq 1(%1, %3), %%mm3 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %1 \n\t" "psubusb %%mm6, %%mm0 \n\t" "psubusb %%mm6, %%mm2 \n\t" PAVGB" %%mm1, %%mm0 \n\t" @@ -564,50 +564,50 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in "movq 1(%1), %%mm1 \n\t" "movq (%1, %3), %%mm2 \n\t" "movq 1(%1, %3), %%mm3 \n\t" - "addl %%eax, %2 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" "psubusb %%mm6, %%mm0 \n\t" "psubusb %%mm6, %%mm2 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm3, %%mm2 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm2, (%2, %3) \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" - "subl %3, %2 \n\t" + "sub %3, %2 \n\t" "1: \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm2 \n\t" - "addl %%eax, %1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm2, %%mm1 \n\t" "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%eax) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "addl %%eax, %2 \n\t" - "addl %%eax, %1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm0, %%mm1 \n\t" "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%eax) \n\t" - "addl %%eax, %2 \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D" (block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } /* GL: this function does incorrect rounding if overflow */ @@ -615,39 +615,39 @@ static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, in { MOVQ_BONE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" - "subl %3, %2 \n\t" + "sub %3, %2 \n\t" "1: \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm2 \n\t" - "addl %%eax, %1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" "psubusb %%mm6, %%mm1 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm2, %%mm1 \n\t" "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%eax) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "addl %%eax, %2 \n\t" - "addl %%eax, %1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" "psubusb %%mm6, %%mm1 \n\t" PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm0, %%mm1 \n\t" "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%eax) \n\t" - "addl %%eax, %2 \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D" (block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "1: \n\t" "movq (%2), %%mm0 \n\t" "movq (%2, %3), %%mm1 \n\t" @@ -655,27 +655,27 @@ static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_siz PAVGB" (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%2), %%mm0 \n\t" "movq (%2, %3), %%mm1 \n\t" PAVGB" (%1), %%mm0 \n\t" PAVGB" (%1, %3), %%mm1 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm2 \n\t" @@ -683,63 +683,63 @@ static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_ PAVGB" 1(%1, %3), %%mm2 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2, %3), %%mm2 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm2, (%2, %3) \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm2 \n\t" PAVGB" 1(%1), %%mm0 \n\t" PAVGB" 1(%1, %3), %%mm2 \n\t" - "addl %%eax, %2 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" PAVGB" (%2), %%mm0 \n\t" PAVGB" (%2, %3), %%mm2 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm2, (%2, %3) \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" - "subl %3, %2 \n\t" + "sub %3, %2 \n\t" "1: \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm2 \n\t" - "addl %%eax, %1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm2, %%mm1 \n\t" "movq (%2, %3), %%mm3 \n\t" - "movq (%2, %%eax), %%mm4 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" PAVGB" %%mm3, %%mm0 \n\t" PAVGB" %%mm4, %%mm1 \n\t" "movq %%mm0, (%2, %3) \n\t" - "movq %%mm1, (%2, %%eax) \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm0, %%mm1 \n\t" - "addl %%eax, %2 \n\t" - "addl %%eax, %1 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" "movq (%2, %3), %%mm3 \n\t" - "movq (%2, %%eax), %%mm4 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" PAVGB" %%mm3, %%mm2 \n\t" PAVGB" %%mm4, %%mm1 \n\t" "movq %%mm2, (%2, %3) \n\t" - "movq %%mm1, (%2, %%eax) \n\t" - "addl %%eax, %2 \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter @@ -747,17 +747,17 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line { MOVQ_BONE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" PAVGB" 1(%1), %%mm0 \n\t" ".balign 8 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm2 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" "movq (%1, %3), %%mm1 \n\t" "psubusb %%mm6, %%mm2 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 1(%1, %%eax), %%mm2 \n\t" - "addl %%eax, %1 \n\t" + PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t" + "add %%"REG_a", %1 \n\t" PAVGB" %%mm1, %%mm0 \n\t" PAVGB" %%mm2, %%mm1 \n\t" PAVGB" (%2), %%mm0 \n\t" @@ -765,23 +765,23 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" PAVGB" 1(%1, %3), %%mm1 \n\t" - PAVGB" 1(%1, %%eax), %%mm0 \n\t" - "addl %%eax, %2 \n\t" - "addl %%eax, %1 \n\t" + PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t" + "add %%"REG_a", %2 \n\t" + "add %%"REG_a", %1 \n\t" PAVGB" %%mm1, %%mm2 \n\t" PAVGB" %%mm0, %%mm1 \n\t" PAVGB" (%2), %%mm2 \n\t" PAVGB" (%2, %3), %%mm1 \n\t" "movq %%mm2, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r" (line_size) - :"%eax", "memory"); + :"r" ((long)line_size) + :"%"REG_a, "memory"); } //FIXME the following could be optimized too ... diff --git a/libavcodec/i386/dsputil_mmx_rnd.h b/libavcodec/i386/dsputil_mmx_rnd.h index e22e240375..20ea1b59e6 100644 --- a/libavcodec/i386/dsputil_mmx_rnd.h +++ b/libavcodec/i386/dsputil_mmx_rnd.h @@ -27,7 +27,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line { MOVQ_BFE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" @@ -37,8 +37,8 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%2) \n\t" "movq %%mm5, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 1(%1), %%mm1 \n\t" "movq (%1, %3), %%mm2 \n\t" @@ -46,13 +46,13 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%2) \n\t" "movq %%mm5, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r"(line_size) - :"eax", "memory"); + :"r"((long)line_size) + :REG_a, "memory"); } static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) @@ -63,37 +63,37 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int " jz 1f \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" - "addl $8, %2 \n\t" + "add %4, %1 \n\t" + "add $8, %2 \n\t" PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) "movq %%mm4, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm2 \n\t" "movq 8(%2), %%mm3 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq %%mm5, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" "movq 16(%2), %%mm1 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" "movq (%1), %%mm2 \n\t" "movq 24(%2), %%mm3 \n\t" - "addl %4, %1 \n\t" - "addl $32, %2 \n\t" + "add %4, %1 \n\t" + "add $32, %2 \n\t" PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq %%mm5, (%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -101,7 +101,7 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); } @@ -109,7 +109,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin { MOVQ_BFE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1), %%mm0 \n\t" @@ -126,8 +126,8 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, 8(%2) \n\t" "movq %%mm5, 8(%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 1(%1), %%mm1 \n\t" "movq (%1, %3), %%mm2 \n\t" @@ -142,13 +142,13 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, 8(%2) \n\t" "movq %%mm5, 8(%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r"(line_size) - :"eax", "memory"); + :"r"((long)line_size) + :REG_a, "memory"); } static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) @@ -161,12 +161,12 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in "movq (%2), %%mm1 \n\t" "movq 8(%1), %%mm2 \n\t" "movq 8(%2), %%mm3 \n\t" - "addl %4, %1 \n\t" - "addl $16, %2 \n\t" + "add %4, %1 \n\t" + "add $16, %2 \n\t" PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%3) \n\t" "movq %%mm5, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "decl %0 \n\t" ".balign 8 \n\t" "1: \n\t" @@ -174,21 +174,21 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in "movq (%2), %%mm1 \n\t" "movq 8(%1), %%mm2 \n\t" "movq 8(%2), %%mm3 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%3) \n\t" "movq %%mm5, 8(%3) \n\t" - "addl %5, %3 \n\t" + "add %5, %3 \n\t" "movq (%1), %%mm0 \n\t" "movq 16(%2), %%mm1 \n\t" "movq 8(%1), %%mm2 \n\t" "movq 24(%2), %%mm3 \n\t" - "addl %4, %1 \n\t" + "add %4, %1 \n\t" PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, (%3) \n\t" "movq %%mm5, 8(%3) \n\t" - "addl %5, %3 \n\t" - "addl $32, %2 \n\t" + "add %5, %3 \n\t" + "add $32, %2 \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" #ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used @@ -196,7 +196,7 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in #else :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) #endif - :"S"(src1Stride), "D"(dstStride) + :"S"((long)src1Stride), "D"((long)dstStride) :"memory"); } @@ -204,29 +204,29 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line { MOVQ_BFE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax),%%mm2 \n\t" + "movq (%1, %%"REG_a"),%%mm2 \n\t" PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) "movq %%mm4, (%2) \n\t" "movq %%mm5, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax),%%mm0 \n\t" + "movq (%1, %%"REG_a"),%%mm0 \n\t" PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) "movq %%mm4, (%2) \n\t" "movq %%mm5, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r"(line_size) - :"eax", "memory"); + :"r"((long)line_size) + :REG_a, "memory"); } static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) @@ -244,12 +244,12 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "punpckhbw %%mm7, %%mm5 \n\t" "paddusw %%mm0, %%mm4 \n\t" "paddusw %%mm1, %%mm5 \n\t" - "xorl %%eax, %%eax \n\t" - "addl %3, %1 \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" + "add %3, %1 \n\t" ".balign 8 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq 1(%1, %%eax), %%mm2 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" @@ -265,11 +265,11 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm5 \n\t" "packuswb %%mm5, %%mm4 \n\t" - "movq %%mm4, (%2, %%eax) \n\t" - "addl %3, %%eax \n\t" + "movq %%mm4, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" - "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%eax), %%mm4 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%"REG_a"), %%mm4 \n\t" "movq %%mm2, %%mm3 \n\t" "movq %%mm4, %%mm5 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" @@ -285,14 +285,14 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm1 \n\t" "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%eax) \n\t" - "addl %3, %%eax \n\t" + "movq %%mm0, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels) - :"D"(block), "r"(line_size) - :"eax", "memory"); + :"D"(block), "r"((long)line_size) + :REG_a, "memory"); } // avg_pixels @@ -456,12 +456,12 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line { MOVQ_BFE(mm6); __asm __volatile( - "lea (%3, %3), %%eax \n\t" + "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" ".balign 8 \n\t" "1: \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm2 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) "movq (%2), %%mm3 \n\t" PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) @@ -469,11 +469,11 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%eax), %%mm0 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) "movq (%2), %%mm3 \n\t" PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) @@ -481,14 +481,14 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) "movq %%mm2, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" - "addl %%eax, %1 \n\t" - "addl %%eax, %2 \n\t" + "add %%"REG_a", %1 \n\t" + "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels), "+D"(block) - :"r"(line_size) - :"eax", "memory"); + :"r"((long)line_size) + :REG_a, "memory"); } // this routine is 'slightly' suboptimal but mostly unused @@ -507,12 +507,12 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "punpckhbw %%mm7, %%mm5 \n\t" "paddusw %%mm0, %%mm4 \n\t" "paddusw %%mm1, %%mm5 \n\t" - "xorl %%eax, %%eax \n\t" - "addl %3, %1 \n\t" + "xor %%"REG_a", %%"REG_a" \n\t" + "add %3, %1 \n\t" ".balign 8 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq 1(%1, %%eax), %%mm2 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" @@ -527,16 +527,16 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "paddusw %%mm1, %%mm5 \n\t" "psrlw $2, %%mm4 \n\t" "psrlw $2, %%mm5 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" "packuswb %%mm5, %%mm4 \n\t" "pcmpeqd %%mm2, %%mm2 \n\t" "paddb %%mm2, %%mm2 \n\t" PAVGB(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%eax) \n\t" - "addl %3, %%eax \n\t" + "movq %%mm5, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" - "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%eax), %%mm4 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%"REG_a"), %%mm4 \n\t" "movq %%mm2, %%mm3 \n\t" "movq %%mm4, %%mm5 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" @@ -551,19 +551,19 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "paddusw %%mm5, %%mm1 \n\t" "psrlw $2, %%mm0 \n\t" "psrlw $2, %%mm1 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" "packuswb %%mm1, %%mm0 \n\t" "pcmpeqd %%mm2, %%mm2 \n\t" "paddb %%mm2, %%mm2 \n\t" PAVGB(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%eax) \n\t" - "addl %3, %%eax \n\t" + "movq %%mm1, (%2, %%"REG_a") \n\t" + "add %3, %%"REG_a" \n\t" "subl $2, %0 \n\t" "jnz 1b \n\t" :"+g"(h), "+S"(pixels) - :"D"(block), "r"(line_size) - :"eax", "memory"); + :"D"(block), "r"((long)line_size) + :REG_a, "memory"); } //FIXME optimize diff --git a/libavcodec/i386/fdct_mmx.c b/libavcodec/i386/fdct_mmx.c index 7f348329a2..aacbe57437 100644 --- a/libavcodec/i386/fdct_mmx.c +++ b/libavcodec/i386/fdct_mmx.c @@ -47,13 +47,13 @@ static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 }; -static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; +static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; -static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; +static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; struct { - const long fdct_r_row_sse2[4] ATTR_ALIGN(16); + const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16); } fdct_r_row_sse2 ATTR_ALIGN(16)= {{ RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW diff --git a/libavcodec/i386/mmx.h b/libavcodec/i386/mmx.h index ad684bc5ac..f0ef1b79e0 100644 --- a/libavcodec/i386/mmx.h +++ b/libavcodec/i386/mmx.h @@ -5,6 +5,12 @@ #ifndef AVCODEC_I386MMX_H #define AVCODEC_I386MMX_H +#ifdef ARCH_X86_64 +# define REG_a "rax" +#else +# define REG_a "eax" +#endif + /* * The type of an value that fits in an MMX register (note that long * long constant values MUST be suffixed by LL and unsigned long long diff --git a/libavcodec/i386/motion_est_mmx.c b/libavcodec/i386/motion_est_mmx.c index 39246d9050..1b90f8e40f 100644 --- a/libavcodec/i386/motion_est_mmx.c +++ b/libavcodec/i386/motion_est_mmx.c @@ -20,6 +20,7 @@ * mostly by Michael Niedermayer */ #include "../dsputil.h" +#include "mmx.h" static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={ 0x0000000000000000ULL, @@ -31,19 +32,19 @@ static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x010101010101 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - int len= -(stride*h); + long len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm2 \n\t" - "movq (%2, %%eax), %%mm4 \n\t" - "addl %3, %%eax \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm4 \n\t" + "add %3, %%"REG_a" \n\t" "psubusb %%mm0, %%mm2 \n\t" "psubusb %%mm4, %%mm0 \n\t" - "movq (%1, %%eax), %%mm1 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" - "movq (%2, %%eax), %%mm5 \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "movq (%2, %%"REG_a"), %%mm5 \n\t" "psubusb %%mm1, %%mm3 \n\t" "psubusb %%mm5, %%mm1 \n\t" "por %%mm2, %%mm0 \n\t" @@ -58,116 +59,116 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) "paddw %%mm3, %%mm2 \n\t" "paddw %%mm2, %%mm0 \n\t" "paddw %%mm0, %%mm6 \n\t" - "addl %3, %%eax \n\t" + "add %3, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" (stride) + : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride) ); } static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - int len= -(stride*h); + long len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm2 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" "psadbw %%mm2, %%mm0 \n\t" - "addl %3, %%eax \n\t" - "movq (%1, %%eax), %%mm1 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" + "add %3, %%"REG_a" \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" "psadbw %%mm1, %%mm3 \n\t" "paddw %%mm3, %%mm0 \n\t" "paddw %%mm0, %%mm6 \n\t" - "addl %3, %%eax \n\t" + "add %3, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1 - len), "r" (blk2 - len), "r" (stride) + : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride) ); } static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { - int len= -(stride*h); + long len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm2 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" "pavgb %%mm2, %%mm0 \n\t" - "movq (%3, %%eax), %%mm2 \n\t" + "movq (%3, %%"REG_a"), %%mm2 \n\t" "psadbw %%mm2, %%mm0 \n\t" - "addl %4, %%eax \n\t" - "movq (%1, %%eax), %%mm1 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" + "add %4, %%"REG_a" \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" "pavgb %%mm1, %%mm3 \n\t" - "movq (%3, %%eax), %%mm1 \n\t" + "movq (%3, %%"REG_a"), %%mm1 \n\t" "psadbw %%mm1, %%mm3 \n\t" "paddw %%mm3, %%mm0 \n\t" "paddw %%mm0, %%mm6 \n\t" - "addl %4, %%eax \n\t" + "add %4, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) + : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride) ); } static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { //FIXME reuse src - int len= -(stride*h); + long len= -(stride*h); asm volatile( ".balign 16 \n\t" "movq "MANGLE(bone)", %%mm5 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm2 \n\t" - "movq 1(%1, %%eax), %%mm1 \n\t" - "movq 1(%2, %%eax), %%mm3 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm2 \n\t" + "movq 1(%1, %%"REG_a"), %%mm1 \n\t" + "movq 1(%2, %%"REG_a"), %%mm3 \n\t" "pavgb %%mm2, %%mm0 \n\t" "pavgb %%mm1, %%mm3 \n\t" "psubusb %%mm5, %%mm3 \n\t" "pavgb %%mm3, %%mm0 \n\t" - "movq (%3, %%eax), %%mm2 \n\t" + "movq (%3, %%"REG_a"), %%mm2 \n\t" "psadbw %%mm2, %%mm0 \n\t" - "addl %4, %%eax \n\t" - "movq (%1, %%eax), %%mm1 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" - "movq 1(%1, %%eax), %%mm2 \n\t" - "movq 1(%2, %%eax), %%mm4 \n\t" + "add %4, %%"REG_a" \n\t" + "movq (%1, %%"REG_a"), %%mm1 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq 1(%2, %%"REG_a"), %%mm4 \n\t" "pavgb %%mm3, %%mm1 \n\t" "pavgb %%mm4, %%mm2 \n\t" "psubusb %%mm5, %%mm2 \n\t" "pavgb %%mm1, %%mm2 \n\t" - "movq (%3, %%eax), %%mm1 \n\t" + "movq (%3, %%"REG_a"), %%mm1 \n\t" "psadbw %%mm1, %%mm2 \n\t" "paddw %%mm2, %%mm0 \n\t" "paddw %%mm0, %%mm6 \n\t" - "addl %4, %%eax \n\t" + "add %4, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" (stride) + : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride) ); } static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { - int len= -(stride*h); + long len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm2 \n\t" - "movq (%2, %%eax), %%mm3 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm2 \n\t" + "movq (%2, %%"REG_a"), %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpcklbw %%mm7, %%mm1 \n\t" "punpckhbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "paddw %%mm0, %%mm1 \n\t" "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%eax), %%mm4 \n\t" - "movq (%3, %%eax), %%mm2 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm2 \n\t" "paddw %%mm5, %%mm1 \n\t" "paddw %%mm5, %%mm3 \n\t" "psrlw $1, %%mm1 \n\t" @@ -181,21 +182,21 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int "punpckhbw %%mm7, %%mm1 \n\t" "paddw %%mm1, %%mm0 \n\t" "paddw %%mm0, %%mm6 \n\t" - "addl %4, %%eax \n\t" + "add %4, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" (stride) + : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride) ); } static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - int len= -(stride*h); + long len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" - "movq (%1, %%eax), %%mm0 \n\t" - "movq (%2, %%eax), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm0 \n\t" + "movq (%2, %%"REG_a"), %%mm1 \n\t" "movq %%mm0, %%mm4 \n\t" "movq %%mm1, %%mm2 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" @@ -204,8 +205,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) "punpckhbw %%mm7, %%mm2 \n\t" "paddw %%mm1, %%mm0 \n\t" "paddw %%mm2, %%mm4 \n\t" - "movq 1(%1, %%eax), %%mm2 \n\t" - "movq 1(%2, %%eax), %%mm3 \n\t" + "movq 1(%1, %%"REG_a"), %%mm2 \n\t" + "movq 1(%2, %%"REG_a"), %%mm3 \n\t" "movq %%mm2, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" @@ -216,8 +217,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) "punpckhbw %%mm7, %%mm4 \n\t" "paddw %%mm3, %%mm2 \n\t" "paddw %%mm4, %%mm1 \n\t" - "movq (%3, %%eax), %%mm3 \n\t" - "movq (%3, %%eax), %%mm4 \n\t" + "movq (%3, %%"REG_a"), %%mm3 \n\t" + "movq (%3, %%"REG_a"), %%mm4 \n\t" "paddw %%mm5, %%mm2 \n\t" "paddw %%mm5, %%mm1 \n\t" "psrlw $2, %%mm2 \n\t" @@ -231,10 +232,10 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) "punpckhbw %%mm7, %%mm2 \n\t" "paddw %%mm2, %%mm0 \n\t" "paddw %%mm0, %%mm6 \n\t" - "addl %4, %%eax \n\t" + "add %4, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" (stride) + : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride) ); } diff --git a/libavcodec/i386/mpegvideo_mmx.c b/libavcodec/i386/mpegvideo_mmx.c index f19de73d61..70c81f6754 100644 --- a/libavcodec/i386/mpegvideo_mmx.c +++ b/libavcodec/i386/mpegvideo_mmx.c @@ -23,6 +23,7 @@ #include "../dsputil.h" #include "../mpegvideo.h" #include "../avcodec.h" +#include "mmx.h" extern uint8_t zigzag_direct_noperm[64]; extern uint16_t inv_zigzag_direct16[64]; @@ -34,7 +35,7 @@ static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x000 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int level, qmul, qadd, nCoeffs; + long level, qmul, qadd, nCoeffs; qmul = qscale << 1; @@ -97,7 +98,7 @@ asm volatile( "movq %%mm0, (%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t" - "addl $16, %3 \n\t" + "add $16, %3 \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) : "memory" @@ -109,7 +110,7 @@ asm volatile( static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int qmul, qadd, nCoeffs; + long qmul, qadd, nCoeffs; qmul = qscale << 1; qadd = (qscale - 1) | 1; @@ -160,7 +161,7 @@ asm volatile( "movq %%mm0, (%0, %3) \n\t" "movq %%mm1, 8(%0, %3) \n\t" - "addl $16, %3 \n\t" + "add $16, %3 \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) : "memory" @@ -200,7 +201,7 @@ asm volatile( static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int nCoeffs; + long nCoeffs; const uint16_t *quant_matrix; int block0; @@ -220,13 +221,13 @@ asm volatile( "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" - "movl %3, %%eax \n\t" + "mov %3, %%"REG_a" \n\t" ".balign 16\n\t" "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" @@ -241,8 +242,8 @@ asm volatile( "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $3, %%mm0 \n\t" "psraw $3, %%mm1 \n\t" "psubw %%mm7, %%mm0 \n\t" @@ -255,13 +256,13 @@ asm volatile( "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" - "addl $16, %%eax \n\t" + "add $16, %%"REG_a" \n\t" "js 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) - : "%eax", "memory" + : "%"REG_a, "memory" ); block[0]= block0; } @@ -269,7 +270,7 @@ asm volatile( static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int nCoeffs; + long nCoeffs; const uint16_t *quant_matrix; assert(s->block_last_index[n]>=0); @@ -283,13 +284,13 @@ asm volatile( "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" - "movl %3, %%eax \n\t" + "mov %3, %%"REG_a" \n\t" ".balign 16\n\t" "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" @@ -308,8 +309,8 @@ asm volatile( "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $4, %%mm0 \n\t" "psraw $4, %%mm1 \n\t" "psubw %%mm7, %%mm0 \n\t" @@ -322,20 +323,20 @@ asm volatile( "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" - "addl $16, %%eax \n\t" + "add $16, %%"REG_a" \n\t" "js 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) - : "%eax", "memory" + : "%"REG_a, "memory" ); } static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int nCoeffs; + long nCoeffs; const uint16_t *quant_matrix; int block0; @@ -355,13 +356,13 @@ asm volatile( "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" - "movl %3, %%eax \n\t" + "mov %3, %%"REG_a" \n\t" ".balign 16\n\t" "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" @@ -376,8 +377,8 @@ asm volatile( "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psraw $3, %%mm0 \n\t" "psraw $3, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" @@ -386,13 +387,13 @@ asm volatile( "psubw %%mm3, %%mm1 \n\t" "pandn %%mm0, %%mm4 \n\t" "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" - "addl $16, %%eax \n\t" + "add $16, %%"REG_a" \n\t" "jng 1b \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) - : "%eax", "memory" + : "%"REG_a, "memory" ); block[0]= block0; //Note, we dont do mismatch control for intra as errors cannot accumulate @@ -401,7 +402,7 @@ asm volatile( static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int nCoeffs; + long nCoeffs; const uint16_t *quant_matrix; assert(s->block_last_index[n]>=0); @@ -416,13 +417,13 @@ asm volatile( "movd %2, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" - "movl %3, %%eax \n\t" + "mov %3, %%"REG_a" \n\t" ".balign 16\n\t" "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" + "movq (%0, %%"REG_a"), %%mm0 \n\t" + "movq 8(%0, %%"REG_a"), %%mm1 \n\t" + "movq (%1, %%"REG_a"), %%mm4 \n\t" + "movq 8(%1, %%"REG_a"), %%mm5 \n\t" "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] "pxor %%mm2, %%mm2 \n\t" @@ -441,8 +442,8 @@ asm volatile( "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q "pxor %%mm4, %%mm4 \n\t" "pxor %%mm5, %%mm5 \n\t" // FIXME slow - "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 "psrlw $4, %%mm0 \n\t" "psrlw $4, %%mm1 \n\t" "pxor %%mm2, %%mm0 \n\t" @@ -453,10 +454,10 @@ asm volatile( "pandn %%mm1, %%mm5 \n\t" "pxor %%mm4, %%mm7 \n\t" "pxor %%mm5, %%mm7 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" + "movq %%mm4, (%0, %%"REG_a") \n\t" + "movq %%mm5, 8(%0, %%"REG_a") \n\t" - "addl $16, %%eax \n\t" + "add $16, %%"REG_a" \n\t" "jng 1b \n\t" "movd 124(%0, %3), %%mm0 \n\t" "movq %%mm7, %%mm6 \n\t" @@ -471,7 +472,7 @@ asm volatile( "movd %%mm0, 124(%0, %3) \n\t" ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) - : "%eax", "memory" + : "%"REG_a, "memory" ); } @@ -499,11 +500,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) "punpckhwd %%mm1, %%mm1 \n\t" "punpckhdq %%mm1, %%mm1 \n\t" "movq %%mm1, (%0, %2) \n\t" - "addl %1, %0 \n\t" - "cmpl %3, %0 \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) - : "r" (wrap), "r" (width), "r" (ptr + wrap*height) + : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) ); } else @@ -522,11 +523,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) "punpckhdq %%mm1, %%mm1 \n\t" "movq %%mm1, (%0, %2) \n\t" "movq %%mm1, 8(%0, %2) \n\t" - "addl %1, %0 \n\t" - "cmpl %3, %0 \n\t" + "add %1, %0 \n\t" + "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) - : "r" (wrap), "r" (width), "r" (ptr + wrap*height) + : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height) ); } @@ -540,11 +541,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) "movq %%mm0, (%0, %2) \n\t" "movq %%mm0, (%0, %2, 2) \n\t" "movq %%mm0, (%0, %3) \n\t" - "addl $8, %0 \n\t" - "cmpl %4, %0 \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) - : "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w) + : "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w) ); ptr= last_line + (i + 1) * wrap - w; asm volatile( @@ -554,11 +555,11 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) "movq %%mm0, (%0, %2) \n\t" "movq %%mm0, (%0, %2, 2) \n\t" "movq %%mm0, (%0, %3) \n\t" - "addl $8, %0 \n\t" - "cmpl %4, %0 \n\t" + "add $8, %0 \n\t" + "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (ptr) - : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w) + : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w) ); } } @@ -607,10 +608,10 @@ static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ "movq %%mm2, 8(%1) \n\t" "movq %%mm5, 16(%1) \n\t" "movq %%mm3, 24(%1) \n\t" - "addl $16, %0 \n\t" - "addl $32, %1 \n\t" - "addl $16, %2 \n\t" - "cmpl %3, %0 \n\t" + "add $16, %0 \n\t" + "add $32, %1 \n\t" + "add $16, %2 \n\t" + "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (block), "+r" (sum), "+r" (offset) : "r"(block+64) @@ -661,10 +662,10 @@ static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ "movdqa %%xmm6, 16(%1) \n\t" "movdqa %%xmm5, 32(%1) \n\t" "movdqa %%xmm0, 48(%1) \n\t" - "addl $32, %0 \n\t" - "addl $64, %1 \n\t" - "addl $32, %2 \n\t" - "cmpl %3, %0 \n\t" + "add $32, %0 \n\t" + "add $64, %1 \n\t" + "add $32, %2 \n\t" + "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (block), "+r" (sum), "+r" (offset) : "r"(block+64) diff --git a/libavcodec/i386/mpegvideo_mmx_template.c b/libavcodec/i386/mpegvideo_mmx_template.c index 0ceddd7106..8520daab34 100644 --- a/libavcodec/i386/mpegvideo_mmx_template.c +++ b/libavcodec/i386/mpegvideo_mmx_template.c @@ -36,7 +36,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow) { - int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... + long last_non_zero_p1; + int level=0, q; //=0 is cuz gcc says uninitalized ... const uint16_t *qmat, *bias; __align8 int16_t temp_block[64]; @@ -90,18 +91,18 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if(s->out_format == FMT_H263 && s->mpeg_quant==0){ asm volatile( - "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 + "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 "movq (%2), %%mm5 \n\t" // qmat[0] "pxor %%mm6, %%mm6 \n\t" "psubw (%3), %%mm6 \n\t" // -bias[0] - "movl $-128, %%eax \n\t" + "mov $-128, %%"REG_a" \n\t" ".balign 16 \n\t" "1: \n\t" "pxor %%mm1, %%mm1 \n\t" // 0 - "movq (%1, %%eax), %%mm0 \n\t" // block[i] + "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) @@ -110,13 +111,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "por %%mm0, %%mm4 \n\t" "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - "movq %%mm0, (%5, %%eax) \n\t" + "movq %%mm0, (%5, %%"REG_a") \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 - "movq (%4, %%eax), %%mm1 \n\t" - "movq %%mm7, (%1, %%eax) \n\t" // 0 + "movq (%4, %%"REG_a"), %%mm1 \n\t" + "movq %%mm7, (%1, %%"REG_a") \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) - "addl $8, %%eax \n\t" + "add $8, %%"REG_a" \n\t" " js 1b \n\t" "movq %%mm3, %%mm0 \n\t" "psrlq $32, %%mm3 \n\t" @@ -124,8 +125,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "movq %%mm3, %%mm0 \n\t" "psrlq $16, %%mm3 \n\t" PMAXW(%%mm0, %%mm3) - "movd %%mm3, %%eax \n\t" - "movzbl %%al, %%eax \n\t" // last_non_zero_p1 + "movd %%mm3, %%"REG_a" \n\t" + "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) : "r" (block+64), "r" (qmat), "r" (bias), "r" (inv_zigzag_direct16+64), "r" (temp_block+64) @@ -142,32 +143,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s, ); }else{ // FMT_H263 asm volatile( - "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 + "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 - "movl $-128, %%eax \n\t" + "mov $-128, %%"REG_a" \n\t" ".balign 16 \n\t" "1: \n\t" "pxor %%mm1, %%mm1 \n\t" // 0 - "movq (%1, %%eax), %%mm0 \n\t" // block[i] + "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) - "movq (%3, %%eax), %%mm6 \n\t" // bias[0] + "movq (%3, %%"REG_a"), %%mm6 \n\t" // bias[0] "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] - "movq (%2, %%eax), %%mm5 \n\t" // qmat[i] + "movq (%2, %%"REG_a"), %%mm5 \n\t" // qmat[i] "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 "por %%mm0, %%mm4 \n\t" "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) - "movq %%mm0, (%5, %%eax) \n\t" + "movq %%mm0, (%5, %%"REG_a") \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 - "movq (%4, %%eax), %%mm1 \n\t" - "movq %%mm7, (%1, %%eax) \n\t" // 0 + "movq (%4, %%"REG_a"), %%mm1 \n\t" + "movq %%mm7, (%1, %%"REG_a") \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) - "addl $8, %%eax \n\t" + "add $8, %%"REG_a" \n\t" " js 1b \n\t" "movq %%mm3, %%mm0 \n\t" "psrlq $32, %%mm3 \n\t" @@ -175,8 +176,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "movq %%mm3, %%mm0 \n\t" "psrlq $16, %%mm3 \n\t" PMAXW(%%mm0, %%mm3) - "movd %%mm3, %%eax \n\t" - "movzbl %%al, %%eax \n\t" // last_non_zero_p1 + "movd %%mm3, %%"REG_a" \n\t" + "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) : "r" (block+64), "r" (qmat+64), "r" (bias+64), "r" (inv_zigzag_direct16+64), "r" (temp_block+64) diff --git a/libavcodec/libpostproc/postprocess.c b/libavcodec/libpostproc/postprocess.c index 82bf628480..1713573d3d 100644 --- a/libavcodec/libpostproc/postprocess.c +++ b/libavcodec/libpostproc/postprocess.c @@ -119,7 +119,7 @@ try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks # define always_inline inline #endif -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) static uint64_t __attribute__((aligned(8))) attribute_used w05= 0x0005000500050005LL; static uint64_t __attribute__((aligned(8))) attribute_used w04= 0x0004000400040004LL; static uint64_t __attribute__((aligned(8))) attribute_used w20= 0x0020002000200020LL; @@ -172,7 +172,7 @@ static char *replaceTable[]= }; -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) static inline void prefetchnta(void *p) { asm volatile( "prefetchnta (%0)\n\t" @@ -597,7 +597,7 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC #endif //HAVE_ALTIVEC #endif //ARCH_POWERPC -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT) #define COMPILE_MMX @@ -616,13 +616,11 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC #undef HAVE_MMX2 #undef HAVE_3DNOW #undef HAVE_ALTIVEC -#undef ARCH_X86 #ifdef COMPILE_C #undef HAVE_MMX #undef HAVE_MMX2 #undef HAVE_3DNOW -#undef ARCH_X86 #define RENAME(a) a ## _C #include "postprocess_template.c" #endif @@ -643,7 +641,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC #define HAVE_MMX #undef HAVE_MMX2 #undef HAVE_3DNOW -#define ARCH_X86 #define RENAME(a) a ## _MMX #include "postprocess_template.c" #endif @@ -654,7 +651,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC #define HAVE_MMX #define HAVE_MMX2 #undef HAVE_3DNOW -#define ARCH_X86 #define RENAME(a) a ## _MMX2 #include "postprocess_template.c" #endif @@ -665,7 +661,6 @@ static always_inline void do_a_deblock_C(uint8_t *src, int step, int stride, PPC #define HAVE_MMX #undef HAVE_MMX2 #define HAVE_3DNOW -#define ARCH_X86 #define RENAME(a) a ## _3DNow #include "postprocess_template.c" #endif @@ -683,7 +678,7 @@ static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int // difference wouldnt be messureable here but its much better because // someone might exchange the cpu whithout restarting mplayer ;) #ifdef RUNTIME_CPUDETECT -#ifdef ARCH_X86 +#if defined(ARCH_X86) || defined(ARCH_X86_64) // ordered per speed fasterst first if(c->cpuCaps & PP_CPU_CAPS_MMX2) postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c); diff --git a/libavcodec/libpostproc/postprocess_template.c b/libavcodec/libpostproc/postprocess_template.c index fcfff4b704..a18242aad2 100644 --- a/libavcodec/libpostproc/postprocess_template.c +++ b/libavcodec/libpostproc/postprocess_template.c @@ -22,15 +22,37 @@ */ +#ifdef ARCH_X86_64 +# define REGa rax +# define REGc rcx +# define REGd rdx +# define REG_a "rax" +# define REG_c "rcx" +# define REG_d "rdx" +# define REG_SP "rsp" +# define ALIGN_MASK "$0xFFFFFFFFFFFFFFF8" +#else +# define REGa eax +# define REGc ecx +# define REGd edx +# define REG_a "eax" +# define REG_c "ecx" +# define REG_d "edx" +# define REG_SP "esp" +# define ALIGN_MASK "$0xFFFFFFF8" +#endif + + #undef PAVGB #undef PMINUB #undef PMAXUB #ifdef HAVE_MMX2 -#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" +#define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" #elif defined (HAVE_3DNOW) -#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" +#define REAL_RPAVGB(a,b) "pavgusb " #a ", " #b " \n\t" #endif +#define PAVGB(a,b) REAL_PAVGB(a,b) #ifdef HAVE_MMX2 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" @@ -64,12 +86,12 @@ asm volatile( ); asm volatile( - "leal (%2, %3), %%eax \n\t" + "lea (%2, %3), %%"REG_a" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 "movq (%2), %%mm0 \n\t" - "movq (%%eax), %%mm1 \n\t" + "movq (%%"REG_a"), %%mm1 \n\t" "movq %%mm0, %%mm3 \n\t" "movq %%mm0, %%mm4 \n\t" PMAXUB(%%mm1, %%mm4) @@ -78,7 +100,7 @@ asm volatile( "paddb %%mm7, %%mm0 \n\t" "pcmpgtb %%mm6, %%mm0 \n\t" - "movq (%%eax,%3), %%mm2 \n\t" + "movq (%%"REG_a",%3), %%mm2 \n\t" PMAXUB(%%mm2, %%mm4) PMINUB(%%mm2, %%mm3, %%mm5) "psubb %%mm2, %%mm1 \n\t" @@ -86,7 +108,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax, %3, 2), %%mm1 \n\t" + "movq (%%"REG_a", %3, 2), %%mm1 \n\t" PMAXUB(%%mm1, %%mm4) PMINUB(%%mm1, %%mm3, %%mm5) "psubb %%mm1, %%mm2 \n\t" @@ -94,7 +116,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm2 \n\t" "paddb %%mm2, %%mm0 \n\t" - "leal (%%eax, %3, 4), %%eax \n\t" + "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" "movq (%2, %3, 4), %%mm2 \n\t" PMAXUB(%%mm2, %%mm4) @@ -104,7 +126,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax), %%mm1 \n\t" + "movq (%%"REG_a"), %%mm1 \n\t" PMAXUB(%%mm1, %%mm4) PMINUB(%%mm1, %%mm3, %%mm5) "psubb %%mm1, %%mm2 \n\t" @@ -112,7 +134,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm2 \n\t" "paddb %%mm2, %%mm0 \n\t" - "movq (%%eax, %3), %%mm2 \n\t" + "movq (%%"REG_a", %3), %%mm2 \n\t" PMAXUB(%%mm2, %%mm4) PMINUB(%%mm2, %%mm3, %%mm5) "psubb %%mm2, %%mm1 \n\t" @@ -120,7 +142,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax, %3, 2), %%mm1 \n\t" + "movq (%%"REG_a", %3, 2), %%mm1 \n\t" PMAXUB(%%mm1, %%mm4) PMINUB(%%mm1, %%mm3, %%mm5) "psubb %%mm1, %%mm2 \n\t" @@ -152,8 +174,8 @@ asm volatile( "movd %%mm4, %1 \n\t" : "=r" (numEq), "=r" (dcOk) - : "r" (src), "r" (stride), "m" (c->pQPb) - : "%eax" + : "r" (src), "r" ((long)stride), "m" (c->pQPb) + : "%"REG_a ); numEq= (-numEq) &0xFF; @@ -194,10 +216,10 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) "por %%mm2, %%mm6 \n\t"// First Line to Filter "movq (%0, %1, 8), %%mm5 \n\t" - "leal (%0, %1, 4), %%eax \n\t" - "leal (%0, %1, 8), %%ecx \n\t" - "subl %1, %%ecx \n\t" - "addl %1, %0 \n\t" // %0 points to line 1 not 0 + "lea (%0, %1, 4), %%"REG_a" \n\t" + "lea (%0, %1, 8), %%"REG_c" \n\t" + "sub %1, %%"REG_c" \n\t" + "add %1, %0 \n\t" // %0 points to line 1 not 0 "movq (%0, %1, 8), %%mm7 \n\t" "movq %%mm5, %%mm1 \n\t" "movq %%mm7, %%mm2 \n\t" @@ -225,7 +247,7 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) "movq (%0, %1, 4), %%mm2 \n\t" // 1 "movq %%mm2, %%mm5 \n\t" // 1 - PAVGB((%%eax), %%mm2) // 11 /2 + PAVGB((%%REGa), %%mm2) // 11 /2 PAVGB((%0, %1, 2), %%mm2) // 211 /4 "movq %%mm2, %%mm3 \n\t" // 211 /4 "movq (%0), %%mm4 \n\t" // 1 @@ -237,15 +259,15 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) PAVGB(%%mm6, %%mm0) //1 1 /2 "movq %%mm4, %%mm3 \n\t" // 1 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 - PAVGB((%%eax,%1,2), %%mm5) // 11 /2 - PAVGB((%%eax), %%mm5) // 211 /4 + PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 + PAVGB((%%REGa), %%mm5) // 211 /4 PAVGB(%%mm5, %%mm3) // 2 2211 /8 PAVGB(%%mm0, %%mm3) //4242211 /16 "movq %%mm3, (%0,%1) \n\t" // X // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 PAVGB(%%mm4, %%mm6) //11 /2 - "movq (%%ecx), %%mm0 \n\t" // 1 - PAVGB((%%eax, %1, 2), %%mm0) // 11/2 + "movq (%%"REG_c"), %%mm0 \n\t" // 1 + PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 "movq %%mm0, %%mm3 \n\t" // 11/2 PAVGB(%%mm1, %%mm0) // 2 11/4 PAVGB(%%mm6, %%mm0) //222 11/8 @@ -253,17 +275,17 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) "movq (%0, %1, 2), %%mm2 \n\t" // 1 "movq %%mm0, (%0, %1, 2) \n\t" // X // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 - "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 - PAVGB((%%ecx), %%mm0) // 11 /2 + "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 + PAVGB((%%REGc), %%mm0) // 11 /2 PAVGB(%%mm0, %%mm6) //11 11 /4 PAVGB(%%mm1, %%mm4) // 11 /2 PAVGB(%%mm2, %%mm1) // 11 /2 PAVGB(%%mm1, %%mm6) //1122 11 /8 PAVGB(%%mm5, %%mm6) //112242211 /16 - "movq (%%eax), %%mm5 \n\t" // 1 - "movq %%mm6, (%%eax) \n\t" // X + "movq (%%"REG_a"), %%mm5 \n\t" // 1 + "movq %%mm6, (%%"REG_a") \n\t" // X // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 - "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 + "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 PAVGB(%%mm7, %%mm6) // 11 /2 PAVGB(%%mm4, %%mm6) // 11 11 /4 PAVGB(%%mm3, %%mm6) // 11 2211 /8 @@ -276,29 +298,29 @@ static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) PAVGB(%%mm7, %%mm1) // 11 2 /4 PAVGB(%%mm4, %%mm5) // 11 /2 PAVGB(%%mm5, %%mm0) // 11 11 /4 - "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 + "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 PAVGB(%%mm0, %%mm1) // 11224222 /16 - "movq %%mm1, (%%eax, %1, 2) \n\t" // X + "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 - PAVGB((%%ecx), %%mm2) // 112 4 /8 - "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 + PAVGB((%%REGc), %%mm2) // 112 4 /8 + "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 PAVGB(%%mm0, %%mm6) // 1 1 /2 PAVGB(%%mm7, %%mm6) // 1 12 /4 PAVGB(%%mm2, %%mm6) // 1122424 /4 - "movq %%mm6, (%%ecx) \n\t" // X + "movq %%mm6, (%%"REG_c") \n\t" // X // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 PAVGB(%%mm7, %%mm5) // 11 2 /4 PAVGB(%%mm7, %%mm5) // 11 6 /8 PAVGB(%%mm3, %%mm0) // 112 /4 PAVGB(%%mm0, %%mm5) // 112246 /16 - "movq %%mm5, (%%eax, %1, 4) \n\t" // X - "subl %1, %0 \n\t" + "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X + "sub %1, %0 \n\t" : - : "r" (src), "r" (stride), "m" (c->pQPb) - : "%eax", "%ecx" + : "r" (src), "r" ((long)stride), "m" (c->pQPb) + : "%"REG_a, "%"REG_c ); #else const int l1= stride; @@ -364,8 +386,8 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) asm volatile( "pxor %%mm7, %%mm7 \n\t" // 0 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ecx \n\t" + "leal (%0, %1), %%"REG_a" \n\t" + "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP @@ -375,7 +397,7 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... "movq (%0, %1, 4), %%mm2 \n\t" // line 4 - "movq (%%ecx), %%mm3 \n\t" // line 5 + "movq (%%"REG_c"), %%mm3 \n\t" // line 5 "movq %%mm2, %%mm4 \n\t" // line 4 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 @@ -393,32 +415,32 @@ static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) // "psubb %%mm6, %%mm2 \n\t" "movq %%mm2, (%0,%1, 4) \n\t" - "movq (%%ecx), %%mm2 \n\t" + "movq (%%"REG_c"), %%mm2 \n\t" // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 "psubb %%mm5, %%mm2 \n\t" // "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%ecx) \n\t" + "movq %%mm2, (%%"REG_c") \n\t" "paddb %%mm6, %%mm5 \n\t" "psrlw $2, %%mm5 \n\t" "pand "MANGLE(b3F)", %%mm5 \n\t" "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 - "movq (%%eax, %1, 2), %%mm2 \n\t" + "movq (%%"REG_a", %1, 2), %%mm2 \n\t" "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 "paddsb %%mm5, %%mm2 \n\t" "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%eax, %1, 2) \n\t" + "movq %%mm2, (%%"REG_a", %1, 2) \n\t" - "movq (%%ecx, %1), %%mm2 \n\t" + "movq (%%"REG_c", %1), %%mm2 \n\t" "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 "psubsb %%mm5, %%mm2 \n\t" "psubb %%mm6, %%mm2 \n\t" - "movq %%mm2, (%%ecx, %1) \n\t" + "movq %%mm2, (%%"REG_c", %1) \n\t" : - : "r" (src), "r" (stride) - : "%eax", "%ecx" + : "r" (src), "r" ((long)stride) + : "%"REG_a, "%"REG_c ); #else const int l1= stride; @@ -464,18 +486,18 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) asm volatile( "pxor %%mm7, %%mm7 \n\t" // 0 - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ecx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 - "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 + "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 "movq %%mm1, %%mm2 \n\t" // line 4 "psubusb %%mm0, %%mm1 \n\t" "psubusb %%mm2, %%mm0 \n\t" "por %%mm1, %%mm0 \n\t" // |l2 - l3| - "movq (%%ecx), %%mm3 \n\t" // line 5 - "movq (%%ecx, %1), %%mm4 \n\t" // line 6 + "movq (%%"REG_c"), %%mm3 \n\t" // line 5 + "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 "movq %%mm3, %%mm5 \n\t" // line 5 "psubusb %%mm4, %%mm3 \n\t" "psubusb %%mm5, %%mm4 \n\t" @@ -507,43 +529,43 @@ static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) "pxor %%mm2, %%mm0 \n\t" "movq %%mm0, (%0, %1, 4) \n\t" // line 4 - "movq (%%ecx), %%mm0 \n\t" // line 5 + "movq (%%"REG_c"), %%mm0 \n\t" // line 5 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 "paddusb %%mm3, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ecx) \n\t" // line 5 + "movq %%mm0, (%%"REG_c") \n\t" // line 5 PAVGB(%%mm7, %%mm1) // d/4 - "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 + "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 "psubusb %%mm1, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 + "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 - "movq (%%ecx, %1), %%mm0 \n\t" // line 6 + "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 "paddusb %%mm1, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ecx, %1) \n\t" // line 6 + "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 PAVGB(%%mm7, %%mm1) // d/8 - "movq (%%eax, %1), %%mm0 \n\t" // line 2 + "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 "psubusb %%mm1, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%eax, %1) \n\t" // line 2 + "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 - "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 + "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 "paddusb %%mm1, %%mm0 \n\t" "pxor %%mm2, %%mm0 \n\t" - "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 + "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 : - : "r" (src), "r" (stride), "m" (co->pQPb) - : "%eax", "%ecx" + : "r" (src), "r" ((long)stride), "m" (co->pQPb) + : "%"REG_a, "%"REG_c ); #else @@ -608,8 +630,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext #if 0 //sligtly more accurate and slightly slower "pxor %%mm7, %%mm7 \n\t" // 0 - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ecx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" // 0 1 2 3 4 5 6 7 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 @@ -622,8 +644,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 - "movq (%%eax), %%mm1 \n\t" // l1 - "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 + "movq (%%"REG_a"), %%mm1 \n\t" // l1 + "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 "movq %%mm1, %%mm4 \n\t" // l1 PAVGB(%%mm7, %%mm1) // ~l1/2 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 @@ -641,7 +663,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 - "movq (%%ecx), %%mm2 \n\t" // l5 + "movq (%%"REG_c"), %%mm2 \n\t" // l5 "movq %%mm3, %%mm5 \n\t" // l3 PAVGB(%%mm7, %%mm3) // ~l3/2 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 @@ -654,13 +676,13 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 - "movq (%%ecx, %1), %%mm6 \n\t" // l6 + "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 "movq %%mm6, %%mm5 \n\t" // l6 PAVGB(%%mm7, %%mm6) // ~l6/2 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 - "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 + "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 "movq %%mm2, %%mm4 \n\t" // l5 PAVGB(%%mm7, %%mm2) // ~l5/2 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 @@ -687,7 +709,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext "paddusb %%mm1, %%mm3 \n\t" // "paddusb "MANGLE(b01)", %%mm3 \n\t" - "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 + "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 "movq (%0, %1, 4), %%mm5 \n\t" //l4 "movq (%0, %1, 4), %%mm4 \n\t" //l4 "psubusb %%mm6, %%mm5 \n\t" @@ -701,7 +723,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext "psubusb "MANGLE(b01)", %%mm3 \n\t" PAVGB(%%mm7, %%mm3) - "movq (%%eax, %1, 2), %%mm0 \n\t" + "movq (%%"REG_a", %1, 2), %%mm0 \n\t" "movq (%0, %1, 4), %%mm2 \n\t" "pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm2 \n\t" @@ -709,36 +731,36 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext "paddb %%mm3, %%mm2 \n\t" "pxor %%mm6, %%mm0 \n\t" "pxor %%mm6, %%mm2 \n\t" - "movq %%mm0, (%%eax, %1, 2) \n\t" + "movq %%mm0, (%%"REG_a", %1, 2) \n\t" "movq %%mm2, (%0, %1, 4) \n\t" #endif - "leal (%0, %1), %%eax \n\t" + "lea (%0, %1), %%"REG_a" \n\t" "pcmpeqb %%mm6, %%mm6 \n\t" // -1 // 0 1 2 3 4 5 6 7 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 - "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 + "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 "movq (%0, %1, 4), %%mm0 \n\t" // l4 "pxor %%mm6, %%mm1 \n\t" // -l3-1 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 // mm1=-l3-1, mm0=128-q - "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 - "movq (%%eax, %1), %%mm3 \n\t" // l2 + "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 + "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 "pxor %%mm6, %%mm2 \n\t" // -l5-1 "movq %%mm2, %%mm5 \n\t" // -l5-1 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 - "leal (%%eax, %1, 4), %%ecx \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 - "movq (%%eax), %%mm2 \n\t" // l1 + "movq (%%"REG_a"), %%mm2 \n\t" // l1 "pxor %%mm6, %%mm2 \n\t" // -l1-1 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 PAVGB((%0), %%mm1) // (l0-l3+256)/2 @@ -748,8 +770,8 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 - PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 - "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 + PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 + "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 "pxor %%mm6, %%mm1 \n\t" // -l7-1 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 @@ -798,7 +820,7 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) "pand %%mm7, %%mm4 \n\t" - "movq (%%eax, %1, 2), %%mm0 \n\t" + "movq (%%"REG_a", %1, 2), %%mm0 \n\t" "movq (%0, %1, 4), %%mm2 \n\t" "pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm2 \n\t" @@ -806,12 +828,12 @@ static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext "psubb %%mm4, %%mm2 \n\t" "pxor %%mm1, %%mm0 \n\t" "pxor %%mm1, %%mm2 \n\t" - "movq %%mm0, (%%eax, %1, 2) \n\t" + "movq %%mm0, (%%"REG_a", %1, 2) \n\t" "movq %%mm2, (%0, %1, 4) \n\t" : - : "r" (src), "r" (stride), "m" (c->pQPb) - : "%eax", "%ecx" + : "r" (src), "r" ((long)stride), "m" (c->pQPb) + : "%"REG_a, "%"REG_c ); /* @@ -882,8 +904,8 @@ src-=8; src+= stride*4; asm volatile( "pxor %%mm7, %%mm7 \n\t" - "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars - "andl $0xFFFFFFF8, %%ecx \n\t" // align + "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars + "and "ALIGN_MASK", %%"REG_c" \n\t" // align // 0 1 2 3 4 5 6 7 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 @@ -894,12 +916,12 @@ src-=8; "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 "movq (%0, %1), %%mm2 \n\t" - "leal (%0, %1, 2), %%eax \n\t" + "lea (%0, %1, 2), %%"REG_a" \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 - "movq (%%eax), %%mm4 \n\t" + "movq (%%"REG_a"), %%mm4 \n\t" "movq %%mm4, %%mm5 \n\t" "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 @@ -916,7 +938,7 @@ src-=8; "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - "movq (%%eax, %1), %%mm2 \n\t" + "movq (%%"REG_a", %1), %%mm2 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" // L3 "punpckhbw %%mm7, %%mm3 \n\t" // H3 @@ -925,24 +947,24 @@ src-=8; "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 - "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 - "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 + "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 + "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 - "movq (%%eax, %1, 2), %%mm0 \n\t" + "movq (%%"REG_a", %1, 2), %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" // L4 "punpckhbw %%mm7, %%mm1 \n\t" // H4 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 - "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 - "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 + "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 + "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 "paddw %%mm4, %%mm4 \n\t" // 2L2 "paddw %%mm5, %%mm5 \n\t" // 2H2 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 - "leal (%%eax, %1), %0 \n\t" + "lea (%%"REG_a", %1), %0 \n\t" "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 @@ -957,10 +979,10 @@ src-=8; "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 - "movq (%%eax, %1, 4), %%mm6 \n\t" + "movq (%%"REG_a", %1, 4), %%mm6 \n\t" "punpcklbw %%mm7, %%mm6 \n\t" // L6 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 - "movq (%%eax, %1, 4), %%mm6 \n\t" + "movq (%%"REG_a", %1, 4), %%mm6 \n\t" "punpckhbw %%mm7, %%mm6 \n\t" // H6 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 @@ -984,8 +1006,8 @@ src-=8; "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 - "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 - "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 + "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 + "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 #ifdef HAVE_MMX2 "movq %%mm7, %%mm6 \n\t" // 0 @@ -1063,8 +1085,8 @@ src-=8; "psrlw $6, %%mm4 \n\t" "psrlw $6, %%mm5 \n\t" - "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 - "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 + "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 + "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" @@ -1107,8 +1129,8 @@ src-=8; "movq %%mm0, (%0, %1) \n\t" : "+r" (src) - : "r" (stride), "m" (c->pQPb) - : "%eax", "%ecx" + : "r" ((long)stride), "m" (c->pQPb) + : "%"REG_a, "%"REG_c ); #else const int l1= stride; @@ -1171,20 +1193,20 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) "packuswb %%mm0, %%mm0 \n\t" "movq %%mm0, %3 \n\t" - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 #undef FIND_MIN_MAX #ifdef HAVE_MMX2 -#define FIND_MIN_MAX(addr)\ +#define REAL_FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ "pminub %%mm0, %%mm7 \n\t"\ "pmaxub %%mm0, %%mm6 \n\t" #else -#define FIND_MIN_MAX(addr)\ +#define REAL_FIND_MIN_MAX(addr)\ "movq " #addr ", %%mm0 \n\t"\ "movq %%mm7, %%mm1 \n\t"\ "psubusb %%mm0, %%mm6 \n\t"\ @@ -1192,14 +1214,15 @@ static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) "psubusb %%mm0, %%mm1 \n\t"\ "psubb %%mm1, %%mm7 \n\t" #endif +#define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) -FIND_MIN_MAX((%%eax)) -FIND_MIN_MAX((%%eax, %1)) -FIND_MIN_MAX((%%eax, %1, 2)) +FIND_MIN_MAX((%%REGa)) +FIND_MIN_MAX((%%REGa, %1)) +FIND_MIN_MAX((%%REGa, %1, 2)) FIND_MIN_MAX((%0, %1, 4)) -FIND_MIN_MAX((%%edx)) -FIND_MIN_MAX((%%edx, %1)) -FIND_MIN_MAX((%%edx, %1, 2)) +FIND_MIN_MAX((%%REGd)) +FIND_MIN_MAX((%%REGd, %1)) +FIND_MIN_MAX((%%REGd, %1, 2)) FIND_MIN_MAX((%0, %1, 8)) "movq %%mm7, %%mm4 \n\t" @@ -1252,13 +1275,13 @@ FIND_MIN_MAX((%0, %1, 8)) "movd %%mm6, %%ecx \n\t" "cmpb "MANGLE(deringThreshold)", %%cl \n\t" " jb 1f \n\t" - "leal -24(%%esp), %%ecx \n\t" - "andl $0xFFFFFFF8, %%ecx \n\t" + "lea -24(%%"REG_SP"), %%"REG_c" \n\t" + "and "ALIGN_MASK", %%"REG_c" \n\t" PAVGB(%%mm0, %%mm7) // a=(max + min)/2 "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" "punpcklbw %%mm7, %%mm7 \n\t" - "movq %%mm7, (%%ecx) \n\t" + "movq %%mm7, (%%"REG_c") \n\t" "movq (%0), %%mm0 \n\t" // L10 "movq %%mm0, %%mm1 \n\t" // L10 @@ -1283,13 +1306,13 @@ FIND_MIN_MAX((%0, %1, 8)) "paddb %%mm2, %%mm0 \n\t" "paddb %%mm3, %%mm0 \n\t" - "movq (%%eax), %%mm2 \n\t" // L11 + "movq (%%"REG_a"), %%mm2 \n\t" // L11 "movq %%mm2, %%mm3 \n\t" // L11 "movq %%mm2, %%mm4 \n\t" // L11 "psllq $8, %%mm3 \n\t" "psrlq $8, %%mm4 \n\t" - "movd -4(%%eax), %%mm5 \n\t" - "movd 8(%%eax), %%mm6 \n\t" + "movd -4(%%"REG_a"), %%mm5 \n\t" + "movd 8(%%"REG_a"), %%mm6 \n\t" "psrlq $24, %%mm5 \n\t" "psllq $56, %%mm6 \n\t" "por %%mm5, %%mm3 \n\t" // L01 @@ -1306,7 +1329,7 @@ FIND_MIN_MAX((%0, %1, 8)) "paddb %%mm4, %%mm2 \n\t" "paddb %%mm5, %%mm2 \n\t" // 0, 2, 3, 1 -#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ +#define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ "movq " #src ", " #sx " \n\t" /* src[0] */\ "movq " #sx ", " #lx " \n\t" /* src[0] */\ "movq " #sx ", " #t0 " \n\t" /* src[0] */\ @@ -1322,8 +1345,8 @@ FIND_MIN_MAX((%0, %1, 8)) PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ PAVGB(lx, pplx) \ - "movq " #lx ", 8(%%ecx) \n\t"\ - "movq (%%ecx), " #lx " \n\t"\ + "movq " #lx ", 8(%%"REG_c") \n\t"\ + "movq (%%"REG_c"), " #lx " \n\t"\ "psubusb " #lx ", " #t1 " \n\t"\ "psubusb " #lx ", " #t0 " \n\t"\ "psubusb " #lx ", " #sx " \n\t"\ @@ -1350,8 +1373,10 @@ FIND_MIN_MAX((%0, %1, 8)) "pandn " #dst ", " #ppsx " \n\t"\ "por " #pplx ", " #ppsx " \n\t"\ "movq " #ppsx ", " #dst " \n\t"\ - "movq 8(%%ecx), " #lx " \n\t" + "movq 8(%%"REG_c"), " #lx " \n\t" +#define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ + REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) /* 0000000 1111111 @@ -1368,18 +1393,18 @@ FIND_MIN_MAX((%0, %1, 8)) */ //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) -DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) -DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) -DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) -DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) -DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) -DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) -DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) -DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) +DERING_CORE((%%REGa),(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) +DERING_CORE((%%REGa, %1),(%%REGa, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) +DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) +DERING_CORE((%0, %1, 4),(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) +DERING_CORE((%%REGd),(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) +DERING_CORE((%%REGd, %1), (%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) +DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) +DERING_CORE((%0, %1, 8),(%%REGd, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) "1: \n\t" - : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) - : "%eax", "%edx", "%ecx" + : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2) + : "%"REG_a, "%"REG_d, "%"REG_c ); #else int y; @@ -1526,27 +1551,27 @@ static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int strid #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= 4*stride; asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%ecx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 "movq (%0), %%mm0 \n\t" - "movq (%%eax, %1), %%mm1 \n\t" + "movq (%%"REG_a", %1), %%mm1 \n\t" PAVGB(%%mm1, %%mm0) - "movq %%mm0, (%%eax) \n\t" + "movq %%mm0, (%%"REG_a") \n\t" "movq (%0, %1, 4), %%mm0 \n\t" PAVGB(%%mm0, %%mm1) - "movq %%mm1, (%%eax, %1, 2) \n\t" - "movq (%%ecx, %1), %%mm1 \n\t" + "movq %%mm1, (%%"REG_a", %1, 2) \n\t" + "movq (%%"REG_c", %1), %%mm1 \n\t" PAVGB(%%mm1, %%mm0) - "movq %%mm0, (%%ecx) \n\t" + "movq %%mm0, (%%"REG_c") \n\t" "movq (%0, %1, 8), %%mm0 \n\t" PAVGB(%%mm0, %%mm1) - "movq %%mm1, (%%ecx, %1, 2) \n\t" + "movq %%mm1, (%%"REG_c", %1, 2) \n\t" - : : "r" (src), "r" (stride) - : "%eax", "%ecx" + : : "r" (src), "r" ((long)stride) + : "%"REG_a, "%"REG_c ); #else int a, b, x; @@ -1579,15 +1604,15 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*3; asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" - "leal (%%edx, %1, 4), %%ecx \n\t" - "addl %1, %%ecx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" + "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" + "add %1, %%"REG_c" \n\t" "pxor %%mm7, %%mm7 \n\t" // 0 1 2 3 4 5 6 7 8 9 10 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx -#define DEINT_CUBIC(a,b,c,d,e)\ +#define REAL_DEINT_CUBIC(a,b,c,d,e)\ "movq " #a ", %%mm0 \n\t"\ "movq " #b ", %%mm1 \n\t"\ "movq " #d ", %%mm2 \n\t"\ @@ -1608,14 +1633,15 @@ static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ "packuswb %%mm3, %%mm1 \n\t"\ "movq %%mm1, " #c " \n\t" +#define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) -DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) -DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) -DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) -DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) +DEINT_CUBIC((%0), (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd, %1)) +DEINT_CUBIC((%%REGa, %1), (%0, %1, 4), (%%REGd), (%%REGd, %1), (%0, %1, 8)) +DEINT_CUBIC((%0, %1, 4), (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGc)) +DEINT_CUBIC((%%REGd, %1), (%0, %1, 8), (%%REGd, %1, 4), (%%REGc), (%%REGc, %1, 2)) - : : "r" (src), "r" (stride) - : "%eax", "%edx", "ecx" + : : "r" (src), "r" ((long)stride) + : "%"REG_a, "%"REG_d, "%"REG_c ); #else int x; @@ -1643,14 +1669,14 @@ static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*4; asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" "pxor %%mm7, %%mm7 \n\t" "movq (%2), %%mm0 \n\t" // 0 1 2 3 4 5 6 7 8 9 10 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx -#define DEINT_FF(a,b,c,d)\ +#define REAL_DEINT_FF(a,b,c,d)\ "movq " #a ", %%mm1 \n\t"\ "movq " #b ", %%mm2 \n\t"\ "movq " #c ", %%mm3 \n\t"\ @@ -1678,14 +1704,16 @@ static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp "packuswb %%mm4, %%mm1 \n\t"\ "movq %%mm1, " #b " \n\t"\ -DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) -DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) -DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) -DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) +#define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) + +DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) +DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4), (%%REGd) ) +DEINT_FF((%0, %1, 4), (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) +DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8), (%%REGd, %1, 4)) "movq %%mm0, (%2) \n\t" - : : "r" (src), "r" (stride), "r"(tmp) - : "%eax", "%edx" + : : "r" (src), "r" ((long)stride), "r"(tmp) + : "%"REG_a, "%"REG_d ); #else int x; @@ -1721,15 +1749,15 @@ static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= stride*4; asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" "pxor %%mm7, %%mm7 \n\t" "movq (%2), %%mm0 \n\t" "movq (%3), %%mm1 \n\t" // 0 1 2 3 4 5 6 7 8 9 10 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx -#define DEINT_L5(t1,t2,a,b,c)\ +#define REAL_DEINT_L5(t1,t2,a,b,c)\ "movq " #a ", %%mm2 \n\t"\ "movq " #b ", %%mm3 \n\t"\ "movq " #c ", %%mm4 \n\t"\ @@ -1762,19 +1790,21 @@ static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp "packuswb %%mm5, %%mm2 \n\t"\ "movq %%mm2, " #a " \n\t"\ -DEINT_L5(%%mm0, %%mm1, (%0) , (%%eax) , (%%eax, %1) ) -DEINT_L5(%%mm1, %%mm0, (%%eax) , (%%eax, %1) , (%%eax, %1, 2)) -DEINT_L5(%%mm0, %%mm1, (%%eax, %1) , (%%eax, %1, 2), (%0, %1, 4) ) -DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4) , (%%edx) ) -DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%edx) , (%%edx, %1) ) -DEINT_L5(%%mm1, %%mm0, (%%edx) , (%%edx, %1) , (%%edx, %1, 2)) -DEINT_L5(%%mm0, %%mm1, (%%edx, %1) , (%%edx, %1, 2), (%0, %1, 8) ) -DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8) , (%%edx, %1, 4)) +#define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) + +DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) +DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) +DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) +DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) +DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) +DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) +DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) +DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) "movq %%mm0, (%2) \n\t" "movq %%mm1, (%3) \n\t" - : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2) - : "%eax", "%edx" + : : "r" (src), "r" ((long)stride), "r"(tmp), "r"(tmp2) + : "%"REG_a, "%"REG_d ); #else int x; @@ -1821,49 +1851,49 @@ static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uin #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) src+= 4*stride; asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 "movq (%2), %%mm0 \n\t" // L0 - "movq (%%eax), %%mm1 \n\t" // L2 + "movq (%%"REG_a"), %%mm1 \n\t" // L2 PAVGB(%%mm1, %%mm0) // L0+L2 "movq (%0), %%mm2 \n\t" // L1 PAVGB(%%mm2, %%mm0) "movq %%mm0, (%0) \n\t" - "movq (%%eax, %1), %%mm0 \n\t" // L3 + "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 PAVGB(%%mm0, %%mm2) // L1+L3 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 - "movq %%mm2, (%%eax) \n\t" - "movq (%%eax, %1, 2), %%mm2 \n\t" // L4 + "movq %%mm2, (%%"REG_a") \n\t" + "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 PAVGB(%%mm2, %%mm1) // L2+L4 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 - "movq %%mm1, (%%eax, %1) \n\t" + "movq %%mm1, (%%"REG_a", %1) \n\t" "movq (%0, %1, 4), %%mm1 \n\t" // L5 PAVGB(%%mm1, %%mm0) // L3+L5 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 - "movq %%mm0, (%%eax, %1, 2) \n\t" - "movq (%%edx), %%mm0 \n\t" // L6 + "movq %%mm0, (%%"REG_a", %1, 2) \n\t" + "movq (%%"REG_d"), %%mm0 \n\t" // L6 PAVGB(%%mm0, %%mm2) // L4+L6 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 "movq %%mm2, (%0, %1, 4) \n\t" - "movq (%%edx, %1), %%mm2 \n\t" // L7 + "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 PAVGB(%%mm2, %%mm1) // L5+L7 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 - "movq %%mm1, (%%edx) \n\t" - "movq (%%edx, %1, 2), %%mm1 \n\t" // L8 + "movq %%mm1, (%%"REG_d") \n\t" + "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 PAVGB(%%mm1, %%mm0) // L6+L8 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 - "movq %%mm0, (%%edx, %1) \n\t" + "movq %%mm0, (%%"REG_d", %1) \n\t" "movq (%0, %1, 8), %%mm0 \n\t" // L9 PAVGB(%%mm0, %%mm2) // L7+L9 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 - "movq %%mm2, (%%edx, %1, 2) \n\t" + "movq %%mm2, (%%"REG_d", %1, 2) \n\t" "movq %%mm1, (%2) \n\t" - : : "r" (src), "r" (stride), "r" (tmp) - : "%eax", "%edx" + : : "r" (src), "r" ((long)stride), "r" (tmp) + : "%"REG_a, "%"REG_d ); #else int a, b, c, x; @@ -1923,62 +1953,62 @@ static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) src+= 4*stride; #ifdef HAVE_MMX2 asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 "movq (%0), %%mm0 \n\t" // - "movq (%%eax, %1), %%mm2 \n\t" // - "movq (%%eax), %%mm1 \n\t" // + "movq (%%"REG_a", %1), %%mm2 \n\t" // + "movq (%%"REG_a"), %%mm1 \n\t" // "movq %%mm0, %%mm3 \n\t" "pmaxub %%mm1, %%mm0 \n\t" // "pminub %%mm3, %%mm1 \n\t" // "pmaxub %%mm2, %%mm1 \n\t" // "pminub %%mm1, %%mm0 \n\t" - "movq %%mm0, (%%eax) \n\t" + "movq %%mm0, (%%"REG_a") \n\t" "movq (%0, %1, 4), %%mm0 \n\t" // - "movq (%%eax, %1, 2), %%mm1 \n\t" // + "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // "movq %%mm2, %%mm3 \n\t" "pmaxub %%mm1, %%mm2 \n\t" // "pminub %%mm3, %%mm1 \n\t" // "pmaxub %%mm0, %%mm1 \n\t" // "pminub %%mm1, %%mm2 \n\t" - "movq %%mm2, (%%eax, %1, 2) \n\t" + "movq %%mm2, (%%"REG_a", %1, 2) \n\t" - "movq (%%edx), %%mm2 \n\t" // - "movq (%%edx, %1), %%mm1 \n\t" // + "movq (%%"REG_d"), %%mm2 \n\t" // + "movq (%%"REG_d", %1), %%mm1 \n\t" // "movq %%mm2, %%mm3 \n\t" "pmaxub %%mm0, %%mm2 \n\t" // "pminub %%mm3, %%mm0 \n\t" // "pmaxub %%mm1, %%mm0 \n\t" // "pminub %%mm0, %%mm2 \n\t" - "movq %%mm2, (%%edx) \n\t" + "movq %%mm2, (%%"REG_d") \n\t" - "movq (%%edx, %1, 2), %%mm2 \n\t" // + "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // "movq (%0, %1, 8), %%mm0 \n\t" // "movq %%mm2, %%mm3 \n\t" "pmaxub %%mm0, %%mm2 \n\t" // "pminub %%mm3, %%mm0 \n\t" // "pmaxub %%mm1, %%mm0 \n\t" // "pminub %%mm0, %%mm2 \n\t" - "movq %%mm2, (%%edx, %1, 2) \n\t" + "movq %%mm2, (%%"REG_d", %1, 2) \n\t" - : : "r" (src), "r" (stride) - : "%eax", "%edx" + : : "r" (src), "r" ((long)stride) + : "%"REG_a, "%"REG_d ); #else // MMX without MMX2 asm volatile( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 "pxor %%mm7, %%mm7 \n\t" -#define MEDIAN(a,b,c)\ +#define REAL_MEDIAN(a,b,c)\ "movq " #a ", %%mm0 \n\t"\ "movq " #b ", %%mm2 \n\t"\ "movq " #c ", %%mm1 \n\t"\ @@ -2001,14 +2031,15 @@ static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) "pand %%mm2, %%mm0 \n\t"\ "pand %%mm1, %%mm0 \n\t"\ "movq %%mm0, " #b " \n\t" +#define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) -MEDIAN((%0), (%%eax), (%%eax, %1)) -MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) -MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) -MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) +MEDIAN((%0), (%%REGa), (%%REGa, %1)) +MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) +MEDIAN((%0, %1, 4), (%%REGd), (%%REGd, %1)) +MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) - : : "r" (src), "r" (stride) - : "%eax", "%edx" + : : "r" (src), "r" ((long)stride) + : "%"REG_a, "%"REG_d ); #endif // MMX #else @@ -2042,17 +2073,17 @@ MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) { asm( - "leal (%0, %1), %%eax \n\t" + "lea (%0, %1), %%"REG_a" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 "movq (%0), %%mm0 \n\t" // 12345678 - "movq (%%eax), %%mm1 \n\t" // abcdefgh + "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh "movq %%mm0, %%mm2 \n\t" // 12345678 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h - "movq (%%eax, %1), %%mm1 \n\t" - "movq (%%eax, %1, 2), %%mm3 \n\t" + "movq (%%"REG_a", %1), %%mm1 \n\t" + "movq (%%"REG_a", %1, 2), %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "punpcklbw %%mm3, %%mm1 \n\t" "punpckhbw %%mm3, %%mm4 \n\t" @@ -2079,16 +2110,16 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src "psrlq $32, %%mm1 \n\t" "movd %%mm1, 112(%3) \n\t" - "leal (%%eax, %1, 4), %%eax \n\t" + "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 - "movq (%%eax), %%mm1 \n\t" // abcdefgh + "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh "movq %%mm0, %%mm2 \n\t" // 12345678 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h - "movq (%%eax, %1), %%mm1 \n\t" - "movq (%%eax, %1, 2), %%mm3 \n\t" + "movq (%%"REG_a", %1), %%mm1 \n\t" + "movq (%%"REG_a", %1, 2), %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "punpcklbw %%mm3, %%mm1 \n\t" "punpckhbw %%mm3, %%mm4 \n\t" @@ -2116,8 +2147,8 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src "movd %%mm1, 116(%3) \n\t" - :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) - : "%eax" + :: "r" (src), "r" ((long)srcStride), "r" (dst1), "r" (dst2) + : "%"REG_a ); } @@ -2127,8 +2158,8 @@ static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) { asm( - "leal (%0, %1), %%eax \n\t" - "leal (%%eax, %1, 4), %%edx \n\t" + "lea (%0, %1), %%"REG_a" \n\t" + "lea (%%"REG_a",%1,4), %%"REG_d"\n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 "movq (%2), %%mm0 \n\t" // 12345678 @@ -2152,16 +2183,16 @@ static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) "movd %%mm0, (%0) \n\t" "psrlq $32, %%mm0 \n\t" - "movd %%mm0, (%%eax) \n\t" - "movd %%mm3, (%%eax, %1) \n\t" + "movd %%mm0, (%%"REG_a") \n\t" + "movd %%mm3, (%%"REG_a", %1) \n\t" "psrlq $32, %%mm3 \n\t" - "movd %%mm3, (%%eax, %1, 2) \n\t" + "movd %%mm3, (%%"REG_a", %1, 2) \n\t" "movd %%mm2, (%0, %1, 4) \n\t" "psrlq $32, %%mm2 \n\t" - "movd %%mm2, (%%edx) \n\t" - "movd %%mm1, (%%edx, %1) \n\t" + "movd %%mm2, (%%"REG_d") \n\t" + "movd %%mm1, (%%"REG_d", %1) \n\t" "psrlq $32, %%mm1 \n\t" - "movd %%mm1, (%%edx, %1, 2) \n\t" + "movd %%mm1, (%%"REG_d", %1, 2) \n\t" "movq 64(%2), %%mm0 \n\t" // 12345678 @@ -2185,23 +2216,23 @@ static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) "movd %%mm0, 4(%0) \n\t" "psrlq $32, %%mm0 \n\t" - "movd %%mm0, 4(%%eax) \n\t" - "movd %%mm3, 4(%%eax, %1) \n\t" + "movd %%mm0, 4(%%"REG_a") \n\t" + "movd %%mm3, 4(%%"REG_a", %1) \n\t" "psrlq $32, %%mm3 \n\t" - "movd %%mm3, 4(%%eax, %1, 2) \n\t" + "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" "movd %%mm2, 4(%0, %1, 4) \n\t" "psrlq $32, %%mm2 \n\t" - "movd %%mm2, 4(%%edx) \n\t" - "movd %%mm1, 4(%%edx, %1) \n\t" + "movd %%mm2, 4(%%"REG_d") \n\t" + "movd %%mm1, 4(%%"REG_d", %1) \n\t" "psrlq $32, %%mm1 \n\t" - "movd %%mm1, 4(%%edx, %1, 2) \n\t" + "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" - :: "r" (dst), "r" (dstStride), "r" (src) - : "%eax", "%edx" + :: "r" (dst), "r" ((long)dstStride), "r" (src) + : "%"REG_a, "%"REG_d ); } #endif -//static int test=0; +//static long test=0; #ifndef HAVE_ALTIVEC static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, @@ -2216,9 +2247,9 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, //#define L1_DIFF //u should change the thresholds too if u try that one #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) asm volatile( - "leal (%2, %2, 2), %%eax \n\t" // 3*stride - "leal (%2, %2, 4), %%edx \n\t" // 5*stride - "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride + "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride + "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride + "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride // 0 1 2 3 4 5 6 7 8 9 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 //FIXME reorder? @@ -2229,29 +2260,30 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| "movq (%0, %2, 2), %%mm2 \n\t" // L2 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| - "movq (%0, %%eax), %%mm3 \n\t" // L3 - "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| + "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 + "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| "movq (%0, %2, 4), %%mm4 \n\t" // L4 "paddw %%mm1, %%mm0 \n\t" "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| - "movq (%0, %%edx), %%mm5 \n\t" // L5 + "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 "paddw %%mm2, %%mm0 \n\t" - "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| - "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 + "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| + "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 "paddw %%mm3, %%mm0 \n\t" - "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| - "movq (%0, %%ecx), %%mm7 \n\t" // L7 + "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| + "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 "paddw %%mm4, %%mm0 \n\t" - "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| + "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| "paddw %%mm5, %%mm6 \n\t" "paddw %%mm7, %%mm6 \n\t" "paddw %%mm6, %%mm0 \n\t" -#elif defined (FAST_L2_DIFF) +#else +#if defined (FAST_L2_DIFF) "pcmpeqb %%mm7, %%mm7 \n\t" "movq "MANGLE(b80)", %%mm6 \n\t" "pxor %%mm0, %%mm0 \n\t" -#define L2_DIFF_CORE(a, b)\ +#define REAL_L2_DIFF_CORE(a, b)\ "movq " #a ", %%mm5 \n\t"\ "movq " #b ", %%mm2 \n\t"\ "pxor %%mm7, %%mm2 \n\t"\ @@ -2265,19 +2297,10 @@ static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, "psrld $14, %%mm5 \n\t"\ "paddd %%mm5, %%mm0 \n\t" -L2_DIFF_CORE((%0), (%1)) -L2_DIFF_CORE((%0, %2), (%1, %2)) -L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) -L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) -L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) -L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) -L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) -L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) - #else "pxor %%mm7, %%mm7 \n\t" "pxor %%mm0, %%mm0 \n\t" -#define L2_DIFF_CORE(a, b)\ +#define REAL_L2_DIFF_CORE(a, b)\ "movq " #a ", %%mm5 \n\t"\ "movq " #b ", %%mm2 \n\t"\ "movq %%mm5, %%mm1 \n\t"\ @@ -2293,14 +2316,18 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) "paddd %%mm1, %%mm5 \n\t"\ "paddd %%mm5, %%mm0 \n\t" +#endif + +#define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) + L2_DIFF_CORE((%0), (%1)) L2_DIFF_CORE((%0, %2), (%1, %2)) L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) -L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) +L2_DIFF_CORE((%0, %%REGa), (%1, %%REGa)) L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) -L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) -L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) -L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) +L2_DIFF_CORE((%0, %%REGd), (%1, %%REGd)) +L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) +L2_DIFF_CORE((%0, %%REGc), (%1, %%REGc)) #endif @@ -2309,94 +2336,94 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) "paddd %%mm0, %%mm4 \n\t" "movd %%mm4, %%ecx \n\t" "shll $2, %%ecx \n\t" - "movl %3, %%edx \n\t" - "addl -4(%%edx), %%ecx \n\t" - "addl 4(%%edx), %%ecx \n\t" - "addl -1024(%%edx), %%ecx \n\t" + "mov %3, %%"REG_d" \n\t" + "addl -4(%%"REG_d"), %%ecx \n\t" + "addl 4(%%"REG_d"), %%ecx \n\t" + "addl -1024(%%"REG_d"), %%ecx \n\t" "addl $4, %%ecx \n\t" - "addl 1024(%%edx), %%ecx \n\t" + "addl 1024(%%"REG_d"), %%ecx \n\t" "shrl $3, %%ecx \n\t" - "movl %%ecx, (%%edx) \n\t" + "movl %%ecx, (%%"REG_d") \n\t" -// "movl %3, %%ecx \n\t" -// "movl %%ecx, test \n\t" +// "mov %3, %%"REG_c" \n\t" +// "mov %%"REG_c", test \n\t" // "jmp 4f \n\t" - "cmpl 512(%%edx), %%ecx \n\t" + "cmpl 512(%%"REG_d"), %%ecx \n\t" " jb 2f \n\t" - "cmpl 516(%%edx), %%ecx \n\t" + "cmpl 516(%%"REG_d"), %%ecx \n\t" " jb 1f \n\t" - "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride - "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride + "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride + "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride "movq (%0), %%mm0 \n\t" // L0 "movq (%0, %2), %%mm1 \n\t" // L1 "movq (%0, %2, 2), %%mm2 \n\t" // L2 - "movq (%0, %%eax), %%mm3 \n\t" // L3 + "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 "movq (%0, %2, 4), %%mm4 \n\t" // L4 - "movq (%0, %%edx), %%mm5 \n\t" // L5 - "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 - "movq (%0, %%ecx), %%mm7 \n\t" // L7 + "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 + "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 + "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 "movq %%mm0, (%1) \n\t" // L0 "movq %%mm1, (%1, %2) \n\t" // L1 "movq %%mm2, (%1, %2, 2) \n\t" // L2 - "movq %%mm3, (%1, %%eax) \n\t" // L3 + "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 "movq %%mm4, (%1, %2, 4) \n\t" // L4 - "movq %%mm5, (%1, %%edx) \n\t" // L5 - "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 - "movq %%mm7, (%1, %%ecx) \n\t" // L7 + "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 + "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 + "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 "jmp 4f \n\t" "1: \n\t" - "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride - "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride + "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride + "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride "movq (%0), %%mm0 \n\t" // L0 PAVGB((%1), %%mm0) // L0 "movq (%0, %2), %%mm1 \n\t" // L1 PAVGB((%1, %2), %%mm1) // L1 "movq (%0, %2, 2), %%mm2 \n\t" // L2 PAVGB((%1, %2, 2), %%mm2) // L2 - "movq (%0, %%eax), %%mm3 \n\t" // L3 - PAVGB((%1, %%eax), %%mm3) // L3 + "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 + PAVGB((%1, %%REGa), %%mm3) // L3 "movq (%0, %2, 4), %%mm4 \n\t" // L4 PAVGB((%1, %2, 4), %%mm4) // L4 - "movq (%0, %%edx), %%mm5 \n\t" // L5 - PAVGB((%1, %%edx), %%mm5) // L5 - "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 - PAVGB((%1, %%eax, 2), %%mm6) // L6 - "movq (%0, %%ecx), %%mm7 \n\t" // L7 - PAVGB((%1, %%ecx), %%mm7) // L7 + "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 + PAVGB((%1, %%REGd), %%mm5) // L5 + "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 + PAVGB((%1, %%REGa, 2), %%mm6) // L6 + "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 + PAVGB((%1, %%REGc), %%mm7) // L7 "movq %%mm0, (%1) \n\t" // R0 "movq %%mm1, (%1, %2) \n\t" // R1 "movq %%mm2, (%1, %2, 2) \n\t" // R2 - "movq %%mm3, (%1, %%eax) \n\t" // R3 + "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 "movq %%mm4, (%1, %2, 4) \n\t" // R4 - "movq %%mm5, (%1, %%edx) \n\t" // R5 - "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 - "movq %%mm7, (%1, %%ecx) \n\t" // R7 + "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 + "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 + "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 "movq %%mm0, (%0) \n\t" // L0 "movq %%mm1, (%0, %2) \n\t" // L1 "movq %%mm2, (%0, %2, 2) \n\t" // L2 - "movq %%mm3, (%0, %%eax) \n\t" // L3 + "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 "movq %%mm4, (%0, %2, 4) \n\t" // L4 - "movq %%mm5, (%0, %%edx) \n\t" // L5 - "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 - "movq %%mm7, (%0, %%ecx) \n\t" // L7 + "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 + "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 + "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 "jmp 4f \n\t" "2: \n\t" - "cmpl 508(%%edx), %%ecx \n\t" + "cmpl 508(%%"REG_d"), %%ecx \n\t" " jb 3f \n\t" - "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride - "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride + "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride + "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride "movq (%0), %%mm0 \n\t" // L0 "movq (%0, %2), %%mm1 \n\t" // L1 "movq (%0, %2, 2), %%mm2 \n\t" // L2 - "movq (%0, %%eax), %%mm3 \n\t" // L3 + "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 "movq (%1), %%mm4 \n\t" // R0 "movq (%1, %2), %%mm5 \n\t" // R1 "movq (%1, %2, 2), %%mm6 \n\t" // R2 - "movq (%1, %%eax), %%mm7 \n\t" // R3 + "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 PAVGB(%%mm4, %%mm0) PAVGB(%%mm5, %%mm1) PAVGB(%%mm6, %%mm2) @@ -2408,20 +2435,20 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) "movq %%mm0, (%1) \n\t" // R0 "movq %%mm1, (%1, %2) \n\t" // R1 "movq %%mm2, (%1, %2, 2) \n\t" // R2 - "movq %%mm3, (%1, %%eax) \n\t" // R3 + "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 "movq %%mm0, (%0) \n\t" // L0 "movq %%mm1, (%0, %2) \n\t" // L1 "movq %%mm2, (%0, %2, 2) \n\t" // L2 - "movq %%mm3, (%0, %%eax) \n\t" // L3 + "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 "movq (%0, %2, 4), %%mm0 \n\t" // L4 - "movq (%0, %%edx), %%mm1 \n\t" // L5 - "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 - "movq (%0, %%ecx), %%mm3 \n\t" // L7 + "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 + "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 + "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 "movq (%1, %2, 4), %%mm4 \n\t" // R4 - "movq (%1, %%edx), %%mm5 \n\t" // R5 - "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 - "movq (%1, %%ecx), %%mm7 \n\t" // R7 + "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 + "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 + "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 PAVGB(%%mm4, %%mm0) PAVGB(%%mm5, %%mm1) PAVGB(%%mm6, %%mm2) @@ -2431,26 +2458,26 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) PAVGB(%%mm6, %%mm2) PAVGB(%%mm7, %%mm3) "movq %%mm0, (%1, %2, 4) \n\t" // R4 - "movq %%mm1, (%1, %%edx) \n\t" // R5 - "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 - "movq %%mm3, (%1, %%ecx) \n\t" // R7 + "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 + "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 + "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 "movq %%mm0, (%0, %2, 4) \n\t" // L4 - "movq %%mm1, (%0, %%edx) \n\t" // L5 - "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 - "movq %%mm3, (%0, %%ecx) \n\t" // L7 + "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 + "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 + "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 "jmp 4f \n\t" "3: \n\t" - "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride - "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride + "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride + "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride "movq (%0), %%mm0 \n\t" // L0 "movq (%0, %2), %%mm1 \n\t" // L1 "movq (%0, %2, 2), %%mm2 \n\t" // L2 - "movq (%0, %%eax), %%mm3 \n\t" // L3 + "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 "movq (%1), %%mm4 \n\t" // R0 "movq (%1, %2), %%mm5 \n\t" // R1 "movq (%1, %2, 2), %%mm6 \n\t" // R2 - "movq (%1, %%eax), %%mm7 \n\t" // R3 + "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 PAVGB(%%mm4, %%mm0) PAVGB(%%mm5, %%mm1) PAVGB(%%mm6, %%mm2) @@ -2466,20 +2493,20 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) "movq %%mm0, (%1) \n\t" // R0 "movq %%mm1, (%1, %2) \n\t" // R1 "movq %%mm2, (%1, %2, 2) \n\t" // R2 - "movq %%mm3, (%1, %%eax) \n\t" // R3 + "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 "movq %%mm0, (%0) \n\t" // L0 "movq %%mm1, (%0, %2) \n\t" // L1 "movq %%mm2, (%0, %2, 2) \n\t" // L2 - "movq %%mm3, (%0, %%eax) \n\t" // L3 + "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 "movq (%0, %2, 4), %%mm0 \n\t" // L4 - "movq (%0, %%edx), %%mm1 \n\t" // L5 - "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 - "movq (%0, %%ecx), %%mm3 \n\t" // L7 + "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 + "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 + "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 "movq (%1, %2, 4), %%mm4 \n\t" // R4 - "movq (%1, %%edx), %%mm5 \n\t" // R5 - "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 - "movq (%1, %%ecx), %%mm7 \n\t" // R7 + "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 + "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 + "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 PAVGB(%%mm4, %%mm0) PAVGB(%%mm5, %%mm1) PAVGB(%%mm6, %%mm2) @@ -2493,18 +2520,18 @@ L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) PAVGB(%%mm6, %%mm2) PAVGB(%%mm7, %%mm3) "movq %%mm0, (%1, %2, 4) \n\t" // R4 - "movq %%mm1, (%1, %%edx) \n\t" // R5 - "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 - "movq %%mm3, (%1, %%ecx) \n\t" // R7 + "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 + "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 + "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 "movq %%mm0, (%0, %2, 4) \n\t" // L4 - "movq %%mm1, (%0, %%edx) \n\t" // L5 - "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 - "movq %%mm3, (%0, %%ecx) \n\t" // L7 + "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 + "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 + "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 "4: \n\t" - :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) - : "%eax", "%edx", "%ecx", "memory" + :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast) + : "%"REG_a, "%"REG_d, "%"REG_c, "memory" ); //printf("%d\n", test); #else @@ -2630,19 +2657,19 @@ asm volatile( ); asm volatile( - "leal (%2, %3), %%eax \n\t" + "lea (%2, %3), %%"REG_a" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 "movq (%2), %%mm0 \n\t" - "movq (%%eax), %%mm1 \n\t" + "movq (%%"REG_a"), %%mm1 \n\t" "movq %%mm1, %%mm3 \n\t" "movq %%mm1, %%mm4 \n\t" "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece "paddb %%mm7, %%mm0 \n\t" "pcmpgtb %%mm6, %%mm0 \n\t" - "movq (%%eax,%3), %%mm2 \n\t" + "movq (%%"REG_a",%3), %%mm2 \n\t" PMAXUB(%%mm2, %%mm4) PMINUB(%%mm2, %%mm3, %%mm5) "psubb %%mm2, %%mm1 \n\t" @@ -2650,7 +2677,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax, %3, 2), %%mm1 \n\t" + "movq (%%"REG_a", %3, 2), %%mm1 \n\t" PMAXUB(%%mm1, %%mm4) PMINUB(%%mm1, %%mm3, %%mm5) "psubb %%mm1, %%mm2 \n\t" @@ -2658,7 +2685,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm2 \n\t" "paddb %%mm2, %%mm0 \n\t" - "leal (%%eax, %3, 4), %%eax \n\t" + "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" "movq (%2, %3, 4), %%mm2 \n\t" PMAXUB(%%mm2, %%mm4) @@ -2668,7 +2695,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax), %%mm1 \n\t" + "movq (%%"REG_a"), %%mm1 \n\t" PMAXUB(%%mm1, %%mm4) PMINUB(%%mm1, %%mm3, %%mm5) "psubb %%mm1, %%mm2 \n\t" @@ -2676,7 +2703,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm2 \n\t" "paddb %%mm2, %%mm0 \n\t" - "movq (%%eax, %3), %%mm2 \n\t" + "movq (%%"REG_a", %3), %%mm2 \n\t" PMAXUB(%%mm2, %%mm4) PMINUB(%%mm2, %%mm3, %%mm5) "psubb %%mm2, %%mm1 \n\t" @@ -2684,7 +2711,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax, %3, 2), %%mm1 \n\t" + "movq (%%"REG_a", %3, 2), %%mm1 \n\t" PMAXUB(%%mm1, %%mm4) PMINUB(%%mm1, %%mm3, %%mm5) "psubb %%mm1, %%mm2 \n\t" @@ -2700,7 +2727,7 @@ asm volatile( "pcmpgtb %%mm6, %%mm1 \n\t" "paddb %%mm1, %%mm0 \n\t" - "movq (%%eax, %3, 4), %%mm1 \n\t" + "movq (%%"REG_a", %3, 4), %%mm1 \n\t" "psubb %%mm1, %%mm2 \n\t" "paddb %%mm7, %%mm2 \n\t" "pcmpgtb %%mm6, %%mm2 \n\t" @@ -2724,12 +2751,12 @@ asm volatile( "movq %%mm6, %0 \n\t" : "=m" (eq_mask), "=m" (dc_mask) - : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) - : "%eax" + : "r" (src), "r" ((long)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) + : "%"REG_a ); if(dc_mask & eq_mask){ - int offset= -8*step; + long offset= -8*step; int64_t *temp_sums= sums; asm volatile( @@ -2752,7 +2779,7 @@ asm volatile( // 0:QP 6:First "movq (%0, %1, 8), %%mm5 \n\t" - "addl %1, %0 \n\t" // %0 points to line 1 not 0 + "add %1, %0 \n\t" // %0 points to line 1 not 0 "movq (%0, %1, 8), %%mm7 \n\t" "movq %%mm5, %%mm1 \n\t" "movq %%mm7, %%mm2 \n\t" @@ -2782,7 +2809,7 @@ asm volatile( #define NEXT\ "movq (%0), %%mm2 \n\t"\ "movq (%0), %%mm3 \n\t"\ - "addl %1, %0 \n\t"\ + "add %1, %0 \n\t"\ "punpcklbw %%mm4, %%mm2 \n\t"\ "punpckhbw %%mm4, %%mm3 \n\t"\ "paddw %%mm2, %%mm0 \n\t"\ @@ -2791,7 +2818,7 @@ asm volatile( #define PREV\ "movq (%0), %%mm2 \n\t"\ "movq (%0), %%mm3 \n\t"\ - "addl %1, %0 \n\t"\ + "add %1, %0 \n\t"\ "punpcklbw %%mm4, %%mm2 \n\t"\ "punpckhbw %%mm4, %%mm3 \n\t"\ "psubw %%mm2, %%mm0 \n\t"\ @@ -2833,8 +2860,8 @@ asm volatile( "punpcklbw %%mm4, %%mm6 \n\t" NEXT //7 - "movl %4, %0 \n\t" - "addl %1, %0 \n\t" + "mov %4, %0 \n\t" + "add %1, %0 \n\t" PREV //0 "movq %%mm0, 80(%3) \n\t" "movq %%mm1, 88(%3) \n\t" @@ -2863,10 +2890,10 @@ asm volatile( "movq %%mm0, 144(%3) \n\t" "movq %%mm1, 152(%3) \n\t" - "movl %4, %0 \n\t" //FIXME + "mov %4, %0 \n\t" //FIXME : "+&r"(src) - : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src) + : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src) ); src+= step; // src points to begin of the 8x8 Block @@ -2898,12 +2925,12 @@ asm volatile( "pand %%mm5, %%mm4 \n\t" "por %%mm4, %%mm0 \n\t" "movq %%mm0, (%0, %3) \n\t" - "addl $16, %1 \n\t" - "addl %2, %0 \n\t" + "add $16, %1 \n\t" + "add %2, %0 \n\t" " js 1b \n\t" : "+r"(offset), "+r"(temp_sums) - : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask) + : "r" ((long)step), "r"(src - offset), "m"(dc_mask & eq_mask) ); }else src+= step; // src points to begin of the 8x8 Block @@ -2912,8 +2939,8 @@ asm volatile( uint8_t *temp_src= src; asm volatile( "pxor %%mm7, %%mm7 \n\t" - "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars - "andl $0xFFFFFFF8, %%ecx \n\t" // align + "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars + "and "ALIGN_MASK", %%"REG_c" \n\t" // align // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 @@ -2923,12 +2950,12 @@ asm volatile( "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 "movq (%0, %1), %%mm2 \n\t" - "leal (%0, %1, 2), %%eax \n\t" + "lea (%0, %1, 2), %%"REG_a" \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 - "movq (%%eax), %%mm4 \n\t" + "movq (%%"REG_a"), %%mm4 \n\t" "movq %%mm4, %%mm5 \n\t" "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 @@ -2945,7 +2972,7 @@ asm volatile( "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - "movq (%%eax, %1), %%mm2 \n\t" + "movq (%%"REG_a", %1), %%mm2 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" // L3 "punpckhbw %%mm7, %%mm3 \n\t" // H3 @@ -2954,24 +2981,24 @@ asm volatile( "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 - "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 - "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 + "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 + "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 - "movq (%%eax, %1, 2), %%mm0 \n\t" + "movq (%%"REG_a", %1, 2), %%mm0 \n\t" "movq %%mm0, %%mm1 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" // L4 "punpckhbw %%mm7, %%mm1 \n\t" // H4 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 - "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 - "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 + "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 + "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 "paddw %%mm4, %%mm4 \n\t" // 2L2 "paddw %%mm5, %%mm5 \n\t" // 2H2 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 - "leal (%%eax, %1), %0 \n\t" + "lea (%%"REG_a", %1), %0 \n\t" "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 @@ -2986,10 +3013,10 @@ asm volatile( "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 - "movq (%%eax, %1, 4), %%mm6 \n\t" + "movq (%%"REG_a", %1, 4), %%mm6 \n\t" "punpcklbw %%mm7, %%mm6 \n\t" // L6 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 - "movq (%%eax, %1, 4), %%mm6 \n\t" + "movq (%%"REG_a", %1, 4), %%mm6 \n\t" "punpckhbw %%mm7, %%mm6 \n\t" // H6 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 @@ -3013,8 +3040,8 @@ asm volatile( "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 - "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 - "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 + "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 + "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 #ifdef HAVE_MMX2 "movq %%mm7, %%mm6 \n\t" // 0 @@ -3092,8 +3119,8 @@ asm volatile( "psrlw $6, %%mm4 \n\t" "psrlw $6, %%mm5 \n\t" - "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 - "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 + "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 + "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 "pxor %%mm2, %%mm2 \n\t" "pxor %%mm3, %%mm3 \n\t" @@ -3138,8 +3165,8 @@ asm volatile( "movq %%mm0, (%0, %1) \n\t" : "+r" (temp_src) - : "r" (step), "m" (c->pQPb), "m"(eq_mask) - : "%eax", "%ecx" + : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask) + : "%"REG_a, "%"REG_c ); } /*if(step==16){ @@ -3169,13 +3196,13 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[] { #ifdef HAVE_MMX asm volatile( - "movq (%%eax), %%mm2 \n\t" // packedYOffset - "movq 8(%%eax), %%mm3 \n\t" // packedYScale - "leal (%2,%4), %%eax \n\t" - "leal (%3,%5), %%edx \n\t" + "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset + "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale + "lea (%2,%4), %%"REG_a" \n\t" + "lea (%3,%5), %%"REG_d" \n\t" "pxor %%mm4, %%mm4 \n\t" #ifdef HAVE_MMX2 -#define SCALED_CPY(src1, src2, dst1, dst2) \ +#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ "movq " #src1 ", %%mm0 \n\t"\ "movq " #src1 ", %%mm5 \n\t"\ "movq " #src2 ", %%mm1 \n\t"\ @@ -3198,7 +3225,7 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[] "movq %%mm1, " #dst2 " \n\t"\ #else //HAVE_MMX2 -#define SCALED_CPY(src1, src2, dst1, dst2) \ +#define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ "movq " #src1 ", %%mm0 \n\t"\ "movq " #src1 ", %%mm5 \n\t"\ "punpcklbw %%mm4, %%mm0 \n\t"\ @@ -3225,22 +3252,24 @@ static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[] "movq %%mm1, " #dst2 " \n\t"\ #endif //!HAVE_MMX2 +#define SCALED_CPY(src1, src2, dst1, dst2)\ + REAL_SCALED_CPY(src1, src2, dst1, dst2) SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) -SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) -SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) - "leal (%%eax,%4,4), %%eax \n\t" - "leal (%%edx,%5,4), %%edx \n\t" -SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) +SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) +SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) + "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" + "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" +SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) : "=&a" (packedOffsetAndScale) : "0" (packedOffsetAndScale), "r"(src), "r"(dst), - "r" (srcStride), - "r" (dstStride) - : "%edx" + "r" ((long)srcStride), + "r" ((long)dstStride) + : "%"REG_d ); #else for(i=0; i<8; i++) @@ -3252,27 +3281,30 @@ SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) { #ifdef HAVE_MMX asm volatile( - "leal (%0,%2), %%eax \n\t" - "leal (%1,%3), %%edx \n\t" + "lea (%0,%2), %%"REG_a" \n\t" + "lea (%1,%3), %%"REG_d" \n\t" -#define SIMPLE_CPY(src1, src2, dst1, dst2) \ +#define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ "movq " #src1 ", %%mm0 \n\t"\ "movq " #src2 ", %%mm1 \n\t"\ "movq %%mm0, " #dst1 " \n\t"\ "movq %%mm1, " #dst2 " \n\t"\ +#define SIMPLE_CPY(src1, src2, dst1, dst2)\ + REAL_SIMPLE_CPY(src1, src2, dst1, dst2) + SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) -SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) -SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) - "leal (%%eax,%2,4), %%eax \n\t" - "leal (%%edx,%3,4), %%edx \n\t" -SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) +SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) +SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) + "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" + "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" +SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) : : "r" (src), "r" (dst), - "r" (srcStride), - "r" (dstStride) - : "%eax", "%edx" + "r" ((long)srcStride), + "r" ((long)dstStride) + : "%"REG_a, "%"REG_d ); #else for(i=0; i<8; i++) @@ -3290,12 +3322,12 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride) #ifdef HAVE_MMX asm volatile( "movq (%0), %%mm0 \n\t" - "addl %1, %0 \n\t" + "add %1, %0 \n\t" "movq %%mm0, (%0) \n\t" "movq %%mm0, (%0, %1) \n\t" "movq %%mm0, (%0, %1, 2) \n\t" : "+r" (src) - : "r" (-stride) + : "r" ((long)-stride) ); #else int i; @@ -3447,22 +3479,22 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int */ asm( - "movl %4, %%eax \n\t" - "shrl $2, %%eax \n\t" - "andl $6, %%eax \n\t" - "addl %5, %%eax \n\t" - "movl %%eax, %%edx \n\t" - "imul %1, %%eax \n\t" - "imul %3, %%edx \n\t" - "prefetchnta 32(%%eax, %0) \n\t" - "prefetcht0 32(%%edx, %2) \n\t" - "addl %1, %%eax \n\t" - "addl %3, %%edx \n\t" - "prefetchnta 32(%%eax, %0) \n\t" - "prefetcht0 32(%%edx, %2) \n\t" - :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), - "m" (x), "m" (copyAhead) - : "%eax", "%edx" + "mov %4, %%"REG_a" \n\t" + "shr $2, %%"REG_a" \n\t" + "and $6, %%"REG_a" \n\t" + "add %5, %%"REG_a" \n\t" + "mov %%"REG_a", %%"REG_d" \n\t" + "imul %1, %%"REG_a" \n\t" + "imul %3, %%"REG_d" \n\t" + "prefetchnta 32(%%"REG_a", %0) \n\t" + "prefetcht0 32(%%"REG_d", %2) \n\t" + "add %1, %%"REG_a" \n\t" + "add %3, %%"REG_d" \n\t" + "prefetchnta 32(%%"REG_a", %0) \n\t" + "prefetcht0 32(%%"REG_d", %2) \n\t" + :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), + "m" ((long)x), "m" ((long)copyAhead) + : "%"REG_a, "%"REG_d ); #elif defined(HAVE_3DNOW) @@ -3593,22 +3625,22 @@ static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int */ asm( - "movl %4, %%eax \n\t" - "shrl $2, %%eax \n\t" - "andl $6, %%eax \n\t" - "addl %5, %%eax \n\t" - "movl %%eax, %%edx \n\t" - "imul %1, %%eax \n\t" - "imul %3, %%edx \n\t" - "prefetchnta 32(%%eax, %0) \n\t" - "prefetcht0 32(%%edx, %2) \n\t" - "addl %1, %%eax \n\t" - "addl %3, %%edx \n\t" - "prefetchnta 32(%%eax, %0) \n\t" - "prefetcht0 32(%%edx, %2) \n\t" - :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), - "m" (x), "m" (copyAhead) - : "%eax", "%edx" + "mov %4, %%"REG_a" \n\t" + "shr $2, %%"REG_a" \n\t" + "and $6, %%"REG_a" \n\t" + "add %5, %%"REG_a" \n\t" + "mov %%"REG_a", %%"REG_d" \n\t" + "imul %1, %%"REG_a" \n\t" + "imul %3, %%"REG_d" \n\t" + "prefetchnta 32(%%"REG_a", %0) \n\t" + "prefetcht0 32(%%"REG_d", %2) \n\t" + "add %1, %%"REG_a" \n\t" + "add %3, %%"REG_d" \n\t" + "prefetchnta 32(%%"REG_a", %0) \n\t" + "prefetcht0 32(%%"REG_d", %2) \n\t" + :: "r" (srcBlock), "r" ((long)srcStride), "r" (dstBlock), "r" ((long)dstStride), + "m" ((long)x), "m" ((long)copyAhead) + : "%"REG_a, "%"REG_d ); #elif defined(HAVE_3DNOW) diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c index 7df276ca70..1941991ed7 100644 --- a/libavcodec/msmpeg4.c +++ b/libavcodec/msmpeg4.c @@ -716,7 +716,7 @@ static inline int msmpeg4_pred_dc(MpegEncContext * s, int n, necessitate to modify mpegvideo.c. The problem comes from the fact they decided to store the quantized DC (which would lead to problems if Q could vary !) */ -#if defined ARCH_X86 && !defined PIC +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && !defined PIC asm volatile( "movl %3, %%eax \n\t" "shrl $1, %%eax \n\t" -- cgit v1.2.3