summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
authorMichael Niedermayer <michaelni@gmx.at>2012-02-01 02:08:23 +0100
committerMichael Niedermayer <michaelni@gmx.at>2012-02-01 02:36:09 +0100
commita369a6b85819890b21a87af3ce983ce533b7169b (patch)
tree838f9821dc09bd99b59ce4a2d8123d5fd6868b91 /libavcodec/x86
parent0a3a69e8d77146b53a1112c715a78e7d293883b1 (diff)
parent52afc9716849e6fb6c2420674d790d374061c663 (diff)
Merge remote-tracking branch 'qatar/master'
* qatar/master: (29 commits) fate: add golomb-test golomb-test: K&R formatting cosmetics h264: Split h264-test off into a separate file - golomb-test.c. h264-test: cleanup: drop timer invocations, commented out code and other cruft h264-test: Remove unused DSP and AVCodec contexts and related init calls. adpcm: Add missing stdint.h #include to fix standalone header compilation. lavf: add functions for accessing the fourcc<->CodecID mapping tables. lavc: set AVCodecContext.codec in avcodec_get_context_defaults3(). lavc: make avcodec_close() work properly on unopened codecs. lavc: add avcodec_is_open(). lavf: rename AVInputFormat.value to raw_codec_id. lavf: remove the pointless value field from flv and iv8 lavc/lavf: remove unnecessary symbols from the symbol version script. lavc: reorder AVCodec fields. lavf: reorder AVInput/OutputFormat fields. mp3dec: Fix a heap-buffer-overflow adpcmenc: remove some unneeded casts adpcmenc: use int16_t and uint8_t instead of short and unsigned char. adpcmenc: fix adpcm_ms extradata allocation adpcmenc: return proper AVERROR codes instead of -1 ... Conflicts: doc/APIchanges libavcodec/Makefile libavcodec/adpcmenc.c libavcodec/avcodec.h libavcodec/h264.c libavcodec/libavcodec.v libavcodec/mpc7.c libavcodec/mpegaudiodec.c libavcodec/options.c libavformat/Makefile libavformat/avformat.h libavformat/flvdec.c libavformat/libavformat.v Merged-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/Makefile3
-rw-r--r--libavcodec/x86/fmtconvert_mmx.c2
-rw-r--r--libavcodec/x86/rv40dsp.asm207
-rw-r--r--libavcodec/x86/rv40dsp_init.c (renamed from libavcodec/x86/rv40dsp.c)23
4 files changed, 231 insertions, 4 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 282bc916bd..3b8ee56a49 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -29,8 +29,9 @@ MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
MMX-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
MMX-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
+ x86/rv40dsp_init.o
+YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
x86/rv40dsp.o
-YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_yasm.o
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index a3d8f89816..ca0b29344a 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -110,9 +110,9 @@ static void float_interleave_sse(float *dst, const float **src,
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
+#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
-#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_MMX) {
c->float_interleave = float_interleave_mmx;
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
new file mode 100644
index 0000000000..bff3e7b96a
--- /dev/null
+++ b/libavcodec/x86/rv40dsp.asm
@@ -0,0 +1,207 @@
+;******************************************************************************
+;* MMX/SSE2-optimized functions for the RV40 decoder
+;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
+;*
+;* This file is part of Libav.
+;*
+;* Libav is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* Libav is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+align 16
+shift_round: times 8 dw 1 << (16 - 6)
+cextern pw_16
+
+SECTION .text
+
+; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
+%macro RV40_WCORE 4-5
+ movh m4, [%3 + 0]
+ movh m5, [%4 + 0]
+%if %0 == 4
+%define OFFSET mmsize / 2
+%else
+ ; 8x8 block and sse2, stride was provided
+%define OFFSET %5
+%endif
+ movh m6, [%3 + OFFSET]
+ movh m7, [%4 + OFFSET]
+
+%if %1 == 0
+ ; 14bits weights
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ punpcklbw m7, m0
+
+ psllw m4, 7
+ psllw m5, 7
+ psllw m6, 7
+ psllw m7, 7
+ pmulhw m4, m3
+ pmulhw m5, m2
+ pmulhw m6, m3
+ pmulhw m7, m2
+
+ paddw m4, m5
+ paddw m6, m7
+%else
+ ; 5bits weights
+%if cpuflag(ssse3)
+ punpcklbw m4, m5
+ punpcklbw m6, m7
+
+ pmaddubsw m4, m3
+ pmaddubsw m6, m3
+%else
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ punpcklbw m7, m0
+
+ pmullw m4, m3
+ pmullw m5, m2
+ pmullw m6, m3
+ pmullw m7, m2
+ paddw m4, m5
+ paddw m6, m7
+%endif
+
+%endif
+
+ ; bias and shift down
+%if cpuflag(ssse3)
+ pmulhrsw m4, m1
+ pmulhrsw m6, m1
+%else
+ paddw m4, m1
+ paddw m6, m1
+ psrlw m4, 5
+ psrlw m6, 5
+%endif
+
+ packuswb m4, m6
+%if %0 == 5
+ ; Only called for 8x8 blocks and sse2
+ movh [%2 + 0], m4
+ movhps [%2 + %5], m4
+%else
+ mova [%2], m4
+%endif
+%endmacro
+
+
+%macro MAIN_LOOP 2
+%if mmsize == 8
+ RV40_WCORE %2, r0, r1, r2
+%if %1 == 16
+ RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
+%endif
+
+ ; Prepare for next loop
+ add r0, r5
+ add r1, r5
+ add r2, r5
+%else
+%ifidn %1, 8
+ RV40_WCORE %2, r0, r1, r2, r5
+ ; Prepare 2 next lines
+ lea r0, [r0 + 2 * r5]
+ lea r1, [r1 + 2 * r5]
+ lea r2, [r2 + 2 * r5]
+%else
+ RV40_WCORE %2, r0, r1, r2
+ ; Prepare single next line
+ add r0, r5
+ add r1, r5
+ add r2, r5
+%endif
+%endif
+
+ dec r6
+%endmacro
+
+; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
+; %1=size %2=num of xmm regs
+%macro RV40_WEIGHT 2
+cglobal rv40_weight_func_%1, 6, 7, %2
+%if cpuflag(ssse3)
+ mova m1, [shift_round]
+%else
+ mova m1, [pw_16]
+%endif
+ pxor m0, m0
+ mov r6, r3
+ or r6, r4
+ ; The weights are FP0.14 notation of fractions depending on pts.
+ ; For timebases without rounding error (i.e. PAL), the fractions
+ ; can be simplified, and several operations can be avoided.
+ ; Therefore, we check here whether they are multiples of 2^9 for
+ ; those simplifications to occur.
+ and r6, 0x1FF
+ ; Set loop counter and increments
+%if mmsize == 8
+ mov r6, %1
+%else
+ mov r6, (%1 * %1) / mmsize
+%endif
+
+ ; Use result of test now
+ jz .loop_512
+ movd m2, r3
+ movd m3, r4
+ SPLATW m2, m2
+ SPLATW m3, m3
+
+.loop:
+ MAIN_LOOP %1, 0
+ jnz .loop
+ REP_RET
+
+ ; Weights are multiple of 512, which allows some shortcuts
+.loop_512:
+ sar r3, 9
+ sar r4, 9
+ movd m2, r3
+ movd m3, r4
+%if cpuflag(ssse3)
+ punpcklbw m3, m2
+ SPLATW m3, m3
+%else
+ SPLATW m2, m2
+ SPLATW m3, m3
+%endif
+.loop2:
+ MAIN_LOOP %1, 1
+ jnz .loop2
+ REP_RET
+
+%endmacro
+
+INIT_MMX mmx
+RV40_WEIGHT 8, 0
+RV40_WEIGHT 16, 0
+
+INIT_XMM sse2
+RV40_WEIGHT 8, 8
+RV40_WEIGHT 16, 8
+
+INIT_XMM ssse3
+RV40_WEIGHT 8, 8
+RV40_WEIGHT 16, 8
diff --git a/libavcodec/x86/rv40dsp.c b/libavcodec/x86/rv40dsp_init.c
index 9f90ad8bb6..3d6c6f0fa0 100644
--- a/libavcodec/x86/rv40dsp.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -40,14 +40,25 @@ void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
+#define DECLARE_WEIGHT(opt) \
+void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, int stride); \
+void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
+ int w1, int w2, int stride);
+DECLARE_WEIGHT(mmx)
+DECLARE_WEIGHT(sse2)
+DECLARE_WEIGHT(ssse3)
+
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
{
- av_unused int mm_flags = av_get_cpu_flags();
-
#if HAVE_YASM
+ int mm_flags = av_get_cpu_flags();
+
if (mm_flags & AV_CPU_FLAG_MMX) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+ c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx;
+ c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx;
}
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
@@ -56,5 +67,13 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
}
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2;
+ c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2;
+ }
+ if (mm_flags & AV_CPU_FLAG_SSSE3) {
+ c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3;
+ c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3;
+ }
#endif
}