summaryrefslogtreecommitdiff
path: root/libavfilter
diff options
context:
space:
mode:
authorPascal Massimino <pascal.massimino@gmail.com>2014-09-09 14:38:58 +0200
committerMichael Niedermayer <michaelni@gmx.at>2014-09-09 16:47:22 +0200
commite3fd6a3a4e3d28d8a50bb6ec3e19449bc4e0d3db (patch)
tree4706c3e78a5046f13e80142b1f12cbbc0756e2b6 /libavfilter
parent881f96c4c2ef0c0162f63a370cbfff3c1e1feb2a (diff)
av_filter/x86/idet: MMX/SSE2 implementation of 16bits filter_line()
tested on http://ps-auxw.de/10bit-h264-sample/10bit-eldorado.mkv MMX: ~30% faster decoding overall SSE2:~40% faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
Diffstat (limited to 'libavfilter')
-rw-r--r--libavfilter/vf_idet.c11
-rw-r--r--libavfilter/vf_idet.h7
-rw-r--r--libavfilter/x86/vf_idet.asm70
-rw-r--r--libavfilter/x86/vf_idet_init.c29
4 files changed, 103 insertions, 14 deletions
diff --git a/libavfilter/vf_idet.c b/libavfilter/vf_idet.c
index 4416228431..22ff494dfb 100644
--- a/libavfilter/vf_idet.c
+++ b/libavfilter/vf_idet.c
@@ -61,7 +61,7 @@ int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c,
return ret;
}
-static int filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
+int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w)
{
int x;
int ret=0;
@@ -169,8 +169,11 @@ static int filter_frame(AVFilterLink *link, AVFrame *picref)
if (!idet->csp)
idet->csp = av_pix_fmt_desc_get(link->format);
- if (idet->csp->comp[0].depth_minus1 / 8 == 1)
- idet->filter_line = (void*)filter_line_c_16bit;
+ if (idet->csp->comp[0].depth_minus1 / 8 == 1){
+ idet->filter_line = (ff_idet_filter_func)ff_idet_filter_line_c_16bit;
+ if (ARCH_X86)
+ ff_idet_init_x86(idet, 1);
+ }
filter(ctx);
@@ -245,7 +248,7 @@ static av_cold int init(AVFilterContext *ctx)
idet->filter_line = ff_idet_filter_line_c;
if (ARCH_X86)
- ff_idet_init_x86(idet);
+ ff_idet_init_x86(idet, 0);
return 0;
}
diff --git a/libavfilter/vf_idet.h b/libavfilter/vf_idet.h
index 05506901f2..c5799fb67d 100644
--- a/libavfilter/vf_idet.h
+++ b/libavfilter/vf_idet.h
@@ -24,6 +24,8 @@
#define HIST_SIZE 4
+typedef int (*ff_idet_filter_func)(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
+
typedef enum {
TFF,
BFF,
@@ -45,14 +47,15 @@ typedef struct {
AVFrame *cur;
AVFrame *next;
AVFrame *prev;
- int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w);
+ ff_idet_filter_func filter_line;
const AVPixFmtDescriptor *csp;
} IDETContext;
-void ff_idet_init_x86(IDETContext *idet);
+void ff_idet_init_x86(IDETContext *idet, int for_16b);
/* main fall-back for left-over */
int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w);
+int ff_idet_filter_line_c_16bit(const uint16_t *a, const uint16_t *b, const uint16_t *c, int w);
#endif
diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm
index 14b16c5779..4649cae030 100644
--- a/libavfilter/x86/vf_idet.asm
+++ b/libavfilter/x86/vf_idet.asm
@@ -25,8 +25,6 @@
SECTION_TEXT
-%if ARCH_X86_32
-
; Implementation that does 8-bytes at a time using single-word operations.
%macro IDET_FILTER_LINE 1
INIT_MMX %1
@@ -78,11 +76,79 @@ cglobal idet_filter_line, 4, 5, 0, a, b, c, width, index
RET
%endmacro
+%if ARCH_X86_32
IDET_FILTER_LINE mmxext
IDET_FILTER_LINE mmx
%endif
+;******************************************************************************
+; 16bit implementation that does 4/8-pixels at a time
+
+%macro PABS_DIFF_WD 3 ; a, b, junk , output=a
+ psubusw %3, %2, %1
+ psubusw %1, %2
+ por %1, %3
+
+ mova %2, %1
+ punpcklwd %1, m_zero
+ punpckhwd %2, m_zero
+ paddd %1, %2
+%endmacro
+
+%macro IDET_FILTER_LINE_16BIT 1 ; %1=increment (4 or 8 words)
+cglobal idet_filter_line_16bit, 4, 5, 8, a, b, c, width, index
+ xor indexq, indexq
+%define m_zero m1
+%define m_sum m0
+ pxor m_sum, m_sum
+ pxor m_zero, m_zero
+
+.loop_16bit:
+ movu m2, [bq + indexq * 2] ; B
+ movu m3, [aq + indexq * 2] ; A
+ mova m6, m2
+ psubusw m5, m2, m3 ; ba
+
+ movu m4, [cq + indexq * 2] ; C
+ add indexq, %1
+ psubusw m3, m2 ; ab
+ CMP indexd, widthd
+
+ psubusw m6, m4 ; bc
+ psubusw m4, m2 ; cb
+
+ PABS_DIFF_WD m3, m6, m7 ; |ab - bc|
+ PABS_DIFF_WD m5, m4, m7 ; |ba - cb|
+ paddd m_sum, m3
+ paddd m_sum, m5
+ jl .loop_16bit
+
+ mova m2, m_sum
+%if mmsize == 16
+ psrldq m2, 4
+ paddd m_sum, m2
+ psrldq m2, 4
+ paddd m_sum, m2
+ psrldq m2, 4
+ paddd m_sum, m2
+%else
+ psrlq m2, 32
+ paddd m_sum, m2
+%endif
+ movd eax, m_sum
+ RET
+%endmacro
+
+INIT_XMM sse2
+IDET_FILTER_LINE_16BIT 8
+%if ARCH_X86_32
+INIT_MMX mmx
+IDET_FILTER_LINE_16BIT 4
+%endif
+
+;******************************************************************************
; SSE2 8-bit implementation that does 16-bytes at a time:
+
INIT_XMM sse2
cglobal idet_filter_line, 4, 6, 7, a, b, c, width, index, total
xor indexq, indexq
diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c
index fb9ad832b0..1147ca8ba8 100644
--- a/libavfilter/x86/vf_idet_init.c
+++ b/libavfilter/x86/vf_idet_init.c
@@ -23,6 +23,8 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/vf_idet.h"
+#if HAVE_YASM
+
/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */
#define FUNC_MAIN_DECL(KIND, SPAN) \
int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
@@ -39,32 +41,47 @@ static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \
return sum; \
}
-#if HAVE_YASM
+
+#define FUNC_MAIN_DECL_16bit(KIND, SPAN) \
+int ff_idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
+ const uint16_t *c, int w); \
+static int idet_filter_line_16bit_##KIND(const uint16_t *a, const uint16_t *b, \
+ const uint16_t *c, int w) { \
+ int sum = 0; \
+ const int left_over = w & (SPAN - 1); \
+ w -= left_over; \
+ if (w > 0) \
+ sum += ff_idet_filter_line_16bit_##KIND(a, b, c, w); \
+ if (left_over > 0) \
+ sum += ff_idet_filter_line_c_16bit(a + w, b + w, c + w, left_over); \
+ return sum; \
+}
FUNC_MAIN_DECL(sse2, 16)
+FUNC_MAIN_DECL_16bit(sse2, 8)
#if ARCH_X86_32
FUNC_MAIN_DECL(mmx, 8)
FUNC_MAIN_DECL(mmxext, 8)
+FUNC_MAIN_DECL_16bit(mmx, 4)
#endif
#endif
-
-av_cold void ff_idet_init_x86(IDETContext *idet)
+av_cold void ff_idet_init_x86(IDETContext *idet, int for_16b)
{
#if HAVE_YASM
const int cpu_flags = av_get_cpu_flags();
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
- idet->filter_line = idet_filter_line_mmx;
+ idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
- idet->filter_line = idet_filter_line_mmxext;
+ idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_mmx : idet_filter_line_mmxext;
}
#endif // ARCH_x86_32
if (EXTERNAL_SSE2(cpu_flags)) {
- idet->filter_line = idet_filter_line_sse2;
+ idet->filter_line = for_16b ? (ff_idet_filter_func)idet_filter_line_16bit_sse2 : idet_filter_line_sse2;
}
#endif // HAVE_YASM
}