huffyuvencdsp: move shared functions to a new lossless_videoencdsp context

Signed-off-by: James Almer <jamrial@gmail.com>
author: James Almer <jamrial@gmail.com> 2017-01-07 23:13:48 -0300
committer: James Almer <jamrial@gmail.com> 2017-01-12 22:53:04 -0300
commit: cf9ef839606dd50f779c395d8a277de143f7e5b2 (patch)
tree: 615bcdf1fc268c6ef0b3cc75273ca08aff8254bd /libavcodec/x86
parent: 30c1f27299d3fc2b0c0858c003066cc5e36a28af (diff)
5 files changed, 216 insertions, 144 deletions
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 1db1137392..2f0354a2c8 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -20,8 +20,9 @@ OBJS-$(CONFIG_H264QPEL)                += x86/h264_qpel.o
 OBJS-$(CONFIG_HPELDSP)                 += x86/hpeldsp_init.o
 OBJS-$(CONFIG_LLAUDDSP)                += x86/lossless_audiodsp_init.o
 OBJS-$(CONFIG_LLVIDDSP)                += x86/lossless_videodsp_init.o
+OBJS-$(CONFIG_LLVIDENCDSP)             += x86/lossless_videoencdsp_init.o
 OBJS-$(CONFIG_HUFFYUVDSP)              += x86/huffyuvdsp_init.o
-OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_mmx.o
+OBJS-$(CONFIG_HUFFYUVENCDSP)           += x86/huffyuvencdsp_init.o
 OBJS-$(CONFIG_IDCTDSP)                 += x86/idctdsp_init.o
 OBJS-$(CONFIG_LPC)                     += x86/lpc.o
 OBJS-$(CONFIG_ME_CMP)                  += x86/me_cmp_init.o
@@ -114,6 +115,7 @@ YASM-OBJS-$(CONFIG_HUFFYUVENCDSP)      += x86/huffyuvencdsp.o
 YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/idctdsp.o
 YASM-OBJS-$(CONFIG_LLAUDDSP)           += x86/lossless_audiodsp.o
 YASM-OBJS-$(CONFIG_LLVIDDSP)           += x86/lossless_videodsp.o
+YASM-OBJS-$(CONFIG_LLVIDENCDSP)        += x86/lossless_videoencdsp.o
 YASM-OBJS-$(CONFIG_ME_CMP)             += x86/me_cmp.o
 YASM-OBJS-$(CONFIG_MPEGAUDIODSP)       += x86/imdct36.o
 YASM-OBJS-$(CONFIG_MPEGVIDEOENC)       += x86/mpegvideoencdsp.o
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 78ad202249..1228aa8355 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -27,128 +27,8 @@
 
 section .text
 
-; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
-;                    intptr_t w);
-%macro DIFF_BYTES_PROLOGUE 0
-%if ARCH_X86_32
-cglobal diff_bytes, 3,5,2, dst, src1, src2
-%define wq r4q
-    DECLARE_REG_TMP 3
-    mov               wq, r3mp
-%else
-cglobal diff_bytes, 4,5,2, dst, src1, src2, w
-    DECLARE_REG_TMP 4
-%endif ; ARCH_X86_32
-%define i t0q
-%endmacro
-
-; label to jump to if w < regsize
-%macro DIFF_BYTES_LOOP_PREP 1
-    mov                i, wq
-    and                i, -2 * regsize
-        jz            %1
-    add             dstq, i
-    add            src1q, i
-    add            src2q, i
-    neg                i
-%endmacro
-
-; mov type used for src1q, dstq, first reg, second reg
-%macro DIFF_BYTES_LOOP_CORE 4
-%if mmsize != 16
-    mov%1             %3, [src1q + i]
-    mov%1             %4, [src1q + i + regsize]
-    psubb             %3, [src2q + i]
-    psubb             %4, [src2q + i + regsize]
-    mov%2           [dstq + i], %3
-    mov%2 [regsize + dstq + i], %4
-%else
-    ; SSE enforces alignment of psubb operand
-    mov%1             %3, [src1q + i]
-    movu              %4, [src2q + i]
-    psubb             %3, %4
-    mov%2     [dstq + i], %3
-    mov%1             %3, [src1q + i + regsize]
-    movu              %4, [src2q + i + regsize]
-    psubb             %3, %4
-    mov%2 [regsize + dstq + i], %3
-%endif
-%endmacro
-
-%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
-    %define regsize mmsize
-.loop_%1%2:
-    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
-    add                i, 2 * regsize
-        jl    .loop_%1%2
-.skip_main_%1%2:
-    and               wq, 2 * regsize - 1
-        jz     .end_%1%2
-%if mmsize > 16
-    ; fall back to narrower xmm
-    %define regsize mmsize / 2
-    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
-.loop2_%1%2:
-    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
-    add                i, 2 * regsize
-        jl   .loop2_%1%2
-.setup_loop_gpr_%1%2:
-    and               wq, 2 * regsize - 1
-        jz     .end_%1%2
-%endif
-    add             dstq, wq
-    add            src1q, wq
-    add            src2q, wq
-    neg               wq
-.loop_gpr_%1%2:
-    mov              t0b, [src1q + wq]
-    sub              t0b, [src2q + wq]
-    mov      [dstq + wq], t0b
-    inc               wq
-        jl .loop_gpr_%1%2
-.end_%1%2:
-    REP_RET
-%endmacro
-
-%if ARCH_X86_32
-INIT_MMX mmx
-DIFF_BYTES_PROLOGUE
-    %define regsize mmsize
-    DIFF_BYTES_LOOP_PREP .skip_main_aa
-    DIFF_BYTES_BODY    a, a
-%undef i
-%endif
-
-INIT_XMM sse2
-DIFF_BYTES_PROLOGUE
-    %define regsize mmsize
-    DIFF_BYTES_LOOP_PREP .skip_main_aa
-    test            dstq, regsize - 1
-        jnz     .loop_uu
-    test           src1q, regsize - 1
-        jnz     .loop_ua
-    DIFF_BYTES_BODY    a, a
-    DIFF_BYTES_BODY    u, a
-    DIFF_BYTES_BODY    u, u
-%undef i
-
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-DIFF_BYTES_PROLOGUE
-    %define regsize mmsize
-    ; Directly using unaligned SSE2 version is marginally faster than
-    ; branching based on arguments.
-    DIFF_BYTES_LOOP_PREP .skip_main_uu
-    test            dstq, regsize - 1
-        jnz     .loop_uu
-    test           src1q, regsize - 1
-        jnz     .loop_ua
-    DIFF_BYTES_BODY    a, a
-    DIFF_BYTES_BODY    u, a
-    DIFF_BYTES_BODY    u, u
-%undef i
-%endif
-
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    unsigned mask, int w);
 %macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
     movd    m4, maskd
     SPLATW  m4, m4
diff --git a/libavcodec/x86/huffyuvencdsp_init.c b/libavcodec/x86/huffyuvencdsp_init.c
new file mode 100644
index 0000000000..f66bc8c4f0
--- /dev/null
+++ b/libavcodec/x86/huffyuvencdsp_init.c
@@ -0,0 +1,54 @@
+/*
+ * SIMD-optimized HuffYUV encoding functions
+ * Copyright (c) 2000, 2001 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/huffyuvencdsp.h"
+
+void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                        unsigned mask, int w);
+void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
+                                          unsigned mask, int w, int *left, int *left_top);
+
+av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+{
+    av_unused int cpu_flags = av_get_cpu_flags();
+    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
+
+    if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_mmx;
+    }
+
+    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
+        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->diff_int16 = ff_diff_int16_sse2;
+    }
+}
diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm
new file mode 100644
index 0000000000..63fd72174a
--- /dev/null
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@ -0,0 +1,150 @@
+;************************************************************************
+;* SIMD-optimized lossless video encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99@gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+section .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+;                    intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+    DECLARE_REG_TMP 3
+    mov               wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+    DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; label to jump to if w < regsize
+%macro DIFF_BYTES_LOOP_PREP 1
+    mov                i, wq
+    and                i, -2 * regsize
+        jz            %1
+    add             dstq, i
+    add            src1q, i
+    add            src2q, i
+    neg                i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+    mov%1             %3, [src1q + i]
+    mov%1             %4, [src1q + i + regsize]
+    psubb             %3, [src2q + i]
+    psubb             %4, [src2q + i + regsize]
+    mov%2           [dstq + i], %3
+    mov%2 [regsize + dstq + i], %4
+%else
+    ; SSE enforces alignment of psubb operand
+    mov%1             %3, [src1q + i]
+    movu              %4, [src2q + i]
+    psubb             %3, %4
+    mov%2     [dstq + i], %3
+    mov%1             %3, [src1q + i + regsize]
+    movu              %4, [src2q + i + regsize]
+    psubb             %3, %4
+    mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+    %define regsize mmsize
+.loop_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+    add                i, 2 * regsize
+        jl    .loop_%1%2
+.skip_main_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%if mmsize > 16
+    ; fall back to narrower xmm
+    %define regsize mmsize / 2
+    DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
+.loop2_%1%2:
+    DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+    add                i, 2 * regsize
+        jl   .loop2_%1%2
+.setup_loop_gpr_%1%2:
+    and               wq, 2 * regsize - 1
+        jz     .end_%1%2
+%endif
+    add             dstq, wq
+    add            src1q, wq
+    add            src2q, wq
+    neg               wq
+.loop_gpr_%1%2:
+    mov              t0b, [src1q + wq]
+    sub              t0b, [src2q + wq]
+    mov      [dstq + wq], t0b
+    inc               wq
+        jl .loop_gpr_%1%2
+.end_%1%2:
+    REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
+    DIFF_BYTES_BODY    a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    DIFF_BYTES_LOOP_PREP .skip_main_aa
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+    %define regsize mmsize
+    ; Directly using unaligned SSE2 version is marginally faster than
+    ; branching based on arguments.
+    DIFF_BYTES_LOOP_PREP .skip_main_uu
+    test            dstq, regsize - 1
+        jnz     .loop_uu
+    test           src1q, regsize - 1
+        jnz     .loop_ua
+    DIFF_BYTES_BODY    a, a
+    DIFF_BYTES_BODY    u, a
+    DIFF_BYTES_BODY    u, u
+%undef i
+%endif
diff --git a/libavcodec/x86/huffyuvencdsp_mmx.c b/libavcodec/x86/lossless_videoencdsp_init.c
index 2402021823..fc728c9fd1 100644
--- a/libavcodec/x86/huffyuvencdsp_mmx.c
+++ b/libavcodec/x86/lossless_videoencdsp_init.c
@@ -1,5 +1,5 @@
 /*
- * SIMD-optimized HuffYUV encoding functions
+ * SIMD-optimized lossless video encoding functions
  * Copyright (c) 2000, 2001 Fabrice Bellard
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
@@ -24,10 +24,9 @@
 
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/pixdesc.h"
 #include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
-#include "libavcodec/huffyuvencdsp.h"
+#include "libavcodec/lossless_videoencdsp.h"
 #include "libavcodec/mathops.h"
 
 void ff_diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
@@ -36,18 +35,12 @@ void ff_diff_bytes_sse2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                         intptr_t w);
 void ff_diff_bytes_avx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
                         intptr_t w);
-void ff_diff_int16_mmx (uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
-                        unsigned mask, int w);
-void ff_diff_int16_sse2(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
-                        unsigned mask, int w);
-void ff_sub_hfyu_median_pred_int16_mmxext(uint16_t *dst, const uint16_t *src1, const uint16_t *src2,
-                                          unsigned mask, int w, int *left, int *left_top);
 
 #if HAVE_INLINE_ASM
 
-static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
-                                        const uint8_t *src2, intptr_t w,
-                                        int *left, int *left_top)
+static void sub_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
+                                   const uint8_t *src2, intptr_t w,
+                                   int *left, int *left_top)
 {
     x86_reg i = 0;
     uint8_t l, lt;
@@ -87,29 +80,22 @@ static void sub_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *src1,
 
 #endif /* HAVE_INLINE_ASM */
 
-av_cold void ff_huffyuvencdsp_init_x86(HuffYUVEncDSPContext *c, AVCodecContext *avctx)
+av_cold void ff_llvidencdsp_init_x86(LLVidEncDSPContext *c)
 {
     av_unused int cpu_flags = av_get_cpu_flags();
-    const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(avctx->pix_fmt);
 
     if (ARCH_X86_32 && EXTERNAL_MMX(cpu_flags)) {
         c->diff_bytes = ff_diff_bytes_mmx;
-        c->diff_int16 = ff_diff_int16_mmx;
     }
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMXEXT(cpu_flags)) {
-        c->sub_hfyu_median_pred = sub_hfyu_median_pred_mmxext;
+        c->sub_median_pred = sub_median_pred_mmxext;
     }
 #endif /* HAVE_INLINE_ASM */
 
-    if (EXTERNAL_MMXEXT(cpu_flags) && pix_desc && pix_desc->comp[0].depth<16) {
-        c->sub_hfyu_median_pred_int16 = ff_sub_hfyu_median_pred_int16_mmxext;
-    }
-
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->diff_bytes = ff_diff_bytes_sse2;
-        c->diff_int16 = ff_diff_int16_sse2;
     }
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
author	James Almer <jamrial@gmail.com>	2017-01-07 23:13:48 -0300
committer	James Almer <jamrial@gmail.com>	2017-01-12 22:53:04 -0300
commit	cf9ef839606dd50f779c395d8a277de143f7e5b2 (patch)
tree	615bcdf1fc268c6ef0b3cc75273ca08aff8254bd /libavcodec/x86
parent	30c1f27299d3fc2b0c0858c003066cc5e36a28af (diff)