From e7078e842d93436edba1f30af1f9869d3913f7fe Mon Sep 17 00:00:00 2001 From: Anton Khirnov Date: Fri, 24 Jul 2015 18:42:08 +0200 Subject: hevcdsp: add x86 SIMD for MC --- libavcodec/x86/Makefile | 3 +- libavcodec/x86/hevc_mc.asm | 851 ++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/hevcdsp_init.c | 241 +++++++++++- 3 files changed, 1090 insertions(+), 5 deletions(-) create mode 100644 libavcodec/x86/hevc_mc.asm (limited to 'libavcodec/x86') diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 1cf47a8567..d5bd22f070 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -113,7 +113,8 @@ YASM-OBJS-$(CONFIG_VP8DSP) += x86/vp8dsp.o \ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o YASM-OBJS-$(CONFIG_APE_DECODER) += x86/apedsp.o YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o -YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o +YASM-OBJS-$(CONFIG_HEVC_DECODER) += x86/hevc_deblock.o \ + x86/hevc_mc.o YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm new file mode 100644 index 0000000000..0bb0e170ed --- /dev/null +++ b/libavcodec/x86/hevc_mc.asm @@ -0,0 +1,851 @@ +;***************************************************************************** +;* x86-optimized HEVC MC +;* Copyright 2015 Anton Khirnov +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .rodata + +pw_1023: times 8 dw 1023 + +cextern hevc_qpel_coeffs +cextern hevc_qpel_coeffs8 + +cextern hevc_epel_coeffs +cextern hevc_epel_coeffs8 + +cextern pw_8 +cextern pw_16 +cextern pw_32 +cextern pw_64 + +SECTION .text + +; %1: width +; %2: bit depth +%macro COMMON_DEFS 2 + %assign blocksize 8 + %assign nb_blocks ((%1 + blocksize - 1) / blocksize) + %define last_block_truncated (blocksize * nb_blocks > %1) + %if %2 > 8 + %define LOAD_BLOCK movu + %define LOAD_HALFBLOCK movq + %assign pixelsize 2 + %else + %define LOAD_BLOCK movq + %define LOAD_HALFBLOCK movd + %assign pixelsize 1 + %endif + %define STORE_BLOCK mova + %define STORE_HALFBLOCK movq +%endmacro + +; %1: block index +%macro BLOCK_DEFS 1 + %if last_block_truncated && %1 == nb_blocks - 1 + %define block_truncated 1 + %define LOAD LOAD_HALFBLOCK + %define STORE STORE_HALFBLOCK + %else + %define block_truncated 0 + %define LOAD LOAD_BLOCK + %define STORE STORE_BLOCK + %endif +%endmacro + + +; hevc_get_pixels__(int16_t *dst, ptrdiff_t dststride, +; pixel *src, ptrdiff_t srcstride, +; int height, int mx, int my, int *mcbuffer) + +; %1: block width +; %2: bit depth +; %3: log2 of height unroll +%macro GET_PIXELS 3 +cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused + + %assign shift 14 - %2 + COMMON_DEFS %1, %2 + +%if pixelsize == 1 + pxor m0, m0 +%endif + + shr heightd, %3 + +.loop: + +%assign i 0 +%rep (1 << %3) + +%assign j 0 +%rep nb_blocks + + BLOCK_DEFS j + + LOAD m1, [srcq + j * pixelsize * blocksize] +%if pixelsize == 1 + punpcklbw m1, m0 +%endif + psllw m1, shift + STORE [dstq + j * 2 * blocksize], m1 + +%assign j (j + 1) +%endrep + + add dstq, dststrideq + add srcq, srcstrideq + +%assign i (i + 1) +%endrep + + dec heightd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +GET_PIXELS 4, 8, 1 +GET_PIXELS 8, 8, 1 +GET_PIXELS 12, 8, 3 +GET_PIXELS 16, 8, 2 +GET_PIXELS 24, 8, 3 +GET_PIXELS 32, 8, 3 +GET_PIXELS 48, 8, 3 +GET_PIXELS 64, 8, 3 + +GET_PIXELS 4, 10, 1 +GET_PIXELS 8, 10, 1 +GET_PIXELS 12, 10, 3 +GET_PIXELS 16, 10, 2 +GET_PIXELS 24, 10, 3 +GET_PIXELS 32, 10, 3 +GET_PIXELS 48, 10, 3 +GET_PIXELS 64, 10, 3 + +; hevc_qpel_h/v__8(int16_t *dst, ptrdiff_t dststride, +; uint8_t *src, ptrdiff_t srcstride, +; int height, int mx, int my, int *mcbuffer) + +; 8-bit qpel interpolation +; %1: block width +; %2: 0 - horizontal; 1 - vertical +%macro QPEL_8 2 +%if %2 + %define postfix v + %define mvfrac myq + %define coeffsaddr r5q + %define pixstride srcstrideq + %define pixstride3 r5q + %define src_m3 r6q +%else + %define postfix h + %define mvfrac mxq + %define coeffsaddr r6q + %define pixstride 1 + %define pixstride3 3 + %define src_m3 (srcq - 3) +%endif + + COMMON_DEFS %1, 8 + +cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my + and mvfrac, 0x3 + dec mvfrac + shl mvfrac, 4 + lea coeffsaddr, [hevc_qpel_coeffs8] + mova m0, [coeffsaddr + mvfrac] + + SPLATW m1, m0, 1 + SPLATW m2, m0, 2 + SPLATW m3, m0, 3 + SPLATW m0, m0, 0 + +%if %2 + lea pixstride3, [srcstrideq + 2 * srcstrideq] + mov src_m3, srcq + sub src_m3, pixstride3 +%endif + +.loop + +%assign i 0 +%rep nb_blocks + + BLOCK_DEFS i + + LOAD m4, [src_m3 + i * blocksize] + LOAD m5, [src_m3 + i * blocksize + 1 * pixstride] + punpcklbw m4, m5 + pmaddubsw m4, m0 + + LOAD m5, [src_m3 + i * blocksize + 2 * pixstride] + LOAD m6, [srcq + i * blocksize] + punpcklbw m5, m6 + pmaddubsw m5, m1 + paddsw m4, m5 + + LOAD m5, [srcq + i * blocksize + 1 * pixstride] + LOAD m6, [srcq + i * blocksize + 2 * pixstride] + punpcklbw m5, m6 + pmaddubsw m5, m2 + paddsw m4, m5 + + LOAD m5, [srcq + i * blocksize + pixstride3] + LOAD m6, [srcq + i * blocksize + 4 * pixstride] + punpcklbw m5, m6 + pmaddubsw m5, m3 + paddsw m4, m5 + + STORE [dstq + i * 2 * blocksize], m4 + +%assign i (i + 1) +%endrep + + add dstq, dststrideq + add srcq, srcstrideq +%if %2 + add src_m3, srcstrideq +%endif + + dec heightd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +QPEL_8 4, 0 +QPEL_8 8, 0 +QPEL_8 12, 0 +QPEL_8 16, 0 +QPEL_8 24, 0 +QPEL_8 32, 0 +QPEL_8 48, 0 +QPEL_8 64, 0 + +QPEL_8 4, 1 +QPEL_8 8, 1 +QPEL_8 12, 1 +QPEL_8 16, 1 +QPEL_8 24, 1 +QPEL_8 32, 1 +QPEL_8 48, 1 +QPEL_8 64, 1 + +; 16-bit qpel interpolation +; %1: block width +; %2: shift applied to the result +; %3: 0 - horizontal; 1 - vertical +%macro QPEL_16 3 +%if %3 + %define mvfrac myq + %define pixstride srcstrideq + %define pixstride3 sstride3q + %define src_m3 srcm3q +%else + %define mvfrac mxq + %define pixstride 2 + %define pixstride3 6 + %define src_m3 (srcq - 6) +%endif + + COMMON_DEFS %1, 16 + + and mvfrac, 0x3 + dec mvfrac + shl mvfrac, 4 + lea coeffsregq, [hevc_qpel_coeffs] + mova m0, [coeffsregq + mvfrac] + + pshufd m1, m0, 0x55 + pshufd m2, m0, 0xaa + pshufd m3, m0, 0xff + pshufd m0, m0, 0x00 + +%if %3 + lea sstride3q, [srcstrideq + 2 * srcstrideq] + mov srcm3q, srcq + sub srcm3q, sstride3q +%endif + +.loop + +%assign i 0 +%rep nb_blocks + + BLOCK_DEFS i + + LOAD m4, [src_m3 + i * 2 * blocksize] + LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride] + LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride] + LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride] + LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride] + LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride] + LOAD m10, [srcq + i * 2 * blocksize + pixstride3] + LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride] + + punpcklwd m12, m4, m5 + pmaddwd m12, m0 + + punpcklwd m13, m6, m7 + pmaddwd m13, m1 + paddd m12, m13 + + punpcklwd m13, m8, m9 + pmaddwd m13, m2 + paddd m12, m13 + + punpcklwd m13, m10, m11 + pmaddwd m13, m3 + paddd m12, m13 + psrad m12, %2 + + %if block_truncated == 0 + punpckhwd m4, m5 + pmaddwd m4, m0 + + punpckhwd m6, m7 + pmaddwd m6, m1 + paddd m4, m6 + + punpckhwd m8, m9 + pmaddwd m8, m2 + paddd m4, m8 + + punpckhwd m10, m11 + pmaddwd m10, m3 + paddd m4, m10 + + psrad m4, %2 + %endif + packssdw m12, m4 + STORE [dstq + i * 2 * blocksize], m12 + +%assign i (i + 1) +%endrep + + add dstq, dststrideq + add srcq, srcstrideq +%if %3 + add srcm3q, srcstrideq +%endif + + dec heightd + jg .loop + RET +%endmacro + +%if ARCH_X86_64 + +%macro QPEL_H_10 1 +cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg +QPEL_16 %1, 2, 0 +%endmacro + +INIT_XMM avx +QPEL_H_10 4 +QPEL_H_10 8 +QPEL_H_10 12 +QPEL_H_10 16 +QPEL_H_10 24 +QPEL_H_10 32 +QPEL_H_10 48 +QPEL_H_10 64 + +%macro QPEL_V_10 1 +cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg +QPEL_16 %1, 2, 1 +%endmacro + +INIT_XMM avx +QPEL_V_10 4 +QPEL_V_10 8 +QPEL_V_10 12 +QPEL_V_10 16 +QPEL_V_10 24 +QPEL_V_10 32 +QPEL_V_10 48 +QPEL_V_10 64 + +; hevc_qpel_hv_(int16_t *dst, ptrdiff_t dststride, +; uint8_t *src, ptrdiff_t srcstride, +; int height, int mx, int my, int *mcbuffer) + +%macro QPEL_HV 1 +cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg +QPEL_16 %1, 6, 1 +%endmacro + +INIT_XMM avx +QPEL_HV 4 +QPEL_HV 8 +QPEL_HV 12 +QPEL_HV 16 +QPEL_HV 24 +QPEL_HV 32 +QPEL_HV 48 +QPEL_HV 64 + +%endif ; ARCH_X86_64 + +; hevc_epel_h/v__8(int16_t *dst, ptrdiff_t dststride, +; uint8_t *src, ptrdiff_t srcstride, +; int height, int mx, int my, int *mcbuffer) + +; 8-bit epel interpolation +; %1: block width +; %2: 0 - horizontal; 1 - vertical +%macro EPEL_8 2 +%if %2 + %define postfix v + %define mvfrac myq + %define coeffsaddr r5q + %define pixstride srcstrideq + %define pixstride3 r5q +%else + %define postfix h + %define mvfrac mxq + %define coeffsaddr r6q + %define pixstride 1 + %define pixstride3 3 +%endif + + COMMON_DEFS %1, 8 + +cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my + and mvfrac, 0x7 + dec mvfrac + shl mvfrac, 4 + lea coeffsaddr, [hevc_epel_coeffs8] + movq m0, [coeffsaddr + mvfrac] + + SPLATW m1, m0, 1 + SPLATW m0, m0, 0 + +%if %2 + lea pixstride3, [srcstrideq + 2 * srcstrideq] +%endif + sub srcq, pixstride + +.loop + +%assign i 0 +%rep nb_blocks + + BLOCK_DEFS i + + LOAD m2, [srcq + i * blocksize + 0 * pixstride] + LOAD m3, [srcq + i * blocksize + 1 * pixstride] + LOAD m4, [srcq + i * blocksize + 2 * pixstride] + LOAD m5, [srcq + i * blocksize + pixstride3] + + punpcklbw m2, m3 + punpcklbw m4, m5 + + pmaddubsw m2, m0 + pmaddubsw m4, m1 + + paddsw m2, m4 + + STORE [dstq + i * 2 * blocksize], m2 + +%assign i (i + 1) +%endrep + + add dstq, dststrideq + add srcq, srcstrideq + + dec heightd + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +EPEL_8 4, 0 +EPEL_8 8, 0 +EPEL_8 12, 0 +EPEL_8 16, 0 +EPEL_8 24, 0 +EPEL_8 32, 0 + +EPEL_8 4, 1 +EPEL_8 8, 1 +EPEL_8 12, 1 +EPEL_8 16, 1 +EPEL_8 24, 1 +EPEL_8 32, 1 + +%macro EPEL_16 3 +%if %3 + %define mvfrac myq + %define pixstride srcstrideq + %define pixstride3 sstride3q +%else + %define mvfrac mxq + %define pixstride 2 + %define pixstride3 6 +%endif + + COMMON_DEFS %1, 16 + + and mvfrac, 0x7 + dec mvfrac + shl mvfrac, 5 + lea coeffsregq, [hevc_epel_coeffs] + mova m0, [coeffsregq + mvfrac] + + pshufd m1, m0, 0x55 + pshufd m0, m0, 0x00 + +%if %3 + lea sstride3q, [srcstrideq + 2 * srcstrideq] +%endif + sub srcq, pixstride + +.loop + +%assign i 0 +%rep nb_blocks + + BLOCK_DEFS i + + LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride] + LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride] + LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride] + LOAD m5, [srcq + i * 2 * blocksize + pixstride3] + + punpcklwd m6, m2, m3 + punpcklwd m7, m4, m5 + pmaddwd m6, m0 + pmaddwd m7, m1 + paddd m6, m7 + psrad m6, %2 + + %if block_truncated == 0 + punpckhwd m2, m3 + punpckhwd m4, m5 + pmaddwd m2, m0 + pmaddwd m4, m1 + paddd m2, m4 + psrad m2, %2 + %endif + packssdw m6, m2 + STORE [dstq + i * 2 * blocksize], m6 + +%assign i (i + 1) +%endrep + + add dstq, dststrideq + add srcq, srcstrideq + + dec heightd + jg .loop + RET +%endmacro + +%if ARCH_X86_64 + +%macro EPEL_H_10 1 +cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg +EPEL_16 %1, 2, 0 +%endmacro + +INIT_XMM avx +EPEL_H_10 4 +EPEL_H_10 8 +EPEL_H_10 12 +EPEL_H_10 16 +EPEL_H_10 24 +EPEL_H_10 32 + +%macro EPEL_V_10 1 +cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg +EPEL_16 %1, 2, 1 +%endmacro + +INIT_XMM avx +EPEL_V_10 4 +EPEL_V_10 8 +EPEL_V_10 12 +EPEL_V_10 16 +EPEL_V_10 24 +EPEL_V_10 32 + +; hevc_epel_hv__8(int16_t *dst, ptrdiff_t dststride, +; int16_t *src, ptrdiff_t srcstride, +; int height, int mx, int my, int *mcbuffer) + +%macro EPEL_HV 1 +cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg +EPEL_16 %1, 6, 1 +%endmacro + +INIT_XMM avx +EPEL_HV 4 +EPEL_HV 8 +EPEL_HV 12 +EPEL_HV 16 +EPEL_HV 24 +EPEL_HV 32 + +%endif ; ARCH_X86_64 + +; hevc_put_unweighted_pred__(pixel *dst, ptrdiff_t dststride, +; int16_t *src, ptrdiff_t srcstride, +; int height) + +%macro AVG 5 + %if %3 + %if %4 == 4 + movq %5, %2 + paddsw %1, %5 + %else + paddsw %1, %2 + %endif + %endif +%endmacro + +; %1: 0 - one source; 1 - two sources +; %2: width +; %3: bit depth +%macro PUT_PRED 3 +%if %1 +cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height +%else +cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height +%endif + +%assign shift 14 + %1 - %3 +%assign offset (1 << (shift - 1)) +%define offset_data pw_ %+ offset + + mova m0, [offset_data] + +%if %3 > 8 + %define STORE_BLOCK movu + %define STORE_HALF movq + + %assign pixel_max ((1 << %3) - 1) + %define pw_pixel_max pw_ %+ pixel_max + pxor m1, m1 + mova m2, [pw_pixel_max] +%else + %define STORE_BLOCK movq + %define STORE_HALF movd +%endif + +.loop +%assign i 0 +%rep (%2 + 7) / 8 + + %if (i + 1) * 8 > %2 + %define LOAD movq + %define STORE STORE_HALF + %else + %define LOAD mova + %define STORE STORE_BLOCK + %endif + + LOAD m3, [srcq + 16 * i] + AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4 + + paddsw m3, m0 + psraw m3, shift + + %if %3 == 8 + packuswb m3, m3 + STORE [dstq + 8 * i], m3 + %else + CLIPW m3, m1, m2 + STORE [dstq + 16 * i], m3 + %endif +%assign i (i + 1) +%endrep + + add dstq, dststrideq + add srcq, srcstrideq +%if %1 + add src2q, srcstrideq +%endif + + dec heightd + jg .loop + RET +%endmacro + +INIT_XMM sse2 +PUT_PRED 0, 4, 8 +PUT_PRED 1, 4, 8 +PUT_PRED 0, 8, 8 +PUT_PRED 1, 8, 8 +PUT_PRED 0, 12, 8 +PUT_PRED 1, 12, 8 +PUT_PRED 0, 16, 8 +PUT_PRED 1, 16, 8 +PUT_PRED 0, 24, 8 +PUT_PRED 1, 24, 8 +PUT_PRED 0, 32, 8 +PUT_PRED 1, 32, 8 +PUT_PRED 0, 48, 8 +PUT_PRED 1, 48, 8 +PUT_PRED 0, 64, 8 +PUT_PRED 1, 64, 8 + +PUT_PRED 0, 4, 10 +PUT_PRED 1, 4, 10 +PUT_PRED 0, 8, 10 +PUT_PRED 1, 8, 10 +PUT_PRED 0, 12, 10 +PUT_PRED 1, 12, 10 +PUT_PRED 0, 16, 10 +PUT_PRED 1, 16, 10 +PUT_PRED 0, 24, 10 +PUT_PRED 1, 24, 10 +PUT_PRED 0, 32, 10 +PUT_PRED 1, 32, 10 +PUT_PRED 0, 48, 10 +PUT_PRED 1, 48, 10 +PUT_PRED 0, 64, 10 +PUT_PRED 1, 64, 10 + +%macro PUT_WEIGHTED_PRED 3 +%if %1 +cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height +%else +cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height +%endif + + and denomd, 0xff + movsx weight0d, weight0w + movsx offset0d, offset0w +%if %1 + movsx weight1d, weight1w + movsx offset1d, offset1w +%endif + + add denomd, 14 + %1 - %3 + movd m0, denomd + +%if %3 > 8 + %assign pixel_max ((1 << %3) - 1) + %define pw_pixel_max pw_ %+ pixel_max + pxor m4, m4 + mova m5, [pw_pixel_max] + + shl offset0d, %3 - 8 +%if %1 + shl offset1d, %3 - 8 +%endif +%endif + +%if %1 + lea offset0d, [offset0d + offset1d + 1] +%else + lea offset0d, [2 * offset0d + 1] +%endif + movd m1, offset0d + SPLATD m1 + pslld m1, m0 + psrad m1, 1 + + movd m2, weight0d + SPLATD m2 +%if %1 + movd m3, weight1d + SPLATD m3 +%endif + +.loop +%assign i 0 +%rep (%2 + 3) / 4 + + pmovsxwd m6, [src0q + 8 * i] + pmulld m6, m2 + +%if %1 + pmovsxwd m7, [src1q + 8 * i] + pmulld m7, m3 + paddd m6, m7 +%endif + + paddd m6, m1 + psrad m6, m0 + + packssdw m6, m6 + +%if %3 > 8 + CLIPW m6, m4, m5 + movq [dstq + 8 * i], m6 +%else + packuswb m6, m6 + movd [dstq + 4 * i], m6 +%endif + +%assign i (i + 1) +%endrep + + add dstq, dststrideq + add src0q, srcstrideq +%if %1 + add src1q, srcstrideq +%endif + + dec heightd + jg .loop + RET +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +PUT_WEIGHTED_PRED 0, 4, 8 +PUT_WEIGHTED_PRED 1, 4, 8 +PUT_WEIGHTED_PRED 0, 8, 8 +PUT_WEIGHTED_PRED 1, 8, 8 +PUT_WEIGHTED_PRED 0, 12, 8 +PUT_WEIGHTED_PRED 1, 12, 8 +PUT_WEIGHTED_PRED 0, 16, 8 +PUT_WEIGHTED_PRED 1, 16, 8 +PUT_WEIGHTED_PRED 0, 24, 8 +PUT_WEIGHTED_PRED 1, 24, 8 +PUT_WEIGHTED_PRED 0, 32, 8 +PUT_WEIGHTED_PRED 1, 32, 8 +PUT_WEIGHTED_PRED 0, 48, 8 +PUT_WEIGHTED_PRED 1, 48, 8 +PUT_WEIGHTED_PRED 0, 64, 8 +PUT_WEIGHTED_PRED 1, 64, 8 + +PUT_WEIGHTED_PRED 0, 4, 10 +PUT_WEIGHTED_PRED 1, 4, 10 +PUT_WEIGHTED_PRED 0, 8, 10 +PUT_WEIGHTED_PRED 1, 8, 10 +PUT_WEIGHTED_PRED 0, 12, 10 +PUT_WEIGHTED_PRED 1, 12, 10 +PUT_WEIGHTED_PRED 0, 16, 10 +PUT_WEIGHTED_PRED 1, 16, 10 +PUT_WEIGHTED_PRED 0, 24, 10 +PUT_WEIGHTED_PRED 1, 24, 10 +PUT_WEIGHTED_PRED 0, 32, 10 +PUT_WEIGHTED_PRED 1, 32, 10 +PUT_WEIGHTED_PRED 0, 48, 10 +PUT_WEIGHTED_PRED 1, 48, 10 +PUT_WEIGHTED_PRED 0, 64, 10 +PUT_WEIGHTED_PRED 1, 64, 10 + +%endif ; ARCH_X86_64 diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c index 04203c22a0..d0c17769b7 100644 --- a/libavcodec/x86/hevcdsp_init.c +++ b/libavcodec/x86/hevcdsp_init.c @@ -45,27 +45,260 @@ LFC_FUNCS(uint8_t, 10) LFL_FUNCS(uint8_t, 8) LFL_FUNCS(uint8_t, 10) +#define GET_PIXELS(width, depth, cf) \ +void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); + +GET_PIXELS(4, 8, sse2) +GET_PIXELS(8, 8, sse2) +GET_PIXELS(12, 8, sse2) +GET_PIXELS(16, 8, sse2) +GET_PIXELS(24, 8, sse2) +GET_PIXELS(32, 8, sse2) +GET_PIXELS(48, 8, sse2) +GET_PIXELS(64, 8, sse2) + +GET_PIXELS(4, 10, sse2) +GET_PIXELS(8, 10, sse2) +GET_PIXELS(12, 10, sse2) +GET_PIXELS(16, 10, sse2) +GET_PIXELS(24, 10, sse2) +GET_PIXELS(32, 10, sse2) +GET_PIXELS(48, 10, sse2) +GET_PIXELS(64, 10, sse2) + +/* those are independent of the bit depth, so declared separately */ +#define INTERP_HV_FUNC(width, cf) \ +void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ + int16_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); \ +void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \ + int16_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); + +INTERP_HV_FUNC(4, avx) +INTERP_HV_FUNC(8, avx) +INTERP_HV_FUNC(12, avx) +INTERP_HV_FUNC(16, avx) +INTERP_HV_FUNC(24, avx) +INTERP_HV_FUNC(32, avx) +INTERP_HV_FUNC(48, avx) +INTERP_HV_FUNC(64, avx) + +#if ARCH_X86_64 +#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \ +static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer) \ +{ \ + const ptrdiff_t stride = FFALIGN(width + 7, 8); \ + ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \ + height + 7, mx, my, mcbuffer); \ + ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \ + height, mx, my, mcbuffer); \ +} +#else +#define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) +#endif + +#define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \ +void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); \ +void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); \ +QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) + +QPEL_FUNCS(4, 8, ssse3, ssse3, avx) +QPEL_FUNCS(8, 8, ssse3, ssse3, avx) +QPEL_FUNCS(12, 8, ssse3, ssse3, avx) +QPEL_FUNCS(16, 8, ssse3, ssse3, avx) +QPEL_FUNCS(24, 8, ssse3, ssse3, avx) +QPEL_FUNCS(32, 8, ssse3, ssse3, avx) +QPEL_FUNCS(48, 8, ssse3, ssse3, avx) +QPEL_FUNCS(64, 8, ssse3, ssse3, avx) + +QPEL_FUNCS(4, 10, avx, avx, avx) +QPEL_FUNCS(8, 10, avx, avx, avx) +QPEL_FUNCS(12, 10, avx, avx, avx) +QPEL_FUNCS(16, 10, avx, avx, avx) +QPEL_FUNCS(24, 10, avx, avx, avx) +QPEL_FUNCS(32, 10, avx, avx, avx) +QPEL_FUNCS(48, 10, avx, avx, avx) +QPEL_FUNCS(64, 10, avx, avx, avx) + +#if ARCH_X86_64 +#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \ +static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer) \ +{ \ + const ptrdiff_t stride = FFALIGN(width + 3, 8); \ + ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \ + height + 3, mx, my, mcbuffer); \ + ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \ + height, mx, my, mcbuffer); \ +} +#else +#define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) +#endif + +#define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \ +void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); \ +void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int height, int mx, int my, int16_t *mcbuffer); \ +EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) + +EPEL_FUNCS(4, 8, ssse3, ssse3, avx) +EPEL_FUNCS(8, 8, ssse3, ssse3, avx) +EPEL_FUNCS(12, 8, ssse3, ssse3, avx) +EPEL_FUNCS(16, 8, ssse3, ssse3, avx) +EPEL_FUNCS(24, 8, ssse3, ssse3, avx) +EPEL_FUNCS(32, 8, ssse3, ssse3, avx) + +EPEL_FUNCS(4, 10, avx, avx, avx) +EPEL_FUNCS(8, 10, avx, avx, avx) +EPEL_FUNCS(12, 10, avx, avx, avx) +EPEL_FUNCS(16, 10, avx, avx, avx) +EPEL_FUNCS(24, 10, avx, avx, avx) +EPEL_FUNCS(32, 10, avx, avx, avx) + +#define PUT_PRED(width, depth, cf_uw, cf_w) \ +void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src, ptrdiff_t srcstride, \ + int height); \ +void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src1, int16_t *src2, \ + ptrdiff_t srcstride, int height); \ +void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \ + uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src, ptrdiff_t srcstride, \ + int height); \ +void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \ + int16_t offset0, int16_t offset1, \ + uint8_t *dst, ptrdiff_t dststride, \ + int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \ + int height); + +PUT_PRED(4, 8, sse2, sse4) +PUT_PRED(8, 8, sse2, sse4) +PUT_PRED(12, 8, sse2, sse4) +PUT_PRED(16, 8, sse2, sse4) +PUT_PRED(24, 8, sse2, sse4) +PUT_PRED(32, 8, sse2, sse4) +PUT_PRED(48, 8, sse2, sse4) +PUT_PRED(64, 8, sse2, sse4) + +PUT_PRED(4, 10, sse2, sse4) +PUT_PRED(8, 10, sse2, sse4) +PUT_PRED(12, 10, sse2, sse4) +PUT_PRED(16, 10, sse2, sse4) +PUT_PRED(24, 10, sse2, sse4) +PUT_PRED(32, 10, sse2, sse4) +PUT_PRED(48, 10, sse2, sse4) +PUT_PRED(64, 10, sse2, sse4) + void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) { int cpu_flags = av_get_cpu_flags(); +#define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \ + c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \ + c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \ + c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \ + c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \ + c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \ + c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \ + c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \ + c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf; + +#define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \ + c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \ + c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \ + c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \ + c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \ + c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \ + c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf; + +#define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf) +#define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf) + if (bit_depth == 8) { if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2; + + SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); + SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels); + + SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2); + SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2); + SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2); + SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2); } - if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { - c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; - c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; + if (EXTERNAL_SSSE3(cpu_flags)) { + SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h); + SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v); + SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h); + SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v); } } else if (bit_depth == 10) { if (EXTERNAL_SSE2(cpu_flags)) { c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2; c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2; + + SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); + SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels); + + SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2); + SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2); + SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2); + SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2); + } + } + +#if ARCH_X86_64 + if (bit_depth == 8) { + if (EXTERNAL_SSSE3(cpu_flags)) { + c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3; + c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3; + } + + if (EXTERNAL_SSE4(cpu_flags)) { + SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4); + SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4); + SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4); + SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4); } - if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) { + + if (EXTERNAL_AVX(cpu_flags)) { + SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv); + SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv); + } + } else if (bit_depth == 10) { + if (EXTERNAL_SSSE3(cpu_flags)) { c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3; c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3; } + if (EXTERNAL_SSE4(cpu_flags)) { + SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4); + SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4); + SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4); + SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4); + } + if (EXTERNAL_AVX(cpu_flags)) { + SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h); + SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v); + SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv); + SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h); + SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v); + SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv); + } } +#endif /* ARCH_X86_64 */ } -- cgit v1.2.3