From 502ab21af0ca68f76d6112722c46d2f35c004053 Mon Sep 17 00:00:00 2001 From: Loren Merritt Date: Tue, 18 Jun 2013 21:30:42 +0000 Subject: x86: lpc: simd av_update_lls 4x-6x faster on sandybridge Signed-off-by: Luca Barbato --- libavutil/lls.c | 8 +- libavutil/lls.h | 12 ++- libavutil/x86/Makefile | 2 + libavutil/x86/lls.asm | 196 +++++++++++++++++++++++++++++++++++++++++++++++ libavutil/x86/lls_init.c | 38 +++++++++ 5 files changed, 250 insertions(+), 6 deletions(-) create mode 100644 libavutil/x86/lls.asm create mode 100644 libavutil/x86/lls_init.c (limited to 'libavutil') diff --git a/libavutil/lls.c b/libavutil/lls.c index 5a3e4485c8..f87c2cd153 100644 --- a/libavutil/lls.c +++ b/libavutil/lls.c @@ -46,8 +46,8 @@ static void update_lls(LLSModel *m, double *var) void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order) { int i, j, k; - double (*factor)[MAX_VARS + 1] = (void *) &m->covariance[1][0]; - double (*covar) [MAX_VARS + 1] = (void *) &m->covariance[1][1]; + double (*factor)[MAX_VARS_ALIGN] = (void *) &m->covariance[1][0]; + double (*covar) [MAX_VARS_ALIGN] = (void *) &m->covariance[1][1]; double *covar_y = m->covariance[0]; int count = m->indep_count; @@ -117,6 +117,8 @@ av_cold void avpriv_init_lls(LLSModel *m, int indep_count) m->indep_count = indep_count; m->update_lls = update_lls; m->evaluate_lls = evaluate_lls; + if (ARCH_X86) + ff_init_lls_x86(m); } #if FF_API_LLS_PRIVATE @@ -154,7 +156,7 @@ int main(void) avpriv_init_lls(&m, 3); for (i = 0; i < 100; i++) { - double var[4]; + LOCAL_ALIGNED(32, double, var, [4]); double eval; var[0] = (av_lfg_get(&lfg) / (double) UINT_MAX - 0.5) * 2; diff --git a/libavutil/lls.h b/libavutil/lls.h index 81834402f7..27c0d5e3fe 100644 --- a/libavutil/lls.h +++ b/libavutil/lls.h @@ -23,9 +23,12 @@ #ifndef AVUTIL_LLS_H #define AVUTIL_LLS_H +#include "common.h" +#include "mem.h" #include "version.h" #define MAX_VARS 32 +#define MAX_VARS_ALIGN FFALIGN(MAX_VARS+1,4) //FIXME avoid direct access to LLSModel from outside @@ -33,26 +36,29 @@ * Linear least squares model. */ typedef struct LLSModel { - double covariance[MAX_VARS + 1][MAX_VARS + 1]; - double coeff[MAX_VARS][MAX_VARS]; + DECLARE_ALIGNED(32, double, covariance[MAX_VARS_ALIGN][MAX_VARS_ALIGN]); + DECLARE_ALIGNED(32, double, coeff[MAX_VARS][MAX_VARS]); double variance[MAX_VARS]; int indep_count; /** * Take the outer-product of var[] with itself, and add to the covariance matrix. * @param m this context * @param var training samples, starting with the value to be predicted + * 32-byte aligned, and any padding elements must be initialized + * (i.e not denormal/nan). */ void (*update_lls)(struct LLSModel *m, double *var); /** * Inner product of var[] and the LPC coefs. * @param m this context - * @param var training samples, excluding the value to be predicted + * @param var training samples, excluding the value to be predicted. unaligned. * @param order lpc order */ double (*evaluate_lls)(struct LLSModel *m, double *var, int order); } LLSModel; void avpriv_init_lls(LLSModel *m, int indep_count); +void ff_init_lls_x86(LLSModel *m); void avpriv_solve_lls(LLSModel *m, double threshold, unsigned short min_order); #if FF_API_LLS_PRIVATE diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index ae07470b17..1e19082233 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -1,6 +1,8 @@ OBJS += x86/cpu.o \ x86/float_dsp_init.o \ + x86/lls_init.o \ YASM-OBJS += x86/cpuid.o \ x86/emms.o \ x86/float_dsp.o \ + x86/lls.o \ diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm new file mode 100644 index 0000000000..92c00fcda1 --- /dev/null +++ b/libavutil/x86/lls.asm @@ -0,0 +1,196 @@ +;****************************************************************************** +;* linear least squares model +;* +;* Copyright (c) 2013 Loren Merritt +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86util.asm" + +SECTION .text + +%define MAX_VARS 32 +%define MAX_VARS_ALIGN (MAX_VARS+4) +%define COVAR_STRIDE MAX_VARS_ALIGN*8 +%define COVAR(x,y) [covarq + (x)*8 + (y)*COVAR_STRIDE] + +struc LLSModel + .covariance: resq MAX_VARS_ALIGN*MAX_VARS_ALIGN + .coeff: resq MAX_VARS*MAX_VARS + .variance: resq MAX_VARS + .indep_count: resd 1 +endstruc + +%macro ADDPD_MEM 2 +%if cpuflag(avx) + vaddpd %2, %1 +%else + addpd %2, %1 +%endif + mova %1, %2 +%endmacro + +INIT_XMM sse2 +%define movdqa movaps +cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 + %define covarq ctxq + mov id, [ctxq + LLSModel.indep_count] + lea varq, [varq + iq*8] + neg iq + mov covar2q, covarq +.loopi: + ; Compute all 3 pairwise products of a 2x2 block that lies on the diagonal + mova m1, [varq + iq*8] + mova m3, [varq + iq*8 + 16] + pshufd m4, m1, q1010 + pshufd m5, m1, q3232 + pshufd m6, m3, q1010 + pshufd m7, m3, q3232 + mulpd m0, m1, m4 + mulpd m1, m1, m5 + lea covarq, [covar2q + 16] + ADDPD_MEM COVAR(-2,0), m0 + ADDPD_MEM COVAR(-2,1), m1 + lea jq, [iq + 2] + cmp jd, -2 + jg .skip4x4 +.loop4x4: + ; Compute all 16 pairwise products of a 4x4 block + mulpd m0, m4, m3 + mulpd m1, m5, m3 + mulpd m2, m6, m3 + mulpd m3, m3, m7 + ADDPD_MEM COVAR(0,0), m0 + ADDPD_MEM COVAR(0,1), m1 + ADDPD_MEM COVAR(0,2), m2 + ADDPD_MEM COVAR(0,3), m3 + mova m3, [varq + jq*8 + 16] + mulpd m0, m4, m3 + mulpd m1, m5, m3 + mulpd m2, m6, m3 + mulpd m3, m3, m7 + ADDPD_MEM COVAR(2,0), m0 + ADDPD_MEM COVAR(2,1), m1 + ADDPD_MEM COVAR(2,2), m2 + ADDPD_MEM COVAR(2,3), m3 + mova m3, [varq + jq*8 + 32] + add covarq, 32 + add jq, 4 + cmp jd, -2 + jle .loop4x4 +.skip4x4: + test jd, jd + jg .skip2x4 + mulpd m4, m3 + mulpd m5, m3 + mulpd m6, m3 + mulpd m7, m3 + ADDPD_MEM COVAR(0,0), m4 + ADDPD_MEM COVAR(0,1), m5 + ADDPD_MEM COVAR(0,2), m6 + ADDPD_MEM COVAR(0,3), m7 +.skip2x4: + add iq, 4 + add covar2q, 4*COVAR_STRIDE+32 + cmp id, -2 + jle .loopi + test id, id + jg .ret + mov jq, iq + %define covarq covar2q +.loop2x1: + movsd m0, [varq + iq*8] + movlhps m0, m0 + mulpd m0, [varq + jq*8] + ADDPD_MEM COVAR(0,0), m0 + inc iq + add covarq, COVAR_STRIDE + test id, id + jle .loop2x1 +.ret: + REP_RET + +INIT_YMM avx +cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 + %define covarq ctxq + mov countd, [ctxq + LLSModel.indep_count] + lea count2d, [countq-2] + xor id, id +.loopi: + ; Compute all 10 pairwise products of a 4x4 block that lies on the diagonal + mova ymm1, [varq + iq*8] + vbroadcastsd ymm4, [varq + iq*8] + vbroadcastsd ymm5, [varq + iq*8 + 8] + vbroadcastsd ymm6, [varq + iq*8 + 16] + vbroadcastsd ymm7, [varq + iq*8 + 24] + vextractf128 xmm3, ymm1, 1 + vmulpd ymm0, ymm1, ymm4 + vmulpd ymm1, ymm1, ymm5 + vmulpd xmm2, xmm3, xmm6 + vmulpd xmm3, xmm3, xmm7 + ADDPD_MEM COVAR(iq ,0), ymm0 + ADDPD_MEM COVAR(iq ,1), ymm1 + ADDPD_MEM COVAR(iq+2,2), xmm2 + ADDPD_MEM COVAR(iq+2,3), xmm3 + lea jd, [iq + 4] + cmp jd, count2d + jg .skip4x4 +.loop4x4: + ; Compute all 16 pairwise products of a 4x4 block + mova ymm3, [varq + jq*8] + vmulpd ymm0, ymm3, ymm4 + vmulpd ymm1, ymm3, ymm5 + vmulpd ymm2, ymm3, ymm6 + vmulpd ymm3, ymm3, ymm7 + ADDPD_MEM COVAR(jq,0), ymm0 + ADDPD_MEM COVAR(jq,1), ymm1 + ADDPD_MEM COVAR(jq,2), ymm2 + ADDPD_MEM COVAR(jq,3), ymm3 + add jd, 4 + cmp jd, count2d + jle .loop4x4 +.skip4x4: + cmp jd, countd + jg .skip2x4 + mova xmm3, [varq + jq*8] + vmulpd xmm0, xmm3, xmm4 + vmulpd xmm1, xmm3, xmm5 + vmulpd xmm2, xmm3, xmm6 + vmulpd xmm3, xmm3, xmm7 + ADDPD_MEM COVAR(jq,0), xmm0 + ADDPD_MEM COVAR(jq,1), xmm1 + ADDPD_MEM COVAR(jq,2), xmm2 + ADDPD_MEM COVAR(jq,3), xmm3 +.skip2x4: + add id, 4 + add covarq, 4*COVAR_STRIDE + cmp id, count2d + jle .loopi + cmp id, countd + jg .ret + mov jd, id +.loop2x1: + vmovddup xmm0, [varq + iq*8] + vmulpd xmm0, [varq + jq*8] + ADDPD_MEM COVAR(jq,0), xmm0 + inc id + add covarq, COVAR_STRIDE + cmp id, countd + jle .loop2x1 +.ret: + REP_RET diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c new file mode 100644 index 0000000000..8a80f83002 --- /dev/null +++ b/libavutil/x86/lls_init.c @@ -0,0 +1,38 @@ +/* + * linear least squares model + * + * Copyright (c) 2013 Loren Merritt + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/lls.h" +#include "libavutil/x86/cpu.h" + +void ff_update_lls_sse2(LLSModel *m, double *var); +void ff_update_lls_avx(LLSModel *m, double *var); + +av_cold void ff_init_lls_x86(LLSModel *m) +{ + int cpu_flags = av_get_cpu_flags(); + if (EXTERNAL_SSE2(cpu_flags)) { + m->update_lls = ff_update_lls_sse2; + } + if (EXTERNAL_AVX(cpu_flags)) { + m->update_lls = ff_update_lls_avx; + } +} -- cgit v1.2.3