lavc/flacenc: partially unroll loop in flac_enc_lpc_16

It now does 12 samples per iteration, up from 4. From 1.8 to 3.2 times faster again. 3.6 to 5.7 times faster overall. Runtime is reduced by a further 2 to 18%. Overall runtime reduced by 4 to 50%. Same conditions as before apply. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
author: James Darnley <james.darnley@gmail.com> 2014-08-12 23:22:03 +0200
committer: Michael Niedermayer <michaelni@gmx.at> 2014-08-13 03:09:26 +0200
commit: 54a51d384055a771ba1eeef3c2f399bd03fa2663 (patch)
tree: 2d7e770fb0fcfebaba0b4ef02f9714e07109f4c2 /libavcodec/x86/flac_dsp_gpl.asm
parent: a8592db9bb787e6cd3aece69ce211cb97bd718cd (diff)
1 files changed, 21 insertions, 5 deletions
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index 1f28be132a..cedf0837a7 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -26,13 +26,13 @@ SECTION_TEXT
 
 INIT_XMM sse4
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_16, 5, 7, 4, 0, res, smp, len, order, coefs
+    cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
     DECLARE_REG_TMP 5, 6
     %define length r2d
 
     movsxd orderq, orderd
 %else
-    cglobal flac_enc_lpc_16, 5, 6, 4, 0, res, smp, len, order, coefs
+    cglobal flac_enc_lpc_16, 5, 6, 8, 0, res, smp, len, order, coefs
     DECLARE_REG_TMP 2, 5
     %define length r2mp
 %endif
@@ -59,6 +59,8 @@ neg  orderq
 
 .looplen:
     pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
     mov  posj, orderq
     xor  negj, negj
 
@@ -66,20 +68,34 @@ neg  orderq
         movd   m2, [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
         movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
         pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
         paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
 
         dec    negj
         inc    posj
     jnz .looporder
 
     psrad  m0,     m3              ; p >>= shift
+    psrad  m4,     m3
+    psrad  m6,     m3
     movu   m1,    [smpq]
+    movu   m5,    [smpq+mmsize]
+    movu   m7,    [smpq+mmsize*2]
     psubd  m1,     m0              ; smp[i] - p
+    psubd  m5,     m4
+    psubd  m7,     m6
     movu  [resq],  m1              ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
 
-    add resq,   mmsize
-    add smpq,   mmsize
-    sub length, mmsize/4
+    add resq,    3*mmsize
+    add smpq,    3*mmsize
+    sub length, (3*mmsize)/4
 jg .looplen
 RET
author	James Darnley <james.darnley@gmail.com>	2014-08-12 23:22:03 +0200
committer	Michael Niedermayer <michaelni@gmx.at>	2014-08-13 03:09:26 +0200
commit	54a51d384055a771ba1eeef3c2f399bd03fa2663 (patch)
tree	2d7e770fb0fcfebaba0b4ef02f9714e07109f4c2 /libavcodec/x86/flac_dsp_gpl.asm
parent	a8592db9bb787e6cd3aece69ce211cb97bd718cd (diff)