From fde82ca7e4e72226da99d86a9fa9689136461d14 Mon Sep 17 00:00:00 2001
From: Justin Ruggles <justin.ruggles@gmail.com>
Date: Sat, 17 Oct 2009 21:00:39 +0000
Subject: Move autocorrelation function from flacenc.c to lpc.c. Also rename
 the corresponding dsputil functions and remove their dependency on the FLAC
 encoder. Fixes Issue1486.

Originally committed as revision 20266 to svn://svn.ffmpeg.org/ffmpeg/trunk
---
 libavcodec/Makefile             |   2 +-
 libavcodec/dsputil.c            |   8 +--
 libavcodec/dsputil.h            |   2 +-
 libavcodec/flacenc.c            |  63 ------------------
 libavcodec/lpc.c                |  64 +++++++++++++++++-
 libavcodec/x86/dsputilenc_mmx.c |   6 +-
 libavcodec/x86/flacdsp_mmx.c    | 139 ----------------------------------------
 libavcodec/x86/lpc_mmx.c        | 139 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 209 insertions(+), 214 deletions(-)
 delete mode 100644 libavcodec/x86/flacdsp_mmx.c
 create mode 100644 libavcodec/x86/lpc_mmx.c

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 789ece0be0..85acb5f202 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -456,7 +456,6 @@ YASM-OBJS-$(CONFIG_GPL)                += x86/h264_deblock_sse2.o       \
 
 MMX-OBJS-$(CONFIG_CAVS_DECODER)        += x86/cavsdsp_mmx.o
 MMX-OBJS-$(CONFIG_ENCODERS)            += x86/dsputilenc_mmx.o
-MMX-OBJS-$(CONFIG_FLAC_ENCODER)        += x86/flacdsp_mmx.o
 MMX-OBJS-$(CONFIG_GPL)                 += x86/idct_mmx.o
 MMX-OBJS-$(CONFIG_SNOW_DECODER)        += x86/snowdsp_mmx.o
 MMX-OBJS-$(CONFIG_VC1_DECODER)         += x86/vc1dsp_mmx.o
@@ -474,6 +473,7 @@ OBJS-$(HAVE_MMX)                       += x86/cpuid.o                   \
                                           x86/fft.o                     \
                                           x86/idct_mmx_xvid.o           \
                                           x86/idct_sse2_xvid.o          \
+                                          x86/lpc_mmx.o                 \
                                           x86/motion_est_mmx.o          \
                                           x86/mpegvideo_mmx.o           \
                                           x86/simple_idct_mmx.o         \
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index be35ec3cff..aae86bcb21 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -45,8 +45,8 @@ void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
 /* ac3dec.c */
 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
 
-/* flacenc.c */
-void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
+/* lpc.c */
+void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
 
 /* pngdec.c */
 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
@@ -4837,9 +4837,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
 #if CONFIG_AC3_DECODER
     c->ac3_downmix = ff_ac3_downmix_c;
 #endif
-#if CONFIG_FLAC_ENCODER
-    c->flac_compute_autocorr = ff_flac_compute_autocorr;
-#endif
+    c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
     c->vector_fmul = vector_fmul_c;
     c->vector_fmul_reverse = vector_fmul_reverse_c;
     c->vector_fmul_add = vector_fmul_add_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 6cb5af2cc5..58524b26e4 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -386,7 +386,7 @@ typedef struct DSPContext {
     void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
     void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
     /* no alignment needed */
-    void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
+    void (*lpc_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
     void (*vector_fmul)(float *dst, const float *src, int len);
     void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 81a35a0d54..c1f862a684 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -552,69 +552,6 @@ static uint32_t calc_rice_params_lpc(RiceContext *rc, int pmin, int pmax,
     return bits;
 }
 
-/**
- * Apply Welch window function to audio block
- */
-static void apply_welch_window(const int32_t *data, int len, double *w_data)
-{
-    int i, n2;
-    double w;
-    double c;
-
-    assert(!(len&1)); //the optimization in r11881 does not support odd len
-                      //if someone wants odd len extend the change in r11881
-
-    n2 = (len >> 1);
-    c = 2.0 / (len - 1.0);
-
-    w_data+=n2;
-      data+=n2;
-    for(i=0; i<n2; i++) {
-        w = c - n2 + i;
-        w = 1.0 - (w * w);
-        w_data[-i-1] = data[-i-1] * w;
-        w_data[+i  ] = data[+i  ] * w;
-    }
-}
-
-/**
- * Calculates autocorrelation data from audio samples
- * A Welch window function is applied before calculation.
- */
-void ff_flac_compute_autocorr(const int32_t *data, int len, int lag,
-                              double *autoc)
-{
-    int i, j;
-    double tmp[len + lag + 1];
-    double *data1= tmp + lag;
-
-    apply_welch_window(data, len, data1);
-
-    for(j=0; j<lag; j++)
-        data1[j-lag]= 0.0;
-    data1[len] = 0.0;
-
-    for(j=0; j<lag; j+=2){
-        double sum0 = 1.0, sum1 = 1.0;
-        for(i=j; i<len; i++){
-            sum0 += data1[i] * data1[i-j];
-            sum1 += data1[i] * data1[i-j-1];
-        }
-        autoc[j  ] = sum0;
-        autoc[j+1] = sum1;
-    }
-
-    if(j==lag){
-        double sum = 1.0;
-        for(i=j-1; i<len; i+=2){
-            sum += data1[i  ] * data1[i-j  ]
-                 + data1[i+1] * data1[i-j+1];
-        }
-        autoc[j] = sum;
-    }
-}
-
-
 static void encode_residual_verbatim(int32_t *res, int32_t *smp, int n)
 {
     assert(n > 0);
diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 896db51759..49e41d8c34 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -26,6 +26,68 @@
 #include "lpc.h"
 
 
+/**
+ * Apply Welch window function to audio block
+ */
+static void apply_welch_window(const int32_t *data, int len, double *w_data)
+{
+    int i, n2;
+    double w;
+    double c;
+
+    assert(!(len&1)); //the optimization in r11881 does not support odd len
+                      //if someone wants odd len extend the change in r11881
+
+    n2 = (len >> 1);
+    c = 2.0 / (len - 1.0);
+
+    w_data+=n2;
+      data+=n2;
+    for(i=0; i<n2; i++) {
+        w = c - n2 + i;
+        w = 1.0 - (w * w);
+        w_data[-i-1] = data[-i-1] * w;
+        w_data[+i  ] = data[+i  ] * w;
+    }
+}
+
+/**
+ * Calculates autocorrelation data from audio samples
+ * A Welch window function is applied before calculation.
+ */
+void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag,
+                             double *autoc)
+{
+    int i, j;
+    double tmp[len + lag + 1];
+    double *data1= tmp + lag;
+
+    apply_welch_window(data, len, data1);
+
+    for(j=0; j<lag; j++)
+        data1[j-lag]= 0.0;
+    data1[len] = 0.0;
+
+    for(j=0; j<lag; j+=2){
+        double sum0 = 1.0, sum1 = 1.0;
+        for(i=j; i<len; i++){
+            sum0 += data1[i] * data1[i-j];
+            sum1 += data1[i] * data1[i-j-1];
+        }
+        autoc[j  ] = sum0;
+        autoc[j+1] = sum1;
+    }
+
+    if(j==lag){
+        double sum = 1.0;
+        for(i=j-1; i<len; i+=2){
+            sum += data1[i  ] * data1[i-j  ]
+                 + data1[i+1] * data1[i-j+1];
+        }
+        autoc[j] = sum;
+    }
+}
+
 /**
  * Quantize LPC coefficients
  */
@@ -115,7 +177,7 @@ int ff_lpc_calc_coefs(DSPContext *s,
     assert(max_order >= MIN_LPC_ORDER && max_order <= MAX_LPC_ORDER && use_lpc > 0);
 
     if(use_lpc == 1){
-        s->flac_compute_autocorr(samples, blocksize, max_order, autoc);
+        s->lpc_compute_autocorr(samples, blocksize, max_order, autoc);
 
         compute_lpc_coefs(autoc, max_order, &lpc[0][0], MAX_LPC_ORDER, 0, 1);
 
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 6059436252..2bf0fe1328 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -1348,8 +1348,7 @@ static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int si
 #endif //HAVE_SSSE3
 
 
-/* FLAC specific */
-void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
+void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag,
                                    double *autoc);
 
 
@@ -1414,8 +1413,7 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->sum_abs_dctelem= sum_abs_dctelem_sse2;
             c->hadamard8_diff[0]= hadamard8_diff16_sse2;
             c->hadamard8_diff[1]= hadamard8_diff_sse2;
-            if (CONFIG_FLAC_ENCODER)
-                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse2;
+            c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
         }
 
 #if HAVE_SSSE3
diff --git a/libavcodec/x86/flacdsp_mmx.c b/libavcodec/x86/flacdsp_mmx.c
deleted file mode 100644
index 01c0d7ae8a..0000000000
--- a/libavcodec/x86/flacdsp_mmx.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * MMX optimized FLAC DSP utils
- * Copyright (c) 2007 Loren Merritt
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/x86_cpu.h"
-#include "dsputil_mmx.h"
-
-static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
-{
-    double c = 2.0 / (len-1.0);
-    int n2 = len>>1;
-    x86_reg i = -n2*sizeof(int32_t);
-    x86_reg j =  n2*sizeof(int32_t);
-    __asm__ volatile(
-        "movsd   %0,     %%xmm7                \n\t"
-        "movapd  "MANGLE(ff_pd_1)", %%xmm6     \n\t"
-        "movapd  "MANGLE(ff_pd_2)", %%xmm5     \n\t"
-        "movlhps %%xmm7, %%xmm7                \n\t"
-        "subpd   %%xmm5, %%xmm7                \n\t"
-        "addsd   %%xmm6, %%xmm7                \n\t"
-        ::"m"(c)
-    );
-#define WELCH(MOVPD, offset)\
-    __asm__ volatile(\
-        "1:                                    \n\t"\
-        "movapd   %%xmm7,  %%xmm1              \n\t"\
-        "mulpd    %%xmm1,  %%xmm1              \n\t"\
-        "movapd   %%xmm6,  %%xmm0              \n\t"\
-        "subpd    %%xmm1,  %%xmm0              \n\t"\
-        "pshufd   $0x4e,   %%xmm0, %%xmm1      \n\t"\
-        "cvtpi2pd (%3,%0), %%xmm2              \n\t"\
-        "cvtpi2pd "#offset"*4(%3,%1), %%xmm3   \n\t"\
-        "mulpd    %%xmm0,  %%xmm2              \n\t"\
-        "mulpd    %%xmm1,  %%xmm3              \n\t"\
-        "movapd   %%xmm2, (%2,%0,2)            \n\t"\
-        MOVPD"    %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
-        "subpd    %%xmm5,  %%xmm7              \n\t"\
-        "sub      $8,      %1                  \n\t"\
-        "add      $8,      %0                  \n\t"\
-        "jl 1b                                 \n\t"\
-        :"+&r"(i), "+&r"(j)\
-        :"r"(w_data+n2), "r"(data+n2)\
-    );
-    if(len&1)
-        WELCH("movupd", -1)
-    else
-        WELCH("movapd", -2)
-#undef WELCH
-}
-
-void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
-                                   double *autoc)
-{
-    double tmp[len + lag + 2];
-    double *data1 = tmp + lag;
-    int j;
-
-    if((x86_reg)data1 & 15)
-        data1++;
-
-    apply_welch_window_sse2(data, len, data1);
-
-    for(j=0; j<lag; j++)
-        data1[j-lag]= 0.0;
-    data1[len] = 0.0;
-
-    for(j=0; j<lag; j+=2){
-        x86_reg i = -len*sizeof(double);
-        if(j == lag-2) {
-            __asm__ volatile(
-                "movsd    "MANGLE(ff_pd_1)", %%xmm0 \n\t"
-                "movsd    "MANGLE(ff_pd_1)", %%xmm1 \n\t"
-                "movsd    "MANGLE(ff_pd_1)", %%xmm2 \n\t"
-                "1:                                 \n\t"
-                "movapd   (%4,%0), %%xmm3           \n\t"
-                "movupd -8(%5,%0), %%xmm4           \n\t"
-                "movapd   (%5,%0), %%xmm5           \n\t"
-                "mulpd     %%xmm3, %%xmm4           \n\t"
-                "mulpd     %%xmm3, %%xmm5           \n\t"
-                "mulpd -16(%5,%0), %%xmm3           \n\t"
-                "addpd     %%xmm4, %%xmm1           \n\t"
-                "addpd     %%xmm5, %%xmm0           \n\t"
-                "addpd     %%xmm3, %%xmm2           \n\t"
-                "add       $16,    %0               \n\t"
-                "jl 1b                              \n\t"
-                "movhlps   %%xmm0, %%xmm3           \n\t"
-                "movhlps   %%xmm1, %%xmm4           \n\t"
-                "movhlps   %%xmm2, %%xmm5           \n\t"
-                "addsd     %%xmm3, %%xmm0           \n\t"
-                "addsd     %%xmm4, %%xmm1           \n\t"
-                "addsd     %%xmm5, %%xmm2           \n\t"
-                "movsd     %%xmm0, %1               \n\t"
-                "movsd     %%xmm1, %2               \n\t"
-                "movsd     %%xmm2, %3               \n\t"
-                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
-                :"r"(data1+len), "r"(data1+len-j)
-            );
-        } else {
-            __asm__ volatile(
-                "movsd    "MANGLE(ff_pd_1)", %%xmm0 \n\t"
-                "movsd    "MANGLE(ff_pd_1)", %%xmm1 \n\t"
-                "1:                                 \n\t"
-                "movapd   (%3,%0), %%xmm3           \n\t"
-                "movupd -8(%4,%0), %%xmm4           \n\t"
-                "mulpd     %%xmm3, %%xmm4           \n\t"
-                "mulpd    (%4,%0), %%xmm3           \n\t"
-                "addpd     %%xmm4, %%xmm1           \n\t"
-                "addpd     %%xmm3, %%xmm0           \n\t"
-                "add       $16,    %0               \n\t"
-                "jl 1b                              \n\t"
-                "movhlps   %%xmm0, %%xmm3           \n\t"
-                "movhlps   %%xmm1, %%xmm4           \n\t"
-                "addsd     %%xmm3, %%xmm0           \n\t"
-                "addsd     %%xmm4, %%xmm1           \n\t"
-                "movsd     %%xmm0, %1               \n\t"
-                "movsd     %%xmm1, %2               \n\t"
-                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
-                :"r"(data1+len), "r"(data1+len-j)
-            );
-        }
-    }
-}
diff --git a/libavcodec/x86/lpc_mmx.c b/libavcodec/x86/lpc_mmx.c
new file mode 100644
index 0000000000..40fc678b7c
--- /dev/null
+++ b/libavcodec/x86/lpc_mmx.c
@@ -0,0 +1,139 @@
+/*
+ * MMX optimized LPC DSP utils
+ * Copyright (c) 2007 Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/x86_cpu.h"
+#include "dsputil_mmx.h"
+
+static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
+{
+    double c = 2.0 / (len-1.0);
+    int n2 = len>>1;
+    x86_reg i = -n2*sizeof(int32_t);
+    x86_reg j =  n2*sizeof(int32_t);
+    __asm__ volatile(
+        "movsd   %0,     %%xmm7                \n\t"
+        "movapd  "MANGLE(ff_pd_1)", %%xmm6     \n\t"
+        "movapd  "MANGLE(ff_pd_2)", %%xmm5     \n\t"
+        "movlhps %%xmm7, %%xmm7                \n\t"
+        "subpd   %%xmm5, %%xmm7                \n\t"
+        "addsd   %%xmm6, %%xmm7                \n\t"
+        ::"m"(c)
+    );
+#define WELCH(MOVPD, offset)\
+    __asm__ volatile(\
+        "1:                                    \n\t"\
+        "movapd   %%xmm7,  %%xmm1              \n\t"\
+        "mulpd    %%xmm1,  %%xmm1              \n\t"\
+        "movapd   %%xmm6,  %%xmm0              \n\t"\
+        "subpd    %%xmm1,  %%xmm0              \n\t"\
+        "pshufd   $0x4e,   %%xmm0, %%xmm1      \n\t"\
+        "cvtpi2pd (%3,%0), %%xmm2              \n\t"\
+        "cvtpi2pd "#offset"*4(%3,%1), %%xmm3   \n\t"\
+        "mulpd    %%xmm0,  %%xmm2              \n\t"\
+        "mulpd    %%xmm1,  %%xmm3              \n\t"\
+        "movapd   %%xmm2, (%2,%0,2)            \n\t"\
+        MOVPD"    %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
+        "subpd    %%xmm5,  %%xmm7              \n\t"\
+        "sub      $8,      %1                  \n\t"\
+        "add      $8,      %0                  \n\t"\
+        "jl 1b                                 \n\t"\
+        :"+&r"(i), "+&r"(j)\
+        :"r"(w_data+n2), "r"(data+n2)\
+    );
+    if(len&1)
+        WELCH("movupd", -1)
+    else
+        WELCH("movapd", -2)
+#undef WELCH
+}
+
+void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag,
+                                   double *autoc)
+{
+    double tmp[len + lag + 2];
+    double *data1 = tmp + lag;
+    int j;
+
+    if((x86_reg)data1 & 15)
+        data1++;
+
+    apply_welch_window_sse2(data, len, data1);
+
+    for(j=0; j<lag; j++)
+        data1[j-lag]= 0.0;
+    data1[len] = 0.0;
+
+    for(j=0; j<lag; j+=2){
+        x86_reg i = -len*sizeof(double);
+        if(j == lag-2) {
+            __asm__ volatile(
+                "movsd    "MANGLE(ff_pd_1)", %%xmm0 \n\t"
+                "movsd    "MANGLE(ff_pd_1)", %%xmm1 \n\t"
+                "movsd    "MANGLE(ff_pd_1)", %%xmm2 \n\t"
+                "1:                                 \n\t"
+                "movapd   (%4,%0), %%xmm3           \n\t"
+                "movupd -8(%5,%0), %%xmm4           \n\t"
+                "movapd   (%5,%0), %%xmm5           \n\t"
+                "mulpd     %%xmm3, %%xmm4           \n\t"
+                "mulpd     %%xmm3, %%xmm5           \n\t"
+                "mulpd -16(%5,%0), %%xmm3           \n\t"
+                "addpd     %%xmm4, %%xmm1           \n\t"
+                "addpd     %%xmm5, %%xmm0           \n\t"
+                "addpd     %%xmm3, %%xmm2           \n\t"
+                "add       $16,    %0               \n\t"
+                "jl 1b                              \n\t"
+                "movhlps   %%xmm0, %%xmm3           \n\t"
+                "movhlps   %%xmm1, %%xmm4           \n\t"
+                "movhlps   %%xmm2, %%xmm5           \n\t"
+                "addsd     %%xmm3, %%xmm0           \n\t"
+                "addsd     %%xmm4, %%xmm1           \n\t"
+                "addsd     %%xmm5, %%xmm2           \n\t"
+                "movsd     %%xmm0, %1               \n\t"
+                "movsd     %%xmm1, %2               \n\t"
+                "movsd     %%xmm2, %3               \n\t"
+                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
+                :"r"(data1+len), "r"(data1+len-j)
+            );
+        } else {
+            __asm__ volatile(
+                "movsd    "MANGLE(ff_pd_1)", %%xmm0 \n\t"
+                "movsd    "MANGLE(ff_pd_1)", %%xmm1 \n\t"
+                "1:                                 \n\t"
+                "movapd   (%3,%0), %%xmm3           \n\t"
+                "movupd -8(%4,%0), %%xmm4           \n\t"
+                "mulpd     %%xmm3, %%xmm4           \n\t"
+                "mulpd    (%4,%0), %%xmm3           \n\t"
+                "addpd     %%xmm4, %%xmm1           \n\t"
+                "addpd     %%xmm3, %%xmm0           \n\t"
+                "add       $16,    %0               \n\t"
+                "jl 1b                              \n\t"
+                "movhlps   %%xmm0, %%xmm3           \n\t"
+                "movhlps   %%xmm1, %%xmm4           \n\t"
+                "addsd     %%xmm3, %%xmm0           \n\t"
+                "addsd     %%xmm4, %%xmm1           \n\t"
+                "movsd     %%xmm0, %1               \n\t"
+                "movsd     %%xmm1, %2               \n\t"
+                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
+                :"r"(data1+len), "r"(data1+len-j)
+            );
+        }
+    }
+}
-- 
cgit v1.2.3