4 files changed, 158 insertions, 110 deletions
diff --git a/libavcodec/flacdsp.c b/libavcodec/flacdsp.c
index 3f4f71076c..b916869321 100644
--- a/libavcodec/flacdsp.c
+++ b/libavcodec/flacdsp.c
@@ -26,6 +26,7 @@
 #define SAMPLE_SIZE 16
 #define PLANAR 0
 #include "flacdsp_template.c"
+#include "flacdsp_lpc_template.c"
 
 #undef  PLANAR
 #define PLANAR 1
@@ -36,6 +37,7 @@
 #define SAMPLE_SIZE 32
 #define PLANAR 0
 #include "flacdsp_template.c"
+#include "flacdsp_lpc_template.c"
 
 #undef  PLANAR
 #define PLANAR 1
@@ -86,10 +88,13 @@ static void flac_lpc_32_c(int32_t *decoded, const int coeffs[32],
 av_cold void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt,
                              int bps)
 {
-    if (bps > 16)
+    if (bps > 16) {
         c->lpc            = flac_lpc_32_c;
-    else
+        c->lpc_encode     = flac_lpc_encode_c_32;
+    } else {
         c->lpc            = flac_lpc_16_c;
+        c->lpc_encode     = flac_lpc_encode_c_16;
+    }
 
     switch (fmt) {
     case AV_SAMPLE_FMT_S32:
diff --git a/libavcodec/flacdsp.h b/libavcodec/flacdsp.h
index 429daab346..33184b589d 100644
--- a/libavcodec/flacdsp.h
+++ b/libavcodec/flacdsp.h
@@ -27,6 +27,8 @@ typedef struct FLACDSPContext {
                            int len, int shift);
     void (*lpc)(int32_t *samples, const int coeffs[32], int order,
                 int qlevel, int len);
+    void (*lpc_encode)(int32_t *res, const int32_t *smp, int len, int order,
+                       const int32_t *coefs, int shift);
 } FLACDSPContext;
 
 void ff_flacdsp_init(FLACDSPContext *c, enum AVSampleFormat fmt, int bps);
diff --git a/libavcodec/flacdsp_lpc_template.c b/libavcodec/flacdsp_lpc_template.c
new file mode 100644
index 0000000000..269e64b3a8
--- /dev/null
+++ b/libavcodec/flacdsp_lpc_template.c
@@ -0,0 +1,141 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavutil/avutil.h"
+#include "mathops.h"
+
+#undef FUNC
+#undef sum_type
+#undef MUL
+#undef CLIP
+#undef FSUF
+
+#define FUNC(n) AV_JOIN(n ## _, SAMPLE_SIZE)
+
+#if SAMPLE_SIZE == 32
+#   define sum_type  int64_t
+#   define MUL(a, b) MUL64(a, b)
+#   define CLIP(x) av_clipl_int32(x)
+#else
+#   define sum_type  int32_t
+#   define MUL(a, b) ((a) * (b))
+#   define CLIP(x) (x)
+#endif
+
+#define LPC1(x) {           \
+    int c = coefs[(x)-1];   \
+    p0   += MUL(c, s);      \
+    s     = smp[i-(x)+1];   \
+    p1   += MUL(c, s);      \
+}
+
+static av_always_inline void FUNC(lpc_encode_unrolled)(int32_t *res,
+                                  const int32_t *smp, int len, int order,
+                                  const int32_t *coefs, int shift, int big)
+{
+    int i;
+    for (i = order; i < len; i += 2) {
+        int s  = smp[i-order];
+        sum_type p0 = 0, p1 = 0;
+        if (big) {
+            switch (order) {
+            case 32: LPC1(32)
+            case 31: LPC1(31)
+            case 30: LPC1(30)
+            case 29: LPC1(29)
+            case 28: LPC1(28)
+            case 27: LPC1(27)
+            case 26: LPC1(26)
+            case 25: LPC1(25)
+            case 24: LPC1(24)
+            case 23: LPC1(23)
+            case 22: LPC1(22)
+            case 21: LPC1(21)
+            case 20: LPC1(20)
+            case 19: LPC1(19)
+            case 18: LPC1(18)
+            case 17: LPC1(17)
+            case 16: LPC1(16)
+            case 15: LPC1(15)
+            case 14: LPC1(14)
+            case 13: LPC1(13)
+            case 12: LPC1(12)
+            case 11: LPC1(11)
+            case 10: LPC1(10)
+            case  9: LPC1( 9)
+                     LPC1( 8)
+                     LPC1( 7)
+                     LPC1( 6)
+                     LPC1( 5)
+                     LPC1( 4)
+                     LPC1( 3)
+                     LPC1( 2)
+                     LPC1( 1)
+            }
+        } else {
+            switch (order) {
+            case  8: LPC1( 8)
+            case  7: LPC1( 7)
+            case  6: LPC1( 6)
+            case  5: LPC1( 5)
+            case  4: LPC1( 4)
+            case  3: LPC1( 3)
+            case  2: LPC1( 2)
+            case  1: LPC1( 1)
+            }
+        }
+        res[i  ] = smp[i  ] - CLIP(p0 >> shift);
+        res[i+1] = smp[i+1] - CLIP(p1 >> shift);
+    }
+}
+
+static void FUNC(flac_lpc_encode_c)(int32_t *res, const int32_t *smp, int len,
+                                    int order, const int32_t *coefs, int shift)
+{
+    int i;
+    for (i = 0; i < order; i++)
+        res[i] = smp[i];
+#if CONFIG_SMALL
+    for (i = order; i < len; i += 2) {
+        int j;
+        int s  = smp[i];
+        sum_type p0 = 0, p1 = 0;
+        for (j = 0; j < order; j++) {
+            int c = coefs[j];
+            p1   += MUL(c, s);
+            s     = smp[i-j-1];
+            p0   += MUL(c, s);
+        }
+        res[i  ] = smp[i  ] - CLIP(p0 >> shift);
+        res[i+1] = smp[i+1] - CLIP(p1 >> shift);
+    }
+#else
+    switch (order) {
+    case  1: FUNC(lpc_encode_unrolled)(res, smp, len,     1, coefs, shift, 0); break;
+    case  2: FUNC(lpc_encode_unrolled)(res, smp, len,     2, coefs, shift, 0); break;
+    case  3: FUNC(lpc_encode_unrolled)(res, smp, len,     3, coefs, shift, 0); break;
+    case  4: FUNC(lpc_encode_unrolled)(res, smp, len,     4, coefs, shift, 0); break;
+    case  5: FUNC(lpc_encode_unrolled)(res, smp, len,     5, coefs, shift, 0); break;
+    case  6: FUNC(lpc_encode_unrolled)(res, smp, len,     6, coefs, shift, 0); break;
+    case  7: FUNC(lpc_encode_unrolled)(res, smp, len,     7, coefs, shift, 0); break;
+    case  8: FUNC(lpc_encode_unrolled)(res, smp, len,     8, coefs, shift, 0); break;
+    default: FUNC(lpc_encode_unrolled)(res, smp, len, order, coefs, shift, 1); break;
+    }
+#endif
+}
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 115664a779..71024f272e 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -31,6 +31,7 @@
 #include "lpc.h"
 #include "flac.h"
 #include "flacdata.h"
+#include "flacdsp.h"
 
 #define FLAC_SUBFRAME_CONSTANT  0
 #define FLAC_SUBFRAME_VERBATIM  1
@@ -106,6 +107,7 @@ typedef struct FlacEncodeContext {
     uint8_t *md5_buffer;
     unsigned int md5_buffer_size;
     DSPContext dsp;
+    FLACDSPContext flac_dsp;
 } FlacEncodeContext;
 
 
@@ -385,6 +387,7 @@ static av_cold int flac_encode_init(AVCodecContext *avctx)
                       s->options.max_prediction_order, FF_LPC_TYPE_LEVINSON);
 
     ff_dsputil_init(&s->dsp, avctx);
+    ff_flacdsp_init(&s->flac_dsp, avctx->sample_fmt, 16);
 
     dprint_compression_options(s);
 
@@ -684,110 +687,6 @@ static void encode_residual_fixed(int32_t *res, const int32_t *smp, int n,
 }
 
 
-#define LPC1(x) {\
-    int c = coefs[(x)-1];\
-    p0   += c * s;\
-    s     = smp[i-(x)+1];\
-    p1   += c * s;\
-}
-
-static av_always_inline void encode_residual_lpc_unrolled(int32_t *res,
-                                    const int32_t *smp, int n, int order,
-                                    const int32_t *coefs, int shift, int big)
-{
-    int i;
-    for (i = order; i < n; i += 2) {
-        int s  = smp[i-order];
-        int p0 = 0, p1 = 0;
-        if (big) {
-            switch (order) {
-            case 32: LPC1(32)
-            case 31: LPC1(31)
-            case 30: LPC1(30)
-            case 29: LPC1(29)
-            case 28: LPC1(28)
-            case 27: LPC1(27)
-            case 26: LPC1(26)
-            case 25: LPC1(25)
-            case 24: LPC1(24)
-            case 23: LPC1(23)
-            case 22: LPC1(22)
-            case 21: LPC1(21)
-            case 20: LPC1(20)
-            case 19: LPC1(19)
-            case 18: LPC1(18)
-            case 17: LPC1(17)
-            case 16: LPC1(16)
-            case 15: LPC1(15)
-            case 14: LPC1(14)
-            case 13: LPC1(13)
-            case 12: LPC1(12)
-            case 11: LPC1(11)
-            case 10: LPC1(10)
-            case  9: LPC1( 9)
-                     LPC1( 8)
-                     LPC1( 7)
-                     LPC1( 6)
-                     LPC1( 5)
-                     LPC1( 4)
-                     LPC1( 3)
-                     LPC1( 2)
-                     LPC1( 1)
-            }
-        } else {
-            switch (order) {
-            case  8: LPC1( 8)
-            case  7: LPC1( 7)
-            case  6: LPC1( 6)
-            case  5: LPC1( 5)
-            case  4: LPC1( 4)
-            case  3: LPC1( 3)
-            case  2: LPC1( 2)
-            case  1: LPC1( 1)
-            }
-        }
-        res[i  ] = smp[i  ] - (p0 >> shift);
-        res[i+1] = smp[i+1] - (p1 >> shift);
-    }
-}
-
-
-static void encode_residual_lpc(int32_t *res, const int32_t *smp, int n,
-                                int order, const int32_t *coefs, int shift)
-{
-    int i;
-    for (i = 0; i < order; i++)
-        res[i] = smp[i];
-#if CONFIG_SMALL
-    for (i = order; i < n; i += 2) {
-        int j;
-        int s  = smp[i];
-        int p0 = 0, p1 = 0;
-        for (j = 0; j < order; j++) {
-            int c = coefs[j];
-            p1   += c * s;
-            s     = smp[i-j-1];
-            p0   += c * s;
-        }
-        res[i  ] = smp[i  ] - (p0 >> shift);
-        res[i+1] = smp[i+1] - (p1 >> shift);
-    }
-#else
-    switch (order) {
-    case  1: encode_residual_lpc_unrolled(res, smp, n, 1, coefs, shift, 0); break;
-    case  2: encode_residual_lpc_unrolled(res, smp, n, 2, coefs, shift, 0); break;
-    case  3: encode_residual_lpc_unrolled(res, smp, n, 3, coefs, shift, 0); break;
-    case  4: encode_residual_lpc_unrolled(res, smp, n, 4, coefs, shift, 0); break;
-    case  5: encode_residual_lpc_unrolled(res, smp, n, 5, coefs, shift, 0); break;
-    case  6: encode_residual_lpc_unrolled(res, smp, n, 6, coefs, shift, 0); break;
-    case  7: encode_residual_lpc_unrolled(res, smp, n, 7, coefs, shift, 0); break;
-    case  8: encode_residual_lpc_unrolled(res, smp, n, 8, coefs, shift, 0); break;
-    default: encode_residual_lpc_unrolled(res, smp, n, order, coefs, shift, 1); break;
-    }
-#endif
-}
-
-
 static int encode_residual_ch(FlacEncodeContext *s, int ch)
 {
     int i, n;
@@ -869,7 +768,8 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             order = min_order + (((max_order-min_order+1) * (i+1)) / levels)-1;
             if (order < 0)
                 order = 0;
-            encode_residual_lpc(res, smp, n, order+1, coefs[order], shift[order]);
+            s->flac_dsp.lpc_encode(res, smp, n, order+1, coefs[order],
+                                   shift[order]);
             bits[i] = find_subframe_rice_params(s, sub, order+1);
             if (bits[i] < bits[opt_index]) {
                 opt_index = i;
@@ -883,7 +783,7 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
         opt_order = 0;
         bits[0]   = UINT32_MAX;
         for (i = min_order-1; i < max_order; i++) {
-            encode_residual_lpc(res, smp, n, i+1, coefs[i], shift[i]);
+            s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
             bits[i] = find_subframe_rice_params(s, sub, i+1);
             if (bits[i] < bits[opt_order])
                 opt_order = i;
@@ -901,7 +801,7 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
             for (i = last-step; i <= last+step; i += step) {
                 if (i < min_order-1 || i >= max_order || bits[i] < UINT32_MAX)
                     continue;
-                encode_residual_lpc(res, smp, n, i+1, coefs[i], shift[i]);
+                s->flac_dsp.lpc_encode(res, smp, n, i+1, coefs[i], shift[i]);
                 bits[i] = find_subframe_rice_params(s, sub, i+1);
                 if (bits[i] < bits[opt_order])
                     opt_order = i;
@@ -916,7 +816,7 @@ static int encode_residual_ch(FlacEncodeContext *s, int ch)
     for (i = 0; i < sub->order; i++)
         sub->coefs[i] = coefs[sub->order-1][i];
 
-    encode_residual_lpc(res, smp, n, sub->order, sub->coefs, sub->shift);
+    s->flac_dsp.lpc_encode(res, smp, n, sub->order, sub->coefs, sub->shift);
 
     find_subframe_rice_params(s, sub, sub->order);