6 files changed, 9 insertions, 38 deletions
diff --git a/libavcodec/acelp_pitch_delay.c b/libavcodec/acelp_pitch_delay.c
index 395247dd2a..214a272c32 100644
--- a/libavcodec/acelp_pitch_delay.c
+++ b/libavcodec/acelp_pitch_delay.c
@@ -106,7 +106,7 @@ int16_t ff_acelp_decode_gain_code(
         mr_energy += quant_energy[i] * ma_prediction_coeff[i];
 
     mr_energy = gain_corr_factor * exp(M_LN10 / (20 << 23) * mr_energy) /
-                sqrt(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size, 0));
+                sqrt(dsp->scalarproduct_int16(fc_v, fc_v, subframe_size));
     return mr_energy >> 12;
 }
 
diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c
index 68e5b3ed42..b2931fe525 100644
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -171,8 +171,7 @@ void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
 
 void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, int blocksize);
 
-int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len,
-                                    int shift);
+int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
 int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3, int len, int mul);
 
diff --git a/libavcodec/arm/int_neon.S b/libavcodec/arm/int_neon.S
index 8bb58afb18..ea479bb580 100644
--- a/libavcodec/arm/int_neon.S
+++ b/libavcodec/arm/int_neon.S
@@ -29,32 +29,8 @@ function ff_scalarproduct_int16_neon, export=1
         vmov.i16        q1,  #0
         vmov.i16        q2,  #0
         vmov.i16        q3,  #0
-        negs            r3,  r3
-        beq             2f
-
-        vdup.s32        q12, r3
 1:      vld1.16         {d16-d17}, [r0]!
         vld1.16         {d20-d21}, [r1,:128]!
-        vmull.s16       q12, d16,  d20
-        vld1.16         {d18-d19}, [r0]!
-        vmull.s16       q13, d17,  d21
-        vld1.16         {d22-d23}, [r1,:128]!
-        vmull.s16       q14, d18,  d22
-        vmull.s16       q15, d19,  d23
-        vshl.s32        q8,  q12,  q12
-        vshl.s32        q9,  q13,  q12
-        vadd.s32        q0,  q0,   q8
-        vshl.s32        q10, q14,  q12
-        vadd.s32        q1,  q1,   q9
-        vshl.s32        q11, q15,  q12
-        vadd.s32        q2,  q2,   q10
-        vadd.s32        q3,  q3,   q11
-        subs            r2,  r2,   #16
-        bne             1b
-        b               3f
-
-2:      vld1.16         {d16-d17}, [r0]!
-        vld1.16         {d20-d21}, [r1,:128]!
         vmlal.s16       q0,  d16,  d20
         vld1.16         {d18-d19}, [r0]!
         vmlal.s16       q1,  d17,  d21
@@ -62,9 +38,9 @@ function ff_scalarproduct_int16_neon, export=1
         vmlal.s16       q2,  d18,  d22
         vmlal.s16       q3,  d19,  d23
         subs            r2,  r2,   #16
-        bne             2b
+        bne             1b
 
-3:      vpadd.s32       d16, d0,   d1
+        vpadd.s32       d16, d0,   d1
         vpadd.s32       d17, d2,   d3
         vpadd.s32       d10, d4,   d5
         vpadd.s32       d11, d6,   d7
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index 29c5976596..f5b7d076d1 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -2559,12 +2559,12 @@ static void vector_clipf_c(float *dst, const float *src, float min, float max, i
     }
 }
 
-static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
+static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
 {
     int res = 0;
 
     while (order--)
-        res += (*v1++ * *v2++) >> shift;
+        res += *v1++ * *v2++;
 
     return res;
 }
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index 0a6165685e..aa026e15f5 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -536,9 +536,8 @@ typedef struct DSPContext {
     /**
      * Calculate scalar product of two vectors.
      * @param len length of vectors, should be multiple of 16
-     * @param shift number of bits to discard from product
      */
-    int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len, int shift);
+    int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len);
     /* ape functions */
     /**
      * Calculate scalar product of v1 and v2,
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 09940d147d..da08bdab50 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -35,13 +35,12 @@ pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 SECTION_TEXT
 
 %macro SCALARPRODUCT 1
-; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
-cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
+; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
+cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
     shl orderq, 1
     add v1q, orderq
     add v2q, orderq
     neg orderq
-    movd    m3, shiftm
     pxor    m2, m2
 .loop:
     movu    m0, [v1q + orderq]
@@ -55,10 +54,8 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
 %if mmsize == 16
     movhlps m0, m2
     paddd   m2, m0
-    psrad   m2, m3
     pshuflw m0, m2, 0x4e
 %else
-    psrad   m2, m3
     pshufw  m0, m2, 0x4e
 %endif
     paddd   m2, m0