Merge remote-tracking branch 'newdev/master'

* newdev/master: avio: make udp_set_remote_url/get_local_port internal. asfdec: also subtract preroll when reading simple index object matroskaenc: remove a variable that's unused after bc17bd9. avio: cosmetics - nicer vertical alignment. Remove unnecessary icc version checks Disable 'attribute "foo" ignored' warnings from icc rtsp: Don't use a locale dependent format string Add xd55 codec tag for XDCAM HD422 720p25 CBR files. configure: get libavcodec version from new version.h header lavc: move the version macros to a new installed header. matroskaenc: simplify get_aac_sample_rates by using ff_mpeg4audio_get_config Do not use format string "%0.3f" for RTSP Range field. Add apply_window_int16() to DSPContext with x86-optimized versions and use it in the ac3_fixed encoder. Document usage of import libraries created by dlltool configure: Set the correct lib target for arm/wince dlltool fate: simplify regression-funcs.sh fate: add support for multithread testing Conflicts: libavformat/rtspdec.c libavutil/attributes.h libavutil/internal.h libavutil/mem.h Merged-by: Michael Niedermayer <michaelni@gmx.at>
author: Michael Niedermayer <michaelni@gmx.at> 2011-03-24 02:12:17 +0100
committer: Michael Niedermayer <michaelni@gmx.at> 2011-03-24 02:16:11 +0100
commit: 2fd41c9067fc67b40f80e9cbd4787018009040db (patch)
tree: 378cc399057a6089f3f06bc62f0eff97d3ada56b /libavcodec
parent: 00dc0206cb5b351a66d7cce77b8a65fabe6ea7da (diff)
parent: 4ec153bb66a95da46c98e269bd0aa787e6172ed3 (diff)
11 files changed, 275 insertions, 66 deletions
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 4d3fbe1771..ef91ee3159 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -3,7 +3,7 @@ include $(SUBDIR)../config.mak
 NAME = avcodec
 FFLIBS = avutil
 
-HEADERS = avcodec.h avfft.h dxva2.h opt.h vaapi.h vdpau.h xvmc.h
+HEADERS = avcodec.h avfft.h dxva2.h opt.h vaapi.h vdpau.h version.h xvmc.h
 
 OBJS = allcodecs.o                                                      \
        audioconvert.o                                                   \
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 4413e5bd6b..72b0291727 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -167,7 +167,7 @@ static av_cold int mdct_init(AVCodecContext *avctx, AC3MDCTContext *mdct,
 static void mdct512(AC3MDCTContext *mdct, CoefType *out, SampleType *in);
 
 static void apply_window(DSPContext *dsp, SampleType *output, const SampleType *input,
-                         const SampleType *window, int n);
+                         const SampleType *window, unsigned int len);
 
 static int normalize_samples(AC3EncodeContext *s);
 
diff --git a/libavcodec/ac3enc_fixed.c b/libavcodec/ac3enc_fixed.c
index d1bb429ccc..f682aa625f 100644
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -252,15 +252,9 @@ static void mdct512(AC3MDCTContext *mdct, int32_t *out, int16_t *in)
  * Apply KBD window to input samples prior to MDCT.
  */
 static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
-                         const int16_t *window, int n)
+                         const int16_t *window, unsigned int len)
 {
-    int i;
-    int n2 = n >> 1;
-
-    for (i = 0; i < n2; i++) {
-        output[i]     = MUL16(input[i],     window[i]) >> 15;
-        output[n-i-1] = MUL16(input[n-i-1], window[i]) >> 15;
-    }
+    dsp->apply_window_int16(output, input, window, len);
 }
 
 
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index e0783f9aa3..f5b01f7d6f 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -83,9 +83,9 @@ static void mdct512(AC3MDCTContext *mdct, float *out, float *in)
  * Apply KBD window to input samples prior to MDCT.
  */
 static void apply_window(DSPContext *dsp, float *output, const float *input,
-                         const float *window, int n)
+                         const float *window, unsigned int len)
 {
-    dsp->vector_fmul(output, input, window, n);
+    dsp->vector_fmul(output, input, window, len);
 }
 
 
diff --git a/libavcodec/ac3tab.c b/libavcodec/ac3tab.c
index f7b030209c..fb8e9c7b52 100644
--- a/libavcodec/ac3tab.c
+++ b/libavcodec/ac3tab.c
@@ -141,7 +141,7 @@ const uint8_t ff_ac3_rematrix_band_tab[5] = { 13, 25, 37, 61, 253 };
 /* AC-3 MDCT window */
 
 /* MDCT window */
-const int16_t ff_ac3_window[AC3_WINDOW_SIZE/2] = {
+DECLARE_ALIGNED(16, const int16_t, ff_ac3_window)[AC3_WINDOW_SIZE/2] = {
     4,    7,   12,   16,   21,   28,   34,   42,
    51,   61,   72,   84,   97,  111,  127,  145,
   164,  184,  207,  231,  257,  285,  315,  347,
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
index 0f9201d8e4..5a9411f633 100644
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -31,57 +31,7 @@
 #include "libavutil/avutil.h"
 #include "libavutil/cpu.h"
 
-#define LIBAVCODEC_VERSION_MAJOR 52
-#define LIBAVCODEC_VERSION_MINOR 114
-#define LIBAVCODEC_VERSION_MICRO  0
-
-#define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
-                                               LIBAVCODEC_VERSION_MINOR, \
-                                               LIBAVCODEC_VERSION_MICRO)
-#define LIBAVCODEC_VERSION      AV_VERSION(LIBAVCODEC_VERSION_MAJOR,    \
-                                           LIBAVCODEC_VERSION_MINOR,    \
-                                           LIBAVCODEC_VERSION_MICRO)
-#define LIBAVCODEC_BUILD        LIBAVCODEC_VERSION_INT
-
-#define LIBAVCODEC_IDENT        "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
-
-/**
- * Those FF_API_* defines are not part of public API.
- * They may change, break or disappear at any time.
- */
-#ifndef FF_API_PALETTE_CONTROL
-#define FF_API_PALETTE_CONTROL  (LIBAVCODEC_VERSION_MAJOR < 54)
-#endif
-#ifndef FF_API_MM_FLAGS
-#define FF_API_MM_FLAGS         (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_OPT_SHOW
-#define FF_API_OPT_SHOW         (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_AUDIO_OLD
-#define FF_API_AUDIO_OLD        (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_VIDEO_OLD
-#define FF_API_VIDEO_OLD        (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_SUBTITLE_OLD
-#define FF_API_SUBTITLE_OLD     (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_USE_LPC
-#define FF_API_USE_LPC          (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_SET_STRING_OLD
-#define FF_API_SET_STRING_OLD   (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_INOFFICIAL
-#define FF_API_INOFFICIAL       (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_OLD_SAMPLE_FMT
-#define FF_API_OLD_SAMPLE_FMT   (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
-#ifndef FF_API_OLD_AUDIOCONVERT
-#define FF_API_OLD_AUDIOCONVERT (LIBAVCODEC_VERSION_MAJOR < 53)
-#endif
+#include "libavcodec/version.h"
 
 #if LIBAVCODEC_VERSION_MAJOR < 53
 #   define FF_INTERNALC_MEM_TYPE unsigned int
diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
index d4e538cd07..951236c8c3 100644
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3895,6 +3895,19 @@ static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, co
     return res;
 }
 
+static void apply_window_int16_c(int16_t *output, const int16_t *input,
+                                 const int16_t *window, unsigned int len)
+{
+    int i;
+    int len2 = len >> 1;
+
+    for (i = 0; i < len2; i++) {
+        int16_t w       = window[i];
+        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
+        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
+    }
+}
+
 #define W0 2048
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -4369,6 +4382,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
     c->vector_clipf = vector_clipf_c;
     c->scalarproduct_int16 = scalarproduct_int16_c;
     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
+    c->apply_window_int16 = apply_window_int16_c;
     c->scalarproduct_float = scalarproduct_float_c;
     c->butterflies_float = butterflies_float_c;
     c->vector_fmul_scalar = vector_fmul_scalar_c;
diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
index d42abe8956..4f1f650dc8 100644
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -526,6 +526,20 @@ typedef struct DSPContext {
      */
     int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
 
+    /**
+     * Apply symmetric window in 16-bit fixed-point.
+     * @param output destination array
+     *               constraints: 16-byte aligned
+     * @param input  source array
+     *               constraints: 16-byte aligned
+     * @param window window array
+     *               constraints: 16-byte aligned, at least len/2 elements
+     * @param len    full window length
+     *               constraints: multiple of ? greater than zero
+     */
+    void (*apply_window_int16)(int16_t *output, const int16_t *input,
+                               const int16_t *window, unsigned int len);
+
     /* rv30 functions */
     qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
     qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
diff --git a/libavcodec/version.h b/libavcodec/version.h
new file mode 100644
index 0000000000..fe608c78db
--- /dev/null
+++ b/libavcodec/version.h
@@ -0,0 +1,75 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_VERSION_H
+#define AVCODEC_VERSION_H
+
+#define LIBAVCODEC_VERSION_MAJOR 52
+#define LIBAVCODEC_VERSION_MINOR 114
+#define LIBAVCODEC_VERSION_MICRO  0
+
+#define LIBAVCODEC_VERSION_INT  AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \
+                                               LIBAVCODEC_VERSION_MINOR, \
+                                               LIBAVCODEC_VERSION_MICRO)
+#define LIBAVCODEC_VERSION      AV_VERSION(LIBAVCODEC_VERSION_MAJOR,    \
+                                           LIBAVCODEC_VERSION_MINOR,    \
+                                           LIBAVCODEC_VERSION_MICRO)
+#define LIBAVCODEC_BUILD        LIBAVCODEC_VERSION_INT
+
+#define LIBAVCODEC_IDENT        "Lavc" AV_STRINGIFY(LIBAVCODEC_VERSION)
+
+/**
+ * Those FF_API_* defines are not part of public API.
+ * They may change, break or disappear at any time.
+ */
+#ifndef FF_API_PALETTE_CONTROL
+#define FF_API_PALETTE_CONTROL  (LIBAVCODEC_VERSION_MAJOR < 54)
+#endif
+#ifndef FF_API_MM_FLAGS
+#define FF_API_MM_FLAGS         (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_OPT_SHOW
+#define FF_API_OPT_SHOW         (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_AUDIO_OLD
+#define FF_API_AUDIO_OLD        (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_VIDEO_OLD
+#define FF_API_VIDEO_OLD        (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_SUBTITLE_OLD
+#define FF_API_SUBTITLE_OLD     (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_USE_LPC
+#define FF_API_USE_LPC          (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_SET_STRING_OLD
+#define FF_API_SET_STRING_OLD   (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_INOFFICIAL
+#define FF_API_INOFFICIAL       (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_OLD_SAMPLE_FMT
+#define FF_API_OLD_SAMPLE_FMT   (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+#ifndef FF_API_OLD_AUDIOCONVERT
+#define FF_API_OLD_AUDIOCONVERT (LIBAVCODEC_VERSION_MAJOR < 53)
+#endif
+
+#endif /* AVCODEC_VERSION_H */
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 3272556a74..c163a16848 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2393,6 +2393,20 @@ int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int or
 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
+void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
+                                      const int16_t *window, unsigned int len);
+
 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
@@ -2754,6 +2768,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_YASM
             c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
+            if (avctx->flags & CODEC_FLAG_BITEXACT) {
+                c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
+            } else {
+                c->apply_window_int16 = ff_apply_window_int16_mmxext;
+            }
 #endif
         }
         if(mm_flags & AV_CPU_FLAG_SSE){
@@ -2776,13 +2795,30 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 #if HAVE_YASM
             c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
             c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
+            if (avctx->flags & CODEC_FLAG_BITEXACT) {
+                c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
+            } else {
+                if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+                    c->apply_window_int16 = ff_apply_window_int16_sse2;
+                }
+            }
 
             c->emulated_edge_mc = emulated_edge_mc_sse;
             c->gmc= gmc_sse;
 #endif
         }
-        if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
-            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+        if (mm_flags & AV_CPU_FLAG_SSSE3) {
+#if HAVE_YASM
+            if (mm_flags & AV_CPU_FLAG_ATOM) {
+                c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
+            } else {
+                c->apply_window_int16 = ff_apply_window_int16_ssse3;
+            }
+            if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
+                c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
+            }
+#endif
+        }
     }
 
     if (CONFIG_ENCODERS)
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 53884a6b40..4658e5e3fa 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -27,6 +27,8 @@ pb_zzzzzzzz77777777: times 8 db -1
 pb_7: times 8 db 7
 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
+pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+pd_16384: times 4 dd 16384
 
 section .text align=16
 
@@ -202,6 +204,130 @@ SCALARPRODUCT_LOOP 0
     RET
 
 
+;-----------------------------------------------------------------------------
+; void ff_apply_window_int16(int16_t *output, const int16_t *input,
+;                            const int16_t *window, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro REVERSE_WORDS_MMXEXT 1-2
+    pshufw   %1, %1, 0x1B
+%endmacro
+
+%macro REVERSE_WORDS_SSE2 1-2
+    pshuflw  %1, %1, 0x1B
+    pshufhw  %1, %1, 0x1B
+    pshufd   %1, %1, 0x4E
+%endmacro
+
+%macro REVERSE_WORDS_SSSE3 2
+    pshufb  %1, %2
+%endmacro
+
+; dst = (dst * src) >> 15
+; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
+; in from the pmullw result.
+%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
+    mova    %3, %1
+    pmulhw  %1, %2
+    pmullw  %3, %2
+    psrlw   %3, 15
+    psllw   %1, 1
+    por     %1, %3
+%endmacro
+
+; dst = ((dst * src) + (1<<14)) >> 15
+%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
+    pmulhrsw   %1, %2
+%endmacro
+
+%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
+cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
+    lea     offset2q, [offsetq-mmsize]
+%if %2
+    mova          m5, [pd_16384]
+%elifidn %1, ssse3
+    mova          m5, [pb_revwords]
+    ALIGN 16
+%endif
+.loop:
+%if %2
+    ; This version expands 16-bit to 32-bit, multiplies by the window,
+    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
+    ; save to the output. The window is reversed for the second half.
+    mova          m3, [windowq+offset2q]
+    mova          m4, [ inputq+offset2q]
+    pxor          m0, m0
+    punpcklwd     m0, m3
+    punpcklwd     m1, m4
+    pmaddwd       m0, m1
+    paddd         m0, m5
+    psrad         m0, 15
+    pxor          m2, m2
+    punpckhwd     m2, m3
+    punpckhwd     m1, m4
+    pmaddwd       m2, m1
+    paddd         m2, m5
+    psrad         m2, 15
+    packssdw      m0, m2
+    mova  [outputq+offset2q], m0
+    REVERSE_WORDS m3
+    mova          m4, [ inputq+offsetq]
+    pxor          m0, m0
+    punpcklwd     m0, m3
+    punpcklwd     m1, m4
+    pmaddwd       m0, m1
+    paddd         m0, m5
+    psrad         m0, 15
+    pxor          m2, m2
+    punpckhwd     m2, m3
+    punpckhwd     m1, m4
+    pmaddwd       m2, m1
+    paddd         m2, m5
+    psrad         m2, 15
+    packssdw      m0, m2
+    mova  [outputq+offsetq], m0
+%elif %3
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The ssse3 version is bit-identical.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    pmulhrsw      m1, m0
+    REVERSE_WORDS m0, m5
+    pmulhrsw      m0, [ inputq+offsetq ]
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m0
+%else
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
+    ; therefore are not bit-identical to the C version.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    mova          m2, [ inputq+offsetq ]
+    MUL16FIXED    m1, m0, m3
+    REVERSE_WORDS m0
+    MUL16FIXED    m2, m0, m3
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m2
+%endif
+    add      offsetd, mmsize
+    sub     offset2d, mmsize
+    jae .loop
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
+%define MUL16FIXED MUL16FIXED_MMXEXT
+APPLY_WINDOW_INT16 mmxext,     0, 0
+APPLY_WINDOW_INT16 mmxext_ba,  1, 0
+INIT_XMM
+%define REVERSE_WORDS REVERSE_WORDS_SSE2
+APPLY_WINDOW_INT16 sse2,       0, 0
+APPLY_WINDOW_INT16 sse2_ba,    1, 0
+APPLY_WINDOW_INT16 ssse3_atom, 0, 1
+%define REVERSE_WORDS REVERSE_WORDS_SSSE3
+APPLY_WINDOW_INT16 ssse3,      0, 1
+
 
 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
author	Michael Niedermayer <michaelni@gmx.at>	2011-03-24 02:12:17 +0100
committer	Michael Niedermayer <michaelni@gmx.at>	2011-03-24 02:16:11 +0100
commit	2fd41c9067fc67b40f80e9cbd4787018009040db (patch)
tree	378cc399057a6089f3f06bc62f0eff97d3ada56b /libavcodec
parent	00dc0206cb5b351a66d7cce77b8a65fabe6ea7da (diff)
parent	4ec153bb66a95da46c98e269bd0aa787e6172ed3 (diff)