summaryrefslogtreecommitdiff
path: root/libavfilter/x86/vf_fspp.asm
diff options
context:
space:
mode:
Diffstat (limited to 'libavfilter/x86/vf_fspp.asm')
-rw-r--r--libavfilter/x86/vf_fspp.asm727
1 files changed, 727 insertions, 0 deletions
diff --git a/libavfilter/x86/vf_fspp.asm b/libavfilter/x86/vf_fspp.asm
new file mode 100644
index 0000000000..c7f8f64f1b
--- /dev/null
+++ b/libavfilter/x86/vf_fspp.asm
@@ -0,0 +1,727 @@
+;*****************************************************************************
+;* x86-optimized functions for fspp filter
+;*
+;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+;* Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \
+ 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \
+ 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \
+ 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21
+pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
+pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
+pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
+pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
+pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
+pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
+pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
+pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
+pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
+pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
+pw_4: times 4 dw 4
+pw_2: times 4 dw 2
+
+SECTION .text
+
+%define DCTSIZE 8
+
+INIT_MMX mmx
+
+;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+ mov widthq, r4m
+ mov dither_heightq, r5m
+ mov ditherq, r6m ; log2_scale
+%endif
+ add widthq, 7
+ mov tmpq, src_strideq
+ and widthq, ~7
+ sub dst_strideq, widthq
+ movd m5, ditherd ; log2_scale
+ xor ditherq, -1 ; log2_scale
+ mov tmp2q, tmpq
+ add ditherq, 7 ; log2_scale
+ neg tmpq
+ sub tmp2q, widthq
+ movd m2, ditherd ; log2_scale
+ add tmp2q, tmp2q
+ lea ditherq, [pb_dither]
+ mov src_strideq, tmp2q
+ shl tmpq, 4
+ lea dither_heightq, [ditherq+dither_heightq*8]
+ pxor m7, m7
+
+.loop_height:
+ movq m3, [ditherq]
+ movq m4, m3
+ punpcklbw m3, m7
+ punpckhbw m4, m7
+ mov tmp2q, widthq
+ psraw m3, m5
+ psraw m4, m5
+
+.loop_width:
+ movq [srcq+tmpq], m7
+ movq m0, [srcq]
+ movq m1, [srcq+8]
+ movq [srcq+tmpq+8], m7
+ paddw m0, m3
+ paddw m1, m4
+ movq [srcq], m7
+ psraw m0, m2
+ psraw m1, m2
+ movq [srcq+8], m7
+ packuswb m0, m1
+ add srcq, 16
+ movq [dstq], m0
+ add dstq, 8
+ sub tmp2q, 8
+ jg .loop_width
+
+ add srcq, src_strideq
+ add ditherq, 8
+ add dstq, dst_strideq
+ cmp ditherq, dither_heightq
+ jl .loop_height
+ RET
+
+;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
+; ptrdiff_t dst_stride, ptrdiff_t src_stride,
+; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
+%if ARCH_X86_64
+cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
+%else
+cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
+%define dst_strideq r2m
+%define src_strideq r3m
+ mov dstq, dstm
+ mov srcq, srcm
+ mov widthq, r4m
+ mov dither_heightq, r5m
+ mov ditherq, r6m ; log2_scale
+%endif
+ add widthq, 7
+ mov tmpq, src_strideq
+ and widthq, ~7
+ sub dst_strideq, widthq
+ movd m5, ditherd ; log2_scale
+ xor ditherq, -1 ; log2_scale
+ mov tmp2q, tmpq
+ add ditherq, 7 ; log2_scale
+ sub tmp2q, widthq
+ movd m2, ditherd ; log2_scale
+ add tmp2q, tmp2q
+ lea ditherq, [pb_dither]
+ mov src_strideq, tmp2q
+ shl tmpq, 5
+ lea dither_heightq, [ditherq+dither_heightq*8]
+ pxor m7, m7
+
+.loop_height:
+ movq m3, [ditherq]
+ movq m4, m3
+ punpcklbw m3, m7
+ punpckhbw m4, m7
+ mov tmp2q,widthq
+ psraw m3, m5
+ psraw m4, m5
+
+.loop_width:
+ movq m0, [srcq]
+ movq m1, [srcq+8]
+ paddw m0, m3
+ paddw m0, [srcq+tmpq]
+ paddw m1, m4
+ movq m6, [srcq+tmpq+8]
+ movq [srcq+tmpq], m7
+ psraw m0, m2
+ paddw m1, m6
+ movq [srcq+tmpq+8], m7
+ psraw m1, m2
+ packuswb m0, m1
+ movq [dstq], m0
+ add srcq, 16
+ add dstq, 8
+ sub tmp2q, 8
+ jg .loop_width
+
+ add srcq, src_strideq
+ add ditherq, 8
+ add dstq, dst_strideq
+ cmp ditherq, dither_heightq
+ jl .loop_height
+ RET
+
+;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
+cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
+ movd m7, qd
+ movq m0, [thrnq]
+ punpcklwd m7, m7
+ movq m1, [thrnq+8]
+ punpckldq m7, m7
+ pmullw m0, m7
+ movq m2, [thrnq+8*2]
+ pmullw m1, m7
+ movq m3, [thrnq+8*3]
+ pmullw m2, m7
+ movq [thrq], m0
+ movq m4, [thrnq+8*4]
+ pmullw m3, m7
+ movq [thrq+8], m1
+ movq m5, [thrnq+8*5]
+ pmullw m4, m7
+ movq [thrq+8*2], m2
+ movq m6, [thrnq+8*6]
+ pmullw m5, m7
+ movq [thrq+8*3], m3
+ movq m0, [thrnq+8*7]
+ pmullw m6, m7
+ movq [thrq+8*4], m4
+ movq m1, [thrnq+8*7+8]
+ pmullw m0, m7
+ movq [thrq+8*5], m5
+ movq m2, [thrnq+8*7+8*2]
+ pmullw m1, m7
+ movq [thrq+8*6], m6
+ movq m3, [thrnq+8*7+8*3]
+ pmullw m2, m7
+ movq [thrq+8*7], m0
+ movq m4, [thrnq+8*7+8*4]
+ pmullw m3, m7
+ movq [thrq+8*7+8], m1
+ movq m5, [thrnq+8*7+8*5]
+ pmullw m4, m7
+ movq [thrq+8*7+8*2], m2
+ movq m6, [thrnq+8*7+8*6]
+ pmullw m5, m7
+ movq [thrq+8*7+8*3], m3
+ movq m0, [thrnq+14*8]
+ pmullw m6, m7
+ movq [thrq+8*7+8*4], m4
+ movq m1, [thrnq+14*8+8]
+ pmullw m0, m7
+ movq [thrq+8*7+8*5], m5
+ pmullw m1, m7
+ movq [thrq+8*7+8*6], m6
+ movq [thrq+14*8], m0
+ movq [thrq+14*8+8], m1
+ RET
+
+%macro COLUMN_FDCT 1-3 0, 0
+ movq m1, [srcq+DCTSIZE*0*2]
+ movq m7, [srcq+DCTSIZE*3*2]
+ movq m0, m1
+ paddw m1, [srcq+DCTSIZE*7*2]
+ movq m3, m7
+ paddw m7, [srcq+DCTSIZE*4*2]
+ movq m5, m1
+ movq m6, [srcq+DCTSIZE*1*2]
+ psubw m1, m7
+ movq m2, [srcq+DCTSIZE*2*2]
+ movq m4, m6
+ paddw m6, [srcq+DCTSIZE*6*2]
+ paddw m5, m7
+ paddw m2, [srcq+DCTSIZE*5*2]
+ movq m7, m6
+ paddw m6, m2
+ psubw m7, m2
+ movq m2, m5
+ paddw m5, m6
+ psubw m2, m6
+ paddw m7, m1
+ movq m6, [thrq+4*16+%2]
+ psllw m7, 2
+ psubw m5, [thrq+%2]
+ psubw m2, m6
+ paddusw m5, [thrq+%2]
+ paddusw m2, m6
+ pmulhw m7, [pw_2D41]
+ paddw m5, [thrq+%2]
+ paddw m2, m6
+ psubusw m5, [thrq+%2]
+ psubusw m2, m6
+ paddw m5, [pw_2]
+ movq m6, m2
+ paddw m2, m5
+ psubw m5, m6
+ movq m6, m1
+ paddw m1, m7
+ psubw m1, [thrq+2*16+%2]
+ psubw m6, m7
+ movq m7, [thrq+6*16+%2]
+ psraw m5, 2
+ paddusw m1, [thrq+2*16+%2]
+ psubw m6, m7
+ paddw m1, [thrq+2*16+%2]
+ paddusw m6, m7
+ psubusw m1, [thrq+2*16+%2]
+ paddw m6, m7
+ psubw m3, [srcq+DCTSIZE*4*2]
+ psubusw m6, m7
+ movq m7, m1
+ psraw m2, 2
+ psubw m4, [srcq+DCTSIZE*6*2]
+ psubw m1, m6
+ psubw m0, [srcq+DCTSIZE*7*2]
+ paddw m6, m7
+ psraw m6, 2
+ movq m7, m2
+ pmulhw m1, [pw_5A82]
+ paddw m2, m6
+ movq [rsp], m2
+ psubw m7, m6
+ movq m2, [srcq+DCTSIZE*2*2]
+ psubw m1, m6
+ psubw m2, [srcq+DCTSIZE*5*2]
+ movq m6, m5
+ movq [rsp+8*3], m7
+ paddw m3, m2
+ paddw m2, m4
+ paddw m4, m0
+ movq m7, m3
+ psubw m3, m4
+ psllw m3, 2
+ psllw m7, 2
+ pmulhw m3, [pw_187E]
+ psllw m4, 2
+ pmulhw m7, [pw_22A3]
+ psllw m2, 2
+ pmulhw m4, [pw_539F]
+ paddw m5, m1
+ pmulhw m2, [pw_2D41]
+ psubw m6, m1
+ paddw m7, m3
+ movq [rsp+8], m5
+ paddw m4, m3
+ movq m3, [thrq+3*16+%2]
+ movq m1, m0
+ movq [rsp+8*2], m6
+ psubw m1, m2
+ paddw m0, m2
+ movq m5, m1
+ movq m2, [thrq+5*16+%2]
+ psubw m1, m7
+ paddw m5, m7
+ psubw m1, m3
+ movq m7, [thrq+16+%2]
+ psubw m5, m2
+ movq m6, m0
+ paddw m0, m4
+ paddusw m1, m3
+ psubw m6, m4
+ movq m4, [thrq+7*16+%2]
+ psubw m0, m7
+ psubw m6, m4
+ paddusw m5, m2
+ paddusw m6, m4
+ paddw m1, m3
+ paddw m5, m2
+ paddw m6, m4
+ psubusw m1, m3
+ psubusw m5, m2
+ psubusw m6, m4
+ movq m4, m1
+ por m4, m5
+ paddusw m0, m7
+ por m4, m6
+ paddw m0, m7
+ packssdw m4, m4
+ psubusw m0, m7
+ movd tmpd, m4
+ or tmpd, tmpd
+ jnz %1
+ movq m4, [rsp]
+ movq m1, m0
+ pmulhw m0, [pw_3642]
+ movq m2, m1
+ movq m5, [outq+DCTSIZE*0*2]
+ movq m3, m2
+ pmulhw m1, [pw_2441]
+ paddw m5, m4
+ movq m6, [rsp+8]
+ psraw m3, 2
+ pmulhw m2, [pw_0CBB]
+ psubw m4, m3
+ movq m7, [outq+DCTSIZE*1*2]
+ paddw m5, m3
+ movq [outq+DCTSIZE*7*2], m4
+ paddw m7, m6
+ movq m3, [rsp+8*2]
+ psubw m6, m0
+ movq m4, [outq+DCTSIZE*2*2]
+ paddw m7, m0
+ movq [outq], m5
+ paddw m4, m3
+ movq [outq+DCTSIZE*6*2], m6
+ psubw m3, m1
+ movq m5, [outq+DCTSIZE*5*2]
+ paddw m4, m1
+ movq m6, [outq+DCTSIZE*3*2]
+ paddw m5, m3
+ movq m0, [rsp+8*3]
+ add srcq, 8+%3
+ movq [outq+DCTSIZE*1*2], m7
+ paddw m6, m0
+ movq [outq+DCTSIZE*2*2], m4
+ psubw m0, m2
+ movq m7, [outq+DCTSIZE*4*2]
+ paddw m6, m2
+ movq [outq+DCTSIZE*5*2], m5
+ paddw m7, m0
+ movq [outq+DCTSIZE*3*2], m6
+ movq [outq+DCTSIZE*4*2], m7
+ add outq, 8+%3
+%endmacro
+
+%macro COLUMN_IDCT 0-1 0
+ movq m3, m5
+ psubw m5, m1
+ psllw m5, 1
+ paddw m3, m1
+ movq m2, m0
+ psubw m0, m6
+ movq m1, m5
+ psllw m0, 1
+ pmulhw m1, [pw_AC62]
+ paddw m5, m0
+ pmulhw m5, [pw_3B21]
+ paddw m2, m6
+ pmulhw m0, [pw_22A3]
+ movq m7, m2
+ movq m4, [rsp]
+ psubw m2, m3
+ psllw m2, 1
+ paddw m7, m3
+ pmulhw m2, [pw_2D41]
+ movq m6, m4
+ psraw m7, 2
+ paddw m4, [outq]
+ psubw m6, m7
+ movq m3, [rsp+8]
+ paddw m4, m7
+ movq [outq+DCTSIZE*7*2], m6
+ paddw m1, m5
+ movq [outq], m4
+ psubw m1, m7
+ movq m7, [rsp+8*2]
+ psubw m0, m5
+ movq m6, [rsp+8*3]
+ movq m5, m3
+ paddw m3, [outq+DCTSIZE*1*2]
+ psubw m5, m1
+ psubw m2, m1
+ paddw m3, m1
+ movq [outq+DCTSIZE*6*2], m5
+ movq m4, m7
+ paddw m7, [outq+DCTSIZE*2*2]
+ psubw m4, m2
+ paddw m4, [outq+DCTSIZE*5*2]
+ paddw m7, m2
+ movq [outq+DCTSIZE*1*2], m3
+ paddw m0, m2
+ movq [outq+DCTSIZE*2*2], m7
+ movq m1, m6
+ paddw m6, [outq+DCTSIZE*4*2]
+ psubw m1, m0
+ paddw m1, [outq+DCTSIZE*3*2]
+ paddw m6, m0
+ movq [outq+DCTSIZE*5*2], m4
+ add srcq, 8+%1
+ movq [outq+DCTSIZE*4*2], m6
+ movq [outq+DCTSIZE*3*2], m1
+ add outq, 8+%1
+%endmacro
+
+;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
+cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
+.fdct1:
+ COLUMN_FDCT .idct1
+ jmp .fdct2
+
+.idct1:
+ COLUMN_IDCT
+
+.fdct2:
+ COLUMN_FDCT .idct2, 8, 16
+ sub cntd, 2
+ jg .fdct1
+ RET
+
+.idct2:
+ COLUMN_IDCT 16
+ sub cntd, 2
+ jg .fdct1
+ RET
+
+;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
+cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
+ add strideq, strideq
+ lea stride3q, [strideq+strideq*2]
+.loop:
+ movq m0, [srcq+DCTSIZE*0*2]
+ movq m1, [srcq+DCTSIZE*1*2]
+ movq m4, m0
+ movq m2, [srcq+DCTSIZE*2*2]
+ punpcklwd m0, m1
+ movq m3, [srcq+DCTSIZE*3*2]
+ punpckhwd m4, m1
+ movq m7, m2
+ punpcklwd m2, m3
+ movq m6, m0
+ punpckldq m0, m2
+ punpckhdq m6, m2
+ movq m5, m0
+ punpckhwd m7, m3
+ psubw m0, m6
+ pmulhw m0, [pw_5A82]
+ movq m2, m4
+ punpckldq m4, m7
+ paddw m5, m6
+ punpckhdq m2, m7
+ movq m1, m4
+ psllw m0, 2
+ paddw m4, m2
+ movq m3, [srcq+DCTSIZE*0*2+8]
+ psubw m1, m2
+ movq m2, [srcq+DCTSIZE*1*2+8]
+ psubw m0, m5
+ movq m6, m4
+ paddw m4, m5
+ psubw m6, m5
+ movq m7, m1
+ movq m5, [srcq+DCTSIZE*2*2+8]
+ paddw m1, m0
+ movq [rsp], m4
+ movq m4, m3
+ movq [rsp+8], m6
+ punpcklwd m3, m2
+ movq m6, [srcq+DCTSIZE*3*2+8]
+ punpckhwd m4, m2
+ movq m2, m5
+ punpcklwd m5, m6
+ psubw m7, m0
+ punpckhwd m2, m6
+ movq m0, m3
+ punpckldq m3, m5
+ punpckhdq m0, m5
+ movq m5, m4
+ movq m6, m3
+ punpckldq m4, m2
+ psubw m3, m0
+ punpckhdq m5, m2
+ paddw m6, m0
+ movq m2, m4
+ movq m0, m3
+ psubw m4, m5
+ pmulhw m0, [pw_AC62]
+ paddw m3, m4
+ pmulhw m3, [pw_3B21]
+ paddw m2, m5
+ pmulhw m4, [pw_22A3]
+ movq m5, m2
+ psubw m2, m6
+ paddw m5, m6
+ pmulhw m2, [pw_2D41]
+ paddw m0, m3
+ psllw m0, 3
+ psubw m4, m3
+ movq m6, [rsp]
+ movq m3, m1
+ psllw m4, 3
+ psubw m0, m5
+ psllw m2, 3
+ paddw m1, m0
+ psubw m2, m0
+ psubw m3, m0
+ paddw m4, m2
+ movq m0, m7
+ paddw m7, m2
+ psubw m0, m2
+ movq m2, [pw_4]
+ psubw m6, m5
+ paddw m5, [rsp]
+ paddw m1, m2
+ paddw m5, m2
+ psraw m1, 3
+ paddw m7, m2
+ psraw m5, 3
+ paddw m5, [dstq]
+ psraw m7, 3
+ paddw m1, [dstq+strideq*1]
+ paddw m0, m2
+ paddw m7, [dstq+strideq*2]
+ paddw m3, m2
+ movq [dstq], m5
+ paddw m6, m2
+ movq [dstq+strideq*1], m1
+ psraw m0, 3
+ movq [dstq+strideq*2], m7
+ add dstq, stride3q
+ movq m5, [rsp+8]
+ psraw m3, 3
+ paddw m0, [dstq+strideq*2]
+ psubw m5, m4
+ paddw m3, [dstq+stride3q*1]
+ psraw m6, 3
+ paddw m4, [rsp+8]
+ paddw m5, m2
+ paddw m6, [dstq+strideq*4]
+ paddw m4, m2
+ movq [dstq+strideq*2], m0
+ psraw m5, 3
+ paddw m5, [dstq]
+ psraw m4, 3
+ paddw m4, [dstq+strideq*1]
+ add srcq, DCTSIZE*2*4
+ movq [dstq+stride3q*1], m3
+ movq [dstq+strideq*4], m6
+ movq [dstq], m5
+ movq [dstq+strideq*1], m4
+ sub dstq, stride3q
+ add dstq, 8
+ dec r3d
+ jnz .loop
+ RET
+
+;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
+cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
+ lea stride3q, [strideq+strideq*2]
+.loop:
+ movd m0, [pixq]
+ pxor m7, m7
+ movd m1, [pixq+strideq*1]
+ punpcklbw m0, m7
+ movd m2, [pixq+strideq*2]
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ add pixq,stride3q
+ movq m5, m0
+ movd m3, [pixq+strideq*4]
+ movq m6, m1
+ movd m4, [pixq+stride3q*1]
+ punpcklbw m3, m7
+ psubw m5, m3
+ punpcklbw m4, m7
+ paddw m0, m3
+ psubw m6, m4
+ movd m3, [pixq+strideq*2]
+ paddw m1, m4
+ movq [rsp], m5
+ punpcklbw m3, m7
+ movq [rsp+8], m6
+ movq m4, m2
+ movd m5, [pixq]
+ paddw m2, m3
+ movd m6, [pixq+strideq*1]
+ punpcklbw m5, m7
+ psubw m4, m3
+ punpcklbw m6, m7
+ movq m3, m5
+ paddw m5, m6
+ psubw m3, m6
+ movq m6, m0
+ movq m7, m1
+ psubw m0, m5
+ psubw m1, m2
+ paddw m7, m2
+ paddw m1, m0
+ movq m2, m7
+ psllw m1, 2
+ paddw m6, m5
+ pmulhw m1, [pw_2D41]
+ paddw m7, m6
+ psubw m6, m2
+ movq m5, m0
+ movq m2, m7
+ punpcklwd m7, m6
+ paddw m0, m1
+ punpckhwd m2, m6
+ psubw m5, m1
+ movq m6, m0
+ movq m1, [rsp+8]
+ punpcklwd m0, m5
+ punpckhwd m6, m5
+ movq m5, m0
+ punpckldq m0, m7
+ paddw m3, m4
+ punpckhdq m5, m7
+ movq m7, m6
+ movq [srcq+DCTSIZE*0*2], m0
+ punpckldq m6, m2
+ movq [srcq+DCTSIZE*1*2], m5
+ punpckhdq m7, m2
+ movq [srcq+DCTSIZE*2*2], m6
+ paddw m4, m1
+ movq [srcq+DCTSIZE*3*2], m7
+ psllw m3, 2
+ movq m2, [rsp]
+ psllw m4, 2
+ pmulhw m4, [pw_2D41]
+ paddw m1, m2
+ psllw m1, 2
+ movq m0, m3
+ pmulhw m0, [pw_22A3]
+ psubw m3, m1
+ pmulhw m3, [pw_187E]
+ movq m5, m2
+ pmulhw m1, [pw_539F]
+ psubw m2, m4
+ paddw m5, m4
+ movq m6, m2
+ paddw m0, m3
+ movq m7, m5
+ paddw m2, m0
+ psubw m6, m0
+ movq m4, m2
+ paddw m1, m3
+ punpcklwd m2, m6
+ paddw m5, m1
+ punpckhwd m4, m6
+ psubw m7, m1
+ movq m6, m5
+ punpcklwd m5, m7
+ punpckhwd m6, m7
+ movq m7, m2
+ punpckldq m2, m5
+ sub pixq, stride3q
+ punpckhdq m7, m5
+ movq m5, m4
+ movq [srcq+DCTSIZE*0*2+8], m2
+ punpckldq m4, m6
+ movq [srcq+DCTSIZE*1*2+8], m7
+ punpckhdq m5, m6
+ movq [srcq+DCTSIZE*2*2+8], m4
+ add pixq, 4
+ movq [srcq+DCTSIZE*3*2+8], m5
+ add srcq, DCTSIZE*4*2
+ dec cntd
+ jnz .loop
+ RET