summaryrefslogtreecommitdiff
path: root/libswscale/rgb2rgb_template.c
diff options
context:
space:
mode:
authorIvo van Poorten <ivop@euronet.nl>2007-04-16 21:41:03 +0000
committerIvo van Poorten <ivop@euronet.nl>2007-04-16 21:41:03 +0000
commitb38d487466e68bd6baf2889017d2a751831560f0 (patch)
tree63c54f05c133132d55b330ac607e7671e1a32a9f /libswscale/rgb2rgb_template.c
parent4f99f932689523dd2a77656ca1b75e00178c3559 (diff)
New implementation of rgb32tobgr32
The previous implementation segfaulted with MMX enabled when fed an image smaller than the size of the units the MMX code processed. The new code: - is faster for MMX, MMX2 and plain C - processes small images correctly - is LGPL Originally committed as revision 23009 to svn://svn.mplayerhq.hu/mplayer/trunk/libswscale
Diffstat (limited to 'libswscale/rgb2rgb_template.c')
-rw-r--r--libswscale/rgb2rgb_template.c99
1 files changed, 58 insertions, 41 deletions
diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
index 6489a4db91..7147855fed 100644
--- a/libswscale/rgb2rgb_template.c
+++ b/libswscale/rgb2rgb_template.c
@@ -1364,49 +1364,66 @@ static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_
static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
{
+ uint8_t *d = dst, *s = (uint8_t *) src;
+ const uint8_t *end = s + src_size;
#ifdef HAVE_MMX
-/* TODO: unroll this loop */
- asm volatile (
- "xor %%"REG_a", %%"REG_a" \n\t"
- ASMALIGN(4)
- "1: \n\t"
- PREFETCH" 32(%0, %%"REG_a") \n\t"
- "movq (%0, %%"REG_a"), %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm0, %%mm2 \n\t"
- "pslld $16, %%mm0 \n\t"
- "psrld $16, %%mm1 \n\t"
- "pand "MANGLE(mask32r)", %%mm0 \n\t"
- "pand "MANGLE(mask32g)", %%mm2 \n\t"
- "pand "MANGLE(mask32b)", %%mm1 \n\t"
- "por %%mm0, %%mm2 \n\t"
- "por %%mm1, %%mm2 \n\t"
- MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
- "add $8, %%"REG_a" \n\t"
- "cmp %2, %%"REG_a" \n\t"
- " jb 1b \n\t"
- :: "r" (src), "r"(dst), "r" (src_size-7)
- : "%"REG_a
- );
-
- __asm __volatile(SFENCE:::"memory");
- __asm __volatile(EMMS:::"memory");
-#else
- unsigned i;
- unsigned num_pixels = src_size >> 2;
- for(i=0; i<num_pixels; i++)
- {
-#ifdef WORDS_BIGENDIAN
- dst[4*i + 1] = src[4*i + 3];
- dst[4*i + 2] = src[4*i + 2];
- dst[4*i + 3] = src[4*i + 1];
-#else
- dst[4*i + 0] = src[4*i + 2];
- dst[4*i + 1] = src[4*i + 1];
- dst[4*i + 2] = src[4*i + 0];
-#endif
+ __asm __volatile(
+ " "PREFETCH" (%1) \n"
+ " movq %3, %%mm7 \n"
+ " pxor %4, %%mm7 \n"
+ " movq %%mm7, %%mm6 \n"
+ " pxor %5, %%mm7 \n"
+ " jmp 2f \n"
+ ASMALIGN(4)
+ "1: \n"
+ " "PREFETCH" 32(%1) \n"
+ " movq (%1), %%mm0 \n"
+ " movq 8(%1), %%mm1 \n"
+# ifdef HAVE_MMX2
+ " pshufw $177, %%mm0, %%mm3 \n"
+ " pshufw $177, %%mm1, %%mm5 \n"
+ " pand %%mm7, %%mm0 \n"
+ " pand %%mm6, %%mm3 \n"
+ " pand %%mm7, %%mm1 \n"
+ " pand %%mm6, %%mm5 \n"
+ " por %%mm3, %%mm0 \n"
+ " por %%mm5, %%mm1 \n"
+# else
+ " movq %%mm0, %%mm2 \n"
+ " movq %%mm1, %%mm4 \n"
+ " pand %%mm7, %%mm0 \n"
+ " pand %%mm6, %%mm2 \n"
+ " pand %%mm7, %%mm1 \n"
+ " pand %%mm6, %%mm4 \n"
+ " movq %%mm2, %%mm3 \n"
+ " movq %%mm4, %%mm5 \n"
+ " pslld $16, %%mm2 \n"
+ " psrld $16, %%mm3 \n"
+ " pslld $16, %%mm4 \n"
+ " psrld $16, %%mm5 \n"
+ " por %%mm2, %%mm0 \n"
+ " por %%mm4, %%mm1 \n"
+ " por %%mm3, %%mm0 \n"
+ " por %%mm5, %%mm1 \n"
+# endif
+ " "MOVNTQ" %%mm0, (%0) \n"
+ " "MOVNTQ" %%mm1, 8(%0) \n"
+ " add $16, %0 \n"
+ " add $16, %1 \n"
+ "2: \n"
+ " cmp %1, %2 \n"
+ " ja 1b \n"
+ " "SFENCE" \n"
+ " "EMMS" \n"
+ : "+r"(d), "+r"(s)
+ : "r" (end-15), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
+ : "memory");
+#endif
+ for (; s<end; s+=4, d+=4) {
+ int v = *(uint32_t *)s, g = v & 0xff00;
+ v &= 0xff00ff;
+ *(uint32_t *)d = (v>>16) + g + (v<<16);
}
-#endif
}
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)