From e74433a8e6fc00c8dbde293c97a3e45384c2c1d9 Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Tue, 14 Jan 2014 10:33:47 +0100 Subject: dsputil: Split clear_block*/fill_block* off into a separate context --- libavcodec/ppc/Makefile | 1 + libavcodec/ppc/blockdsp.c | 169 +++++++++++++++++++++++++++++++++++++++ libavcodec/ppc/dsputil_altivec.c | 14 ---- libavcodec/ppc/dsputil_ppc.c | 110 ------------------------- 4 files changed, 170 insertions(+), 124 deletions(-) create mode 100644 libavcodec/ppc/blockdsp.c (limited to 'libavcodec/ppc') diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index b78d4be8ae..bd78f8e728 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -1,5 +1,6 @@ OBJS += ppc/fmtconvert_altivec.o \ +OBJS-$(CONFIG_BLOCKDSP) += ppc/blockdsp.o OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_ppc.o OBJS-$(CONFIG_FFT) += ppc/fft_altivec.o OBJS-$(CONFIG_H264CHROMA) += ppc/h264chroma_init.o diff --git a/libavcodec/ppc/blockdsp.c b/libavcodec/ppc/blockdsp.c new file mode 100644 index 0000000000..679bc0454f --- /dev/null +++ b/libavcodec/ppc/blockdsp.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2002 Brian Foley + * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003-2004 Romain Dolbeau + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavcodec/blockdsp.h" + +/* ***** WARNING ***** WARNING ***** WARNING ***** */ +/* + * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with + * a cache line size not equal to 32 bytes. Fortunately all processors used + * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte + * cache lines. This is due to the use of the 'dcbz' instruction. It simply + * clears a single cache line to zero, so you need to know the cache line + * size to use it! It's absurd, but it's fast... + * + * update 24/06/2003: Apple released the G5 yesterday, with a PPC970. + * cache line size: 128 bytes. Oups. + * The semantics of dcbz was changed, it always clears 32 bytes. So the function + * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, + * which is defined to clear a cache line (as dcbz before). So we can still + * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. + * + * see + * and + */ +static void clear_blocks_dcbz32_ppc(int16_t *blocks) +{ + register int misal = (unsigned long) blocks & 0x00000010, i = 0; + + if (misal) { + ((unsigned long *) blocks)[0] = 0L; + ((unsigned long *) blocks)[1] = 0L; + ((unsigned long *) blocks)[2] = 0L; + ((unsigned long *) blocks)[3] = 0L; + i += 16; + } + for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32) + __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory"); + if (misal) { + ((unsigned long *) blocks)[188] = 0L; + ((unsigned long *) blocks)[189] = 0L; + ((unsigned long *) blocks)[190] = 0L; + ((unsigned long *) blocks)[191] = 0L; + i += 16; + } +} + +/* Same as above, when dcbzl clears a whole 128 bytes cache line + * i.e. the PPC970 AKA G5. */ +static void clear_blocks_dcbz128_ppc(int16_t *blocks) +{ +#if HAVE_DCBZL + register int misal = (unsigned long) blocks & 0x0000007f, i = 0; + + if (misal) { + /* We could probably also optimize this case, + * but there's not much point as the machines + * aren't available yet (2003-06-26). */ + memset(blocks, 0, sizeof(int16_t) * 6 * 64); + } else { + for (; i < sizeof(int16_t) * 6 * 64; i += 128) + __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory"); + } +#else + memset(blocks, 0, sizeof(int16_t) * 6 * 64); +#endif +} + +/* Check dcbz report how many bytes are set to 0 by dcbz. */ +/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect + * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the + * assembler knows about dcbzl ... */ +static long check_dcbzl_effect(void) +{ + long count = 0; +#if HAVE_DCBZL + register char *fakedata = av_malloc(1024); + register char *fakedata_middle; + register long zero = 0, i = 0; + + if (!fakedata) + return 0L; + + fakedata_middle = fakedata + 512; + + memset(fakedata, 0xFF, 1024); + + /* Below the constraint "b" seems to mean "address base register" + * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */ + __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero)); + + for (i = 0; i < 1024; i++) + if (fakedata[i] == (char) 0) + count++; + + av_free(fakedata); +#endif + + return count; +} + +#if HAVE_ALTIVEC +static void clear_block_altivec(int16_t *block) +{ + LOAD_ZERO; + vec_st(zero_s16v, 0, block); + vec_st(zero_s16v, 16, block); + vec_st(zero_s16v, 32, block); + vec_st(zero_s16v, 48, block); + vec_st(zero_s16v, 64, block); + vec_st(zero_s16v, 80, block); + vec_st(zero_s16v, 96, block); + vec_st(zero_s16v, 112, block); +} +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_blockdsp_init_ppc(BlockDSPContext *c, unsigned high_bit_depth) +{ + // common optimizations whether AltiVec is available or not + if (!high_bit_depth) { + switch (check_dcbzl_effect()) { + case 32: + c->clear_blocks = clear_blocks_dcbz32_ppc; + break; + case 128: + c->clear_blocks = clear_blocks_dcbz128_ppc; + break; + default: + break; + } + } + +#if HAVE_ALTIVEC + if (!PPC_ALTIVEC(av_get_cpu_flags())) + return; + + if (!high_bit_depth) + c->clear_block = clear_block_altivec; +#endif /* HAVE_ALTIVEC */ +} diff --git a/libavcodec/ppc/dsputil_altivec.c b/libavcodec/ppc/dsputil_altivec.c index 442be6c969..c3f90e91c4 100644 --- a/libavcodec/ppc/dsputil_altivec.c +++ b/libavcodec/ppc/dsputil_altivec.c @@ -558,19 +558,6 @@ static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1, } } -static void clear_block_altivec(int16_t *block) -{ - LOAD_ZERO; - vec_st(zero_s16v, 0, block); - vec_st(zero_s16v, 16, block); - vec_st(zero_s16v, 32, block); - vec_st(zero_s16v, 48, block); - vec_st(zero_s16v, 64, block); - vec_st(zero_s16v, 80, block); - vec_st(zero_s16v, 96, block); - vec_st(zero_s16v, 112, block); -} - static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, uint8_t *src, int stride, int h) { @@ -931,7 +918,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx, if (!high_bit_depth) { c->get_pixels = get_pixels_altivec; - c->clear_block = clear_block_altivec; } c->hadamard8_diff[0] = hadamard8_diff16_altivec; diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index 698f54562f..b92fbf0a2c 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -24,124 +24,14 @@ #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/mem.h" #include "libavutil/ppc/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" #include "dsputil_altivec.h" -/* ***** WARNING ***** WARNING ***** WARNING ***** */ -/* - * clear_blocks_dcbz32_ppc will not work properly on PowerPC processors with - * a cache line size not equal to 32 bytes. Fortunately all processors used - * by Apple up to at least the 7450 (AKA second generation G4) use 32-byte - * cache lines. This is due to the use of the 'dcbz' instruction. It simply - * clears a single cache line to zero, so you need to know the cache line - * size to use it! It's absurd, but it's fast... - * - * update 24/06/2003: Apple released the G5 yesterday, with a PPC970. - * cache line size: 128 bytes. Oups. - * The semantics of dcbz was changed, it always clears 32 bytes. So the function - * below will work, but will be slow. So I fixed check_dcbz_effect to use dcbzl, - * which is defined to clear a cache line (as dcbz before). So we can still - * distinguish, and use dcbz (32 bytes) or dcbzl (one cache line) as required. - * - * see - * and - */ -static void clear_blocks_dcbz32_ppc(int16_t *blocks) -{ - register int misal = (unsigned long) blocks & 0x00000010, i = 0; - - if (misal) { - ((unsigned long *) blocks)[0] = 0L; - ((unsigned long *) blocks)[1] = 0L; - ((unsigned long *) blocks)[2] = 0L; - ((unsigned long *) blocks)[3] = 0L; - i += 16; - } - for (; i < sizeof(int16_t) * 6 * 64 - 31; i += 32) - __asm__ volatile ("dcbz %0,%1" :: "b" (blocks), "r" (i) : "memory"); - if (misal) { - ((unsigned long *) blocks)[188] = 0L; - ((unsigned long *) blocks)[189] = 0L; - ((unsigned long *) blocks)[190] = 0L; - ((unsigned long *) blocks)[191] = 0L; - i += 16; - } -} - -/* Same as above, when dcbzl clears a whole 128 bytes cache line - * i.e. the PPC970 AKA G5. */ -static void clear_blocks_dcbz128_ppc(int16_t *blocks) -{ -#if HAVE_DCBZL - register int misal = (unsigned long) blocks & 0x0000007f, i = 0; - - if (misal) { - /* We could probably also optimize this case, - * but there's not much point as the machines - * aren't available yet (2003-06-26). */ - memset(blocks, 0, sizeof(int16_t) * 6 * 64); - } else { - for (; i < sizeof(int16_t) * 6 * 64; i += 128) - __asm__ volatile ("dcbzl %0,%1" :: "b" (blocks), "r" (i) : "memory"); - } -#else - memset(blocks, 0, sizeof(int16_t) * 6 * 64); -#endif -} - -/* Check dcbz report how many bytes are set to 0 by dcbz. */ -/* update 24/06/2003: Replace dcbz by dcbzl to get the intended effect - * (Apple "fixed" dcbz). Unfortunately this cannot be used unless the - * assembler knows about dcbzl ... */ -static long check_dcbzl_effect(void) -{ - long count = 0; -#if HAVE_DCBZL - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0, i = 0; - - if (!fakedata) - return 0L; - - fakedata_middle = fakedata + 512; - - memset(fakedata, 0xFF, 1024); - - /* Below the constraint "b" seems to mean "address base register" - * in gcc-3.3 / RS/6000 speaks. Seems to avoid using r0, so.... */ - __asm__ volatile ("dcbzl %0, %1" :: "b" (fakedata_middle), "r" (zero)); - - for (i = 0; i < 1024; i++) - if (fakedata[i] == (char) 0) - count++; - - av_free(fakedata); -#endif - - return count; -} - av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { - // common optimizations whether AltiVec is available or not - if (!high_bit_depth) { - switch (check_dcbzl_effect()) { - case 32: - c->clear_blocks = clear_blocks_dcbz32_ppc; - break; - case 128: - c->clear_blocks = clear_blocks_dcbz128_ppc; - break; - default: - break; - } - } - if (PPC_ALTIVEC(av_get_cpu_flags())) { ff_dsputil_init_altivec(c, avctx, high_bit_depth); ff_int_init_altivec(c, avctx); -- cgit v1.2.3