From e3fcb14347466095839c2a3c47ebecff02da891e Mon Sep 17 00:00:00 2001 From: Diego Biurrun Date: Fri, 24 Jan 2014 11:55:16 +0100 Subject: dsputil: Split off IDCT bits into their own context --- configure | 40 +++--- doc/optimization.txt | 3 - libavcodec/Makefile | 5 +- libavcodec/aic.c | 23 ++-- libavcodec/arm/Makefile | 23 ++-- libavcodec/arm/dsputil_arm.S | 120 ---------------- libavcodec/arm/dsputil_arm.h | 4 - libavcodec/arm/dsputil_armv6.S | 27 ---- libavcodec/arm/dsputil_init_arm.c | 60 -------- libavcodec/arm/dsputil_init_armv5te.c | 43 ------ libavcodec/arm/dsputil_init_armv6.c | 18 --- libavcodec/arm/dsputil_init_neon.c | 53 ------- libavcodec/arm/dsputil_neon.S | 128 ----------------- libavcodec/arm/idctdsp_arm.S | 120 ++++++++++++++++ libavcodec/arm/idctdsp_arm.h | 34 +++++ libavcodec/arm/idctdsp_armv6.S | 48 +++++++ libavcodec/arm/idctdsp_init_arm.c | 98 +++++++++++++ libavcodec/arm/idctdsp_init_armv5te.c | 43 ++++++ libavcodec/arm/idctdsp_init_armv6.c | 48 +++++++ libavcodec/arm/idctdsp_init_neon.c | 53 +++++++ libavcodec/arm/idctdsp_neon.S | 128 +++++++++++++++++ libavcodec/asv.c | 1 - libavcodec/asv.h | 2 + libavcodec/asvdec.c | 16 ++- libavcodec/asvenc.c | 1 + libavcodec/cavs.c | 7 +- libavcodec/cavs.h | 4 +- libavcodec/cavsdsp.c | 2 +- libavcodec/dnxhddec.c | 52 +++---- libavcodec/dnxhdenc.c | 9 +- libavcodec/dsputil.c | 162 +--------------------- libavcodec/dsputil.h | 66 --------- libavcodec/dvdec.c | 9 +- libavcodec/dxva2_mpeg2.c | 2 +- libavcodec/eamad.c | 9 +- libavcodec/eatgq.c | 2 +- libavcodec/eatqi.c | 7 +- libavcodec/g2meet.c | 12 +- libavcodec/h263.c | 8 +- libavcodec/idctdsp.c | 197 ++++++++++++++++++++++++++ libavcodec/idctdsp.h | 104 ++++++++++++++ libavcodec/intrax8.c | 15 +- libavcodec/ljpegenc.c | 9 +- libavcodec/mdec.c | 20 +-- libavcodec/mimic.c | 10 +- libavcodec/mjpegdec.c | 10 +- libavcodec/mjpegdec.h | 4 +- libavcodec/mjpegenc_common.c | 2 +- libavcodec/mjpegenc_common.h | 2 +- libavcodec/mpeg12dec.c | 30 ++-- libavcodec/mpeg4videodec.c | 39 +++--- libavcodec/mpeg4videoenc.c | 28 ++-- libavcodec/mpegvideo.c | 52 +++---- libavcodec/mpegvideo.h | 2 + libavcodec/mpegvideo_enc.c | 20 +-- libavcodec/mpegvideo_xvmc.c | 2 +- libavcodec/msmpeg4.c | 10 +- libavcodec/nuv.c | 1 + libavcodec/ppc/Makefile | 2 +- libavcodec/ppc/dsputil_altivec.h | 3 - libavcodec/ppc/dsputil_ppc.c | 6 - libavcodec/ppc/idct_altivec.c | 221 ----------------------------- libavcodec/ppc/idctdsp.c | 245 +++++++++++++++++++++++++++++++++ libavcodec/proresdec.c | 2 +- libavcodec/proresdsp.c | 2 +- libavcodec/rtjpeg.c | 8 +- libavcodec/rtjpeg.h | 5 +- libavcodec/vc1dec.c | 116 +++++++++------- libavcodec/wmv2.c | 23 ++-- libavcodec/wmv2dsp.c | 2 +- libavcodec/x86/Makefile | 10 +- libavcodec/x86/cavsdsp.c | 3 +- libavcodec/x86/dsputil_init.c | 85 ------------ libavcodec/x86/dsputil_mmx.c | 135 ------------------ libavcodec/x86/dsputil_x86.h | 7 - libavcodec/x86/idct_mmx_xvid.c | 2 +- libavcodec/x86/idct_sse2_xvid.c | 2 +- libavcodec/x86/idctdsp.h | 31 +++++ libavcodec/x86/idctdsp_init.c | 106 ++++++++++++++ libavcodec/x86/idctdsp_mmx.c | 168 ++++++++++++++++++++++ libavcodec/x86/mpegvideoenc_template.c | 2 +- libavcodec/x86/proresdsp_init.c | 2 +- libavcodec/x86/simple_idct.c | 2 +- 83 files changed, 1788 insertions(+), 1449 deletions(-) delete mode 100644 libavcodec/arm/dsputil_arm.S delete mode 100644 libavcodec/arm/dsputil_init_armv5te.c delete mode 100644 libavcodec/arm/dsputil_init_neon.c delete mode 100644 libavcodec/arm/dsputil_neon.S create mode 100644 libavcodec/arm/idctdsp_arm.S create mode 100644 libavcodec/arm/idctdsp_arm.h create mode 100644 libavcodec/arm/idctdsp_armv6.S create mode 100644 libavcodec/arm/idctdsp_init_arm.c create mode 100644 libavcodec/arm/idctdsp_init_armv5te.c create mode 100644 libavcodec/arm/idctdsp_init_armv6.c create mode 100644 libavcodec/arm/idctdsp_init_neon.c create mode 100644 libavcodec/arm/idctdsp_neon.S create mode 100644 libavcodec/idctdsp.c create mode 100644 libavcodec/idctdsp.h delete mode 100644 libavcodec/ppc/idct_altivec.c create mode 100644 libavcodec/ppc/idctdsp.c create mode 100644 libavcodec/x86/idctdsp.h create mode 100644 libavcodec/x86/idctdsp_init.c create mode 100644 libavcodec/x86/idctdsp_mmx.c diff --git a/configure b/configure index 7ea15aa9f7..be97868031 100755 --- a/configure +++ b/configure @@ -1546,6 +1546,7 @@ CONFIG_EXTRA=" huffman huffyuvdsp huffyuvencdsp + idctdsp intrax8 lgplv3 lpc @@ -1703,6 +1704,7 @@ threads_if_any="$THREADS_LIST" # subsystems dct_select="rdft" +dsputil_select="idctdsp" error_resilience_select="dsputil" intrax8_select="error_resilience" mdct_select="fft" @@ -1710,7 +1712,7 @@ rdft_select="fft" mpeg_er_select="error_resilience" mpegaudio_select="mpegaudiodsp" mpegaudiodsp_select="dct" -mpegvideo_select="blockdsp dsputil hpeldsp videodsp" +mpegvideo_select="blockdsp dsputil hpeldsp idctdsp videodsp" mpegvideoenc_select="dsputil mpegvideo qpeldsp" # decoders / encoders @@ -1720,16 +1722,16 @@ aac_latm_decoder_select="aac_decoder aac_latm_parser" ac3_decoder_select="ac3_parser ac3dsp bswapdsp mdct" ac3_encoder_select="ac3dsp audiodsp dsputil mdct" ac3_fixed_encoder_select="ac3dsp audiodsp dsputil mdct" -aic_decoder_select="dsputil golomb" +aic_decoder_select="golomb idctdsp" alac_encoder_select="lpc" als_decoder_select="bswapdsp" amrnb_decoder_select="lsp" amrwb_decoder_select="lsp" amv_decoder_select="sp5x_decoder" ape_decoder_select="bswapdsp" -asv1_decoder_select="blockdsp bswapdsp dsputil" +asv1_decoder_select="blockdsp bswapdsp idctdsp" asv1_encoder_select="bswapdsp dsputil" -asv2_decoder_select="blockdsp bswapdsp dsputil" +asv2_decoder_select="blockdsp bswapdsp idctdsp" asv2_encoder_select="bswapdsp dsputil" atrac1_decoder_select="mdct sinewin" atrac3_decoder_select="mdct" @@ -1737,23 +1739,23 @@ atrac3p_decoder_select="mdct sinewin" bink_decoder_select="blockdsp hpeldsp" binkaudio_dct_decoder_select="mdct rdft dct sinewin" binkaudio_rdft_decoder_select="mdct rdft sinewin" -cavs_decoder_select="blockdsp dsputil golomb h264chroma qpeldsp videodsp" +cavs_decoder_select="blockdsp golomb h264chroma idctdsp qpeldsp videodsp" cllc_decoder_select="bswapdsp" comfortnoise_encoder_select="lpc" cook_decoder_select="audiodsp mdct sinewin" cscd_decoder_select="lzo" cscd_decoder_suggest="zlib" dca_decoder_select="mdct" -dnxhd_decoder_select="blockdsp dsputil" -dnxhd_encoder_select="aandcttables blockdsp dsputil mpegvideoenc" -dvvideo_decoder_select="dsputil" +dnxhd_decoder_select="blockdsp idctdsp" +dnxhd_encoder_select="aandcttables blockdsp dsputil idctdsp mpegvideoenc" +dvvideo_decoder_select="idctdsp" dvvideo_encoder_select="dsputil" dxa_decoder_deps="zlib" eac3_decoder_select="ac3_decoder" eac3_encoder_select="ac3_encoder" -eamad_decoder_select="aandcttables blockdsp bswapdsp dsputil mpegvideo" -eatgq_decoder_select="aandcttables dsputil" -eatqi_decoder_select="aandcttables blockdsp bswapdsp dsputil mpeg1video_decoder" +eamad_decoder_select="aandcttables blockdsp bswapdsp idctdsp mpegvideo" +eatgq_decoder_select="aandcttables idctdsp" +eatqi_decoder_select="aandcttables blockdsp bswapdsp idctdsp mpeg1video_decoder" exr_decoder_deps="zlib" ffv1_decoder_select="golomb rangecoder" ffv1_encoder_select="rangecoder" @@ -1770,7 +1772,7 @@ flv_encoder_select="h263_encoder" fourxm_decoder_select="blockdsp bswapdsp" fraps_decoder_select="bswapdsp huffman" g2m_decoder_deps="zlib" -g2m_decoder_select="blockdsp dsputil" +g2m_decoder_select="blockdsp idctdsp" h261_decoder_select="mpeg_er mpegvideo" h261_encoder_select="aandcttables mpegvideoenc" h263_decoder_select="error_resilience h263_parser h263dsp mpeg_er mpegvideo qpeldsp" @@ -1790,12 +1792,12 @@ jpegls_decoder_select="golomb mjpeg_decoder" jpegls_encoder_select="golomb" jv_decoder_select="blockdsp" lagarith_decoder_select="huffyuvdsp" -ljpeg_encoder_select="aandcttables dsputil" +ljpeg_encoder_select="aandcttables idctdsp" loco_decoder_select="golomb" -mdec_decoder_select="blockdsp dsputil mpegvideo" +mdec_decoder_select="blockdsp idctdsp mpegvideo" metasound_decoder_select="lsp mdct sinewin" -mimic_decoder_select="blockdsp bswapdsp dsputil hpeldsp" -mjpeg_decoder_select="blockdsp dsputil hpeldsp" +mimic_decoder_select="blockdsp bswapdsp hpeldsp idctdsp" +mjpeg_decoder_select="blockdsp hpeldsp idctdsp" mjpeg_encoder_select="aandcttables mpegvideoenc" mjpegb_decoder_select="mjpeg_decoder" mlp_decoder_select="mlp_parser" @@ -1829,13 +1831,13 @@ mss2_decoder_select="error_resilience mpeg_er qpeldsp vc1_decoder" mxpeg_decoder_select="mjpeg_decoder" nellymoser_decoder_select="mdct sinewin" nellymoser_encoder_select="audio_frame_queue mdct sinewin" -nuv_decoder_select="dsputil lzo" +nuv_decoder_select="idctdsp lzo" on2avc_decoder_select="mdct" opus_decoder_deps="avresample" png_decoder_deps="zlib" png_encoder_deps="zlib" png_encoder_select="huffyuvencdsp" -prores_decoder_select="dsputil" +prores_decoder_select="idctdsp" prores_encoder_select="dsputil" qcelp_decoder_select="lsp" qdm2_decoder_select="mdct rdft mpegaudiodsp" @@ -1888,7 +1890,7 @@ wmav2_encoder_select="mdct sinewin" wmavoice_decoder_select="lsp rdft dct mdct sinewin" wmv1_decoder_select="h263_decoder" wmv1_encoder_select="h263_encoder" -wmv2_decoder_select="blockdsp h263_decoder intrax8 videodsp" +wmv2_decoder_select="blockdsp h263_decoder idctdsp intrax8 videodsp" wmv2_encoder_select="h263_encoder" wmv3_decoder_select="vc1_decoder" wmv3image_decoder_select="wmv3_decoder" diff --git a/doc/optimization.txt b/doc/optimization.txt index b51183fa34..b3dca645a8 100644 --- a/doc/optimization.txt +++ b/doc/optimization.txt @@ -136,9 +136,6 @@ dct_unquantize_mpeg2 dct_unquantize_h263 Used in MPEG-4/H.263 en/decoding. -FIXME remaining functions? -BTW, most of these functions are in dsputil.c/.h, some are in mpegvideo.c/.h. - Alignment: diff --git a/libavcodec/Makefile b/libavcodec/Makefile index bfe50f352e..dc374cb605 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile @@ -33,9 +33,8 @@ OBJS-$(CONFIG_BLOCKDSP) += blockdsp.o OBJS-$(CONFIG_BSWAPDSP) += bswapdsp.o OBJS-$(CONFIG_CABAC) += cabac.o OBJS-$(CONFIG_DCT) += dct.o dct32_fixed.o dct32_float.o +OBJS-$(CONFIG_DSPUTIL) += dsputil.o OBJS-$(CONFIG_DXVA2) += dxva2.o -OBJS-$(CONFIG_DSPUTIL) += dsputil.o faanidct.o \ - simple_idct.o jrevdct.o OBJS-$(CONFIG_ENCODERS) += faandct.o jfdctfst.o jfdctint.o OBJS-$(CONFIG_ERROR_RESILIENCE) += error_resilience.o FFT-OBJS-$(CONFIG_HARDCODED_TABLES) += cos_tables.o cos_fixed_tables.o @@ -51,6 +50,8 @@ OBJS-$(CONFIG_HPELDSP) += hpeldsp.o OBJS-$(CONFIG_HUFFMAN) += huffman.o OBJS-$(CONFIG_HUFFYUVDSP) += huffyuvdsp.o OBJS-$(CONFIG_HUFFYUVENCDSP) += huffyuvencdsp.o +OBJS-$(CONFIG_IDCTDSP) += idctdsp.o faanidct.o \ + simple_idct.o jrevdct.o OBJS-$(CONFIG_INTRAX8) += intrax8.o intrax8dsp.o OBJS-$(CONFIG_LIBXVID) += libxvid_rc.o OBJS-$(CONFIG_LPC) += lpc.o diff --git a/libavcodec/aic.c b/libavcodec/aic.c index 68ae728763..dac9d8b7fd 100644 --- a/libavcodec/aic.c +++ b/libavcodec/aic.c @@ -24,10 +24,10 @@ #include "avcodec.h" #include "bytestream.h" -#include "dsputil.h" #include "internal.h" #include "get_bits.h" #include "golomb.h" +#include "idctdsp.h" #include "unary.h" #define AIC_HDR_SIZE 24 @@ -139,7 +139,7 @@ static const uint8_t *aic_scan[NUM_BANDS] = { typedef struct AICContext { AVCodecContext *avctx; AVFrame *frame; - DSPContext dsp; + IDCTDSPContext idsp; ScanTable scantable; int num_x_slices; @@ -336,16 +336,15 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y, recombine_block_il(ctx->block, ctx->scantable.permutated, &base_y, &ext_y, blk); unquant_block(ctx->block, ctx->quant); - ctx->dsp.idct(ctx->block); + ctx->idsp.idct(ctx->block); if (!ctx->interlaced) { dst = Y + (blk >> 1) * 8 * ystride + (blk & 1) * 8; - ctx->dsp.put_signed_pixels_clamped(ctx->block, dst, - ystride); + ctx->idsp.put_signed_pixels_clamped(ctx->block, dst, ystride); } else { dst = Y + (blk & 1) * 8 + (blk >> 1) * ystride; - ctx->dsp.put_signed_pixels_clamped(ctx->block, dst, - ystride * 2); + ctx->idsp.put_signed_pixels_clamped(ctx->block, dst, + ystride * 2); } } Y += 16; @@ -354,9 +353,9 @@ static int aic_decode_slice(AICContext *ctx, int mb_x, int mb_y, recombine_block(ctx->block, ctx->scantable.permutated, &base_c, &ext_c); unquant_block(ctx->block, ctx->quant); - ctx->dsp.idct(ctx->block); - ctx->dsp.put_signed_pixels_clamped(ctx->block, C[blk], - ctx->frame->linesize[blk + 1]); + ctx->idsp.idct(ctx->block); + ctx->idsp.put_signed_pixels_clamped(ctx->block, C[blk], + ctx->frame->linesize[blk + 1]); C[blk] += 8; } } @@ -426,11 +425,11 @@ static av_cold int aic_decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV420P; - ff_dsputil_init(&ctx->dsp, avctx); + ff_idctdsp_init(&ctx->idsp, avctx); for (i = 0; i < 64; i++) scan[i] = i; - ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, scan); + ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, scan); ctx->mb_width = FFALIGN(avctx->width, 16) >> 4; ctx->mb_height = FFALIGN(avctx->height, 16) >> 4; diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile index eb92a8c953..3a3e244c4d 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile @@ -6,10 +6,7 @@ OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ arm/ac3dsp_arm.o OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_arm.o OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_arm.o -OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o \ - arm/dsputil_arm.o \ - arm/jrevdct_arm.o \ - arm/simple_idct_arm.o +OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \ arm/fft_fixed_init_arm.o OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o @@ -18,6 +15,10 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \ arm/hpeldsp_arm.o +OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ + arm/idctdsp_arm.o \ + arm/jrevdct_arm.o \ + arm/simple_idct_arm.o OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o @@ -40,7 +41,7 @@ OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \ arm/rv40dsp_init_arm.o -ARMV5TE-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv5te.o \ +ARMV5TE-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv5te.o \ arm/simple_idct_armv5te.o ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \ arm/mpegvideo_armv5te_s.o @@ -51,11 +52,13 @@ ARMV5TE-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv5te.o ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \ - arm/dsputil_armv6.o \ - arm/simple_idct_armv6.o + arm/dsputil_armv6.o ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ arm/hpeldsp_armv6.o +ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ + arm/idctdsp_armv6.o \ + arm/simple_idct_armv6.o ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o @@ -83,9 +86,6 @@ NEON-OBJS-$(CONFIG_AUDIODSP) += arm/audiodsp_init_neon.o \ arm/int_neon.o NEON-OBJS-$(CONFIG_BLOCKDSP) += arm/blockdsp_init_neon.o \ arm/blockdsp_neon.o -NEON-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_neon.o \ - arm/dsputil_neon.o \ - arm/simple_idct_neon.o NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \ arm/fft_fixed_neon.o NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o @@ -96,6 +96,9 @@ NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \ arm/hpeldsp_neon.o NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \ arm/hpeldsp_neon.o +NEON-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_neon.o \ + arm/idctdsp_neon.o \ + arm/simple_idct_neon.o NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \ arm/mdct_fixed_neon.o NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o diff --git a/libavcodec/arm/dsputil_arm.S b/libavcodec/arm/dsputil_arm.S deleted file mode 100644 index 82fcf2ae91..0000000000 --- a/libavcodec/arm/dsputil_arm.S +++ /dev/null @@ -1,120 +0,0 @@ -@ -@ ARMv4 optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji -@ -@ This file is part of Libav. -@ -@ Libav is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ Libav is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with Libav; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "libavutil/arm/asm.S" - -@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) -function ff_add_pixels_clamped_arm, export=1, align=5 - push {r4-r10} - mov r10, #8 -1: - ldr r4, [r1] /* load dest */ - /* block[0] and block[1]*/ - ldrsh r5, [r0] - ldrsh r7, [r0, #2] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r6, r5 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #4] /* moved form [A] */ - orr r9, r9, r8, lsl #8 - /* block[2] and block[3] */ - /* [A] */ - ldrsh r7, [r0, #6] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - ldr r4, [r1, #4] /* moved form [B] */ - orr r9, r9, r8, lsl #24 - /* store dest */ - ldrsh r5, [r0, #8] /* moved form [C] */ - str r9, [r1] - - /* load dest */ - /* [B] */ - /* block[4] and block[5] */ - /* [C] */ - ldrsh r7, [r0, #10] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r6, r5 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #12] /* moved from [D] */ - orr r9, r9, r8, lsl #8 - /* block[6] and block[7] */ - /* [D] */ - ldrsh r7, [r0, #14] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - add r0, r0, #16 /* moved from [E] */ - orr r9, r9, r8, lsl #24 - subs r10, r10, #1 /* moved from [F] */ - /* store dest */ - str r9, [r1, #4] - - /* [E] */ - /* [F] */ - add r1, r1, r2 - bne 1b - - pop {r4-r10} - bx lr -endfunc diff --git a/libavcodec/arm/dsputil_arm.h b/libavcodec/arm/dsputil_arm.h index 6080203960..5b976aa3d6 100644 --- a/libavcodec/arm/dsputil_arm.h +++ b/libavcodec/arm/dsputil_arm.h @@ -24,11 +24,7 @@ #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" -void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx, - unsigned high_bit_depth); void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth); -void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, - unsigned high_bit_depth); #endif /* AVCODEC_ARM_DSPUTIL_ARM_H */ diff --git a/libavcodec/arm/dsputil_armv6.S b/libavcodec/arm/dsputil_armv6.S index e667a47f94..b89171ff94 100644 --- a/libavcodec/arm/dsputil_armv6.S +++ b/libavcodec/arm/dsputil_armv6.S @@ -20,33 +20,6 @@ #include "libavutil/arm/asm.S" -function ff_add_pixels_clamped_armv6, export=1 - push {r4-r8,lr} - mov r3, #8 -1: - ldm r0!, {r4,r5,r12,lr} - ldrd r6, r7, [r1] - pkhbt r8, r4, r5, lsl #16 - pkhtb r5, r5, r4, asr #16 - pkhbt r4, r12, lr, lsl #16 - pkhtb lr, lr, r12, asr #16 - pld [r1, r2] - uxtab16 r8, r8, r6 - uxtab16 r5, r5, r6, ror #8 - uxtab16 r4, r4, r7 - uxtab16 lr, lr, r7, ror #8 - usat16 r8, #8, r8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 lr, #8, lr - orr r6, r8, r5, lsl #8 - orr r7, r4, lr, lsl #8 - subs r3, r3, #1 - strd_post r6, r7, r1, r2 - bgt 1b - pop {r4-r8,pc} -endfunc - function ff_get_pixels_armv6, export=1 pld [r1, r2] push {r4-r8, lr} diff --git a/libavcodec/arm/dsputil_init_arm.c b/libavcodec/arm/dsputil_init_arm.c index 33109088ee..a8c806a47f 100644 --- a/libavcodec/arm/dsputil_init_arm.c +++ b/libavcodec/arm/dsputil_init_arm.c @@ -28,71 +28,11 @@ #include "libavcodec/dsputil.h" #include "dsputil_arm.h" -void ff_j_rev_dct_arm(int16_t *data); -void ff_simple_idct_arm(int16_t *data); - -/* XXX: local hack */ -static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); -static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); - -void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, - int line_size); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - * converted */ -static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_j_rev_dct_arm(block); - ff_put_pixels_clamped(block, dest, line_size); -} - -static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_j_rev_dct_arm(block); - ff_add_pixels_clamped(block, dest, line_size); -} - -static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_simple_idct_arm(block); - ff_put_pixels_clamped(block, dest, line_size); -} - -static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_simple_idct_arm(block); - ff_add_pixels_clamped(block, dest, line_size); -} - av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { int cpu_flags = av_get_cpu_flags(); - ff_put_pixels_clamped = c->put_pixels_clamped; - ff_add_pixels_clamped = c->add_pixels_clamped; - - if (!high_bit_depth) { - if (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_ARM) { - c->idct_put = j_rev_dct_arm_put; - c->idct_add = j_rev_dct_arm_add; - c->idct = ff_j_rev_dct_arm; - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; - } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) { - c->idct_put = simple_idct_arm_put; - c->idct_add = simple_idct_arm_add; - c->idct = ff_simple_idct_arm; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } - } - - c->add_pixels_clamped = ff_add_pixels_clamped_arm; - - if (have_armv5te(cpu_flags)) - ff_dsputil_init_armv5te(c, avctx, high_bit_depth); if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx, high_bit_depth); - if (have_neon(cpu_flags)) - ff_dsputil_init_neon(c, avctx, high_bit_depth); } diff --git a/libavcodec/arm/dsputil_init_armv5te.c b/libavcodec/arm/dsputil_init_armv5te.c deleted file mode 100644 index eb45b72088..0000000000 --- a/libavcodec/arm/dsputil_init_armv5te.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavutil/attributes.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_simple_idct_armv5te(int16_t *data); -void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data); -void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data); - -av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx, - unsigned high_bit_depth) -{ - if (!high_bit_depth && - (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { - c->idct_put = ff_simple_idct_put_armv5te; - c->idct_add = ff_simple_idct_add_armv5te; - c->idct = ff_simple_idct_armv5te; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } -} diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c index 2b1002bfda..fab5e0d232 100644 --- a/libavcodec/arm/dsputil_init_armv6.c +++ b/libavcodec/arm/dsputil_init_armv6.c @@ -26,13 +26,6 @@ #include "libavcodec/mpegvideo.h" #include "dsputil_arm.h" -void ff_simple_idct_armv6(int16_t *data); -void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); -void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); - -void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels, - int line_size); - void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride); void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); @@ -56,17 +49,6 @@ int ff_pix_sum_armv6(uint8_t *pix, int line_size); av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { - if (!high_bit_depth) { - if (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEARMV6) { - c->idct_put = ff_simple_idct_put_armv6; - c->idct_add = ff_simple_idct_add_armv6; - c->idct = ff_simple_idct_armv6; - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; - } - } - c->add_pixels_clamped = ff_add_pixels_clamped_armv6; - if (!high_bit_depth) c->get_pixels = ff_get_pixels_armv6; c->diff_pixels = ff_diff_pixels_armv6; diff --git a/libavcodec/arm/dsputil_init_neon.c b/libavcodec/arm/dsputil_init_neon.c deleted file mode 100644 index 9d4c76ce58..0000000000 --- a/libavcodec/arm/dsputil_init_neon.c +++ /dev/null @@ -1,53 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include - -#include "libavutil/attributes.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" -#include "dsputil_arm.h" - -void ff_simple_idct_neon(int16_t *data); -void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); -void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); - -void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); -void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); -void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); - -av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx, - unsigned high_bit_depth) -{ - if (!high_bit_depth) { - if (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLENEON) { - c->idct_put = ff_simple_idct_put_neon; - c->idct_add = ff_simple_idct_add_neon; - c->idct = ff_simple_idct_neon; - c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; - } - } - - c->add_pixels_clamped = ff_add_pixels_clamped_neon; - c->put_pixels_clamped = ff_put_pixels_clamped_neon; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; -} diff --git a/libavcodec/arm/dsputil_neon.S b/libavcodec/arm/dsputil_neon.S deleted file mode 100644 index ed6f218380..0000000000 --- a/libavcodec/arm/dsputil_neon.S +++ /dev/null @@ -1,128 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_put_pixels_clamped_neon, export=1 - vld1.16 {d16-d19}, [r0,:128]! - vqmovun.s16 d0, q8 - vld1.16 {d20-d23}, [r0,:128]! - vqmovun.s16 d1, q9 - vld1.16 {d24-d27}, [r0,:128]! - vqmovun.s16 d2, q10 - vld1.16 {d28-d31}, [r0,:128]! - vqmovun.s16 d3, q11 - vst1.8 {d0}, [r1,:64], r2 - vqmovun.s16 d4, q12 - vst1.8 {d1}, [r1,:64], r2 - vqmovun.s16 d5, q13 - vst1.8 {d2}, [r1,:64], r2 - vqmovun.s16 d6, q14 - vst1.8 {d3}, [r1,:64], r2 - vqmovun.s16 d7, q15 - vst1.8 {d4}, [r1,:64], r2 - vst1.8 {d5}, [r1,:64], r2 - vst1.8 {d6}, [r1,:64], r2 - vst1.8 {d7}, [r1,:64], r2 - bx lr -endfunc - -function ff_put_signed_pixels_clamped_neon, export=1 - vmov.u8 d31, #128 - vld1.16 {d16-d17}, [r0,:128]! - vqmovn.s16 d0, q8 - vld1.16 {d18-d19}, [r0,:128]! - vqmovn.s16 d1, q9 - vld1.16 {d16-d17}, [r0,:128]! - vqmovn.s16 d2, q8 - vld1.16 {d18-d19}, [r0,:128]! - vadd.u8 d0, d0, d31 - vld1.16 {d20-d21}, [r0,:128]! - vadd.u8 d1, d1, d31 - vld1.16 {d22-d23}, [r0,:128]! - vadd.u8 d2, d2, d31 - vst1.8 {d0}, [r1,:64], r2 - vqmovn.s16 d3, q9 - vst1.8 {d1}, [r1,:64], r2 - vqmovn.s16 d4, q10 - vst1.8 {d2}, [r1,:64], r2 - vqmovn.s16 d5, q11 - vld1.16 {d24-d25}, [r0,:128]! - vadd.u8 d3, d3, d31 - vld1.16 {d26-d27}, [r0,:128]! - vadd.u8 d4, d4, d31 - vadd.u8 d5, d5, d31 - vst1.8 {d3}, [r1,:64], r2 - vqmovn.s16 d6, q12 - vst1.8 {d4}, [r1,:64], r2 - vqmovn.s16 d7, q13 - vst1.8 {d5}, [r1,:64], r2 - vadd.u8 d6, d6, d31 - vadd.u8 d7, d7, d31 - vst1.8 {d6}, [r1,:64], r2 - vst1.8 {d7}, [r1,:64], r2 - bx lr -endfunc - -function ff_add_pixels_clamped_neon, export=1 - mov r3, r1 - vld1.8 {d16}, [r1,:64], r2 - vld1.16 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vld1.8 {d17}, [r1,:64], r2 - vld1.16 {d2-d3}, [r0,:128]! - vqmovun.s16 d0, q0 - vld1.8 {d18}, [r1,:64], r2 - vaddw.u8 q1, q1, d17 - vld1.16 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.8 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.8 {d19}, [r1,:64], r2 - vld1.16 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vqmovun.s16 d4, q2 - vst1.8 {d2}, [r3,:64], r2 - vld1.8 {d16}, [r1,:64], r2 - vqmovun.s16 d6, q3 - vld1.16 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vst1.8 {d4}, [r3,:64], r2 - vld1.8 {d17}, [r1,:64], r2 - vld1.16 {d2-d3}, [r0,:128]! - vaddw.u8 q1, q1, d17 - vst1.8 {d6}, [r3,:64], r2 - vqmovun.s16 d0, q0 - vld1.8 {d18}, [r1,:64], r2 - vld1.16 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.8 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.8 {d19}, [r1,:64], r2 - vqmovun.s16 d4, q2 - vld1.16 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vst1.8 {d2}, [r3,:64], r2 - vqmovun.s16 d6, q3 - vst1.8 {d4}, [r3,:64], r2 - vst1.8 {d6}, [r3,:64], r2 - bx lr -endfunc diff --git a/libavcodec/arm/idctdsp_arm.S b/libavcodec/arm/idctdsp_arm.S new file mode 100644 index 0000000000..34f467e86f --- /dev/null +++ b/libavcodec/arm/idctdsp_arm.S @@ -0,0 +1,120 @@ +@ +@ ARMv4-optimized IDCT functions +@ Copyright (c) 2004 AGAWA Koji +@ +@ This file is part of Libav. +@ +@ Libav is free software; you can redistribute it and/or +@ modify it under the terms of the GNU Lesser General Public +@ License as published by the Free Software Foundation; either +@ version 2.1 of the License, or (at your option) any later version. +@ +@ Libav is distributed in the hope that it will be useful, +@ but WITHOUT ANY WARRANTY; without even the implied warranty of +@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +@ Lesser General Public License for more details. +@ +@ You should have received a copy of the GNU Lesser General Public +@ License along with Libav; if not, write to the Free Software +@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +@ + +#include "config.h" +#include "libavutil/arm/asm.S" + +@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) +function ff_add_pixels_clamped_arm, export=1, align=5 + push {r4-r10} + mov r10, #8 +1: + ldr r4, [r1] /* load dest */ + /* block[0] and block[1]*/ + ldrsh r5, [r0] + ldrsh r7, [r0, #2] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r6, r5 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #4] /* moved form [A] */ + orr r9, r9, r8, lsl #8 + /* block[2] and block[3] */ + /* [A] */ + ldrsh r7, [r0, #6] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + ldr r4, [r1, #4] /* moved form [B] */ + orr r9, r9, r8, lsl #24 + /* store dest */ + ldrsh r5, [r0, #8] /* moved form [C] */ + str r9, [r1] + + /* load dest */ + /* [B] */ + /* block[4] and block[5] */ + /* [C] */ + ldrsh r7, [r0, #10] + and r6, r4, #0xFF + and r8, r4, #0xFF00 + add r6, r6, r5 + add r8, r7, r8, lsr #8 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + mov r9, r6 + ldrsh r5, [r0, #12] /* moved from [D] */ + orr r9, r9, r8, lsl #8 + /* block[6] and block[7] */ + /* [D] */ + ldrsh r7, [r0, #14] + and r6, r4, #0xFF0000 + and r8, r4, #0xFF000000 + add r6, r5, r6, lsr #16 + add r8, r7, r8, lsr #24 + mvn r5, r5 + mvn r7, r7 + tst r6, #0x100 + it ne + movne r6, r5, lsr #24 + tst r8, #0x100 + it ne + movne r8, r7, lsr #24 + orr r9, r9, r6, lsl #16 + add r0, r0, #16 /* moved from [E] */ + orr r9, r9, r8, lsl #24 + subs r10, r10, #1 /* moved from [F] */ + /* store dest */ + str r9, [r1, #4] + + /* [E] */ + /* [F] */ + add r1, r1, r2 + bne 1b + + pop {r4-r10} + bx lr +endfunc diff --git a/libavcodec/arm/idctdsp_arm.h b/libavcodec/arm/idctdsp_arm.h new file mode 100644 index 0000000000..9012b82904 --- /dev/null +++ b/libavcodec/arm/idctdsp_arm.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_IDCTDSP_ARM_H +#define AVCODEC_ARM_IDCTDSP_ARM_H + +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" + +void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_ARM_IDCTDSP_ARM_H */ diff --git a/libavcodec/arm/idctdsp_armv6.S b/libavcodec/arm/idctdsp_armv6.S new file mode 100644 index 0000000000..c180d732fa --- /dev/null +++ b/libavcodec/arm/idctdsp_armv6.S @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_add_pixels_clamped_armv6, export=1 + push {r4-r8,lr} + mov r3, #8 +1: + ldm r0!, {r4,r5,r12,lr} + ldrd r6, r7, [r1] + pkhbt r8, r4, r5, lsl #16 + pkhtb r5, r5, r4, asr #16 + pkhbt r4, r12, lr, lsl #16 + pkhtb lr, lr, r12, asr #16 + pld [r1, r2] + uxtab16 r8, r8, r6 + uxtab16 r5, r5, r6, ror #8 + uxtab16 r4, r4, r7 + uxtab16 lr, lr, r7, ror #8 + usat16 r8, #8, r8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 lr, #8, lr + orr r6, r8, r5, lsl #8 + orr r7, r4, lr, lsl #8 + subs r3, r3, #1 + strd_post r6, r7, r1, r2 + bgt 1b + pop {r4-r8,pc} +endfunc diff --git a/libavcodec/arm/idctdsp_init_arm.c b/libavcodec/arm/idctdsp_init_arm.c new file mode 100644 index 0000000000..b4d189902d --- /dev/null +++ b/libavcodec/arm/idctdsp_init_arm.c @@ -0,0 +1,98 @@ +/* + * ARM-optimized IDCT functions + * Copyright (c) 2001 Lionel Ulmer + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/arm/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idctdsp_arm.h" + +void ff_j_rev_dct_arm(int16_t *data); +void ff_simple_idct_arm(int16_t *data); + +/* XXX: local hack */ +static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); +static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); + +void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, + int line_size); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + * converted */ +static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block) +{ + ff_j_rev_dct_arm(block); + ff_put_pixels_clamped(block, dest, line_size); +} + +static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block) +{ + ff_j_rev_dct_arm(block); + ff_add_pixels_clamped(block, dest, line_size); +} + +static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block) +{ + ff_simple_idct_arm(block); + ff_put_pixels_clamped(block, dest, line_size); +} + +static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block) +{ + ff_simple_idct_arm(block); + ff_add_pixels_clamped(block, dest, line_size); +} + +av_cold void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + ff_put_pixels_clamped = c->put_pixels_clamped; + ff_add_pixels_clamped = c->add_pixels_clamped; + + if (!high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_ARM) { + c->idct_put = j_rev_dct_arm_put; + c->idct_add = j_rev_dct_arm_add; + c->idct = ff_j_rev_dct_arm; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; + } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM) { + c->idct_put = simple_idct_arm_put; + c->idct_add = simple_idct_arm_add; + c->idct = ff_simple_idct_arm; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } + } + + c->add_pixels_clamped = ff_add_pixels_clamped_arm; + + if (have_armv5te(cpu_flags)) + ff_idctdsp_init_armv5te(c, avctx, high_bit_depth); + if (have_armv6(cpu_flags)) + ff_idctdsp_init_armv6(c, avctx, high_bit_depth); + if (have_neon(cpu_flags)) + ff_idctdsp_init_neon(c, avctx, high_bit_depth); +} diff --git a/libavcodec/arm/idctdsp_init_armv5te.c b/libavcodec/arm/idctdsp_init_armv5te.c new file mode 100644 index 0000000000..e2492a5da7 --- /dev/null +++ b/libavcodec/arm/idctdsp_init_armv5te.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idctdsp_arm.h" + +void ff_simple_idct_armv5te(int16_t *data); +void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data); +void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data); + +av_cold void ff_idctdsp_init_armv5te(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!high_bit_depth && + (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { + c->idct_put = ff_simple_idct_put_armv5te; + c->idct_add = ff_simple_idct_add_armv5te; + c->idct = ff_simple_idct_armv5te; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } +} diff --git a/libavcodec/arm/idctdsp_init_armv6.c b/libavcodec/arm/idctdsp_init_armv6.c new file mode 100644 index 0000000000..e92f471220 --- /dev/null +++ b/libavcodec/arm/idctdsp_init_armv6.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2009 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idctdsp_arm.h" + +void ff_simple_idct_armv6(int16_t *data); +void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); +void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); + +void ff_add_pixels_clamped_armv6(const int16_t *block, uint8_t *pixels, + int line_size); + +av_cold void ff_idctdsp_init_armv6(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLEARMV6) { + c->idct_put = ff_simple_idct_put_armv6; + c->idct_add = ff_simple_idct_add_armv6; + c->idct = ff_simple_idct_armv6; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; + } + } + c->add_pixels_clamped = ff_add_pixels_clamped_armv6; +} diff --git a/libavcodec/arm/idctdsp_init_neon.c b/libavcodec/arm/idctdsp_init_neon.c new file mode 100644 index 0000000000..17905973fb --- /dev/null +++ b/libavcodec/arm/idctdsp_init_neon.c @@ -0,0 +1,53 @@ +/* + * ARM-NEON-optimized IDCT functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include + +#include "libavutil/attributes.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "idctdsp_arm.h" + +void ff_simple_idct_neon(int16_t *data); +void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); +void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); + +void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); +void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); +void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); + +av_cold void ff_idctdsp_init_neon(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + if (!high_bit_depth) { + if (avctx->idct_algo == FF_IDCT_AUTO || + avctx->idct_algo == FF_IDCT_SIMPLENEON) { + c->idct_put = ff_simple_idct_put_neon; + c->idct_add = ff_simple_idct_add_neon; + c->idct = ff_simple_idct_neon; + c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; + } + } + + c->add_pixels_clamped = ff_add_pixels_clamped_neon; + c->put_pixels_clamped = ff_put_pixels_clamped_neon; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; +} diff --git a/libavcodec/arm/idctdsp_neon.S b/libavcodec/arm/idctdsp_neon.S new file mode 100644 index 0000000000..7095879bae --- /dev/null +++ b/libavcodec/arm/idctdsp_neon.S @@ -0,0 +1,128 @@ +/* + * ARM-NEON-optimized IDCT functions + * Copyright (c) 2008 Mans Rullgard + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/arm/asm.S" + +function ff_put_pixels_clamped_neon, export=1 + vld1.16 {d16-d19}, [r0,:128]! + vqmovun.s16 d0, q8 + vld1.16 {d20-d23}, [r0,:128]! + vqmovun.s16 d1, q9 + vld1.16 {d24-d27}, [r0,:128]! + vqmovun.s16 d2, q10 + vld1.16 {d28-d31}, [r0,:128]! + vqmovun.s16 d3, q11 + vst1.8 {d0}, [r1,:64], r2 + vqmovun.s16 d4, q12 + vst1.8 {d1}, [r1,:64], r2 + vqmovun.s16 d5, q13 + vst1.8 {d2}, [r1,:64], r2 + vqmovun.s16 d6, q14 + vst1.8 {d3}, [r1,:64], r2 + vqmovun.s16 d7, q15 + vst1.8 {d4}, [r1,:64], r2 + vst1.8 {d5}, [r1,:64], r2 + vst1.8 {d6}, [r1,:64], r2 + vst1.8 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_put_signed_pixels_clamped_neon, export=1 + vmov.u8 d31, #128 + vld1.16 {d16-d17}, [r0,:128]! + vqmovn.s16 d0, q8 + vld1.16 {d18-d19}, [r0,:128]! + vqmovn.s16 d1, q9 + vld1.16 {d16-d17}, [r0,:128]! + vqmovn.s16 d2, q8 + vld1.16 {d18-d19}, [r0,:128]! + vadd.u8 d0, d0, d31 + vld1.16 {d20-d21}, [r0,:128]! + vadd.u8 d1, d1, d31 + vld1.16 {d22-d23}, [r0,:128]! + vadd.u8 d2, d2, d31 + vst1.8 {d0}, [r1,:64], r2 + vqmovn.s16 d3, q9 + vst1.8 {d1}, [r1,:64], r2 + vqmovn.s16 d4, q10 + vst1.8 {d2}, [r1,:64], r2 + vqmovn.s16 d5, q11 + vld1.16 {d24-d25}, [r0,:128]! + vadd.u8 d3, d3, d31 + vld1.16 {d26-d27}, [r0,:128]! + vadd.u8 d4, d4, d31 + vadd.u8 d5, d5, d31 + vst1.8 {d3}, [r1,:64], r2 + vqmovn.s16 d6, q12 + vst1.8 {d4}, [r1,:64], r2 + vqmovn.s16 d7, q13 + vst1.8 {d5}, [r1,:64], r2 + vadd.u8 d6, d6, d31 + vadd.u8 d7, d7, d31 + vst1.8 {d6}, [r1,:64], r2 + vst1.8 {d7}, [r1,:64], r2 + bx lr +endfunc + +function ff_add_pixels_clamped_neon, export=1 + mov r3, r1 + vld1.8 {d16}, [r1,:64], r2 + vld1.16 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vld1.8 {d17}, [r1,:64], r2 + vld1.16 {d2-d3}, [r0,:128]! + vqmovun.s16 d0, q0 + vld1.8 {d18}, [r1,:64], r2 + vaddw.u8 q1, q1, d17 + vld1.16 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.8 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.8 {d19}, [r1,:64], r2 + vld1.16 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vqmovun.s16 d4, q2 + vst1.8 {d2}, [r3,:64], r2 + vld1.8 {d16}, [r1,:64], r2 + vqmovun.s16 d6, q3 + vld1.16 {d0-d1}, [r0,:128]! + vaddw.u8 q0, q0, d16 + vst1.8 {d4}, [r3,:64], r2 + vld1.8 {d17}, [r1,:64], r2 + vld1.16 {d2-d3}, [r0,:128]! + vaddw.u8 q1, q1, d17 + vst1.8 {d6}, [r3,:64], r2 + vqmovun.s16 d0, q0 + vld1.8 {d18}, [r1,:64], r2 + vld1.16 {d4-d5}, [r0,:128]! + vaddw.u8 q2, q2, d18 + vst1.8 {d0}, [r3,:64], r2 + vqmovun.s16 d2, q1 + vld1.8 {d19}, [r1,:64], r2 + vqmovun.s16 d4, q2 + vld1.16 {d6-d7}, [r0,:128]! + vaddw.u8 q3, q3, d19 + vst1.8 {d2}, [r3,:64], r2 + vqmovun.s16 d6, q3 + vst1.8 {d4}, [r3,:64], r2 + vst1.8 {d6}, [r3,:64], r2 + bx lr +endfunc diff --git a/libavcodec/asv.c b/libavcodec/asv.c index dba9e840c7..71c5e5f5b8 100644 --- a/libavcodec/asv.c +++ b/libavcodec/asv.c @@ -84,7 +84,6 @@ av_cold void ff_asv_common_init(AVCodecContext *avctx) { ASV1Context * const a = avctx->priv_data; ff_bswapdsp_init(&a->bbdsp); - ff_dsputil_init(&a->dsp, avctx); a->mb_width = (avctx->width + 15) / 16; a->mb_height = (avctx->height + 15) / 16; diff --git a/libavcodec/asv.h b/libavcodec/asv.h index 037e646969..3f8d56cf8b 100644 --- a/libavcodec/asv.h +++ b/libavcodec/asv.h @@ -34,6 +34,7 @@ #include "blockdsp.h" #include "bswapdsp.h" #include "dsputil.h" +#include "idctdsp.h" #include "get_bits.h" #include "put_bits.h" @@ -42,6 +43,7 @@ typedef struct ASV1Context{ BlockDSPContext bdsp; BswapDSPContext bbdsp; DSPContext dsp; + IDCTDSPContext idsp; PutBitContext pb; GetBitContext gb; ScanTable scantable; diff --git a/libavcodec/asvdec.c b/libavcodec/asvdec.c index c785d151ec..252f88ab6e 100644 --- a/libavcodec/asvdec.c +++ b/libavcodec/asvdec.c @@ -30,6 +30,7 @@ #include "avcodec.h" #include "blockdsp.h" #include "put_bits.h" +#include "idctdsp.h" #include "internal.h" #include "mathops.h" #include "mpeg12data.h" @@ -190,14 +191,14 @@ static inline void idct_put(ASV1Context *a, AVFrame *frame, int mb_x, int mb_y) uint8_t *dest_cb = frame->data[1] + (mb_y * 8 * frame->linesize[1]) + mb_x * 8; uint8_t *dest_cr = frame->data[2] + (mb_y * 8 * frame->linesize[2]) + mb_x * 8; - a->dsp.idct_put(dest_y , linesize, block[0]); - a->dsp.idct_put(dest_y + 8, linesize, block[1]); - a->dsp.idct_put(dest_y + 8*linesize , linesize, block[2]); - a->dsp.idct_put(dest_y + 8*linesize + 8, linesize, block[3]); + a->idsp.idct_put(dest_y, linesize, block[0]); + a->idsp.idct_put(dest_y + 8, linesize, block[1]); + a->idsp.idct_put(dest_y + 8 * linesize, linesize, block[2]); + a->idsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]); if (!(a->avctx->flags&CODEC_FLAG_GRAY)) { - a->dsp.idct_put(dest_cb, frame->linesize[1], block[4]); - a->dsp.idct_put(dest_cr, frame->linesize[2], block[5]); + a->idsp.idct_put(dest_cb, frame->linesize[1], block[4]); + a->idsp.idct_put(dest_cr, frame->linesize[2], block[5]); } } @@ -283,8 +284,9 @@ static av_cold int decode_init(AVCodecContext *avctx) ff_asv_common_init(avctx); ff_blockdsp_init(&a->bdsp, avctx); + ff_idctdsp_init(&a->idsp, avctx); init_vlcs(a); - ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_asv_scantab); + ff_init_scantable(a->idsp.idct_permutation, &a->scantable, ff_asv_scantab); avctx->pix_fmt = AV_PIX_FMT_YUV420P; a->inv_qscale = avctx->extradata[0]; diff --git a/libavcodec/asvenc.c b/libavcodec/asvenc.c index 47b766ac9e..e8c6d00197 100644 --- a/libavcodec/asvenc.c +++ b/libavcodec/asvenc.c @@ -247,6 +247,7 @@ static av_cold int encode_init(AVCodecContext *avctx){ avctx->coded_frame->key_frame = 1; ff_asv_common_init(avctx); + ff_dsputil_init(&a->dsp, avctx); if(avctx->global_quality == 0) avctx->global_quality= 4*FF_QUALITY_SCALE; diff --git a/libavcodec/cavs.c b/libavcodec/cavs.c index 21bc1edc23..2be50a7c4a 100644 --- a/libavcodec/cavs.c +++ b/libavcodec/cavs.c @@ -29,6 +29,7 @@ #include "get_bits.h" #include "golomb.h" #include "h264chroma.h" +#include "idctdsp.h" #include "mathops.h" #include "qpeldsp.h" #include "cavs.h" @@ -760,13 +761,13 @@ av_cold int ff_cavs_init(AVCodecContext *avctx) AVSContext *h = avctx->priv_data; ff_blockdsp_init(&h->bdsp, avctx); - ff_dsputil_init(&h->dsp, avctx); ff_h264chroma_init(&h->h264chroma, 8); + ff_idctdsp_init(&h->idsp, avctx); ff_videodsp_init(&h->vdsp, 8); ff_cavsdsp_init(&h->cdsp, avctx); - ff_init_scantable_permutation(h->dsp.idct_permutation, + ff_init_scantable_permutation(h->idsp.idct_permutation, h->cdsp.idct_perm); - ff_init_scantable(h->dsp.idct_permutation, &h->scantable, ff_zigzag_direct); + ff_init_scantable(h->idsp.idct_permutation, &h->scantable, ff_zigzag_direct); h->avctx = avctx; avctx->pix_fmt = AV_PIX_FMT_YUV420P; diff --git a/libavcodec/cavs.h b/libavcodec/cavs.h index c5a10b556b..cfae05576b 100644 --- a/libavcodec/cavs.h +++ b/libavcodec/cavs.h @@ -24,8 +24,8 @@ #include "cavsdsp.h" #include "blockdsp.h" -#include "dsputil.h" #include "h264chroma.h" +#include "idctdsp.h" #include "get_bits.h" #include "videodsp.h" @@ -162,9 +162,9 @@ typedef struct AVSFrame { typedef struct AVSContext { AVCodecContext *avctx; - DSPContext dsp; BlockDSPContext bdsp; H264ChromaContext h264chroma; + IDCTDSPContext idsp; VideoDSPContext vdsp; CAVSDSPContext cdsp; GetBitContext gb; diff --git a/libavcodec/cavsdsp.c b/libavcodec/cavsdsp.c index 666dc7f03f..958e3c5ef2 100644 --- a/libavcodec/cavsdsp.c +++ b/libavcodec/cavsdsp.c @@ -24,7 +24,7 @@ #include -#include "dsputil.h" +#include "idctdsp.h" #include "mathops.h" #include "cavsdsp.h" #include "libavutil/common.h" diff --git a/libavcodec/dnxhddec.c b/libavcodec/dnxhddec.c index 3bd8ffecd6..ca67990156 100644 --- a/libavcodec/dnxhddec.c +++ b/libavcodec/dnxhddec.c @@ -28,7 +28,7 @@ #include "blockdsp.h" #include "get_bits.h" #include "dnxhddata.h" -#include "dsputil.h" +#include "idctdsp.h" #include "internal.h" typedef struct DNXHDContext { @@ -42,7 +42,7 @@ typedef struct DNXHDContext { int cur_field; ///< current interlaced field VLC ac_vlc, dc_vlc, run_vlc; int last_dc[3]; - DSPContext dsp; + IDCTDSPContext idsp; DECLARE_ALIGNED(16, int16_t, blocks)[12][64]; ScanTable scantable; const CIDEntry *cid_table; @@ -95,7 +95,7 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid) ctx->cid_table->run_bits, 1, 1, ctx->cid_table->run_codes, 2, 2, 0); - ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, + ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, ff_zigzag_direct); ctx->cid = cid; } @@ -136,7 +136,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, ctx->avctx->bits_per_raw_sample = 10; if (ctx->bit_depth != 10) { ff_blockdsp_init(&ctx->bdsp, ctx->avctx); - ff_dsputil_init(&ctx->dsp, ctx->avctx); + ff_idctdsp_init(&ctx->idsp, ctx->avctx); ctx->bit_depth = 10; ctx->decode_dct_block = dnxhd_decode_dct_block_10_444; } @@ -146,7 +146,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, ctx->avctx->bits_per_raw_sample = 10; if (ctx->bit_depth != 10) { ff_blockdsp_init(&ctx->bdsp, ctx->avctx); - ff_dsputil_init(&ctx->dsp, ctx->avctx); + ff_idctdsp_init(&ctx->idsp, ctx->avctx); ctx->bit_depth = 10; ctx->decode_dct_block = dnxhd_decode_dct_block_10; } @@ -155,7 +155,7 @@ static int dnxhd_decode_header(DNXHDContext *ctx, AVFrame *frame, ctx->avctx->bits_per_raw_sample = 8; if (ctx->bit_depth != 8) { ff_blockdsp_init(&ctx->bdsp, ctx->avctx); - ff_dsputil_init(&ctx->dsp, ctx->avctx); + ff_idctdsp_init(&ctx->idsp, ctx->avctx); ctx->bit_depth = 8; ctx->decode_dct_block = dnxhd_decode_dct_block_8; } @@ -340,34 +340,34 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, AVFrame *frame, dct_y_offset = dct_linesize_luma << 3; dct_x_offset = 8 << shift1; if (!ctx->is_444) { - ctx->dsp.idct_put(dest_y, dct_linesize_luma, ctx->blocks[0]); - ctx->dsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, ctx->blocks[1]); - ctx->dsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, ctx->blocks[4]); - ctx->dsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]); + ctx->idsp.idct_put(dest_y, dct_linesize_luma, ctx->blocks[0]); + ctx->idsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, ctx->blocks[1]); + ctx->idsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, ctx->blocks[4]); + ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]); if (!(ctx->avctx->flags & CODEC_FLAG_GRAY)) { dct_y_offset = dct_linesize_chroma << 3; - ctx->dsp.idct_put(dest_u, dct_linesize_chroma, ctx->blocks[2]); - ctx->dsp.idct_put(dest_v, dct_linesize_chroma, ctx->blocks[3]); - ctx->dsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]); - ctx->dsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]); + ctx->idsp.idct_put(dest_u, dct_linesize_chroma, ctx->blocks[2]); + ctx->idsp.idct_put(dest_v, dct_linesize_chroma, ctx->blocks[3]); + ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]); + ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]); } } else { - ctx->dsp.idct_put(dest_y, dct_linesize_luma, ctx->blocks[0]); - ctx->dsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, ctx->blocks[1]); - ctx->dsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, ctx->blocks[6]); - ctx->dsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]); + ctx->idsp.idct_put(dest_y, dct_linesize_luma, ctx->blocks[0]); + ctx->idsp.idct_put(dest_y + dct_x_offset, dct_linesize_luma, ctx->blocks[1]); + ctx->idsp.idct_put(dest_y + dct_y_offset, dct_linesize_luma, ctx->blocks[6]); + ctx->idsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[7]); if (!(ctx->avctx->flags & CODEC_FLAG_GRAY)) { dct_y_offset = dct_linesize_chroma << 3; - ctx->dsp.idct_put(dest_u, dct_linesize_chroma, ctx->blocks[2]); - ctx->dsp.idct_put(dest_u + dct_x_offset, dct_linesize_chroma, ctx->blocks[3]); - ctx->dsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[8]); - ctx->dsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]); - ctx->dsp.idct_put(dest_v, dct_linesize_chroma, ctx->blocks[4]); - ctx->dsp.idct_put(dest_v + dct_x_offset, dct_linesize_chroma, ctx->blocks[5]); - ctx->dsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[10]); - ctx->dsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]); + ctx->idsp.idct_put(dest_u, dct_linesize_chroma, ctx->blocks[2]); + ctx->idsp.idct_put(dest_u + dct_x_offset, dct_linesize_chroma, ctx->blocks[3]); + ctx->idsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[8]); + ctx->idsp.idct_put(dest_u + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[9]); + ctx->idsp.idct_put(dest_v, dct_linesize_chroma, ctx->blocks[4]); + ctx->idsp.idct_put(dest_v + dct_x_offset, dct_linesize_chroma, ctx->blocks[5]); + ctx->idsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[10]); + ctx->idsp.idct_put(dest_v + dct_y_offset + dct_x_offset, dct_linesize_chroma, ctx->blocks[11]); } } diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c index b85027b6d2..223791acbf 100644 --- a/libavcodec/dnxhdenc.c +++ b/libavcodec/dnxhdenc.c @@ -200,14 +200,14 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias) if (ctx->cid_table->bit_depth == 8) { for (i = 1; i < 64; i++) { - int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]]; + int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]]; weight_matrix[j] = ctx->cid_table->luma_weight[i]; } ff_convert_matrix(&ctx->m, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix, ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1); for (i = 1; i < 64; i++) { - int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]]; + int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]]; weight_matrix[j] = ctx->cid_table->chroma_weight[i]; } ff_convert_matrix(&ctx->m, ctx->qmatrix_c, ctx->qmatrix_c16, @@ -228,7 +228,7 @@ static av_cold int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias) // 10-bit for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) { for (i = 1; i < 64; i++) { - int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]]; + int j = ctx->m.idsp.idct_permutation[ff_zigzag_direct[i]]; /* The quantization formula from the VC-3 standard is: * quantized = sign(block[i]) * floor(abs(block[i]/s) * p / @@ -308,6 +308,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx) ff_blockdsp_init(&ctx->bdsp, avctx); ff_dsputil_init(&ctx->m.dsp, avctx); + ff_idctdsp_init(&ctx->m.idsp, avctx); ff_dct_common_init(&ctx->m); if (!ctx->m.dct_quantize) ctx->m.dct_quantize = ff_dct_quantize_c; @@ -634,7 +635,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, if (avctx->mb_decision == FF_MB_DECISION_RD || !RC_VARIANCE) { dnxhd_unquantize_c(ctx, block, i, qscale, last_index); - ctx->m.dsp.idct(block); + ctx->m.idsp.idct(block); ssd += dnxhd_ssd_block(block, src_block); } } diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index ca0c8ef622..5e5ad93956 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -33,7 +33,6 @@ #include "dsputil.h" #include "simple_idct.h" #include "faandct.h" -#include "faanidct.h" #include "imgconvert.h" #include "mathops.h" #include "mpegvideo.h" @@ -48,60 +47,6 @@ uint32_t ff_square_tab[512] = { 0, }; #define BIT_DEPTH 8 #include "dsputilenc_template.c" -av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st, - const uint8_t *src_scantable) -{ - int i, end; - - st->scantable = src_scantable; - - for (i = 0; i < 64; i++) { - int j = src_scantable[i]; - st->permutated[i] = permutation[j]; - } - - end = -1; - for (i = 0; i < 64; i++) { - int j = st->permutated[i]; - if (j > end) - end = j; - st->raster_end[i] = end; - } -} - -av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation, - int idct_permutation_type) -{ - int i; - - if (ARCH_X86) - if (ff_init_scantable_permutation_x86(idct_permutation, - idct_permutation_type)) - return; - - switch (idct_permutation_type) { - case FF_NO_IDCT_PERM: - for (i = 0; i < 64; i++) - idct_permutation[i] = i; - break; - case FF_LIBMPEG2_IDCT_PERM: - for (i = 0; i < 64; i++) - idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); - break; - case FF_TRANSPOSE_IDCT_PERM: - for (i = 0; i < 64; i++) - idct_permutation[i] = ((i & 7) << 3) | (i >> 3); - break; - case FF_PARTTRANS_IDCT_PERM: - for (i = 0; i < 64; i++) - idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3); - break; - default: - av_log(NULL, AV_LOG_ERROR, - "Internal error, IDCT permutation not set\n"); - } -} - static int pix_sum_c(uint8_t *pix, int line_size) { int s = 0, i, j; @@ -259,68 +204,6 @@ static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1, } } -static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels, - int line_size) -{ - int i; - - /* read the pixels */ - for (i = 0; i < 8; i++) { - pixels[0] = av_clip_uint8(block[0]); - pixels[1] = av_clip_uint8(block[1]); - pixels[2] = av_clip_uint8(block[2]); - pixels[3] = av_clip_uint8(block[3]); - pixels[4] = av_clip_uint8(block[4]); - pixels[5] = av_clip_uint8(block[5]); - pixels[6] = av_clip_uint8(block[6]); - pixels[7] = av_clip_uint8(block[7]); - - pixels += line_size; - block += 8; - } -} - -static void put_signed_pixels_clamped_c(const int16_t *block, - uint8_t *restrict pixels, - int line_size) -{ - int i, j; - - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - if (*block < -128) - *pixels = 0; - else if (*block > 127) - *pixels = 255; - else - *pixels = (uint8_t) (*block + 128); - block++; - pixels++; - } - pixels += (line_size - 8); - } -} - -static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels, - int line_size) -{ - int i; - - /* read the pixels */ - for (i = 0; i < 8; i++) { - pixels[0] = av_clip_uint8(pixels[0] + block[0]); - pixels[1] = av_clip_uint8(pixels[1] + block[1]); - pixels[2] = av_clip_uint8(pixels[2] + block[2]); - pixels[3] = av_clip_uint8(pixels[3] + block[3]); - pixels[4] = av_clip_uint8(pixels[4] + block[4]); - pixels[5] = av_clip_uint8(pixels[5] + block[5]); - pixels[6] = av_clip_uint8(pixels[6] + block[6]); - pixels[7] = av_clip_uint8(pixels[7] + block[7]); - pixels += line_size; - block += 8; - } -} - static int sum_abs_dctelem_c(int16_t *block) { int sum = 0, i; @@ -967,7 +850,7 @@ static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2, s->dct_unquantize_inter(s, temp, 0, s->qscale); } - s->dsp.idct_add(lsrc2, 8, temp); + s->idsp.idct_add(lsrc2, 8, temp); distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8); @@ -1138,18 +1021,6 @@ WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c) WRAPPER8_16_SQ(rd8x8_c, rd16_c) WRAPPER8_16_SQ(bit8x8_c, bit16_c) -static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_j_rev_dct(block); - put_pixels_clamped_c(block, dest, line_size); -} - -static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_j_rev_dct(block); - add_pixels_clamped_c(block, dest, line_size); -} - /* draw the edges of width 'w' of an image of size width, height */ // FIXME: Check that this is OK for MPEG-4 interlaced. static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height, @@ -1209,36 +1080,8 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) } #endif /* CONFIG_ENCODERS */ - if (avctx->bits_per_raw_sample == 10) { - c->idct_put = ff_simple_idct_put_10; - c->idct_add = ff_simple_idct_add_10; - c->idct = ff_simple_idct_10; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } else { - if (avctx->idct_algo == FF_IDCT_INT) { - c->idct_put = jref_idct_put; - c->idct_add = jref_idct_add; - c->idct = ff_j_rev_dct; - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; - } else if (avctx->idct_algo == FF_IDCT_FAAN) { - c->idct_put = ff_faanidct_put; - c->idct_add = ff_faanidct_add; - c->idct = ff_faanidct; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } else { // accurate/default - c->idct_put = ff_simple_idct_put_8; - c->idct_add = ff_simple_idct_add_8; - c->idct = ff_simple_idct_8; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } - } - c->diff_pixels = diff_pixels_c; - c->put_pixels_clamped = put_pixels_clamped_c; - c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; - c->add_pixels_clamped = add_pixels_clamped_c; - c->sum_abs_dctelem = sum_abs_dctelem_c; c->pix_sum = pix_sum_c; @@ -1309,7 +1152,4 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) ff_dsputil_init_ppc(c, avctx, high_bit_depth); if (ARCH_X86) ff_dsputil_init_x86(c, avctx, high_bit_depth); - - ff_init_scantable_permutation(c->idct_permutation, - c->idct_permutation_type); } diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index b271dccf82..dfbca5a9f7 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -44,22 +44,6 @@ typedef int (*me_cmp_func)(struct MpegEncContext *c, uint8_t *blk1 /* align width (8 or 16) */, uint8_t *blk2 /* align 1 */, int line_size, int h); -/** - * Scantable. - */ -typedef struct ScanTable { - const uint8_t *scantable; - uint8_t permutated[64]; - uint8_t raster_end[64]; -} ScanTable; - -void ff_init_scantable(uint8_t *permutation, ScanTable *st, - const uint8_t *src_scantable); -void ff_init_scantable_permutation(uint8_t *idct_permutation, - int idct_permutation_type); -int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, - int idct_permutation_type); - /** * DSPContext. */ @@ -72,15 +56,6 @@ typedef struct DSPContext { const uint8_t *s1 /* align 8 */, const uint8_t *s2 /* align 8 */, int stride); - void (*put_pixels_clamped)(const int16_t *block /* align 16 */, - uint8_t *pixels /* align 8 */, - int line_size); - void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */, - uint8_t *pixels /* align 8 */, - int line_size); - void (*add_pixels_clamped)(const int16_t *block /* align 16 */, - uint8_t *pixels /* align 8 */, - int line_size); int (*sum_abs_dctelem)(int16_t *block /* align 16 */); int (*pix_sum)(uint8_t *pix, int line_size); @@ -112,47 +87,6 @@ typedef struct DSPContext { void (*fdct)(int16_t *block /* align 16 */); void (*fdct248)(int16_t *block /* align 16 */); - /* IDCT really */ - void (*idct)(int16_t *block /* align 16 */); - - /** - * block -> idct -> clip to unsigned 8 bit -> dest. - * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) - * @param line_size size in bytes of a horizontal line of dest - */ - void (*idct_put)(uint8_t *dest /* align 8 */, - int line_size, int16_t *block /* align 16 */); - - /** - * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. - * @param line_size size in bytes of a horizontal line of dest - */ - void (*idct_add)(uint8_t *dest /* align 8 */, - int line_size, int16_t *block /* align 16 */); - - /** - * IDCT input permutation. - * Several optimized IDCTs need a permutated input (relative to the - * normal order of the reference IDCT). - * This permutation must be performed before the idct_put/add. - * Note, normally this can be merged with the zigzag/alternate scan
- * An example to avoid confusion: - * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...) - * - (x -> reference DCT -> reference IDCT -> x) - * - (x -> reference DCT -> simple_mmx_perm = idct_permutation - * -> simple_idct_mmx -> x) - * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant - * -> simple_idct_mmx -> ...) - */ - uint8_t idct_permutation[64]; - int idct_permutation_type; -#define FF_NO_IDCT_PERM 1 -#define FF_LIBMPEG2_IDCT_PERM 2 -#define FF_SIMPLE_IDCT_PERM 3 -#define FF_TRANSPOSE_IDCT_PERM 4 -#define FF_PARTTRANS_IDCT_PERM 5 -#define FF_SSE2_IDCT_PERM 6 - int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale); void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale); diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c index 9a559dbd45..a03914d23e 100644 --- a/libavcodec/dvdec.c +++ b/libavcodec/dvdec.c @@ -39,6 +39,7 @@ #include "libavutil/imgutils.h" #include "libavutil/pixdesc.h" #include "avcodec.h" +#include "idctdsp.h" #include "internal.h" #include "get_bits.h" #include "put_bits.h" @@ -61,17 +62,17 @@ static const int dv_iweight_bits = 14; static av_cold int dvvideo_decode_init(AVCodecContext *avctx) { DVVideoContext *s = avctx->priv_data; - DSPContext dsp; + IDCTDSPContext idsp; int i; - ff_dsputil_init(&dsp, avctx); + ff_idctdsp_init(&idsp, avctx); for (i = 0; i < 64; i++) - s->dv_zigzag[0][i] = dsp.idct_permutation[ff_zigzag_direct[i]]; + s->dv_zigzag[0][i] = idsp.idct_permutation[ff_zigzag_direct[i]]; memcpy(s->dv_zigzag[1], ff_dv_zigzag248_direct, sizeof(s->dv_zigzag[1])); - s->idct_put[0] = dsp.idct_put; + s->idct_put[0] = idsp.idct_put; s->idct_put[1] = ff_simple_idct248_put; return ff_dvvideo_init(avctx); diff --git a/libavcodec/dxva2_mpeg2.c b/libavcodec/dxva2_mpeg2.c index 044e669332..f6ef5e52ab 100644 --- a/libavcodec/dxva2_mpeg2.c +++ b/libavcodec/dxva2_mpeg2.c @@ -110,7 +110,7 @@ static void fill_quantization_matrices(AVCodecContext *avctx, for (i = 0; i < 4; i++) qm->bNewQmatrix[i] = 1; for (i = 0; i < 64; i++) { - int n = s->dsp.idct_permutation[ff_zigzag_direct[i]]; + int n = s->idsp.idct_permutation[ff_zigzag_direct[i]]; qm->Qmatrix[0][i] = s->intra_matrix[n];; qm->Qmatrix[1][i] = s->inter_matrix[n];; qm->Qmatrix[2][i] = s->chroma_intra_matrix[n];; diff --git a/libavcodec/eamad.c b/libavcodec/eamad.c index 8fe1575a2f..9edf344857 100644 --- a/libavcodec/eamad.c +++ b/libavcodec/eamad.c @@ -33,6 +33,7 @@ #include "get_bits.h" #include "aandcttab.h" #include "eaidct.h" +#include "idctdsp.h" #include "internal.h" #include "mpeg12.h" #include "mpeg12data.h" @@ -47,7 +48,7 @@ typedef struct MadContext { AVCodecContext *avctx; BlockDSPContext bdsp; BswapDSPContext bbdsp; - DSPContext dsp; + IDCTDSPContext idsp; AVFrame *last_frame; GetBitContext gb; void *bitstream_buf; @@ -66,9 +67,9 @@ static av_cold int decode_init(AVCodecContext *avctx) avctx->pix_fmt = AV_PIX_FMT_YUV420P; ff_blockdsp_init(&s->bdsp, avctx); ff_bswapdsp_init(&s->bbdsp); - ff_dsputil_init(&s->dsp, avctx); - ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM); - ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct); + ff_idctdsp_init(&s->idsp, avctx); + ff_init_scantable_permutation(s->idsp.idct_permutation, FF_NO_IDCT_PERM); + ff_init_scantable(s->idsp.idct_permutation, &s->scantable, ff_zigzag_direct); ff_mpeg12_init_vlcs(); s->last_frame = av_frame_alloc(); diff --git a/libavcodec/eatgq.c b/libavcodec/eatgq.c index 1ead5f7adc..d8320c9f1d 100644 --- a/libavcodec/eatgq.c +++ b/libavcodec/eatgq.c @@ -32,7 +32,7 @@ #define BITSTREAM_READER_LE #include "get_bits.h" #include "bytestream.h" -#include "dsputil.h" +#include "idctdsp.h" #include "aandcttab.h" #include "eaidct.h" #include "internal.h" diff --git a/libavcodec/eatqi.c b/libavcodec/eatqi.c index 36ec2e4ff3..60d80e9621 100644 --- a/libavcodec/eatqi.c +++ b/libavcodec/eatqi.c @@ -32,6 +32,7 @@ #include "get_bits.h" #include "aandcttab.h" #include "eaidct.h" +#include "idctdsp.h" #include "internal.h" #include "mpeg12.h" #include "mpegvideo.h" @@ -51,9 +52,9 @@ static av_cold int tqi_decode_init(AVCodecContext *avctx) s->avctx = avctx; ff_blockdsp_init(&s->bdsp, avctx); ff_bswapdsp_init(&t->bsdsp); - ff_dsputil_init(&s->dsp, avctx); - ff_init_scantable_permutation(s->dsp.idct_permutation, FF_NO_IDCT_PERM); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); + ff_idctdsp_init(&s->idsp, avctx); + ff_init_scantable_permutation(s->idsp.idct_permutation, FF_NO_IDCT_PERM); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); s->qscale = 1; avctx->time_base = (AVRational){1, 15}; avctx->pix_fmt = AV_PIX_FMT_YUV420P; diff --git a/libavcodec/g2meet.c b/libavcodec/g2meet.c index 9660155619..456045dbc2 100644 --- a/libavcodec/g2meet.c +++ b/libavcodec/g2meet.c @@ -31,7 +31,7 @@ #include "avcodec.h" #include "blockdsp.h" #include "bytestream.h" -#include "dsputil.h" +#include "idctdsp.h" #include "get_bits.h" #include "internal.h" #include "mjpeg.h" @@ -74,7 +74,7 @@ static const uint8_t chroma_quant[64] = { typedef struct JPGContext { BlockDSPContext bdsp; - DSPContext dsp; + IDCTDSPContext idsp; ScanTable scantable; VLC dc_vlc[2], ac_vlc[2]; @@ -153,8 +153,8 @@ static av_cold int jpg_init(AVCodecContext *avctx, JPGContext *c) return ret; ff_blockdsp_init(&c->bdsp, avctx); - ff_dsputil_init(&c->dsp, avctx); - ff_init_scantable(c->dsp.idct_permutation, &c->scantable, + ff_idctdsp_init(&c->idsp, avctx); + ff_init_scantable(c->idsp.idct_permutation, &c->scantable, ff_zigzag_direct); return 0; @@ -279,13 +279,13 @@ static int jpg_decode_data(JPGContext *c, int width, int height, if ((ret = jpg_decode_block(c, &gb, 0, c->block[i + j * 2])) != 0) return ret; - c->dsp.idct(c->block[i + j * 2]); + c->idsp.idct(c->block[i + j * 2]); } } for (i = 1; i < 3; i++) { if ((ret = jpg_decode_block(c, &gb, i, c->block[i + 3])) != 0) return ret; - c->dsp.idct(c->block[i + 3]); + c->idsp.idct(c->block[i + 3]); } for (j = 0; j < 16; j++) { diff --git a/libavcodec/h263.c b/libavcodec/h263.c index 6d5ffc0b23..9019548a9e 100644 --- a/libavcodec/h263.c +++ b/libavcodec/h263.c @@ -267,7 +267,7 @@ void ff_h263_pred_acdc(MpegEncContext * s, int16_t *block, int n) if (a != 1024) { ac_val -= 16; for(i=1;i<8;i++) { - block[s->dsp.idct_permutation[i<<3]] += ac_val[i]; + block[s->idsp.idct_permutation[i << 3]] += ac_val[i]; } pred_dc = a; } @@ -276,7 +276,7 @@ void ff_h263_pred_acdc(MpegEncContext * s, int16_t *block, int n) if (c != 1024) { ac_val -= 16 * wrap; for(i=1;i<8;i++) { - block[s->dsp.idct_permutation[i ]] += ac_val[i + 8]; + block[s->idsp.idct_permutation[i]] += ac_val[i + 8]; } pred_dc = c; } @@ -304,10 +304,10 @@ void ff_h263_pred_acdc(MpegEncContext * s, int16_t *block, int n) /* left copy */ for(i=1;i<8;i++) - ac_val1[i ] = block[s->dsp.idct_permutation[i<<3]]; + ac_val1[i] = block[s->idsp.idct_permutation[i << 3]]; /* top copy */ for(i=1;i<8;i++) - ac_val1[8 + i] = block[s->dsp.idct_permutation[i ]]; + ac_val1[8 + i] = block[s->idsp.idct_permutation[i]]; } int16_t *ff_h263_pred_motion(MpegEncContext * s, int block, int dir, diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c new file mode 100644 index 0000000000..8542ab35aa --- /dev/null +++ b/libavcodec/idctdsp.c @@ -0,0 +1,197 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/common.h" +#include "avcodec.h" +#include "dct.h" +#include "faanidct.h" +#include "idctdsp.h" +#include "simple_idct.h" + +av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st, + const uint8_t *src_scantable) +{ + int i, end; + + st->scantable = src_scantable; + + for (i = 0; i < 64; i++) { + int j = src_scantable[i]; + st->permutated[i] = permutation[j]; + } + + end = -1; + for (i = 0; i < 64; i++) { + int j = st->permutated[i]; + if (j > end) + end = j; + st->raster_end[i] = end; + } +} + +av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation, + int idct_permutation_type) +{ + int i; + + if (ARCH_X86) + if (ff_init_scantable_permutation_x86(idct_permutation, + idct_permutation_type)) + return; + + switch (idct_permutation_type) { + case FF_NO_IDCT_PERM: + for (i = 0; i < 64; i++) + idct_permutation[i] = i; + break; + case FF_LIBMPEG2_IDCT_PERM: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2); + break; + case FF_TRANSPOSE_IDCT_PERM: + for (i = 0; i < 64; i++) + idct_permutation[i] = ((i & 7) << 3) | (i >> 3); + break; + case FF_PARTTRANS_IDCT_PERM: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3); + break; + default: + av_log(NULL, AV_LOG_ERROR, + "Internal error, IDCT permutation not set\n"); + } +} + +static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels, + int line_size) +{ + int i; + + /* read the pixels */ + for (i = 0; i < 8; i++) { + pixels[0] = av_clip_uint8(block[0]); + pixels[1] = av_clip_uint8(block[1]); + pixels[2] = av_clip_uint8(block[2]); + pixels[3] = av_clip_uint8(block[3]); + pixels[4] = av_clip_uint8(block[4]); + pixels[5] = av_clip_uint8(block[5]); + pixels[6] = av_clip_uint8(block[6]); + pixels[7] = av_clip_uint8(block[7]); + + pixels += line_size; + block += 8; + } +} + +static void put_signed_pixels_clamped_c(const int16_t *block, + uint8_t *restrict pixels, + int line_size) +{ + int i, j; + + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + if (*block < -128) + *pixels = 0; + else if (*block > 127) + *pixels = 255; + else + *pixels = (uint8_t) (*block + 128); + block++; + pixels++; + } + pixels += (line_size - 8); + } +} + +static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels, + int line_size) +{ + int i; + + /* read the pixels */ + for (i = 0; i < 8; i++) { + pixels[0] = av_clip_uint8(pixels[0] + block[0]); + pixels[1] = av_clip_uint8(pixels[1] + block[1]); + pixels[2] = av_clip_uint8(pixels[2] + block[2]); + pixels[3] = av_clip_uint8(pixels[3] + block[3]); + pixels[4] = av_clip_uint8(pixels[4] + block[4]); + pixels[5] = av_clip_uint8(pixels[5] + block[5]); + pixels[6] = av_clip_uint8(pixels[6] + block[6]); + pixels[7] = av_clip_uint8(pixels[7] + block[7]); + pixels += line_size; + block += 8; + } +} + +static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block) +{ + ff_j_rev_dct(block); + put_pixels_clamped_c(block, dest, line_size); +} + +static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block) +{ + ff_j_rev_dct(block); + add_pixels_clamped_c(block, dest, line_size); +} + +av_cold void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx) +{ + const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8; + + if (avctx->bits_per_raw_sample == 10) { + c->idct_put = ff_simple_idct_put_10; + c->idct_add = ff_simple_idct_add_10; + c->idct = ff_simple_idct_10; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } else { + if (avctx->idct_algo == FF_IDCT_INT) { + c->idct_put = jref_idct_put; + c->idct_add = jref_idct_add; + c->idct = ff_j_rev_dct; + c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; + } else if (avctx->idct_algo == FF_IDCT_FAAN) { + c->idct_put = ff_faanidct_put; + c->idct_add = ff_faanidct_add; + c->idct = ff_faanidct; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } else { // accurate/default + c->idct_put = ff_simple_idct_put_8; + c->idct_add = ff_simple_idct_add_8; + c->idct = ff_simple_idct_8; + c->idct_permutation_type = FF_NO_IDCT_PERM; + } + } + + c->put_pixels_clamped = put_pixels_clamped_c; + c->put_signed_pixels_clamped = put_signed_pixels_clamped_c; + c->add_pixels_clamped = add_pixels_clamped_c; + + if (ARCH_ARM) + ff_idctdsp_init_arm(c, avctx, high_bit_depth); + if (ARCH_PPC) + ff_idctdsp_init_ppc(c, avctx, high_bit_depth); + if (ARCH_X86) + ff_idctdsp_init_x86(c, avctx, high_bit_depth); + + ff_init_scantable_permutation(c->idct_permutation, + c->idct_permutation_type); +} diff --git a/libavcodec/idctdsp.h b/libavcodec/idctdsp.h new file mode 100644 index 0000000000..e3a2317679 --- /dev/null +++ b/libavcodec/idctdsp.h @@ -0,0 +1,104 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_IDCTDSP_H +#define AVCODEC_IDCTDSP_H + +#include + +#include "avcodec.h" + +/** + * Scantable. + */ +typedef struct ScanTable { + const uint8_t *scantable; + uint8_t permutated[64]; + uint8_t raster_end[64]; +} ScanTable; + +void ff_init_scantable(uint8_t *permutation, ScanTable *st, + const uint8_t *src_scantable); +void ff_init_scantable_permutation(uint8_t *idct_permutation, + int idct_permutation_type); +int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, + int idct_permutation_type); + +typedef struct IDCTDSPContext { + /* pixel ops : interface with DCT */ + void (*put_pixels_clamped)(const int16_t *block /* align 16 */, + uint8_t *pixels /* align 8 */, + int line_size); + void (*put_signed_pixels_clamped)(const int16_t *block /* align 16 */, + uint8_t *pixels /* align 8 */, + int line_size); + void (*add_pixels_clamped)(const int16_t *block /* align 16 */, + uint8_t *pixels /* align 8 */, + int line_size); + + void (*idct)(int16_t *block /* align 16 */); + + /** + * block -> idct -> clip to unsigned 8 bit -> dest. + * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...) + * @param line_size size in bytes of a horizontal line of dest + */ + void (*idct_put)(uint8_t *dest /* align 8 */, + int line_size, int16_t *block /* align 16 */); + + /** + * block -> idct -> add dest -> clip to unsigned 8 bit -> dest. + * @param line_size size in bytes of a horizontal line of dest + */ + void (*idct_add)(uint8_t *dest /* align 8 */, + int line_size, int16_t *block /* align 16 */); + + /** + * IDCT input permutation. + * Several optimized IDCTs need a permutated input (relative to the + * normal order of the reference IDCT). + * This permutation must be performed before the idct_put/add. + * Note, normally this can be merged with the zigzag/alternate scan
+ * An example to avoid confusion: + * - (->decode coeffs -> zigzag reorder -> dequant -> reference IDCT -> ...) + * - (x -> reference DCT -> reference IDCT -> x) + * - (x -> reference DCT -> simple_mmx_perm = idct_permutation + * -> simple_idct_mmx -> x) + * - (-> decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant + * -> simple_idct_mmx -> ...) + */ + uint8_t idct_permutation[64]; + int idct_permutation_type; +#define FF_NO_IDCT_PERM 1 +#define FF_LIBMPEG2_IDCT_PERM 2 +#define FF_SIMPLE_IDCT_PERM 3 +#define FF_TRANSPOSE_IDCT_PERM 4 +#define FF_PARTTRANS_IDCT_PERM 5 +#define FF_SSE2_IDCT_PERM 6 +} IDCTDSPContext; + +void ff_idctdsp_init(IDCTDSPContext *c, AVCodecContext *avctx); + +void ff_idctdsp_init_arm(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); +void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth); + +#endif /* AVCODEC_IDCTDSP_H */ diff --git a/libavcodec/intrax8.c b/libavcodec/intrax8.c index 2bda7233f7..d37eb793cd 100644 --- a/libavcodec/intrax8.c +++ b/libavcodec/intrax8.c @@ -24,6 +24,7 @@ #include "avcodec.h" #include "error_resilience.h" #include "get_bits.h" +#include "idctdsp.h" #include "mpegvideo.h" #include "msmpeg4data.h" #include "intrax8huf.h" @@ -440,7 +441,7 @@ lut2[q>12][c]={ static void x8_ac_compensation(IntraX8Context * const w, int const direction, int const dc_level){ MpegEncContext * const s= w->s; int t; -#define B(x,y) s->block[0][s->dsp.idct_permutation[(x)+(y)*8]] +#define B(x, y) s->block[0][s->idsp.idct_permutation[(x) + (y) * 8]] #define T(x) ((x) * dc_level + 0x8000) >> 16; switch(direction){ case 0: @@ -646,9 +647,9 @@ static int x8_decode_intra_mb(IntraX8Context* const w, const int chroma){ s->current_picture.f->linesize[!!chroma] ); } if(!zeros_only) - s->dsp.idct_add ( s->dest[chroma], - s->current_picture.f->linesize[!!chroma], - s->block[0] ); + s->idsp.idct_add(s->dest[chroma], + s->current_picture.f->linesize[!!chroma], + s->block[0]); block_placed: @@ -698,9 +699,9 @@ av_cold void ff_intrax8_common_init(IntraX8Context * w, MpegEncContext * const s assert(s->mb_width>0); w->prediction_table=av_mallocz(s->mb_width*2*2);//two rows, 2 blocks per cannon mb - ff_init_scantable(s->dsp.idct_permutation, &w->scantable[0], ff_wmv1_scantable[0]); - ff_init_scantable(s->dsp.idct_permutation, &w->scantable[1], ff_wmv1_scantable[2]); - ff_init_scantable(s->dsp.idct_permutation, &w->scantable[2], ff_wmv1_scantable[3]); + ff_init_scantable(s->idsp.idct_permutation, &w->scantable[0], ff_wmv1_scantable[0]); + ff_init_scantable(s->idsp.idct_permutation, &w->scantable[1], ff_wmv1_scantable[2]); + ff_init_scantable(s->idsp.idct_permutation, &w->scantable[2], ff_wmv1_scantable[3]); ff_intrax8dsp_init(&w->dsp); } diff --git a/libavcodec/ljpegenc.c b/libavcodec/ljpegenc.c index 9f06818667..fbb024b9a6 100644 --- a/libavcodec/ljpegenc.c +++ b/libavcodec/ljpegenc.c @@ -35,7 +35,7 @@ #include "libavutil/pixdesc.h" #include "avcodec.h" -#include "dsputil.h" +#include "idctdsp.h" #include "internal.h" #include "mjpegenc_common.h" #include "mpegvideo.h" @@ -43,7 +43,7 @@ #include "mjpegenc.h" typedef struct LJpegEncContext { - DSPContext dsp; + IDCTDSPContext idsp; ScanTable scantable; uint16_t matrix[64]; @@ -285,8 +285,9 @@ static av_cold int ljpeg_encode_init(AVCodecContext *avctx) s->scratch = av_malloc_array(avctx->width + 1, sizeof(*s->scratch)); - ff_dsputil_init(&s->dsp, avctx); - ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct); + ff_idctdsp_init(&s->idsp, avctx); + ff_init_scantable(s->idsp.idct_permutation, &s->scantable, + ff_zigzag_direct); av_pix_fmt_get_chroma_sub_sample(avctx->pix_fmt, &chroma_h_shift, &chroma_v_shift); diff --git a/libavcodec/mdec.c b/libavcodec/mdec.c index b4213972b7..6b70e37e76 100644 --- a/libavcodec/mdec.c +++ b/libavcodec/mdec.c @@ -29,6 +29,7 @@ #include "avcodec.h" #include "blockdsp.h" +#include "idctdsp.h" #include "mpegvideo.h" #include "mpeg12.h" #include "thread.h" @@ -36,7 +37,7 @@ typedef struct MDECContext { AVCodecContext *avctx; BlockDSPContext bdsp; - DSPContext dsp; + IDCTDSPContext idsp; ThreadFrame frame; GetBitContext gb; ScanTable scantable; @@ -146,14 +147,14 @@ static inline void idct_put(MDECContext *a, AVFrame *frame, int mb_x, int mb_y) uint8_t *dest_cb = frame->data[1] + (mb_y * 8 * frame->linesize[1]) + mb_x * 8; uint8_t *dest_cr = frame->data[2] + (mb_y * 8 * frame->linesize[2]) + mb_x * 8; - a->dsp.idct_put(dest_y, linesize, block[0]); - a->dsp.idct_put(dest_y + 8, linesize, block[1]); - a->dsp.idct_put(dest_y + 8 * linesize, linesize, block[2]); - a->dsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]); + a->idsp.idct_put(dest_y, linesize, block[0]); + a->idsp.idct_put(dest_y + 8, linesize, block[1]); + a->idsp.idct_put(dest_y + 8 * linesize, linesize, block[2]); + a->idsp.idct_put(dest_y + 8 * linesize + 8, linesize, block[3]); if (!(a->avctx->flags & CODEC_FLAG_GRAY)) { - a->dsp.idct_put(dest_cb, frame->linesize[1], block[4]); - a->dsp.idct_put(dest_cr, frame->linesize[2], block[5]); + a->idsp.idct_put(dest_cb, frame->linesize[1], block[4]); + a->idsp.idct_put(dest_cr, frame->linesize[2], block[5]); } } @@ -215,9 +216,10 @@ static av_cold int decode_init(AVCodecContext *avctx) a->avctx = avctx; ff_blockdsp_init(&a->bdsp, avctx); - ff_dsputil_init(&a->dsp, avctx); + ff_idctdsp_init(&a->idsp, avctx); ff_mpeg12_init_vlcs(); - ff_init_scantable(a->dsp.idct_permutation, &a->scantable, ff_zigzag_direct); + ff_init_scantable(a->idsp.idct_permutation, &a->scantable, + ff_zigzag_direct); if (avctx->idct_algo == FF_IDCT_AUTO) avctx->idct_algo = FF_IDCT_SIMPLE; diff --git a/libavcodec/mimic.c b/libavcodec/mimic.c index 4d21b5165e..88ee5d380c 100644 --- a/libavcodec/mimic.c +++ b/libavcodec/mimic.c @@ -29,8 +29,8 @@ #include "get_bits.h" #include "bytestream.h" #include "bswapdsp.h" -#include "dsputil.h" #include "hpeldsp.h" +#include "idctdsp.h" #include "thread.h" #define MIMIC_HEADER_SIZE 20 @@ -56,8 +56,8 @@ typedef struct { ScanTable scantable; BlockDSPContext bdsp; BswapDSPContext bbdsp; - DSPContext dsp; HpelDSPContext hdsp; + IDCTDSPContext idsp; VLC vlc; /* Kept in the context so multithreading can have a constant to read from */ @@ -151,9 +151,9 @@ static av_cold int mimic_decode_init(AVCodecContext *avctx) } ff_blockdsp_init(&ctx->bdsp, avctx); ff_bswapdsp_init(&ctx->bbdsp); - ff_dsputil_init(&ctx->dsp, avctx); ff_hpeldsp_init(&ctx->hdsp, avctx->flags); - ff_init_scantable(ctx->dsp.idct_permutation, &ctx->scantable, col_zag); + ff_idctdsp_init(&ctx->idsp, avctx); + ff_init_scantable(ctx->idsp.idct_permutation, &ctx->scantable, col_zag); for (i = 0; i < FF_ARRAY_ELEMS(ctx->frames); i++) { ctx->frames[i].f = av_frame_alloc(); @@ -302,7 +302,7 @@ static int decode(MimicContext *ctx, int quality, int num_coeffs, "block.\n"); return ret; } - ctx->dsp.idct_put(dst, stride, ctx->dct_block); + ctx->idsp.idct_put(dst, stride, ctx->dct_block); } else { unsigned int backref = get_bits(&ctx->gb, 4); int index = (ctx->cur_index + backref) & 15; diff --git a/libavcodec/mjpegdec.c b/libavcodec/mjpegdec.c index f674539507..d9a73d8426 100644 --- a/libavcodec/mjpegdec.c +++ b/libavcodec/mjpegdec.c @@ -36,6 +36,7 @@ #include "libavutil/opt.h" #include "avcodec.h" #include "blockdsp.h" +#include "idctdsp.h" #include "internal.h" #include "mjpeg.h" #include "mjpegdec.h" @@ -95,8 +96,9 @@ av_cold int ff_mjpeg_decode_init(AVCodecContext *avctx) s->avctx = avctx; ff_blockdsp_init(&s->bdsp, avctx); ff_hpeldsp_init(&s->hdsp, avctx->flags); - ff_dsputil_init(&s->dsp, avctx); - ff_init_scantable(s->dsp.idct_permutation, &s->scantable, ff_zigzag_direct); + ff_idctdsp_init(&s->idsp, avctx); + ff_init_scantable(s->idsp.idct_permutation, &s->scantable, + ff_zigzag_direct); s->buffer_size = 0; s->buffer = NULL; s->start_code = -1; @@ -889,7 +891,7 @@ static int mjpeg_decode_scan(MJpegDecodeContext *s, int nb_components, int Ah, "error y=%d x=%d\n", mb_y, mb_x); return AVERROR_INVALIDDATA; } - s->dsp.idct_put(ptr, linesize[c], s->block); + s->idsp.idct_put(ptr, linesize[c], s->block); } } else { int block_idx = s->block_stride[c] * (v * mb_y + y) + @@ -1002,7 +1004,7 @@ static int mjpeg_decode_scan_progressive_ac(MJpegDecodeContext *s, int ss, reference_data + block_offset, linesize, 8); } else { - s->dsp.idct_put(ptr, linesize, *block); + s->idsp.idct_put(ptr, linesize, *block); ptr += 8; } } diff --git a/libavcodec/mjpegdec.h b/libavcodec/mjpegdec.h index 0d1dd9ee03..aa4703a24d 100644 --- a/libavcodec/mjpegdec.h +++ b/libavcodec/mjpegdec.h @@ -35,8 +35,8 @@ #include "avcodec.h" #include "blockdsp.h" #include "get_bits.h" -#include "dsputil.h" #include "hpeldsp.h" +#include "idctdsp.h" #define MAX_COMPONENTS 4 @@ -97,8 +97,8 @@ typedef struct MJpegDecodeContext { uint64_t coefs_finished[MAX_COMPONENTS]; ///< bitmask of which coefs have been completely decoded (progressive mode) ScanTable scantable; BlockDSPContext bdsp; - DSPContext dsp; HpelDSPContext hdsp; + IDCTDSPContext idsp; int restart_interval; int restart_count; diff --git a/libavcodec/mjpegenc_common.c b/libavcodec/mjpegenc_common.c index 3dba414f1d..adb335e5e1 100644 --- a/libavcodec/mjpegenc_common.c +++ b/libavcodec/mjpegenc_common.c @@ -26,7 +26,7 @@ #include "libavutil/pixfmt.h" #include "avcodec.h" -#include "dsputil.h" +#include "idctdsp.h" #include "put_bits.h" #include "mjpegenc_common.h" #include "mjpeg.h" diff --git a/libavcodec/mjpegenc_common.h b/libavcodec/mjpegenc_common.h index 57dc9ddb5b..b48911e364 100644 --- a/libavcodec/mjpegenc_common.h +++ b/libavcodec/mjpegenc_common.h @@ -24,7 +24,7 @@ #include #include "avcodec.h" -#include "dsputil.h" +#include "idctdsp.h" #include "put_bits.h" void ff_mjpeg_encode_picture_header(AVCodecContext *avctx, PutBitContext *pb, diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c index 0bf3c20c37..aa98454d05 100644 --- a/libavcodec/mpeg12dec.c +++ b/libavcodec/mpeg12dec.c @@ -33,8 +33,8 @@ #include "avcodec.h" #include "bytestream.h" -#include "dsputil.h" #include "error_resilience.h" +#include "idctdsp.h" #include "internal.h" #include "mpeg_er.h" #include "mpeg12.h" @@ -1100,7 +1100,7 @@ static av_cold int mpeg_decode_init(AVCodecContext *avctx) /* we need some permutation to store matrices, * until MPV_common_init() sets the real permutation. */ for (i = 0; i < 64; i++) - s2->dsp.idct_permutation[i] = i; + s2->idsp.idct_permutation[i] = i; ff_MPV_decode_defaults(s2); @@ -1309,15 +1309,15 @@ static int mpeg_decode_postinit(AVCodecContext *avctx) /* Quantization matrices may need reordering * if DCT permutation is changed. */ - memcpy(old_permutation, s->dsp.idct_permutation, 64 * sizeof(uint8_t)); + memcpy(old_permutation, s->idsp.idct_permutation, 64 * sizeof(uint8_t)); if (ff_MPV_common_init(s) < 0) return -2; - quant_matrix_rebuild(s->intra_matrix, old_permutation, s->dsp.idct_permutation); - quant_matrix_rebuild(s->inter_matrix, old_permutation, s->dsp.idct_permutation); - quant_matrix_rebuild(s->chroma_intra_matrix, old_permutation, s->dsp.idct_permutation); - quant_matrix_rebuild(s->chroma_inter_matrix, old_permutation, s->dsp.idct_permutation); + quant_matrix_rebuild(s->intra_matrix, old_permutation, s->idsp.idct_permutation); + quant_matrix_rebuild(s->inter_matrix, old_permutation, s->idsp.idct_permutation); + quant_matrix_rebuild(s->chroma_intra_matrix, old_permutation, s->idsp.idct_permutation); + quant_matrix_rebuild(s->chroma_inter_matrix, old_permutation, s->idsp.idct_permutation); s1->mpeg_enc_ctx_allocated = 1; } @@ -1469,7 +1469,7 @@ static int load_matrix(MpegEncContext *s, uint16_t matrix0[64], int i; for (i = 0; i < 64; i++) { - int j = s->dsp.idct_permutation[ff_zigzag_direct[i]]; + int j = s->idsp.idct_permutation[ff_zigzag_direct[i]]; int v = get_bits(&s->gb, 8); if (v == 0) { av_log(s->avctx, AV_LOG_ERROR, "matrix damaged\n"); @@ -1561,11 +1561,11 @@ static void mpeg_decode_picture_coding_extension(Mpeg1Context *s1) } if (s->alternate_scan) { - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan); } else { - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); } /* composite display not parsed */ @@ -2070,7 +2070,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, load_matrix(s, s->chroma_intra_matrix, s->intra_matrix, 1); } else { for (i = 0; i < 64; i++) { - j = s->dsp.idct_permutation[i]; + j = s->idsp.idct_permutation[i]; v = ff_mpeg1_default_intra_matrix[i]; s->intra_matrix[j] = v; s->chroma_intra_matrix[j] = v; @@ -2080,7 +2080,7 @@ static int mpeg1_decode_sequence(AVCodecContext *avctx, load_matrix(s, s->chroma_inter_matrix, s->inter_matrix, 0); } else { for (i = 0; i < 64; i++) { - int j = s->dsp.idct_permutation[i]; + int j = s->idsp.idct_permutation[i]; v = ff_mpeg1_default_non_intra_matrix[i]; s->inter_matrix[j] = v; s->chroma_inter_matrix[j] = v; @@ -2142,7 +2142,7 @@ static int vcr2_init_sequence(AVCodecContext *avctx) s1->mpeg_enc_ctx_allocated = 1; for (i = 0; i < 64; i++) { - int j = s->dsp.idct_permutation[i]; + int j = s->idsp.idct_permutation[i]; v = ff_mpeg1_default_intra_matrix[i]; s->intra_matrix[j] = v; s->chroma_intra_matrix[j] = v; diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c index 0e3e5803b1..a4e7725753 100644 --- a/libavcodec/mpeg4videodec.c +++ b/libavcodec/mpeg4videodec.c @@ -21,6 +21,7 @@ */ #include "error_resilience.h" +#include "idctdsp.h" #include "internal.h" #include "mpegutils.h" #include "mpegvideo.h" @@ -71,11 +72,11 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir) n == 1 || n == 3) { /* same qscale */ for (i = 1; i < 8; i++) - block[s->dsp.idct_permutation[i << 3]] += ac_val[i]; + block[s->idsp.idct_permutation[i << 3]] += ac_val[i]; } else { /* different qscale, we must rescale */ for (i = 1; i < 8; i++) - block[s->dsp.idct_permutation[i << 3]] += ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale); + block[s->idsp.idct_permutation[i << 3]] += ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale); } } else { const int xy = s->mb_x + s->mb_y * s->mb_stride - s->mb_stride; @@ -86,21 +87,21 @@ void ff_mpeg4_pred_ac(MpegEncContext *s, int16_t *block, int n, int dir) n == 2 || n == 3) { /* same qscale */ for (i = 1; i < 8; i++) - block[s->dsp.idct_permutation[i]] += ac_val[i + 8]; + block[s->idsp.idct_permutation[i]] += ac_val[i + 8]; } else { /* different qscale, we must rescale */ for (i = 1; i < 8; i++) - block[s->dsp.idct_permutation[i]] += ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale); + block[s->idsp.idct_permutation[i]] += ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale); } } } /* left copy */ for (i = 1; i < 8; i++) - ac_val1[i] = block[s->dsp.idct_permutation[i << 3]]; + ac_val1[i] = block[s->idsp.idct_permutation[i << 3]]; /* top copy */ for (i = 1; i < 8; i++) - ac_val1[8 + i] = block[s->dsp.idct_permutation[i]]; + ac_val1[8 + i] = block[s->idsp.idct_permutation[i]]; } /** @@ -1815,7 +1816,7 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb) /* load default matrixes */ for (i = 0; i < 64; i++) { - int j = s->dsp.idct_permutation[i]; + int j = s->idsp.idct_permutation[i]; v = ff_mpeg4_default_intra_matrix[i]; s->intra_matrix[j] = v; s->chroma_intra_matrix[j] = v; @@ -1835,14 +1836,14 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb) break; last = v; - j = s->dsp.idct_permutation[ff_zigzag_direct[i]]; + j = s->idsp.idct_permutation[ff_zigzag_direct[i]]; s->intra_matrix[j] = last; s->chroma_intra_matrix[j] = last; } /* replicate last value */ for (; i < 64; i++) { - int j = s->dsp.idct_permutation[ff_zigzag_direct[i]]; + int j = s->idsp.idct_permutation[ff_zigzag_direct[i]]; s->intra_matrix[j] = last; s->chroma_intra_matrix[j] = last; } @@ -1858,14 +1859,14 @@ static int decode_vol_header(Mpeg4DecContext *ctx, GetBitContext *gb) break; last = v; - j = s->dsp.idct_permutation[ff_zigzag_direct[i]]; + j = s->idsp.idct_permutation[ff_zigzag_direct[i]]; s->inter_matrix[j] = v; s->chroma_inter_matrix[j] = v; } /* replicate last value */ for (; i < 64; i++) { - int j = s->dsp.idct_permutation[ff_zigzag_direct[i]]; + int j = s->idsp.idct_permutation[ff_zigzag_direct[i]]; s->inter_matrix[j] = last; s->chroma_inter_matrix[j] = last; } @@ -2219,15 +2220,15 @@ static int decode_vop_header(Mpeg4DecContext *ctx, GetBitContext *gb) } if (s->alternate_scan) { - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_alternate_vertical_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan); } else { - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan); } if (s->pict_type == AV_PICTURE_TYPE_S && diff --git a/libavcodec/mpeg4videoenc.c b/libavcodec/mpeg4videoenc.c index b95752fe49..f120932443 100644 --- a/libavcodec/mpeg4videoenc.c +++ b/libavcodec/mpeg4videoenc.c @@ -110,11 +110,11 @@ static inline void restore_ac_coeffs(MpegEncContext *s, int16_t block[6][64], if (dir[n]) { /* top prediction */ for (i = 1; i < 8; i++) - block[n][s->dsp.idct_permutation[i]] = ac_val[i + 8]; + block[n][s->idsp.idct_permutation[i]] = ac_val[i + 8]; } else { /* left prediction */ for (i = 1; i < 8; i++) - block[n][s->dsp.idct_permutation[i << 3]] = ac_val[i]; + block[n][s->idsp.idct_permutation[i << 3]] = ac_val[i]; } } } @@ -152,17 +152,17 @@ static inline int decide_ac_pred(MpegEncContext *s, int16_t block[6][64], if (s->mb_y == 0 || s->qscale == qscale_table[xy] || n == 2 || n == 3) { /* same qscale */ for (i = 1; i < 8; i++) { - const int level = block[n][s->dsp.idct_permutation[i]]; - block[n][s->dsp.idct_permutation[i]] = level - ac_val[i + 8]; - ac_val1[i] = block[n][s->dsp.idct_permutation[i << 3]]; + const int level = block[n][s->idsp.idct_permutation[i]]; + block[n][s->idsp.idct_permutation[i]] = level - ac_val[i + 8]; + ac_val1[i] = block[n][s->idsp.idct_permutation[i << 3]]; ac_val1[i + 8] = level; } } else { /* different qscale, we must rescale */ for (i = 1; i < 8; i++) { - const int level = block[n][s->dsp.idct_permutation[i]]; - block[n][s->dsp.idct_permutation[i]] = level - ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale); - ac_val1[i] = block[n][s->dsp.idct_permutation[i << 3]]; + const int level = block[n][s->idsp.idct_permutation[i]]; + block[n][s->idsp.idct_permutation[i]] = level - ROUNDED_DIV(ac_val[i + 8] * qscale_table[xy], s->qscale); + ac_val1[i] = block[n][s->idsp.idct_permutation[i << 3]]; ac_val1[i + 8] = level; } } @@ -174,18 +174,18 @@ static inline int decide_ac_pred(MpegEncContext *s, int16_t block[6][64], if (s->mb_x == 0 || s->qscale == qscale_table[xy] || n == 1 || n == 3) { /* same qscale */ for (i = 1; i < 8; i++) { - const int level = block[n][s->dsp.idct_permutation[i << 3]]; - block[n][s->dsp.idct_permutation[i << 3]] = level - ac_val[i]; + const int level = block[n][s->idsp.idct_permutation[i << 3]]; + block[n][s->idsp.idct_permutation[i << 3]] = level - ac_val[i]; ac_val1[i] = level; - ac_val1[i + 8] = block[n][s->dsp.idct_permutation[i]]; + ac_val1[i + 8] = block[n][s->idsp.idct_permutation[i]]; } } else { /* different qscale, we must rescale */ for (i = 1; i < 8; i++) { - const int level = block[n][s->dsp.idct_permutation[i << 3]]; - block[n][s->dsp.idct_permutation[i << 3]] = level - ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale); + const int level = block[n][s->idsp.idct_permutation[i << 3]]; + block[n][s->idsp.idct_permutation[i << 3]] = level - ROUNDED_DIV(ac_val[i] * qscale_table[xy], s->qscale); ac_val1[i] = level; - ac_val1[i + 8] = block[n][s->dsp.idct_permutation[i]]; + ac_val1[i + 8] = block[n][s->idsp.idct_permutation[i]]; } } st[n] = s->intra_v_scantable.permutated; diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c index fb63d6afda..a4a37d4931 100644 --- a/libavcodec/mpegvideo.c +++ b/libavcodec/mpegvideo.c @@ -34,7 +34,7 @@ #include "libavutil/timer.h" #include "avcodec.h" #include "blockdsp.h" -#include "dsputil.h" +#include "idctdsp.h" #include "internal.h" #include "mathops.h" #include "mpegutils.h" @@ -380,6 +380,7 @@ av_cold int ff_dct_common_init(MpegEncContext *s) ff_blockdsp_init(&s->bdsp, s->avctx); ff_dsputil_init(&s->dsp, s->avctx); ff_hpeldsp_init(&s->hdsp, s->avctx->flags); + ff_idctdsp_init(&s->idsp, s->avctx); ff_mpegvideodsp_init(&s->mdsp); ff_videodsp_init(&s->vdsp, s->avctx->bits_per_raw_sample); @@ -403,14 +404,14 @@ av_cold int ff_dct_common_init(MpegEncContext *s) * note: only wmv uses different ones */ if (s->alternate_scan) { - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable , ff_alternate_vertical_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable , ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_alternate_vertical_scan); } else { - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable , ff_zigzag_direct); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable , ff_zigzag_direct); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_zigzag_direct); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_zigzag_direct); } - ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_alternate_horizontal_scan); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_alternate_vertical_scan); return 0; } @@ -2041,7 +2042,7 @@ static inline void put_dct(MpegEncContext *s, int16_t *block, int i, uint8_t *dest, int line_size, int qscale) { s->dct_unquantize_intra(s, block, i, qscale); - s->dsp.idct_put (dest, line_size, block); + s->idsp.idct_put(dest, line_size, block); } /* add block[] to dest[] */ @@ -2049,7 +2050,7 @@ static inline void add_dct(MpegEncContext *s, int16_t *block, int i, uint8_t *dest, int line_size) { if (s->block_last_index[i] >= 0) { - s->dsp.idct_add (dest, line_size, block); + s->idsp.idct_add(dest, line_size, block); } } @@ -2059,7 +2060,7 @@ static inline void add_dequant_dct(MpegEncContext *s, if (s->block_last_index[i] >= 0) { s->dct_unquantize_inter(s, block, i, qscale); - s->dsp.idct_add (dest, line_size, block); + s->idsp.idct_add(dest, line_size, block); } } @@ -2127,7 +2128,8 @@ FF_ENABLE_DEPRECATION_WARNINGS av_log(s->avctx, AV_LOG_DEBUG, "DCT coeffs of MB at %dx%d:\n", s->mb_x, s->mb_y); for(i=0; i<6; i++){ for(j=0; j<64; j++){ - av_log(s->avctx, AV_LOG_DEBUG, "%5d", block[i][s->dsp.idct_permutation[j]]); + av_log(s->avctx, AV_LOG_DEBUG, "%5d", + block[i][s->idsp.idct_permutation[j]]); } av_log(s->avctx, AV_LOG_DEBUG, "\n"); } @@ -2304,29 +2306,29 @@ FF_ENABLE_DEPRECATION_WARNINGS } } }else{ - s->dsp.idct_put(dest_y , dct_linesize, block[0]); - s->dsp.idct_put(dest_y + block_size, dct_linesize, block[1]); - s->dsp.idct_put(dest_y + dct_offset , dct_linesize, block[2]); - s->dsp.idct_put(dest_y + dct_offset + block_size, dct_linesize, block[3]); + s->idsp.idct_put(dest_y, dct_linesize, block[0]); + s->idsp.idct_put(dest_y + block_size, dct_linesize, block[1]); + s->idsp.idct_put(dest_y + dct_offset, dct_linesize, block[2]); + s->idsp.idct_put(dest_y + dct_offset + block_size, dct_linesize, block[3]); if(!CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ if(s->chroma_y_shift){ - s->dsp.idct_put(dest_cb, uvlinesize, block[4]); - s->dsp.idct_put(dest_cr, uvlinesize, block[5]); + s->idsp.idct_put(dest_cb, uvlinesize, block[4]); + s->idsp.idct_put(dest_cr, uvlinesize, block[5]); }else{ dct_linesize = uvlinesize << s->interlaced_dct; dct_offset = s->interlaced_dct ? uvlinesize : uvlinesize * 8; - s->dsp.idct_put(dest_cb, dct_linesize, block[4]); - s->dsp.idct_put(dest_cr, dct_linesize, block[5]); - s->dsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]); - s->dsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]); + s->idsp.idct_put(dest_cb, dct_linesize, block[4]); + s->idsp.idct_put(dest_cr, dct_linesize, block[5]); + s->idsp.idct_put(dest_cb + dct_offset, dct_linesize, block[6]); + s->idsp.idct_put(dest_cr + dct_offset, dct_linesize, block[7]); if(!s->chroma_x_shift){//Chroma444 - s->dsp.idct_put(dest_cb + 8, dct_linesize, block[8]); - s->dsp.idct_put(dest_cr + 8, dct_linesize, block[9]); - s->dsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]); - s->dsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]); + s->idsp.idct_put(dest_cb + 8, dct_linesize, block[8]); + s->idsp.idct_put(dest_cr + 8, dct_linesize, block[9]); + s->idsp.idct_put(dest_cb + 8 + dct_offset, dct_linesize, block[10]); + s->idsp.idct_put(dest_cr + 8 + dct_offset, dct_linesize, block[11]); } } }//gray diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h index 191dac0d3a..27c72dafe3 100644 --- a/libavcodec/mpegvideo.h +++ b/libavcodec/mpegvideo.h @@ -35,6 +35,7 @@ #include "get_bits.h" #include "h263dsp.h" #include "hpeldsp.h" +#include "idctdsp.h" #include "mpegvideodsp.h" #include "put_bits.h" #include "ratecontrol.h" @@ -352,6 +353,7 @@ typedef struct MpegEncContext { BlockDSPContext bdsp; DSPContext dsp; ///< pointers for accelerated dsp functions HpelDSPContext hdsp; + IDCTDSPContext idsp; MpegVideoDSPContext mdsp; QpelDSPContext qdsp; VideoDSPContext vdsp; diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index 3baf37a5be..65e2a8c8c7 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -38,6 +38,7 @@ #include "avcodec.h" #include "dct.h" #include "dsputil.h" +#include "idctdsp.h" #include "mpeg12.h" #include "mpegvideo.h" #include "h261.h" @@ -86,7 +87,7 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64], dsp->fdct == ff_jpeg_fdct_islow_10 || dsp->fdct == ff_faandct) { for (i = 0; i < 64; i++) { - const int j = dsp->idct_permutation[i]; + const int j = s->idsp.idct_permutation[i]; /* 16 <= qscale * quant_matrix[i] <= 7905 * Assume x = ff_aanscales[i] * qscale * quant_matrix[i] * 19952 <= x <= 249205026 @@ -98,7 +99,7 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64], } } else if (dsp->fdct == ff_fdct_ifast) { for (i = 0; i < 64; i++) { - const int j = dsp->idct_permutation[i]; + const int j = s->idsp.idct_permutation[i]; /* 16 <= qscale * quant_matrix[i] <= 7905 * Assume x = ff_aanscales[i] * qscale * quant_matrix[i] * 19952 <= x <= 249205026 @@ -111,7 +112,7 @@ void ff_convert_matrix(MpegEncContext *s, int (*qmat)[64], } } else { for (i = 0; i < 64; i++) { - const int j = dsp->idct_permutation[i]; + const int j = s->idsp.idct_permutation[i]; /* We can safely suppose that 16 <= quant_matrix[i] <= 255 * Assume x = qscale * quant_matrix[i] * So 16 <= x <= 7905 @@ -755,7 +756,7 @@ av_cold int ff_MPV_encode_init(AVCodecContext *avctx) /* init q matrix */ for (i = 0; i < 64; i++) { - int j = s->dsp.idct_permutation[i]; + int j = s->idsp.idct_permutation[i]; if (CONFIG_MPEG4_ENCODER && s->codec_id == AV_CODEC_ID_MPEG4 && s->mpeg_quant) { s->intra_matrix[j] = ff_mpeg4_default_intra_matrix[i]; @@ -3360,7 +3361,7 @@ static int encode_picture(MpegEncContext *s, int picture_number) if (s->out_format == FMT_MJPEG) { /* for mjpeg, we do include qscale in the matrix */ for(i=1;i<64;i++){ - int j= s->dsp.idct_permutation[i]; + int j = s->idsp.idct_permutation[i]; s->intra_matrix[j] = av_clip_uint8((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3); } @@ -3589,7 +3590,7 @@ static int dct_quantize_trellis_c(MpegEncContext *s, if(s->out_format == FMT_H263){ unquant_coeff= alevel*qmul + qadd; }else{ //MPEG1 - j= s->dsp.idct_permutation[ scantable[i] ]; //FIXME optimize + j = s->idsp.idct_permutation[scantable[i]]; // FIXME: optimize if(s->mb_intra){ unquant_coeff = (int)( alevel * qscale * s->intra_matrix[j]) >> 3; unquant_coeff = (unquant_coeff - 1) | 1; @@ -3795,7 +3796,7 @@ static int messed_sign=0; #endif if(basis[0][0] == 0) - build_basis(s->dsp.idct_permutation); + build_basis(s->idsp.idct_permutation); qmul= qscale*2; qadd= (qscale-1)|1; @@ -4214,8 +4215,9 @@ int ff_dct_quantize_c(MpegEncContext *s, *overflow= s->max_qcoeff < max; //overflow might have happened /* we need this permutation so that we correct the IDCT, we only permute the !=0 elements */ - if (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM) - ff_block_permute(block, s->dsp.idct_permutation, scantable, last_non_zero); + if (s->idsp.idct_permutation_type != FF_NO_IDCT_PERM) + ff_block_permute(block, s->idsp.idct_permutation, + scantable, last_non_zero); return last_non_zero; } diff --git a/libavcodec/mpegvideo_xvmc.c b/libavcodec/mpegvideo_xvmc.c index aa6f49ade6..a8e068b76c 100644 --- a/libavcodec/mpegvideo_xvmc.c +++ b/libavcodec/mpegvideo_xvmc.c @@ -307,7 +307,7 @@ void ff_xvmc_decode_mb(MpegEncContext *s) if (s->mb_intra && (render->idct || !render->unsigned_intra)) *s->pblocks[i][0] -= 1 << 10; if (!render->idct) { - s->dsp.idct(*s->pblocks[i]); + s->idsp.idct(*s->pblocks[i]); /* It is unclear if MC hardware requires pixel diff values to be * in the range [-255;255]. TODO: Clipping if such hardware is * ever found. As of now it would only be an unnecessary diff --git a/libavcodec/msmpeg4.c b/libavcodec/msmpeg4.c index f0eaa9bb5b..95b5c93ea3 100644 --- a/libavcodec/msmpeg4.c +++ b/libavcodec/msmpeg4.c @@ -28,7 +28,7 @@ */ #include "avcodec.h" -#include "dsputil.h" +#include "idctdsp.h" #include "mpegvideo.h" #include "msmpeg4.h" #include "libavutil/x86/asm.h" @@ -136,10 +136,10 @@ av_cold void ff_msmpeg4_common_init(MpegEncContext *s) if(s->msmpeg4_version>=4){ - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable , ff_wmv1_scantable[1]); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, ff_wmv1_scantable[2]); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, ff_wmv1_scantable[3]); - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable , ff_wmv1_scantable[0]); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_wmv1_scantable[1]); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_wmv1_scantable[2]); + ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_wmv1_scantable[3]); + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_wmv1_scantable[0]); } //Note the default tables are set in common_init in mpegvideo.c diff --git a/libavcodec/nuv.c b/libavcodec/nuv.c index 1bd848d519..c31ff11222 100644 --- a/libavcodec/nuv.c +++ b/libavcodec/nuv.c @@ -28,6 +28,7 @@ #include "libavutil/lzo.h" #include "libavutil/imgutils.h" #include "avcodec.h" +#include "idctdsp.h" #include "internal.h" #include "rtjpeg.h" diff --git a/libavcodec/ppc/Makefile b/libavcodec/ppc/Makefile index c6c0bcb241..ee0c18c09e 100644 --- a/libavcodec/ppc/Makefile +++ b/libavcodec/ppc/Makefile @@ -9,6 +9,7 @@ OBJS-$(CONFIG_H264DSP) += ppc/h264dsp.o OBJS-$(CONFIG_H264QPEL) += ppc/h264qpel.o OBJS-$(CONFIG_HPELDSP) += ppc/hpeldsp_altivec.o OBJS-$(CONFIG_HUFFYUVDSP) += ppc/huffyuvdsp_altivec.o +OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ ppc/mpegvideodsp.o @@ -24,7 +25,6 @@ OBJS-$(CONFIG_VP8_DECODER) += ppc/vp8dsp_altivec.o ALTIVEC-OBJS-$(CONFIG_DSPUTIL) += ppc/dsputil_altivec.o \ ppc/fdct_altivec.o \ - ppc/idct_altivec.o \ FFT-OBJS-$(HAVE_GNU_AS) += ppc/fft_altivec_s.o ALTIVEC-OBJS-$(CONFIG_FFT) += $(FFT-OBJS-yes) diff --git a/libavcodec/ppc/dsputil_altivec.h b/libavcodec/ppc/dsputil_altivec.h index 42da933dfa..be5fd58669 100644 --- a/libavcodec/ppc/dsputil_altivec.h +++ b/libavcodec/ppc/dsputil_altivec.h @@ -29,9 +29,6 @@ void ff_fdct_altivec(int16_t *block); -void ff_idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -void ff_idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth); diff --git a/libavcodec/ppc/dsputil_ppc.c b/libavcodec/ppc/dsputil_ppc.c index 778d3e1247..b54111310e 100644 --- a/libavcodec/ppc/dsputil_ppc.c +++ b/libavcodec/ppc/dsputil_ppc.c @@ -42,12 +42,6 @@ av_cold void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx, c->fdct = ff_fdct_altivec; } #endif //CONFIG_ENCODERS - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) { - c->idct_put = ff_idct_put_altivec; - c->idct_add = ff_idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } } } } diff --git a/libavcodec/ppc/idct_altivec.c b/libavcodec/ppc/idct_altivec.c deleted file mode 100644 index 82fd9296f0..0000000000 --- a/libavcodec/ppc/idct_altivec.c +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2001 Michel Lespinasse - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* NOTE: This code is based on GPL code from the libmpeg2 project. The - * author, Michel Lespinasses, has given explicit permission to release - * under LGPL as part of Libav. - * - * Libav integration by Dieter Shirley - * - * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 - * project. I've deleted all of the libmpeg2-specific code, renamed the - * functions and reordered the function parameters. The only change to the - * IDCT function itself was to factor out the partial transposition, and to - * perform a full transpose at the end of the function. */ - -#include -#include -#include "config.h" -#if HAVE_ALTIVEC_H -#include -#endif - -#include "libavutil/ppc/types_altivec.h" -#include "dsputil_altivec.h" - -#define IDCT_HALF \ - /* 1st stage */ \ - t1 = vec_mradds(a1, vx7, vx1); \ - t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7)); \ - t7 = vec_mradds(a2, vx5, vx3); \ - t3 = vec_mradds(ma2, vx3, vx5); \ - \ - /* 2nd stage */ \ - t5 = vec_adds(vx0, vx4); \ - t0 = vec_subs(vx0, vx4); \ - t2 = vec_mradds(a0, vx6, vx2); \ - t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6)); \ - t6 = vec_adds(t8, t3); \ - t3 = vec_subs(t8, t3); \ - t8 = vec_subs(t1, t7); \ - t1 = vec_adds(t1, t7); \ - \ - /* 3rd stage */ \ - t7 = vec_adds(t5, t2); \ - t2 = vec_subs(t5, t2); \ - t5 = vec_adds(t0, t4); \ - t0 = vec_subs(t0, t4); \ - t4 = vec_subs(t8, t3); \ - t3 = vec_adds(t8, t3); \ - \ - /* 4th stage */ \ - vy0 = vec_adds(t7, t1); \ - vy7 = vec_subs(t7, t1); \ - vy1 = vec_mradds(c4, t3, t5); \ - vy6 = vec_mradds(mc4, t3, t5); \ - vy2 = vec_mradds(c4, t4, t0); \ - vy5 = vec_mradds(mc4, t4, t0); \ - vy3 = vec_adds(t2, t6); \ - vy4 = vec_subs(t2, t6) - -#define IDCT \ - vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ - vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ - \ - vec_s16 c4 = vec_splat(constants[0], 0); \ - vec_s16 a0 = vec_splat(constants[0], 1); \ - vec_s16 a1 = vec_splat(constants[0], 2); \ - vec_s16 a2 = vec_splat(constants[0], 3); \ - vec_s16 mc4 = vec_splat(constants[0], 4); \ - vec_s16 ma2 = vec_splat(constants[0], 5); \ - vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3); \ - \ - vec_s16 zero = vec_splat_s16(0); \ - vec_u16 shift = vec_splat_u16(4); \ - \ - vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero); \ - vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero); \ - vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero); \ - vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero); \ - vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero); \ - vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero); \ - vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero); \ - vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero); \ - \ - IDCT_HALF; \ - \ - vx0 = vec_mergeh(vy0, vy4); \ - vx1 = vec_mergel(vy0, vy4); \ - vx2 = vec_mergeh(vy1, vy5); \ - vx3 = vec_mergel(vy1, vy5); \ - vx4 = vec_mergeh(vy2, vy6); \ - vx5 = vec_mergel(vy2, vy6); \ - vx6 = vec_mergeh(vy3, vy7); \ - vx7 = vec_mergel(vy3, vy7); \ - \ - vy0 = vec_mergeh(vx0, vx4); \ - vy1 = vec_mergel(vx0, vx4); \ - vy2 = vec_mergeh(vx1, vx5); \ - vy3 = vec_mergel(vx1, vx5); \ - vy4 = vec_mergeh(vx2, vx6); \ - vy5 = vec_mergel(vx2, vx6); \ - vy6 = vec_mergeh(vx3, vx7); \ - vy7 = vec_mergel(vx3, vx7); \ - \ - vx0 = vec_adds(vec_mergeh(vy0, vy4), bias); \ - vx1 = vec_mergel(vy0, vy4); \ - vx2 = vec_mergeh(vy1, vy5); \ - vx3 = vec_mergel(vy1, vy5); \ - vx4 = vec_mergeh(vy2, vy6); \ - vx5 = vec_mergel(vy2, vy6); \ - vx6 = vec_mergeh(vy3, vy7); \ - vx7 = vec_mergel(vy3, vy7); \ - \ - IDCT_HALF; \ - \ - shift = vec_splat_u16(6); \ - vx0 = vec_sra(vy0, shift); \ - vx1 = vec_sra(vy1, shift); \ - vx2 = vec_sra(vy2, shift); \ - vx3 = vec_sra(vy3, shift); \ - vx4 = vec_sra(vy4, shift); \ - vx5 = vec_sra(vy5, shift); \ - vx6 = vec_sra(vy6, shift); \ - vx7 = vec_sra(vy7, shift) - -static const vec_s16 constants[5] = { - { 23170, 13573, 6518, 21895, -23170, -21895, 32, 31 }, - { 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 }, - { 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 }, - { 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 }, - { 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 } -}; - -void ff_idct_put_altivec(uint8_t *dest, int stride, int16_t *blk) -{ - vec_s16 *block = (vec_s16 *) blk; - vec_u8 tmp; - - IDCT; - -#define COPY(dest, src) \ - tmp = vec_packsu(src, src); \ - vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \ - vec_ste((vec_u32) tmp, 4, (unsigned int *) dest) - - COPY(dest, vx0); - dest += stride; - COPY(dest, vx1); - dest += stride; - COPY(dest, vx2); - dest += stride; - COPY(dest, vx3); - dest += stride; - COPY(dest, vx4); - dest += stride; - COPY(dest, vx5); - dest += stride; - COPY(dest, vx6); - dest += stride; - COPY(dest, vx7); -} - -void ff_idct_add_altivec(uint8_t *dest, int stride, int16_t *blk) -{ - vec_s16 *block = (vec_s16 *) blk; - vec_u8 tmp; - vec_s16 tmp2, tmp3; - vec_u8 perm0; - vec_u8 perm1; - vec_u8 p0, p1, p; - - IDCT; - - p0 = vec_lvsl(0, dest); - p1 = vec_lvsl(stride, dest); - p = vec_splat_u8(-1); - perm0 = vec_mergeh(p, p0); - perm1 = vec_mergeh(p, p1); - -#define ADD(dest, src, perm) \ - /* *(uint64_t *) &tmp = *(uint64_t *) dest; */ \ - tmp = vec_ld(0, dest); \ - tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm); \ - tmp3 = vec_adds(tmp2, src); \ - tmp = vec_packsu(tmp3, tmp3); \ - vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \ - vec_ste((vec_u32) tmp, 4, (unsigned int *) dest) - - ADD(dest, vx0, perm0); - dest += stride; - ADD(dest, vx1, perm1); - dest += stride; - ADD(dest, vx2, perm0); - dest += stride; - ADD(dest, vx3, perm1); - dest += stride; - ADD(dest, vx4, perm0); - dest += stride; - ADD(dest, vx5, perm1); - dest += stride; - ADD(dest, vx6, perm0); - dest += stride; - ADD(dest, vx7, perm1); -} diff --git a/libavcodec/ppc/idctdsp.c b/libavcodec/ppc/idctdsp.c new file mode 100644 index 0000000000..8a1d2903d8 --- /dev/null +++ b/libavcodec/ppc/idctdsp.c @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2001 Michel Lespinasse + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* NOTE: This code is based on GPL code from the libmpeg2 project. The + * author, Michel Lespinasses, has given explicit permission to release + * under LGPL as part of Libav. + * + * Libav integration by Dieter Shirley + * + * This file is a direct copy of the AltiVec IDCT module from the libmpeg2 + * project. I've deleted all of the libmpeg2-specific code, renamed the + * functions and reordered the function parameters. The only change to the + * IDCT function itself was to factor out the partial transposition, and to + * perform a full transpose at the end of the function. */ + +#include +#include +#include "config.h" +#if HAVE_ALTIVEC_H +#include +#endif + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/ppc/cpu.h" +#include "libavutil/ppc/types_altivec.h" +#include "libavcodec/idctdsp.h" + +#if HAVE_ALTIVEC + +#define IDCT_HALF \ + /* 1st stage */ \ + t1 = vec_mradds(a1, vx7, vx1); \ + t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7)); \ + t7 = vec_mradds(a2, vx5, vx3); \ + t3 = vec_mradds(ma2, vx3, vx5); \ + \ + /* 2nd stage */ \ + t5 = vec_adds(vx0, vx4); \ + t0 = vec_subs(vx0, vx4); \ + t2 = vec_mradds(a0, vx6, vx2); \ + t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6)); \ + t6 = vec_adds(t8, t3); \ + t3 = vec_subs(t8, t3); \ + t8 = vec_subs(t1, t7); \ + t1 = vec_adds(t1, t7); \ + \ + /* 3rd stage */ \ + t7 = vec_adds(t5, t2); \ + t2 = vec_subs(t5, t2); \ + t5 = vec_adds(t0, t4); \ + t0 = vec_subs(t0, t4); \ + t4 = vec_subs(t8, t3); \ + t3 = vec_adds(t8, t3); \ + \ + /* 4th stage */ \ + vy0 = vec_adds(t7, t1); \ + vy7 = vec_subs(t7, t1); \ + vy1 = vec_mradds(c4, t3, t5); \ + vy6 = vec_mradds(mc4, t3, t5); \ + vy2 = vec_mradds(c4, t4, t0); \ + vy5 = vec_mradds(mc4, t4, t0); \ + vy3 = vec_adds(t2, t6); \ + vy4 = vec_subs(t2, t6) + +#define IDCT \ + vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ + vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \ + \ + vec_s16 c4 = vec_splat(constants[0], 0); \ + vec_s16 a0 = vec_splat(constants[0], 1); \ + vec_s16 a1 = vec_splat(constants[0], 2); \ + vec_s16 a2 = vec_splat(constants[0], 3); \ + vec_s16 mc4 = vec_splat(constants[0], 4); \ + vec_s16 ma2 = vec_splat(constants[0], 5); \ + vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3); \ + \ + vec_s16 zero = vec_splat_s16(0); \ + vec_u16 shift = vec_splat_u16(4); \ + \ + vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero); \ + vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero); \ + vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero); \ + vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero); \ + vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero); \ + vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero); \ + vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero); \ + vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero); \ + \ + IDCT_HALF; \ + \ + vx0 = vec_mergeh(vy0, vy4); \ + vx1 = vec_mergel(vy0, vy4); \ + vx2 = vec_mergeh(vy1, vy5); \ + vx3 = vec_mergel(vy1, vy5); \ + vx4 = vec_mergeh(vy2, vy6); \ + vx5 = vec_mergel(vy2, vy6); \ + vx6 = vec_mergeh(vy3, vy7); \ + vx7 = vec_mergel(vy3, vy7); \ + \ + vy0 = vec_mergeh(vx0, vx4); \ + vy1 = vec_mergel(vx0, vx4); \ + vy2 = vec_mergeh(vx1, vx5); \ + vy3 = vec_mergel(vx1, vx5); \ + vy4 = vec_mergeh(vx2, vx6); \ + vy5 = vec_mergel(vx2, vx6); \ + vy6 = vec_mergeh(vx3, vx7); \ + vy7 = vec_mergel(vx3, vx7); \ + \ + vx0 = vec_adds(vec_mergeh(vy0, vy4), bias); \ + vx1 = vec_mergel(vy0, vy4); \ + vx2 = vec_mergeh(vy1, vy5); \ + vx3 = vec_mergel(vy1, vy5); \ + vx4 = vec_mergeh(vy2, vy6); \ + vx5 = vec_mergel(vy2, vy6); \ + vx6 = vec_mergeh(vy3, vy7); \ + vx7 = vec_mergel(vy3, vy7); \ + \ + IDCT_HALF; \ + \ + shift = vec_splat_u16(6); \ + vx0 = vec_sra(vy0, shift); \ + vx1 = vec_sra(vy1, shift); \ + vx2 = vec_sra(vy2, shift); \ + vx3 = vec_sra(vy3, shift); \ + vx4 = vec_sra(vy4, shift); \ + vx5 = vec_sra(vy5, shift); \ + vx6 = vec_sra(vy6, shift); \ + vx7 = vec_sra(vy7, shift) + +static const vec_s16 constants[5] = { + { 23170, 13573, 6518, 21895, -23170, -21895, 32, 31 }, + { 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 }, + { 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 }, + { 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 }, + { 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 } +}; + +static void idct_put_altivec(uint8_t *dest, int stride, int16_t *blk) +{ + vec_s16 *block = (vec_s16 *) blk; + vec_u8 tmp; + + IDCT; + +#define COPY(dest, src) \ + tmp = vec_packsu(src, src); \ + vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \ + vec_ste((vec_u32) tmp, 4, (unsigned int *) dest) + + COPY(dest, vx0); + dest += stride; + COPY(dest, vx1); + dest += stride; + COPY(dest, vx2); + dest += stride; + COPY(dest, vx3); + dest += stride; + COPY(dest, vx4); + dest += stride; + COPY(dest, vx5); + dest += stride; + COPY(dest, vx6); + dest += stride; + COPY(dest, vx7); +} + +static void idct_add_altivec(uint8_t *dest, int stride, int16_t *blk) +{ + vec_s16 *block = (vec_s16 *) blk; + vec_u8 tmp; + vec_s16 tmp2, tmp3; + vec_u8 perm0; + vec_u8 perm1; + vec_u8 p0, p1, p; + + IDCT; + + p0 = vec_lvsl(0, dest); + p1 = vec_lvsl(stride, dest); + p = vec_splat_u8(-1); + perm0 = vec_mergeh(p, p0); + perm1 = vec_mergeh(p, p1); + +#define ADD(dest, src, perm) \ + /* *(uint64_t *) &tmp = *(uint64_t *) dest; */ \ + tmp = vec_ld(0, dest); \ + tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, perm); \ + tmp3 = vec_adds(tmp2, src); \ + tmp = vec_packsu(tmp3, tmp3); \ + vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \ + vec_ste((vec_u32) tmp, 4, (unsigned int *) dest) + + ADD(dest, vx0, perm0); + dest += stride; + ADD(dest, vx1, perm1); + dest += stride; + ADD(dest, vx2, perm0); + dest += stride; + ADD(dest, vx3, perm1); + dest += stride; + ADD(dest, vx4, perm0); + dest += stride; + ADD(dest, vx5, perm1); + dest += stride; + ADD(dest, vx6, perm0); + dest += stride; + ADD(dest, vx7, perm1); +} + +#endif /* HAVE_ALTIVEC */ + +av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ +#if HAVE_ALTIVEC + if (PPC_ALTIVEC(av_get_cpu_flags())) { + if (!high_bit_depth) { + if ((avctx->idct_algo == FF_IDCT_AUTO) || + (avctx->idct_algo == FF_IDCT_ALTIVEC)) { + c->idct_add = idct_add_altivec; + c->idct_put = idct_put_altivec; + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } + } + } +#endif /* HAVE_ALTIVEC */ +} diff --git a/libavcodec/proresdec.c b/libavcodec/proresdec.c index 144fa26f8c..03f63d9dce 100644 --- a/libavcodec/proresdec.c +++ b/libavcodec/proresdec.c @@ -34,7 +34,7 @@ #include "libavutil/intmath.h" #include "avcodec.h" -#include "dsputil.h" +#include "idctdsp.h" #include "internal.h" #include "proresdata.h" #include "proresdsp.h" diff --git a/libavcodec/proresdsp.c b/libavcodec/proresdsp.c index 1d60897cc9..1d92d360d2 100644 --- a/libavcodec/proresdsp.c +++ b/libavcodec/proresdsp.c @@ -23,7 +23,7 @@ #include "config.h" #include "libavutil/attributes.h" #include "libavutil/common.h" -#include "dsputil.h" +#include "idctdsp.h" #include "proresdsp.h" #include "simple_idct.h" diff --git a/libavcodec/rtjpeg.c b/libavcodec/rtjpeg.c index 3188e6f6aa..67eeff8f4a 100644 --- a/libavcodec/rtjpeg.c +++ b/libavcodec/rtjpeg.c @@ -121,7 +121,7 @@ int ff_rtjpeg_decode_frame_yuv420(RTJpegContext *c, AVFrame *f, if (res < 0) \ return res; \ if (res > 0) \ - c->dsp.idct_put(dst, stride, block); \ + c->idsp.idct_put(dst, stride, block); \ } while (0) int16_t *block = c->block; BLOCK(c->lquant, y1, f->linesize[0]); @@ -159,7 +159,7 @@ void ff_rtjpeg_decode_init(RTJpegContext *c, int width, int height, const uint32_t *lquant, const uint32_t *cquant) { int i; for (i = 0; i < 64; i++) { - int p = c->dsp.idct_permutation[i]; + int p = c->idsp.idct_permutation[i]; c->lquant[p] = lquant[i]; c->cquant[p] = cquant[i]; } @@ -171,13 +171,13 @@ void ff_rtjpeg_init(RTJpegContext *c, AVCodecContext *avctx) { int i; - ff_dsputil_init(&c->dsp, avctx); + ff_idctdsp_init(&c->idsp, avctx); for (i = 0; i < 64; i++) { int z = ff_zigzag_direct[i]; z = ((z << 3) | (z >> 3)) & 63; // rtjpeg uses a transposed variant // permute the scan and quantization tables for the chosen idct - c->scan[i] = c->dsp.idct_permutation[z]; + c->scan[i] = c->idsp.idct_permutation[z]; } } diff --git a/libavcodec/rtjpeg.h b/libavcodec/rtjpeg.h index 23609b3eb9..cd300797c5 100644 --- a/libavcodec/rtjpeg.h +++ b/libavcodec/rtjpeg.h @@ -23,15 +23,16 @@ #define AVCODEC_RTJPEG_H #include -#include "dsputil.h" + #include "libavutil/mem.h" +#include "idctdsp.h" #define RTJPEG_FILE_VERSION 0 #define RTJPEG_HEADER_SIZE 12 typedef struct RTJpegContext { int w, h; - DSPContext dsp; + IDCTDSPContext idsp; uint8_t scan[64]; uint32_t lquant[64]; uint32_t cquant[64]; diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c index 6d6c1ec4fa..c83bb4fb77 100644 --- a/libavcodec/vc1dec.c +++ b/libavcodec/vc1dec.c @@ -109,24 +109,24 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v) fieldtx = v->fieldtx_plane[topleft_mb_pos]; stride_y = s->linesize << fieldtx; v_dist = (16 - fieldtx) >> (fieldtx == 0); - s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0], - s->dest[0] - 16 * s->linesize - 16, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1], - s->dest[0] - 16 * s->linesize - 8, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2], - s->dest[0] - v_dist * s->linesize - 16, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3], - s->dest[0] - v_dist * s->linesize - 8, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4], - s->dest[1] - 8 * s->uvlinesize - 8, - s->uvlinesize); - s->dsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5], - s->dest[2] - 8 * s->uvlinesize - 8, - s->uvlinesize); + s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][0], + s->dest[0] - 16 * s->linesize - 16, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][1], + s->dest[0] - 16 * s->linesize - 8, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][2], + s->dest[0] - v_dist * s->linesize - 16, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][3], + s->dest[0] - v_dist * s->linesize - 8, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][4], + s->dest[1] - 8 * s->uvlinesize - 8, + s->uvlinesize); + s->idsp.put_signed_pixels_clamped(v->block[v->topleft_blk_idx][5], + s->dest[2] - 8 * s->uvlinesize - 8, + s->uvlinesize); } if (s->mb_x == s->mb_width - 1) { top_mb_pos = (s->mb_y - 1) * s->mb_stride + s->mb_x; @@ -134,24 +134,24 @@ static void vc1_put_signed_blocks_clamped(VC1Context *v) fieldtx = v->fieldtx_plane[top_mb_pos]; stride_y = s->linesize << fieldtx; v_dist = fieldtx ? 15 : 8; - s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0], - s->dest[0] - 16 * s->linesize, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1], - s->dest[0] - 16 * s->linesize + 8, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2], - s->dest[0] - v_dist * s->linesize, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3], - s->dest[0] - v_dist * s->linesize + 8, - stride_y); - s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4], - s->dest[1] - 8 * s->uvlinesize, - s->uvlinesize); - s->dsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5], - s->dest[2] - 8 * s->uvlinesize, - s->uvlinesize); + s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][0], + s->dest[0] - 16 * s->linesize, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][1], + s->dest[0] - 16 * s->linesize + 8, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][2], + s->dest[0] - v_dist * s->linesize, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][3], + s->dest[0] - v_dist * s->linesize + 8, + stride_y); + s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][4], + s->dest[1] - 8 * s->uvlinesize, + s->uvlinesize); + s->idsp.put_signed_pixels_clamped(v->block[v->top_blk_idx][5], + s->dest[2] - 8 * s->uvlinesize, + s->uvlinesize); } } @@ -3280,7 +3280,7 @@ static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n, v->vc1dsp.vc1_inv_trans_8x8_dc(dst, linesize, block); else { v->vc1dsp.vc1_inv_trans_8x8(block); - s->dsp.add_pixels_clamped(block, dst, linesize); + s->idsp.add_pixels_clamped(block, dst, linesize); } } break; @@ -3611,7 +3611,10 @@ static int vc1_decode_p_mb(VC1Context *v) if (v->rangeredfrm) for (j = 0; j < 64; j++) s->block[i][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + i & 4 ? s->uvlinesize + : s->linesize); if (v->pq >= 9 && v->overlap) { if (v->c_avail) v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); @@ -3719,8 +3722,10 @@ static int vc1_decode_p_mb(VC1Context *v) if (v->rangeredfrm) for (j = 0; j < 64; j++) s->block[i][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, - (i & 4) ? s->uvlinesize : s->linesize); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + (i & 4) ? s->uvlinesize + : s->linesize); if (v->pq >= 9 && v->overlap) { if (v->c_avail) v->vc1dsp.vc1_h_overlap(s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); @@ -3869,7 +3874,9 @@ static int vc1_decode_p_mb_intfr(VC1Context *v) stride_y = s->uvlinesize; off = 0; } - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, stride_y); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + stride_y); //TODO: loop filter } @@ -4031,7 +4038,10 @@ static int vc1_decode_p_mb_intfi(VC1Context *v) continue; v->vc1dsp.vc1_inv_trans_8x8(s->block[i]); off = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize); - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + (i & 4) ? s->uvlinesize + : s->linesize); // TODO: loop filter } } else { @@ -4233,7 +4243,10 @@ static void vc1_decode_b_mb(VC1Context *v) if (v->rangeredfrm) for (j = 0; j < 64; j++) s->block[i][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, i & 4 ? s->uvlinesize : s->linesize); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + i & 4 ? s->uvlinesize + : s->linesize); } else if (val) { vc1_decode_p_block(v, s->block[i], i, mquant, ttmb, first_block, s->dest[dst_idx] + off, @@ -4305,7 +4318,10 @@ static void vc1_decode_b_mb_intfi(VC1Context *v) for (j = 0; j < 64; j++) s->block[i][j] <<= 1; off = (i & 4) ? 0 : ((i & 1) * 8 + (i & 2) * 4 * s->linesize); - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, (i & 4) ? s->uvlinesize : s->linesize); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + (i & 4) ? s->uvlinesize + : s->linesize); // TODO: yet to perform loop filter } } else { @@ -4524,7 +4540,9 @@ static int vc1_decode_b_mb_intfr(VC1Context *v) stride_y = s->uvlinesize; off = 0; } - s->dsp.put_signed_pixels_clamped(s->block[i], s->dest[dst_idx] + off, stride_y); + s->idsp.put_signed_pixels_clamped(s->block[i], + s->dest[dst_idx] + off, + stride_y); } } else { s->mb_intra = v->is_intra[s->mb_x] = 0; @@ -4828,12 +4846,16 @@ static void vc1_decode_i_blocks(VC1Context *v) if (v->rangeredfrm) for (j = 0; j < 64; j++) s->block[k][j] <<= 1; - s->dsp.put_signed_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize); + s->idsp.put_signed_pixels_clamped(s->block[k], dst[k], + k & 4 ? s->uvlinesize + : s->linesize); } else { if (v->rangeredfrm) for (j = 0; j < 64; j++) s->block[k][j] = (s->block[k][j] - 64) << 1; - s->dsp.put_pixels_clamped(s->block[k], dst[k], k & 4 ? s->uvlinesize : s->linesize); + s->idsp.put_pixels_clamped(s->block[k], dst[k], + k & 4 ? s->uvlinesize + : s->linesize); } } diff --git a/libavcodec/wmv2.c b/libavcodec/wmv2.c index bd799d0e8b..b6c7bc0a9f 100644 --- a/libavcodec/wmv2.c +++ b/libavcodec/wmv2.c @@ -19,6 +19,7 @@ */ #include "avcodec.h" +#include "idctdsp.h" #include "mpegvideo.h" #include "msmpeg4data.h" #include "simple_idct.h" @@ -30,24 +31,24 @@ av_cold void ff_wmv2_common_init(Wmv2Context * w){ ff_blockdsp_init(&s->bdsp, s->avctx); ff_wmv2dsp_init(&w->wdsp); - s->dsp.idct_permutation_type = w->wdsp.idct_perm; - ff_init_scantable_permutation(s->dsp.idct_permutation, + s->idsp.idct_permutation_type = w->wdsp.idct_perm; + ff_init_scantable_permutation(s->idsp.idct_permutation, w->wdsp.idct_perm); - ff_init_scantable(s->dsp.idct_permutation, &w->abt_scantable[0], + ff_init_scantable(s->idsp.idct_permutation, &w->abt_scantable[0], ff_wmv2_scantableA); - ff_init_scantable(s->dsp.idct_permutation, &w->abt_scantable[1], + ff_init_scantable(s->idsp.idct_permutation, &w->abt_scantable[1], ff_wmv2_scantableB); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_scantable, + ff_init_scantable(s->idsp.idct_permutation, &s->intra_scantable, ff_wmv1_scantable[1]); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_h_scantable, + ff_init_scantable(s->idsp.idct_permutation, &s->intra_h_scantable, ff_wmv1_scantable[2]); - ff_init_scantable(s->dsp.idct_permutation, &s->intra_v_scantable, + ff_init_scantable(s->idsp.idct_permutation, &s->intra_v_scantable, ff_wmv1_scantable[3]); - ff_init_scantable(s->dsp.idct_permutation, &s->inter_scantable, + ff_init_scantable(s->idsp.idct_permutation, &s->inter_scantable, ff_wmv1_scantable[0]); - s->dsp.idct_put = w->wdsp.idct_put; - s->dsp.idct_add = w->wdsp.idct_add; - s->dsp.idct = NULL; + s->idsp.idct_put = w->wdsp.idct_put; + s->idsp.idct_add = w->wdsp.idct_add; + s->idsp.idct = NULL; } static void wmv2_add_block(Wmv2Context *w, int16_t *block1, uint8_t *dst, int stride, int n){ diff --git a/libavcodec/wmv2dsp.c b/libavcodec/wmv2dsp.c index dff49f47a4..49df43690a 100644 --- a/libavcodec/wmv2dsp.c +++ b/libavcodec/wmv2dsp.c @@ -19,7 +19,7 @@ #include "libavutil/attributes.h" #include "libavutil/common.h" #include "avcodec.h" -#include "dsputil.h" +#include "idctdsp.h" #include "mathops.h" #include "wmv2dsp.h" diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 13f9affdb2..14e58f9a9c 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -18,6 +18,7 @@ OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_init.o OBJS-$(CONFIG_HUFFYUVENCDSP) += x86/huffyuvencdsp_mmx.o +OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_init.o OBJS-$(CONFIG_LPC) += x86/lpc.o OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o \ @@ -49,13 +50,14 @@ OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o MMX-OBJS-$(CONFIG_AUDIODSP) += x86/audiodsp_mmx.o MMX-OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_mmx.o -MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ - x86/idct_mmx_xvid.o \ - x86/idct_sse2_xvid.o \ - x86/simple_idct.o +MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ x86/hpeldsp_mmx.o MMX-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp_mmx.o +MMX-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp_mmx.o \ + x86/idct_mmx_xvid.o \ + x86/idct_sse2_xvid.o \ + x86/simple_idct.o MMX-OBJS-$(CONFIG_QPELDSP) += x86/fpel_mmx.o MMX-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_mmx.o diff --git a/libavcodec/x86/cavsdsp.c b/libavcodec/x86/cavsdsp.c index d5c441f1e5..f0e8cfcd17 100644 --- a/libavcodec/x86/cavsdsp.c +++ b/libavcodec/x86/cavsdsp.c @@ -28,9 +28,10 @@ #include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/cavsdsp.h" +#include "libavcodec/idctdsp.h" #include "constants.h" -#include "dsputil_x86.h" #include "fpel.h" +#include "idctdsp.h" #include "config.h" #if HAVE_MMX_INLINE diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c index 74dab48e72..adc7aa95d6 100644 --- a/libavcodec/x86/dsputil_init.c +++ b/libavcodec/x86/dsputil_init.c @@ -22,97 +22,18 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/avcodec.h" #include "libavcodec/dsputil.h" -#include "libavcodec/simple_idct.h" #include "dsputil_x86.h" -#include "idct_xvid.h" - -/* Input permutation for the simple_idct_mmx */ -static const uint8_t simple_mmx_permutation[64] = { - 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, - 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, - 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, - 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, - 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, - 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, - 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, - 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, -}; - -static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; - -av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, - int idct_permutation_type) -{ - int i; - - switch (idct_permutation_type) { - case FF_SIMPLE_IDCT_PERM: - for (i = 0; i < 64; i++) - idct_permutation[i] = simple_mmx_permutation[i]; - return 1; - case FF_SSE2_IDCT_PERM: - for (i = 0; i < 64; i++) - idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7]; - return 1; - } - - return 0; -} static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int cpu_flags, unsigned high_bit_depth) { #if HAVE_MMX_INLINE - c->put_pixels_clamped = ff_put_pixels_clamped_mmx; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; - c->add_pixels_clamped = ff_add_pixels_clamped_mmx; - if (!high_bit_depth) { c->draw_edges = ff_draw_edges_mmx; - - switch (avctx->idct_algo) { - case FF_IDCT_AUTO: - case FF_IDCT_SIMPLEMMX: - c->idct_put = ff_simple_idct_put_mmx; - c->idct_add = ff_simple_idct_add_mmx; - c->idct = ff_simple_idct_mmx; - c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; - break; - case FF_IDCT_XVIDMMX: - c->idct_put = ff_idct_xvid_mmx_put; - c->idct_add = ff_idct_xvid_mmx_add; - c->idct = ff_idct_xvid_mmx; - break; - } } #endif /* HAVE_MMX_INLINE */ } -static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, - int cpu_flags, unsigned high_bit_depth) -{ -#if HAVE_MMXEXT_INLINE - if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { - c->idct_put = ff_idct_xvid_mmxext_put; - c->idct_add = ff_idct_xvid_mmxext_add; - c->idct = ff_idct_xvid_mmxext; - } -#endif /* HAVE_MMXEXT_INLINE */ -} - -static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, - int cpu_flags, unsigned high_bit_depth) -{ -#if HAVE_SSE2_INLINE - if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { - c->idct_put = ff_idct_xvid_sse2_put; - c->idct_add = ff_idct_xvid_sse2_add; - c->idct = ff_idct_xvid_sse2; - c->idct_permutation_type = FF_SSE2_IDCT_PERM; - } -#endif /* HAVE_SSE2_INLINE */ -} - av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth) { @@ -121,12 +42,6 @@ av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx, if (X86_MMX(cpu_flags)) dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth); - if (X86_MMXEXT(cpu_flags)) - dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth); - - if (X86_SSE2(cpu_flags)) - dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth); - if (CONFIG_ENCODERS) ff_dsputilenc_init_mmx(c, avctx, high_bit_depth); } diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5fa047da7b..d205a48ea4 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -30,141 +30,6 @@ #if HAVE_INLINE_ASM -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - const int16_t *p; - uint8_t *pix; - - /* read the pixels */ - p = block; - pix = pixels; - /* unrolled loop */ - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), - "r" (p) - : "memory"); - pix += line_size * 4; - p += 32; - - // if here would be an exact copy of the code above - // compiler would generate some very strange code - // thus using "r" - __asm__ volatile ( - "movq (%3), %%mm0 \n\t" - "movq 8(%3), %%mm1 \n\t" - "movq 16(%3), %%mm2 \n\t" - "movq 24(%3), %%mm3 \n\t" - "movq 32(%3), %%mm4 \n\t" - "movq 40(%3), %%mm5 \n\t" - "movq 48(%3), %%mm6 \n\t" - "movq 56(%3), %%mm7 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "packuswb %%mm7, %%mm6 \n\t" - "movq %%mm0, (%0) \n\t" - "movq %%mm2, (%0, %1) \n\t" - "movq %%mm4, (%0, %1, 2) \n\t" - "movq %%mm6, (%0, %2) \n\t" - :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), - "r" (p) - : "memory"); -} - -#define put_signed_pixels_clamped_mmx_half(off) \ - "movq "#off"(%2), %%mm1 \n\t" \ - "movq 16 + "#off"(%2), %%mm2 \n\t" \ - "movq 32 + "#off"(%2), %%mm3 \n\t" \ - "movq 48 + "#off"(%2), %%mm4 \n\t" \ - "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ - "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ - "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ - "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ - "paddb %%mm0, %%mm1 \n\t" \ - "paddb %%mm0, %%mm2 \n\t" \ - "paddb %%mm0, %%mm3 \n\t" \ - "paddb %%mm0, %%mm4 \n\t" \ - "movq %%mm1, (%0) \n\t" \ - "movq %%mm2, (%0, %3) \n\t" \ - "movq %%mm3, (%0, %3, 2) \n\t" \ - "movq %%mm4, (%0, %1) \n\t" - -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - x86_reg line_skip = line_size; - x86_reg line_skip3; - - __asm__ volatile ( - "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" - "lea (%3, %3, 2), %1 \n\t" - put_signed_pixels_clamped_mmx_half(0) - "lea (%0, %3, 4), %0 \n\t" - put_signed_pixels_clamped_mmx_half(64) - : "+&r" (pixels), "=&r" (line_skip3) - : "r" (block), "r" (line_skip) - : "memory"); -} - -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size) -{ - const int16_t *p; - uint8_t *pix; - int i; - - /* read the pixels */ - p = block; - pix = pixels; - MOVQ_ZERO(mm7); - i = 4; - do { - __asm__ volatile ( - "movq (%2), %%mm0 \n\t" - "movq 8(%2), %%mm1 \n\t" - "movq 16(%2), %%mm2 \n\t" - "movq 24(%2), %%mm3 \n\t" - "movq %0, %%mm4 \n\t" - "movq %1, %%mm6 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm4, %%mm0 \n\t" - "paddsw %%mm5, %%mm1 \n\t" - "movq %%mm6, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm6 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddsw %%mm6, %%mm2 \n\t" - "paddsw %%mm5, %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "packuswb %%mm3, %%mm2 \n\t" - "movq %%mm0, %0 \n\t" - "movq %%mm2, %1 \n\t" - : "+m" (*pix), "+m" (*(pix + line_size)) - : "r" (p) - : "memory"); - pix += line_size * 2; - p += 16; - } while (--i); -} - /* Draw the edges of width 'w' of an image of size width, height * this MMX version can only handle w == 8 || w == 16. */ void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, diff --git a/libavcodec/x86/dsputil_x86.h b/libavcodec/x86/dsputil_x86.h index 4beb6c11ca..7e1e8af051 100644 --- a/libavcodec/x86/dsputil_x86.h +++ b/libavcodec/x86/dsputil_x86.h @@ -31,13 +31,6 @@ void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth); void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx); -void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size); -void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size); -void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, - int line_size); - void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides); diff --git a/libavcodec/x86/idct_mmx_xvid.c b/libavcodec/x86/idct_mmx_xvid.c index 27723393bf..920ea4c0dc 100644 --- a/libavcodec/x86/idct_mmx_xvid.c +++ b/libavcodec/x86/idct_mmx_xvid.c @@ -44,8 +44,8 @@ #include "config.h" #include "libavcodec/avcodec.h" #include "libavutil/mem.h" -#include "dsputil_x86.h" #include "idct_xvid.h" +#include "idctdsp.h" #if HAVE_MMX_INLINE diff --git a/libavcodec/x86/idct_sse2_xvid.c b/libavcodec/x86/idct_sse2_xvid.c index 50655d6bc0..aadeb122c6 100644 --- a/libavcodec/x86/idct_sse2_xvid.c +++ b/libavcodec/x86/idct_sse2_xvid.c @@ -42,7 +42,7 @@ #include "libavutil/mem.h" #include "libavutil/x86/asm.h" #include "idct_xvid.h" -#include "dsputil_x86.h" +#include "idctdsp.h" #if HAVE_SSE2_INLINE diff --git a/libavcodec/x86/idctdsp.h b/libavcodec/x86/idctdsp.h new file mode 100644 index 0000000000..22df3dd758 --- /dev/null +++ b/libavcodec/x86/idctdsp.h @@ -0,0 +1,31 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_X86_IDCTDSP_H +#define AVCODEC_X86_IDCTDSP_H + +#include + +void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, + int line_size); +void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, + int line_size); +void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, + int line_size); + +#endif /* AVCODEC_X86_IDCTDSP_H */ diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c new file mode 100644 index 0000000000..9b68497502 --- /dev/null +++ b/libavcodec/x86/idctdsp_init.c @@ -0,0 +1,106 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/idctdsp.h" +#include "libavcodec/simple_idct.h" +#include "idct_xvid.h" +#include "idctdsp.h" + +/* Input permutation for the simple_idct_mmx */ +static const uint8_t simple_mmx_permutation[64] = { + 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, + 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, + 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, + 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, + 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, + 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, + 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, + 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, +}; + +static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 }; + +av_cold int ff_init_scantable_permutation_x86(uint8_t *idct_permutation, + int idct_permutation_type) +{ + int i; + + switch (idct_permutation_type) { + case FF_SIMPLE_IDCT_PERM: + for (i = 0; i < 64; i++) + idct_permutation[i] = simple_mmx_permutation[i]; + return 1; + case FF_SSE2_IDCT_PERM: + for (i = 0; i < 64; i++) + idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7]; + return 1; + } + + return 0; +} + +av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx, + unsigned high_bit_depth) +{ + int cpu_flags = av_get_cpu_flags(); + + if (INLINE_MMX(cpu_flags)) { + c->put_pixels_clamped = ff_put_pixels_clamped_mmx; + c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; + c->add_pixels_clamped = ff_add_pixels_clamped_mmx; + + if (!high_bit_depth) { + switch (avctx->idct_algo) { + case FF_IDCT_AUTO: + case FF_IDCT_SIMPLEMMX: + c->idct_put = ff_simple_idct_put_mmx; + c->idct_add = ff_simple_idct_add_mmx; + c->idct = ff_simple_idct_mmx; + c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; + break; + case FF_IDCT_XVIDMMX: + c->idct_put = ff_idct_xvid_mmx_put; + c->idct_add = ff_idct_xvid_mmx_add; + c->idct = ff_idct_xvid_mmx; + break; + } + } + } + + if (INLINE_MMXEXT(cpu_flags)) { + if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { + c->idct_put = ff_idct_xvid_mmxext_put; + c->idct_add = ff_idct_xvid_mmxext_add; + c->idct = ff_idct_xvid_mmxext; + } + } + + if (INLINE_SSE2(cpu_flags)) { + if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) { + c->idct_put = ff_idct_xvid_sse2_put; + c->idct_add = ff_idct_xvid_sse2_add; + c->idct = ff_idct_xvid_sse2; + c->idct_permutation_type = FF_SSE2_IDCT_PERM; + } + } +} diff --git a/libavcodec/x86/idctdsp_mmx.c b/libavcodec/x86/idctdsp_mmx.c new file mode 100644 index 0000000000..7285b1d357 --- /dev/null +++ b/libavcodec/x86/idctdsp_mmx.c @@ -0,0 +1,168 @@ +/* + * SIMD-optimized IDCT-related routines + * Copyright (c) 2000, 2001 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer + * + * MMX optimization by Nick Kurshev + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/asm.h" +#include "idctdsp.h" +#include "inline_asm.h" + +#if HAVE_INLINE_ASM + +void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, + int line_size) +{ + const int16_t *p; + uint8_t *pix; + + /* read the pixels */ + p = block; + pix = pixels; + /* unrolled loop */ + __asm__ volatile ( + "movq (%3), %%mm0 \n\t" + "movq 8(%3), %%mm1 \n\t" + "movq 16(%3), %%mm2 \n\t" + "movq 24(%3), %%mm3 \n\t" + "movq 32(%3), %%mm4 \n\t" + "movq 40(%3), %%mm5 \n\t" + "movq 48(%3), %%mm6 \n\t" + "movq 56(%3), %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), + "r" (p) + : "memory"); + pix += line_size * 4; + p += 32; + + // if here would be an exact copy of the code above + // compiler would generate some very strange code + // thus using "r" + __asm__ volatile ( + "movq (%3), %%mm0 \n\t" + "movq 8(%3), %%mm1 \n\t" + "movq 16(%3), %%mm2 \n\t" + "movq 24(%3), %%mm3 \n\t" + "movq 32(%3), %%mm4 \n\t" + "movq 40(%3), %%mm5 \n\t" + "movq 48(%3), %%mm6 \n\t" + "movq 56(%3), %%mm7 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "packuswb %%mm7, %%mm6 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm2, (%0, %1) \n\t" + "movq %%mm4, (%0, %1, 2) \n\t" + "movq %%mm6, (%0, %2) \n\t" + :: "r" (pix), "r" ((x86_reg) line_size), "r" ((x86_reg) line_size * 3), + "r" (p) + : "memory"); +} + +#define put_signed_pixels_clamped_mmx_half(off) \ + "movq "#off"(%2), %%mm1 \n\t" \ + "movq 16 + "#off"(%2), %%mm2 \n\t" \ + "movq 32 + "#off"(%2), %%mm3 \n\t" \ + "movq 48 + "#off"(%2), %%mm4 \n\t" \ + "packsswb 8 + "#off"(%2), %%mm1 \n\t" \ + "packsswb 24 + "#off"(%2), %%mm2 \n\t" \ + "packsswb 40 + "#off"(%2), %%mm3 \n\t" \ + "packsswb 56 + "#off"(%2), %%mm4 \n\t" \ + "paddb %%mm0, %%mm1 \n\t" \ + "paddb %%mm0, %%mm2 \n\t" \ + "paddb %%mm0, %%mm3 \n\t" \ + "paddb %%mm0, %%mm4 \n\t" \ + "movq %%mm1, (%0) \n\t" \ + "movq %%mm2, (%0, %3) \n\t" \ + "movq %%mm3, (%0, %3, 2) \n\t" \ + "movq %%mm4, (%0, %1) \n\t" + +void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, + int line_size) +{ + x86_reg line_skip = line_size; + x86_reg line_skip3; + + __asm__ volatile ( + "movq "MANGLE(ff_pb_80)", %%mm0 \n\t" + "lea (%3, %3, 2), %1 \n\t" + put_signed_pixels_clamped_mmx_half(0) + "lea (%0, %3, 4), %0 \n\t" + put_signed_pixels_clamped_mmx_half(64) + : "+&r" (pixels), "=&r" (line_skip3) + : "r" (block), "r" (line_skip) + : "memory"); +} + +void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, + int line_size) +{ + const int16_t *p; + uint8_t *pix; + int i; + + /* read the pixels */ + p = block; + pix = pixels; + MOVQ_ZERO(mm7); + i = 4; + do { + __asm__ volatile ( + "movq (%2), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 16(%2), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "movq %0, %%mm4 \n\t" + "movq %1, %%mm6 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm4, %%mm0 \n\t" + "paddsw %%mm5, %%mm1 \n\t" + "movq %%mm6, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm6 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddsw %%mm6, %%mm2 \n\t" + "paddsw %%mm5, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, %0 \n\t" + "movq %%mm2, %1 \n\t" + : "+m" (*pix), "+m" (*(pix + line_size)) + : "r" (p) + : "memory"); + pix += line_size * 2; + p += 16; + } while (--i); +} + +#endif /* HAVE_INLINE_ASM */ diff --git a/libavcodec/x86/mpegvideoenc_template.c b/libavcodec/x86/mpegvideoenc_template.c index d01ff1c0f8..fa590066d6 100644 --- a/libavcodec/x86/mpegvideoenc_template.c +++ b/libavcodec/x86/mpegvideoenc_template.c @@ -229,7 +229,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if(s->mb_intra) block[0]= level; else block[0]= temp_block[0]; - if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ + if (s->idsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM) { if(last_non_zero_p1 <= 1) goto end; block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; block[0x20] = temp_block[0x10]; diff --git a/libavcodec/x86/proresdsp_init.c b/libavcodec/x86/proresdsp_init.c index 68ad929067..a66fc70982 100644 --- a/libavcodec/x86/proresdsp_init.c +++ b/libavcodec/x86/proresdsp_init.c @@ -22,7 +22,7 @@ #include "libavutil/attributes.h" #include "libavutil/x86/cpu.h" -#include "libavcodec/dsputil.h" +#include "libavcodec/idctdsp.h" #include "libavcodec/proresdsp.h" void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, diff --git a/libavcodec/x86/simple_idct.c b/libavcodec/x86/simple_idct.c index a342110cd3..bbe5a67472 100644 --- a/libavcodec/x86/simple_idct.c +++ b/libavcodec/x86/simple_idct.c @@ -23,7 +23,7 @@ #include "libavutil/internal.h" #include "libavutil/mem.h" #include "libavutil/x86/asm.h" -#include "dsputil_x86.h" +#include "idctdsp.h" #if HAVE_INLINE_ASM -- cgit v1.2.3